Coverage Report

Created: 2025-07-11 06:53

/src/htslib/header.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2018-2020, 2023, 2025 Genome Research Ltd.
3
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <string.h>
35
#include <assert.h>
36
#include <errno.h>
37
#include "textutils_internal.h"
38
#include "header.h"
39
40
// Hash table for removing multiple lines from the header
41
KHASH_SET_INIT_STR(rm)
42
// Used for long refs in SAM files
43
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44
45
typedef khash_t(rm) rmhash_t;
46
47
static int sam_hdr_link_pg(sam_hdr_t *bh);
48
49
static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
50
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);
51
52
53
3.74k
#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
54
3.20k
static void sam_hrecs_error(const char *msg, const char *line, size_t len, size_t lno) {
55
3.20k
    int j;
56
57
3.20k
    if (len > MAX_ERROR_QUOTE)
58
539
        len = MAX_ERROR_QUOTE;
59
115k
    for (j = 0; j < len && line[j] != '\n'; j++)
60
112k
        ;
61
3.20k
    hts_log_error("%s at line %zd: \"%.*s\"", msg, lno, j, line);
62
3.20k
}
63
64
/* ==== Static methods ==== */
65
66
43.3k
static int sam_hrecs_init_type_order(sam_hrecs_t *hrecs, char *type_list) {
67
43.3k
    if (!hrecs)
68
0
        return -1;
69
70
43.3k
    if (!type_list) {
71
43.3k
        hrecs->type_count = 5;
72
43.3k
        hrecs->type_order = calloc(hrecs->type_count, 3);
73
43.3k
        if (!hrecs->type_order)
74
0
            return -1;
75
43.3k
        memcpy(hrecs->type_order[0], "HD", 2);
76
43.3k
        memcpy(hrecs->type_order[1], "SQ", 2);
77
43.3k
        memcpy(hrecs->type_order[2], "RG", 2);
78
43.3k
        memcpy(hrecs->type_order[3], "PG", 2);
79
43.3k
        memcpy(hrecs->type_order[4], "CO", 2);
80
43.3k
    }
81
82
43.3k
    return 0;
83
43.3k
}
84
85
22.9k
static int sam_hrecs_add_ref_altnames(sam_hrecs_t *hrecs, int nref, const char *list) {
86
22.9k
    const char *token;
87
22.9k
    ks_tokaux_t aux;
88
89
22.9k
    if (!list)
90
22.4k
        return 0;
91
92
6.13k
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
93
5.71k
        if (aux.p == token)
94
3.74k
            continue;
95
96
1.97k
        char *name = string_ndup(hrecs->str_pool, token, aux.p - token);
97
1.97k
        if (!name)
98
0
            return -1;
99
1.97k
        int r;
100
1.97k
        khint_t k = kh_put(m_s2i, hrecs->ref_hash, name, &r);
101
1.97k
        if (r < 0) return -1;
102
103
1.97k
        if (r > 0)
104
932
            kh_val(hrecs->ref_hash, k) = nref;
105
1.04k
        else if (kh_val(hrecs->ref_hash, k) != nref)
106
65
            hts_log_warning("Duplicate entry AN:\"%s\" in sam header", name);
107
1.97k
    }
108
109
420
    return 0;
110
420
}
111
112
0
static void sam_hrecs_remove_ref_altnames(sam_hrecs_t *hrecs, int expected, const char *list) {
113
0
    const char *token, *sn;
114
0
    ks_tokaux_t aux;
115
0
    kstring_t str = KS_INITIALIZE;
116
117
0
    if (expected < 0 || expected >= hrecs->nref)
118
0
        return;
119
0
    sn = hrecs->ref[expected].name;
120
121
0
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
122
0
        kputsn(token, aux.p - token, ks_clear(&str));
123
0
        khint_t k = kh_get(m_s2i, hrecs->ref_hash, str.s);
124
0
        if (k != kh_end(hrecs->ref_hash)
125
0
            && kh_val(hrecs->ref_hash, k) == expected
126
0
            && strcmp(sn, str.s) != 0)
127
0
            kh_del(m_s2i, hrecs->ref_hash, k);
128
0
    }
129
130
0
    free(str.s);
131
0
}
132
133
/* Updates the hash tables in the sam_hrecs_t structure.
134
 *
135
 * Returns 0 on success;
136
 *        -1 on failure
137
 */
138
static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
139
                                   khint32_t type,
140
95.2k
                                   sam_hrec_type_t *h_type) {
141
    /* Add to reference hash? */
142
95.2k
    if (type == TYPEKEY("SQ")) {
143
23.4k
        sam_hrec_tag_t *tag = h_type->tag;
144
23.4k
        int nref = hrecs->nref;
145
23.4k
        const char *name = NULL;
146
23.4k
        const char *altnames = NULL;
147
23.4k
        hts_pos_t len = -1;
148
23.4k
        int r, invLN = 0;
149
23.4k
        khint_t k;
150
151
103k
        while (tag) {
152
80.1k
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
153
32.9k
                assert(tag->len >= 3);
154
32.9k
                name = tag->str+3;
155
47.2k
            } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
156
28.7k
                assert(tag->len >= 3);
157
28.7k
                hts_pos_t tmp = len;
158
28.7k
                len = strtoll(tag->str+3, NULL, 10);
159
28.7k
                if (tmp != -1 && tmp != len) {  //duplicate and different LN
160
218
                    invLN = 1;
161
218
                }
162
28.7k
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
163
4.29k
                assert(tag->len >= 3);
164
4.29k
                altnames = tag->str+3;
165
4.29k
            }
166
80.1k
            tag = tag->next;
167
80.1k
        }
168
169
23.4k
        if (!name) {
170
185
            hts_log_error("Header includes @SQ line with no SN: tag");
171
185
            return -1; // SN should be present, according to spec.
172
185
        }
173
174
23.2k
        if (len == -1) {
175
118
            hts_log_error("Header includes @SQ line \"%s\" with no LN: tag",
176
118
                          name);
177
118
            return -1; // LN should be present, according to spec.
178
118
        }
179
180
23.1k
        if (invLN) {
181
190
            hts_log_error("Header includes @SQ line \"%s\" with multiple LN:"
182
190
            " tag with different values.", name);
183
190
            return -1; // LN should not be duplicated or be same
184
190
        }
185
186
        // Seen already?
187
22.9k
        k = kh_get(m_s2i, hrecs->ref_hash, name);
188
22.9k
        if (k < kh_end(hrecs->ref_hash)) {
189
20.4k
            nref = kh_val(hrecs->ref_hash, k);
190
20.4k
            int ref_changed_flag = 0;
191
192
            // Check for hash entry added by sam_hrecs_refs_from_targets_array()
193
20.4k
            if (hrecs->ref[nref].ty == NULL) {
194
                // Attach header line to existing stub entry.
195
20.3k
                hrecs->ref[nref].ty = h_type;
196
                // Check lengths match; correct if not.
197
20.3k
                if (len != hrecs->ref[nref].len) {
198
701
                    char tmp[32];
199
701
                    snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
200
701
                             hrecs->ref[nref].len);
201
701
                    if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
202
0
                        return -1;
203
701
                    ref_changed_flag = 1;
204
701
                }
205
20.3k
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
206
0
                    return -1;
207
208
20.3k
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
209
626
                    hrecs->refs_changed = nref;
210
20.3k
                return 0;
211
20.3k
            }
212
213
            // Check to see if an existing entry is being updated
214
49
            if (hrecs->ref[nref].ty == h_type) {
215
0
                if (hrecs->ref[nref].len != len) {
216
0
                    hrecs->ref[nref].len = len;
217
0
                    ref_changed_flag = 1;
218
0
                }
219
0
                if (!hrecs->ref[nref].name || strcmp(hrecs->ref[nref].name, name)) {
220
0
                    hrecs->ref[nref].name = name;
221
0
                    ref_changed_flag = 1;
222
0
                }
223
0
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
224
0
                    return -1;
225
226
0
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
227
0
                    hrecs->refs_changed = nref;
228
0
                return 0;
229
0
            }
230
231
            // If here, the name is a duplicate.
232
            // Check to see if it matches the SN: tag from the earlier record.
233
49
            if (strcmp(hrecs->ref[nref].name, name) == 0) {
234
34
                hts_log_error("Duplicate entry \"%s\" in sam header",
235
34
                                name);
236
34
                return -1;
237
34
            }
238
239
            // Clash with an already-seen altname
240
            // As SN: should be preferred to AN: add this as a new
241
            // record and update the hash entry to point to it.
242
15
            hts_log_warning("Ref name SN:\"%s\" is a duplicate of an existing AN key", name);
243
15
            nref = hrecs->nref;
244
15
        }
245
246
2.56k
        if (nref == hrecs->ref_sz) {
247
1.61k
            size_t new_sz = hrecs->ref_sz >= 4 ? hrecs->ref_sz + (hrecs->ref_sz / 4) : 32;
248
1.61k
            sam_hrec_sq_t *new_ref = realloc(hrecs->ref, sizeof(*hrecs->ref) * new_sz);
249
1.61k
            if (!new_ref)
250
0
                return -1;
251
1.61k
            hrecs->ref = new_ref;
252
1.61k
            hrecs->ref_sz = new_sz;
253
1.61k
        }
254
255
2.56k
        hrecs->ref[nref].name = name;
256
2.56k
        hrecs->ref[nref].len  = len;
257
2.56k
        hrecs->ref[nref].ty = h_type;
258
259
2.56k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[nref].name, &r);
260
2.56k
        if (-1 == r) return -1;
261
2.56k
        kh_val(hrecs->ref_hash, k) = nref;
262
263
2.56k
        if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
264
0
            return -1;
265
266
2.56k
        if (hrecs->refs_changed < 0 || hrecs->refs_changed > hrecs->nref)
267
1.60k
            hrecs->refs_changed = hrecs->nref;
268
2.56k
        hrecs->nref++;
269
2.56k
    }
270
271
    /* Add to read-group hash? */
272
74.3k
    if (type == TYPEKEY("RG")) {
273
53.0k
        sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, "ID", NULL);
274
53.0k
        int nrg = hrecs->nrg, r;
275
53.0k
        khint_t k;
276
277
53.0k
        if (!tag) {
278
73
            hts_log_error("Header includes @RG line with no ID: tag");
279
73
            return -1;  // ID should be present, according to spec.
280
73
        }
281
52.9k
        assert(tag->str && tag->len >= 3);
282
283
        // Seen already?
284
52.9k
        k = kh_get(m_s2i, hrecs->rg_hash, tag->str + 3);
285
52.9k
        if (k < kh_end(hrecs->rg_hash)) {
286
51.2k
            nrg = kh_val(hrecs->rg_hash, k);
287
51.2k
            assert(hrecs->rg[nrg].ty != NULL);
288
51.2k
            if (hrecs->rg[nrg].ty != h_type) {
289
51.2k
                hts_log_warning("Duplicate entry \"%s\" in sam header",
290
51.2k
                                tag->str + 3);
291
51.2k
            } else {
292
0
                hrecs->rg[nrg].name = tag->str + 3;
293
0
                hrecs->rg[nrg].name_len = tag->len - 3;
294
0
            }
295
51.2k
            return 0;
296
51.2k
        }
297
298
1.70k
        if (nrg == hrecs->rg_sz) {
299
891
            size_t new_sz = hrecs->rg_sz >= 4 ? hrecs->rg_sz + hrecs->rg_sz / 4 : 4;
300
891
            sam_hrec_rg_t *new_rg = realloc(hrecs->rg, sizeof(*hrecs->rg) * new_sz);
301
891
            if (!new_rg)
302
0
                return -1;
303
891
            hrecs->rg = new_rg;
304
891
            hrecs->rg_sz = new_sz;
305
891
        }
306
307
1.70k
        hrecs->rg[nrg].name = tag->str + 3;
308
1.70k
        hrecs->rg[nrg].name_len = tag->len - 3;
309
1.70k
        hrecs->rg[nrg].ty   = h_type;
310
1.70k
        hrecs->rg[nrg].id   = nrg;
311
312
1.70k
        k = kh_put(m_s2i, hrecs->rg_hash, hrecs->rg[nrg].name, &r);
313
1.70k
        if (-1 == r) return -1;
314
1.70k
        kh_val(hrecs->rg_hash, k) = nrg;
315
316
1.70k
        hrecs->nrg++;
317
1.70k
    }
318
319
    /* Add to program hash? */
320
23.0k
    if (type == TYPEKEY("PG")) {
321
9.24k
        sam_hrec_tag_t *tag;
322
9.24k
        sam_hrec_pg_t *new_pg;
323
9.24k
        int npg = hrecs->npg;
324
325
9.24k
        if (npg == hrecs->pg_sz) {
326
2.34k
            size_t new_sz = hrecs->pg_sz >= 4 ? hrecs->pg_sz + hrecs->pg_sz / 4 : 4;
327
2.34k
            new_pg = realloc(hrecs->pg, sizeof(*hrecs->pg) * new_sz);
328
2.34k
            if (!new_pg)
329
0
                return -1;
330
2.34k
            hrecs->pg = new_pg;
331
2.34k
            hrecs->pg_sz = new_sz;
332
2.34k
        }
333
334
9.24k
        tag = h_type->tag;
335
9.24k
        hrecs->pg[npg].name = NULL;
336
9.24k
        hrecs->pg[npg].name_len = 0;
337
9.24k
        hrecs->pg[npg].ty  = h_type;
338
9.24k
        hrecs->pg[npg].id   = npg;
339
9.24k
        hrecs->pg[npg].prev_id = -1;
340
341
45.6k
        while (tag) {
342
36.4k
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
343
                /* Avoid duplicate ID tags coming from other applications */
344
18.0k
                if (!hrecs->pg[npg].name) {
345
9.08k
                    assert(tag->len >= 3);
346
9.08k
                    hrecs->pg[npg].name = tag->str + 3;
347
9.08k
                    hrecs->pg[npg].name_len = tag->len - 3;
348
9.08k
                } else {
349
9.00k
                    hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name);
350
9.00k
                }
351
18.3k
            } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
352
                // Resolve later if needed
353
11.3k
                khint_t k;
354
11.3k
                k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
355
356
11.3k
                if (k != kh_end(hrecs->pg_hash)) {
357
4.94k
                    int p_id = kh_val(hrecs->pg_hash, k);
358
4.94k
                    hrecs->pg[npg].prev_id = hrecs->pg[p_id].id;
359
360
                    /* Unmark previous entry as a PG termination */
361
4.94k
                    if (hrecs->npg_end > 0 &&
362
4.94k
                        hrecs->pg_end[hrecs->npg_end-1] == p_id) {
363
1.56k
                        hrecs->npg_end--;
364
3.38k
                    } else {
365
3.38k
                        int i;
366
295k
                        for (i = 0; i < hrecs->npg_end; i++) {
367
291k
                            if (hrecs->pg_end[i] == p_id) {
368
894
                                memmove(&hrecs->pg_end[i], &hrecs->pg_end[i+1],
369
894
                                        (hrecs->npg_end-i-1)*sizeof(*hrecs->pg_end));
370
894
                                hrecs->npg_end--;
371
894
                            }
372
291k
                        }
373
3.38k
                    }
374
6.43k
                } else {
375
6.43k
                    hrecs->pg[npg].prev_id = -1;
376
6.43k
                }
377
11.3k
            }
378
36.4k
            tag = tag->next;
379
36.4k
        }
380
381
9.24k
        if (hrecs->pg[npg].name) {
382
9.08k
            khint_t k;
383
9.08k
            int r;
384
9.08k
            k = kh_put(m_s2i, hrecs->pg_hash, hrecs->pg[npg].name, &r);
385
9.08k
            if (-1 == r) return -1;
386
9.08k
            kh_val(hrecs->pg_hash, k) = npg;
387
9.08k
        } else {
388
158
            return -1; // ID should be present, according to spec.
389
158
        }
390
391
        /* Add to npg_end[] array. Remove later if we find a PP line */
392
9.08k
        if (hrecs->npg_end >= hrecs->npg_end_alloc) {
393
1.44k
            int *new_pg_end;
394
1.44k
            int  new_alloc = hrecs->npg_end_alloc ? hrecs->npg_end_alloc*2 : 4;
395
396
1.44k
            new_pg_end = realloc(hrecs->pg_end, new_alloc * sizeof(int));
397
1.44k
            if (!new_pg_end)
398
0
                return -1;
399
1.44k
            hrecs->npg_end_alloc = new_alloc;
400
1.44k
            hrecs->pg_end = new_pg_end;
401
1.44k
        }
402
9.08k
        hrecs->pg_end[hrecs->npg_end++] = npg;
403
404
9.08k
        hrecs->npg++;
405
9.08k
    }
406
407
22.9k
    return 0;
408
23.0k
}
409
410
0
static int sam_hrecs_remove_hash_entry(sam_hrecs_t *hrecs, khint32_t type, sam_hrec_type_t *h_type) {
411
0
    if (!hrecs || !h_type)
412
0
        return -1;
413
414
0
    sam_hrec_tag_t *tag;
415
0
    const char *key = NULL;
416
0
    khint_t k;
417
418
    /* Remove name and any alternative names from reference hash */
419
0
    if (type == TYPEKEY("SQ")) {
420
0
        const char *altnames = NULL;
421
422
0
        tag = h_type->tag;
423
424
0
        while (tag) {
425
0
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
426
0
                assert(tag->len >= 3);
427
0
                key = tag->str + 3;
428
0
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
429
0
                assert(tag->len >= 3);
430
0
                altnames = tag->str + 3;
431
0
            }
432
0
            tag = tag->next;
433
0
        }
434
435
0
        if (key) {
436
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
437
0
            if (k != kh_end(hrecs->ref_hash)) {
438
0
                int idx = kh_val(hrecs->ref_hash, k);
439
0
                if (idx + 1 < hrecs->nref)
440
0
                    memmove(&hrecs->ref[idx], &hrecs->ref[idx+1],
441
0
                            sizeof(sam_hrec_sq_t)*(hrecs->nref - idx - 1));
442
0
                if (altnames)
443
0
                    sam_hrecs_remove_ref_altnames(hrecs, idx, altnames);
444
0
                kh_del(m_s2i, hrecs->ref_hash, k);
445
0
                hrecs->nref--;
446
0
                if (hrecs->refs_changed < 0 || hrecs->refs_changed > idx)
447
0
                    hrecs->refs_changed = idx;
448
0
                for (k = 0; k < kh_end(hrecs->ref_hash); k++) {
449
0
                    if (kh_exist(hrecs->ref_hash, k)
450
0
                        && kh_value(hrecs->ref_hash, k) > idx) {
451
0
                        kh_value(hrecs->ref_hash, k)--;
452
0
                    }
453
0
                }
454
0
            }
455
0
        }
456
0
    }
457
458
    /* Remove from read-group hash */
459
0
    if (type == TYPEKEY("RG")) {
460
0
        tag = h_type->tag;
461
462
0
        while (tag) {
463
0
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
464
0
                assert(tag->len >= 3);
465
0
                key = tag->str + 3;
466
0
                k = kh_get(m_s2i, hrecs->rg_hash, key);
467
0
                if (k != kh_end(hrecs->rg_hash)) {
468
0
                    int idx = kh_val(hrecs->rg_hash, k);
469
0
                    if (idx + 1 < hrecs->nrg)
470
0
                        memmove(&hrecs->rg[idx], &hrecs->rg[idx+1], sizeof(sam_hrec_rg_t)*(hrecs->nrg - idx - 1));
471
0
                    kh_del(m_s2i, hrecs->rg_hash, k);
472
0
                    hrecs->nrg--;
473
0
                    for (k = 0; k < kh_end(hrecs->rg_hash); k++) {
474
0
                        if (kh_exist(hrecs->rg_hash, k)
475
0
                            && kh_value(hrecs->rg_hash, k) > idx) {
476
0
                            kh_value(hrecs->rg_hash, k)--;
477
0
                        }
478
0
                    }
479
0
                }
480
0
                break;
481
0
            }
482
0
            tag = tag->next;
483
0
        }
484
0
    }
485
486
0
    return 0;
487
0
}
488
489
/** Add a header record to the global line ordering
490
 *
491
 * If @p after is not NULL, the new record will be inserted after this one,
492
 * otherwise it will go at the end.
493
 *
494
 * An exception is an HD record, which will always be put first unless
495
 * one is already present.
496
 */
497
static void sam_hrecs_global_list_add(sam_hrecs_t *hrecs,
498
                                      sam_hrec_type_t *h_type,
499
97.8k
                                      sam_hrec_type_t *after) {
500
97.8k
    const khint32_t hd_type = TYPEKEY("HD");
501
97.8k
    int update_first_line = 0;
502
503
    // First line seen
504
97.8k
    if (!hrecs->first_line) {
505
17.9k
        hrecs->first_line = h_type->global_next = h_type->global_prev = h_type;
506
17.9k
        return;
507
17.9k
    }
508
509
    // @HD goes at the top (unless there's one already)
510
79.9k
    if (h_type->type == hd_type && hrecs->first_line->type != hd_type) {
511
16
        after = hrecs->first_line->global_prev;
512
16
        update_first_line = 1;
513
16
    }
514
515
    // If no instructions given, put it at the end
516
79.9k
    if (!after)
517
71.9k
        after = hrecs->first_line->global_prev;
518
519
79.9k
    h_type->global_prev = after;
520
79.9k
    h_type->global_next = after->global_next;
521
79.9k
    h_type->global_prev->global_next = h_type;
522
79.9k
    h_type->global_next->global_prev = h_type;
523
524
79.9k
    if (update_first_line)
525
16
        hrecs->first_line = h_type;
526
79.9k
}
527
528
/*! Add header record with a va_list interface.
529
 *
530
 * Adds a single record to a SAM header.
531
 *
532
 * This takes a header record type, a va_list argument and one or more
533
 * key,value pairs, ending with the NULL key.
534
 *
535
 * Eg. sam_hrecs_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
536
 *
537
 * The purpose of the additional va_list parameter is to permit other
538
 * varargs functions to call this while including their own additional
539
 * parameters; an example is in sam_hdr_add_pg().
540
 *
541
 * Note: this function invokes va_arg at least once, making the value
542
 * of ap indeterminate after the return. The caller should call
543
 * va_start/va_end before/after calling this function or use va_copy.
544
 *
545
 * @return
546
 * Returns >= 0 on success;
547
 *        -1 on failure
548
 */
549
8.70k
static int sam_hrecs_vadd(sam_hrecs_t *hrecs, const char *type, va_list ap, ...) {
550
8.70k
    va_list args;
551
8.70k
    sam_hrec_type_t *h_type;
552
8.70k
    sam_hrec_tag_t *h_tag, *last=NULL;
553
8.70k
    int new;
554
8.70k
    khint32_t type_i = TYPEKEY(type), k;
555
556
8.70k
    if (!strncmp(type, "HD", 2) && (h_type = sam_hrecs_find_type_id(hrecs, "HD", NULL, NULL)))
557
0
        return sam_hrecs_vupdate(hrecs, h_type, ap);
558
559
8.70k
    if (!(h_type = pool_alloc(hrecs->type_pool)))
560
0
        return -1;
561
8.70k
    k = kh_put(sam_hrecs_t, hrecs->h, type_i, &new);
562
8.70k
    if (new < 0)
563
0
        return -1;
564
565
8.70k
    h_type->type = type_i;
566
567
    // Form the ring, either with self or other lines of this type
568
8.70k
    if (!new) {
569
8.01k
        sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
570
8.01k
        p = t->prev;
571
572
8.01k
        assert(p->next == t);
573
8.01k
        p->next = h_type;
574
8.01k
        h_type->prev = p;
575
576
8.01k
        t->prev = h_type;
577
8.01k
        h_type->next = t;
578
8.01k
    } else {
579
687
        kh_val(hrecs->h, k) = h_type;
580
687
        h_type->prev = h_type->next = h_type;
581
687
    }
582
8.70k
    h_type->tag = NULL;
583
584
    // Add to global line ordering after any existing line of the same type,
585
    // or at the end if no line of this type exists yet.
586
8.70k
    sam_hrecs_global_list_add(hrecs, h_type, !new ? h_type->prev : NULL);
587
588
    // Check linked-list invariants
589
8.70k
    assert(h_type->prev->next == h_type);
590
8.70k
    assert(h_type->next->prev == h_type);
591
8.70k
    assert(h_type->global_prev->global_next == h_type);
592
8.70k
    assert(h_type->global_next->global_prev == h_type);
593
594
    // Any ... varargs
595
8.70k
    va_start(args, ap);
596
8.70k
    for (;;) {
597
8.70k
        char *key, *val = NULL, *str;
598
599
8.70k
        if (!(key = (char *)va_arg(args, char *)))
600
8.70k
            break;
601
0
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(args, char *)))
602
0
            break;
603
0
        if (*val == '\0')
604
0
            continue;
605
606
0
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
607
0
            return -1;
608
609
0
        if (strncmp(type, "CO", 2)) {
610
0
            h_tag->len = 3 + strlen(val);
611
0
            str = string_alloc(hrecs->str_pool, h_tag->len+1);
612
0
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
613
0
                return -1;
614
0
            h_tag->str = str;
615
0
        } else {
616
0
            h_tag->len = strlen(key);
617
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
618
0
            if (!h_tag->str)
619
0
                return -1;
620
0
        }
621
622
0
        h_tag->next = NULL;
623
0
        if (last)
624
0
            last->next = h_tag;
625
0
        else
626
0
            h_type->tag = h_tag;
627
628
0
        last = h_tag;
629
0
    }
630
8.70k
    va_end(args);
631
632
    // Plus the specified va_list params
633
26.1k
    for (;;) {
634
26.1k
        char *key, *val = NULL, *str;
635
636
26.1k
        if (!(key = (char *)va_arg(ap, char *)))
637
8.70k
            break;
638
17.4k
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(ap, char *)))
639
0
            break;
640
641
17.4k
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
642
0
            return -1;
643
644
17.4k
        if (strncmp(type, "CO", 2)) {
645
17.4k
            h_tag->len = 3 + strlen(val);
646
17.4k
            str = string_alloc(hrecs->str_pool, h_tag->len+1);
647
17.4k
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
648
0
                return -1;
649
17.4k
            h_tag->str = str;
650
17.4k
        } else {
651
0
            h_tag->len = strlen(key);
652
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
653
0
            if (!h_tag->str)
654
0
                return -1;
655
0
        }
656
657
17.4k
        h_tag->next = NULL;
658
17.4k
        if (last)
659
8.70k
            last->next = h_tag;
660
8.70k
        else
661
8.70k
            h_type->tag = h_tag;
662
663
17.4k
        last = h_tag;
664
17.4k
    }
665
666
8.70k
    if (-1 == sam_hrecs_update_hashes(hrecs, TYPEKEY(type), h_type))
667
0
        return -1;
668
669
8.70k
    if (!strncmp(type, "PG", 2))
670
0
        hrecs->pgs_changed = 1;
671
672
8.70k
    hrecs->dirty = 1;
673
674
8.70k
    return 0;
675
8.70k
}
676
677
// As sam_hrecs_vadd(), but without the extra va_list parameter
678
8.70k
static int sam_hrecs_add(sam_hrecs_t *hrecs, const char *type, ...) {
679
8.70k
    va_list args;
680
8.70k
    int res;
681
8.70k
    va_start(args, type);
682
8.70k
    res = sam_hrecs_vadd(hrecs, type, args, NULL);
683
8.70k
    va_end(args);
684
8.70k
    return res;
685
8.70k
}
686
687
/*
688
 * Function for deallocating a list of tags
689
 */
690
691
0
static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) {
692
0
    if (!hrecs || !tag)
693
0
        return;
694
0
    if (tag->next)
695
0
        sam_hrecs_free_tags(hrecs, tag->next);
696
697
0
    pool_free(hrecs->tag_pool, tag);
698
0
}
699
700
0
static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found, int remove_hash) {
701
0
    if (!hrecs || !type_name || !type_found)
702
0
        return -1;
703
704
0
    khint32_t itype = TYPEKEY(type_name);
705
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype);
706
0
    if (k == kh_end(hrecs->h))
707
0
        return -1;
708
709
    // Remove from global list (remembering it could be the only line)
710
0
    if (hrecs->first_line == type_found) {
711
0
        hrecs->first_line = (type_found->global_next != type_found
712
0
                             ? type_found->global_next : NULL);
713
0
    }
714
0
    type_found->global_next->global_prev = type_found->global_prev;
715
0
    type_found->global_prev->global_next = type_found->global_next;
716
717
    /* single element in the list */
718
0
    if (type_found->prev == type_found || type_found->next == type_found) {
719
0
        kh_del(sam_hrecs_t, hrecs->h, k);
720
0
    } else {
721
0
        type_found->prev->next = type_found->next;
722
0
        type_found->next->prev = type_found->prev;
723
0
        if (kh_val(hrecs->h, k) == type_found) { //first element
724
0
            kh_val(hrecs->h, k) = type_found->next;
725
0
        }
726
0
    }
727
728
0
    if (remove_hash && (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2)))
729
0
        sam_hrecs_remove_hash_entry(hrecs, itype, type_found);
730
731
0
    sam_hrecs_free_tags(hrecs, type_found->tag);
732
0
    pool_free(hrecs->type_pool, type_found);
733
734
0
    hrecs->dirty = 1;
735
736
0
    return 0;
737
0
}
738
739
// Paste together a line from the parsed data structures
740
63.3k
static int build_header_line(const sam_hrec_type_t *ty, kstring_t *ks) {
741
63.3k
    sam_hrec_tag_t *tag;
742
63.3k
    int r = 0;
743
63.3k
    char c[2]= { ty->type >> 8, ty->type & 0xff };
744
745
63.3k
    r |= (kputc_('@', ks) == EOF);
746
63.3k
    r |= (kputsn(c, 2, ks) == EOF);
747
183k
    for (tag = ty->tag; tag; tag = tag->next) {
748
120k
        r |= (kputc_('\t', ks) == EOF);
749
120k
        r |= (kputsn(tag->str, tag->len, ks) == EOF);
750
120k
    }
751
752
63.3k
    return r;
753
63.3k
}
754
755
10.8k
static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) {
756
10.8k
    const sam_hrec_type_t *t1, *t2;
757
758
10.8k
    if (!hrecs->first_line)
759
0
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
760
761
10.8k
    t1 = t2 = hrecs->first_line;
762
63.3k
    do {
763
63.3k
        if (build_header_line(t1, ks) != 0)
764
0
            return -1;
765
63.3k
        if (kputc('\n', ks) < 0)
766
0
            return -1;
767
768
63.3k
        t1 = t1->global_next;
769
63.3k
    } while (t1 != t2);
770
771
10.8k
    return 0;
772
10.8k
}
773
774
18.4k
static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) {
775
18.4k
    size_t i, lno;
776
777
18.4k
    if (!hrecs || len > SSIZE_MAX)
778
0
        return -1;
779
780
18.4k
    if (!len)
781
0
        len = strlen(hdr);
782
783
18.4k
    if (len < 3) {
784
310
        if (len == 0 || *hdr == '\0') return 0;
785
58
        sam_hrecs_error("Header line too short", hdr, len, 1);
786
58
        return -1;
787
310
    }
788
789
104k
    for (i = 0, lno = 1; i < len - 3 && hdr[i] != '\0'; i++, lno++) {
790
90.1k
        khint32_t type;
791
90.1k
        khint_t k;
792
793
90.1k
        int l_start = i, new;
794
90.1k
        sam_hrec_type_t *h_type;
795
90.1k
        sam_hrec_tag_t *h_tag, *last;
796
797
90.1k
        if (hdr[i] != '@') {
798
404
            sam_hrecs_error("Header line does not start with '@'",
799
404
                          &hdr[l_start], len - l_start, lno);
800
404
            return -1;
801
404
        }
802
803
89.7k
        if (!isalpha_c(hdr[i+1]) || !isalpha_c(hdr[i+2])) {
804
125
            sam_hrecs_error("Header line does not have a two character key",
805
125
                          &hdr[l_start], len - l_start, lno);
806
125
            return -1;
807
125
        }
808
89.5k
        type = TYPEKEY(&hdr[i+1]);
809
810
89.5k
        i += 3;
811
89.5k
        if (i == len || hdr[i] == '\n')
812
417
            continue;
813
814
        // Add the header line type
815
89.1k
        if (!(h_type = pool_alloc(hrecs->type_pool)))
816
0
            return -1;
817
89.1k
        k = kh_put(sam_hrecs_t, hrecs->h, type, &new);
818
89.1k
        if (new < 0)
819
0
            return -1;
820
821
89.1k
        h_type->type = type;
822
823
        // Add to end of global list
824
89.1k
        sam_hrecs_global_list_add(hrecs, h_type, NULL);
825
826
        // Form the ring, either with self or other lines of this type
827
89.1k
        if (!new) {
828
68.9k
            sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
829
68.9k
            p = t->prev;
830
831
68.9k
            assert(p->next == t);
832
68.9k
            p->next = h_type;
833
68.9k
            h_type->prev = p;
834
835
68.9k
            t->prev = h_type;
836
68.9k
            h_type->next = t;
837
68.9k
        } else {
838
20.2k
            kh_val(hrecs->h, k) = h_type;
839
20.2k
            h_type->prev = h_type->next = h_type;
840
20.2k
        }
841
842
        // Parse the tags on this line
843
89.1k
        last = NULL;
844
89.1k
        if (type == TYPEKEY("CO")) {
845
7.78k
            size_t j;
846
847
7.78k
            if (i == len || hdr[i] != '\t') {
848
74
                sam_hrecs_error("Missing tab",
849
74
                              &hdr[l_start], len - l_start, lno);
850
74
                return -1;
851
74
            }
852
853
7.75M
            for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
854
7.75M
                ;
855
856
7.71k
            if (!(h_type->tag = h_tag = pool_alloc(hrecs->tag_pool)))
857
0
                return -1;
858
7.71k
            h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
859
7.71k
            h_tag->len = j-i;
860
7.71k
            h_tag->next = NULL;
861
7.71k
            if (!h_tag->str)
862
0
                return -1;
863
864
7.71k
            i = j;
865
866
81.3k
        } else {
867
174k
            do {
868
174k
                size_t j;
869
870
174k
                if (i == len || hdr[i] != '\t') {
871
220
                    sam_hrecs_error("Missing tab",
872
220
                                  &hdr[l_start], len - l_start, lno);
873
220
                    return -1;
874
220
                }
875
876
168M
                for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
877
168M
                    ;
878
879
173k
                if (j - i < 3 || hdr[i + 2] != ':') {
880
2.32k
                    sam_hrecs_error("Malformed key:value pair",
881
2.32k
                                   &hdr[l_start], len - l_start, lno);
882
2.32k
                    return -1;
883
2.32k
                }
884
885
171k
                if (!(h_tag = pool_alloc(hrecs->tag_pool)))
886
0
                    return -1;
887
171k
                h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
888
171k
                h_tag->len = j-i;
889
171k
                h_tag->next = NULL;
890
171k
                if (!h_tag->str)
891
0
                    return -1;
892
893
171k
                if (last)
894
92.4k
                    last->next = h_tag;
895
79.1k
                else
896
79.1k
                    h_type->tag = h_tag;
897
898
171k
                last = h_tag;
899
171k
                i = j;
900
171k
            } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
901
81.3k
        }
902
903
        /* Update RG/SQ hashes */
904
86.5k
        if (-1 == sam_hrecs_update_hashes(hrecs, type, h_type))
905
758
            return -1;
906
86.5k
    }
907
908
14.1k
    return 0;
909
18.0k
}
910
911
/*! Update sam_hdr_t target_name and target_len arrays
912
 *
913
 *  @return 0 on success; -1 on failure
914
 */
915
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
916
15.4k
                                 int refs_changed) {
917
15.4k
    if (!bh || !hrecs)
918
0
        return -1;
919
920
15.4k
    if (refs_changed < 0)
921
0
        return 0;
922
923
    // Grow arrays if necessary
924
15.4k
    if (bh->n_targets < hrecs->nref) {
925
4.16k
        char **new_names = realloc(bh->target_name,
926
4.16k
                                   hrecs->nref * sizeof(*new_names));
927
4.16k
        if (!new_names)
928
0
            return -1;
929
4.16k
        bh->target_name = new_names;
930
4.16k
        uint32_t *new_lens = realloc(bh->target_len,
931
4.16k
                                     hrecs->nref * sizeof(*new_lens));
932
4.16k
        if (!new_lens)
933
0
            return -1;
934
4.16k
        bh->target_len = new_lens;
935
4.16k
    }
936
937
    // Update names and lengths where changed
938
    // hrecs->refs_changed is the first ref that has been updated, so ones
939
    // before that can be skipped.
940
15.4k
    int i;
941
15.4k
    khint_t k;
942
15.4k
    khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
943
25.9k
    for (i = refs_changed; i < hrecs->nref; i++) {
944
10.4k
        if (i >= bh->n_targets
945
10.4k
            || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) {
946
8.64k
            if (i < bh->n_targets)
947
0
                free(bh->target_name[i]);
948
8.64k
            bh->target_name[i] = strdup(hrecs->ref[i].name);
949
8.64k
            if (!bh->target_name[i])
950
0
                return -1;
951
8.64k
        }
952
10.4k
        if (hrecs->ref[i].len < UINT32_MAX) {
953
7.88k
            bh->target_len[i] = hrecs->ref[i].len;
954
955
7.88k
            if (!long_refs)
956
5.39k
                continue;
957
958
            // Check if we have an old length, if so remove it.
959
2.49k
            k = kh_get(s2i, long_refs, bh->target_name[i]);
960
2.49k
            if (k < kh_end(long_refs))
961
2.49k
                kh_del(s2i, long_refs, k);
962
2.57k
        } else {
963
2.57k
            bh->target_len[i] = UINT32_MAX;
964
2.57k
            if (bh->hrecs != hrecs) {
965
                // Called from sam_hdr_dup; need to add sdict entries
966
1.43k
                if (!long_refs) {
967
461
                    if (!(bh->sdict = long_refs = kh_init(s2i)))
968
0
                        return -1;
969
461
                }
970
971
                // Add / update length
972
1.43k
                int absent;
973
1.43k
                k = kh_put(s2i, long_refs, bh->target_name[i], &absent);
974
1.43k
                if (absent < 0)
975
0
                    return -1;
976
1.43k
                kh_val(long_refs, k) = hrecs->ref[i].len;
977
1.43k
            }
978
2.57k
        }
979
10.4k
    }
980
981
    // Free up any names that have been removed
982
15.4k
    for (; i < bh->n_targets; i++) {
983
0
        if (long_refs) {
984
0
            k = kh_get(s2i, long_refs, bh->target_name[i]);
985
0
            if (k < kh_end(long_refs))
986
0
                kh_del(s2i, long_refs, k);
987
0
        }
988
0
        free(bh->target_name[i]);
989
0
    }
990
991
15.4k
    bh->n_targets = hrecs->nref;
992
15.4k
    return 0;
993
15.4k
}
994
995
2.01k
static int rebuild_target_arrays(sam_hdr_t *bh) {
996
2.01k
    if (!bh || !bh->hrecs)
997
0
        return -1;
998
999
2.01k
    sam_hrecs_t *hrecs = bh->hrecs;
1000
2.01k
    if (hrecs->refs_changed < 0)
1001
0
        return 0;
1002
1003
2.01k
    if (sam_hdr_update_target_arrays(bh, hrecs, hrecs->refs_changed) != 0)
1004
0
        return -1;
1005
1006
2.01k
    hrecs->refs_changed = -1;
1007
2.01k
    return 0;
1008
2.01k
}
1009
1010
/// Populate hrecs refs array from header target_name, target_len arrays
1011
/**
1012
 * @return 0 on success; -1 on failure
1013
 *
1014
 * Pre-fills the refs hash from the target arrays.  For BAM files this
1015
 * will ensure that they are in the correct order as the target arrays
1016
 * are the canonical source for converting target ids to names and lengths.
1017
 *
1018
 * The added entries do not link to a header line. sam_hrecs_update_hashes()
1019
 * will add the links later for lines found in the text header.
1020
 *
1021
 * This should be called before the text header is parsed.
1022
 */
1023
static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs,
1024
9.68k
                                             const sam_hdr_t *bh) {
1025
9.68k
    int32_t tid = 0;
1026
1027
9.68k
    if (!hrecs || !bh)
1028
0
        return -1;
1029
1030
    // This should always be called before parsing the text header
1031
    // so the ref array should start off empty, and we don't have to try
1032
    // to reconcile any existing data.
1033
9.68k
    if (hrecs->nref > 0) {
1034
0
        hts_log_error("Called with non-empty ref array");
1035
0
        return -1;
1036
0
    }
1037
1038
9.68k
    if (hrecs->ref_sz < bh->n_targets) {
1039
9.68k
        sam_hrec_sq_t *new_ref = realloc(hrecs->ref,
1040
9.68k
                                         bh->n_targets * sizeof(*new_ref));
1041
9.68k
        if (!new_ref)
1042
0
            return -1;
1043
1044
9.68k
        hrecs->ref = new_ref;
1045
9.68k
        hrecs->ref_sz = bh->n_targets;
1046
9.68k
    }
1047
1048
38.2k
    for (tid = 0; tid < bh->n_targets; tid++) {
1049
28.5k
        khint_t k;
1050
28.5k
        int r;
1051
28.5k
        hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]);
1052
28.5k
        if (!hrecs->ref[tid].name) goto fail;
1053
28.5k
        if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) {
1054
21.8k
            hrecs->ref[tid].len  = bh->target_len[tid];
1055
21.8k
        } else {
1056
6.76k
            khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1057
6.76k
            k = kh_get(s2i, long_refs, hrecs->ref[tid].name);
1058
6.76k
            if (k < kh_end(long_refs)) {
1059
6.50k
                hrecs->ref[tid].len = kh_val(long_refs, k);
1060
6.50k
            } else {
1061
263
                hrecs->ref[tid].len = UINT32_MAX;
1062
263
            }
1063
6.76k
        }
1064
28.5k
        hrecs->ref[tid].ty   = NULL;
1065
28.5k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r);
1066
28.5k
        if (r < 0) goto fail;
1067
28.5k
        if (r == 0) {
1068
32
            hts_log_error("Duplicate entry \"%s\" in target list",
1069
32
                            hrecs->ref[tid].name);
1070
32
            return -1;
1071
28.5k
        } else {
1072
28.5k
            kh_val(hrecs->ref_hash, k) = tid;
1073
28.5k
        }
1074
28.5k
    }
1075
9.65k
    hrecs->nref = bh->n_targets;
1076
9.65k
    return 0;
1077
1078
0
 fail: {
1079
0
        int32_t i;
1080
0
        hts_log_error("%s", strerror(errno));
1081
0
        for (i = 0; i < tid; i++) {
1082
0
            khint_t k;
1083
0
            if (!hrecs->ref[i].name) continue;
1084
0
            k = kh_get(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name);
1085
0
            if (k < kh_end(hrecs->ref_hash)) kh_del(m_s2i, hrecs->ref_hash, k);
1086
0
        }
1087
0
        hrecs->nref = 0;
1088
0
        return -1;
1089
9.68k
    }
1090
9.68k
}
1091
1092
/*
1093
 * Add SQ header records for any references in the hrecs->ref array that
1094
 * were added by sam_hrecs_refs_from_targets_array() but have not
1095
 * been linked to an @SQ line by sam_hrecs_update_hashes() yet.
1096
 *
1097
 * This may be needed either because:
1098
 *
1099
 *   - A bam file was read that had entries in its refs list with no
1100
 *     corresponding @SQ line.
1101
 *
1102
 *   - A program constructed a sam_hdr_t which has target_name and target_len
1103
 *     array entries with no corresponding @SQ line in text.
1104
 */
1105
39.3k
static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) {
1106
39.3k
    int tid;
1107
39.3k
    char len[32];
1108
1109
60.4k
    for (tid = 0; tid < hrecs->nref; tid++) {
1110
21.0k
        if (hrecs->ref[tid].ty == NULL) {
1111
8.70k
            snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len);
1112
8.70k
            if (sam_hrecs_add(hrecs, "SQ",
1113
8.70k
                              "SN", hrecs->ref[tid].name,
1114
8.70k
                              "LN", len, NULL) != 0)
1115
0
                return -1;
1116
1117
            // Check that the stub has actually been filled
1118
8.70k
            if(hrecs->ref[tid].ty == NULL) {
1119
0
                hts_log_error("Reference stub with tid=%d, name=\"%s\", len=%"PRIhts_pos" could not be filled",
1120
0
                        tid, hrecs->ref[tid].name, hrecs->ref[tid].len);
1121
0
                return -1;
1122
0
            }
1123
8.70k
        }
1124
21.0k
    }
1125
39.3k
    return 0;
1126
39.3k
}
1127
1128
43.3k
int sam_hdr_fill_hrecs(sam_hdr_t *bh) {
1129
43.3k
    sam_hrecs_t *hrecs = sam_hrecs_new();
1130
1131
43.3k
    if (!hrecs)
1132
0
        return -1;
1133
1134
43.3k
    if (bh->target_name && bh->target_len && bh->n_targets > 0) {
1135
9.68k
        if (sam_hrecs_refs_from_targets_array(hrecs, bh) != 0) {
1136
32
            sam_hrecs_free(hrecs);
1137
32
            return -1;
1138
32
        }
1139
9.68k
    }
1140
1141
    // Parse existing header text
1142
43.2k
    if (bh->text && bh->l_text > 0) {
1143
17.1k
        if (sam_hrecs_parse_lines(hrecs, bh->text, bh->l_text) != 0) {
1144
3.89k
            sam_hrecs_free(hrecs);
1145
3.89k
            return -1;
1146
3.89k
        }
1147
17.1k
    }
1148
1149
39.3k
    if (add_stub_ref_sq_lines(hrecs) < 0) {
1150
0
        sam_hrecs_free(hrecs);
1151
0
        return -1;
1152
0
    }
1153
1154
39.3k
    bh->hrecs = hrecs;
1155
1156
39.3k
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1157
0
        return -1;
1158
1159
39.3k
    return 0;
1160
39.3k
}
1161
1162
/** Remove outdated header text
1163
1164
    @param bh     BAM header
1165
1166
    This is called when API functions have changed the header so that the
1167
    text version is no longer valid.
1168
 */
1169
1.21k
static void redact_header_text(sam_hdr_t *bh) {
1170
1.21k
    assert(bh->hrecs && bh->hrecs->dirty);
1171
1.21k
    bh->l_text = 0;
1172
1.21k
    free(bh->text);
1173
1.21k
    bh->text = NULL;
1174
1.21k
}
1175
1176
/** Find nth header record of a given type
1177
1178
    @param type   Header type (SQ, RG etc.)
1179
    @param idx    0-based index
1180
1181
    @return sam_hrec_type_t pointer to the record on success
1182
            NULL if no record exists with the given type and index
1183
 */
1184
1185
static sam_hrec_type_t *sam_hrecs_find_type_pos(sam_hrecs_t *hrecs,
1186
0
                                                const char *type, int idx) {
1187
0
    sam_hrec_type_t *first, *itr;
1188
1189
0
    if (idx < 0)
1190
0
        return NULL;
1191
1192
0
    if (type[0] == 'S' && type[1] == 'Q')
1193
0
        return idx < hrecs->nref ? hrecs->ref[idx].ty : NULL;
1194
1195
0
    if (type[0] == 'R' && type[1] == 'G')
1196
0
        return idx < hrecs->nrg ? hrecs->rg[idx].ty : NULL;
1197
1198
0
    if (type[0] == 'P' && type[1] == 'G')
1199
0
        return idx < hrecs->npg ? hrecs->pg[idx].ty : NULL;
1200
1201
0
    first = itr = sam_hrecs_find_type_id(hrecs, type, NULL, NULL);
1202
0
    if (!first)
1203
0
        return NULL;
1204
1205
0
    while (idx > 0) {
1206
0
        itr = itr->next;
1207
0
        if (itr == first)
1208
0
            break;
1209
0
        --idx;
1210
0
    }
1211
1212
0
    return idx == 0 ? itr : NULL;
1213
0
}
1214
1215
/* ==== Public methods ==== */
1216
1217
7.93k
size_t sam_hdr_length(sam_hdr_t *bh) {
1218
7.93k
    if (!bh || -1 == sam_hdr_rebuild(bh))
1219
0
        return SIZE_MAX;
1220
1221
7.93k
    return bh->l_text;
1222
7.93k
}
1223
1224
3.29k
const char *sam_hdr_str(sam_hdr_t *bh) {
1225
3.29k
    if (!bh || -1 == sam_hdr_rebuild(bh))
1226
0
        return NULL;
1227
1228
3.29k
    return bh->text;
1229
3.29k
}
1230
1231
0
int sam_hdr_nref(const sam_hdr_t *bh) {
1232
0
    if (!bh)
1233
0
        return -1;
1234
1235
0
    return bh->hrecs ? bh->hrecs->nref : bh->n_targets;
1236
0
}
1237
1238
/*
1239
 * Reconstructs the text representation from the header hash table.
1240
 * Returns 0 on success
1241
 *        -1 on failure
1242
 */
1243
11.2k
int sam_hdr_rebuild(sam_hdr_t *bh) {
1244
11.2k
    sam_hrecs_t *hrecs;
1245
11.2k
    if (!bh)
1246
0
        return -1;
1247
1248
11.2k
    if (!(hrecs = bh->hrecs))
1249
0
        return bh->text ? 0 : -1;
1250
1251
11.2k
    if (hrecs->refs_changed >= 0) {
1252
0
        if (rebuild_target_arrays(bh) < 0) {
1253
0
            hts_log_error("Header target array rebuild has failed");
1254
0
            return -1;
1255
0
        }
1256
0
    }
1257
1258
    /* If header text wasn't changed or header is empty, don't rebuild it. */
1259
11.2k
    if (!hrecs->dirty)
1260
10.9k
        return 0;
1261
1262
230
    if (hrecs->pgs_changed && sam_hdr_link_pg(bh) < 0) {
1263
0
        hts_log_error("Linking @PG lines has failed");
1264
0
        return -1;
1265
0
    }
1266
1267
230
    kstring_t ks = KS_INITIALIZE;
1268
230
    if (sam_hrecs_rebuild_text(hrecs, &ks) != 0) {
1269
0
        ks_free(&ks);
1270
0
        hts_log_error("Header text rebuild has failed");
1271
0
        return -1;
1272
0
    }
1273
1274
230
    hrecs->dirty = 0;
1275
1276
    /* Sync */
1277
230
    free(bh->text);
1278
230
    bh->l_text = ks_len(&ks);
1279
230
    bh->text = ks_release(&ks);
1280
1281
230
    return 0;
1282
230
}
1283
1284
/*
1285
 * Appends a formatted line to an existing SAM header.
1286
 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
1287
 * optional new-line. If it contains more than 1 line then multiple lines
1288
 * will be added in order.
1289
 *
1290
 * Input text is of maximum length len or as terminated earlier by a NUL.
1291
 * len may be 0 if unknown, in which case lines must be NUL-terminated.
1292
 *
1293
 * Returns 0 on success
1294
 *        -1 on failure
1295
 */
1296
7.30k
int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) {
1297
7.30k
    sam_hrecs_t *hrecs;
1298
1299
7.30k
    if (!bh || !lines)
1300
0
        return -1;
1301
1302
7.30k
    if (len == 0 && *lines == '\0')
1303
6.02k
        return 0;
1304
1305
1.28k
    if (!(hrecs = bh->hrecs)) {
1306
1.28k
        if (sam_hdr_fill_hrecs(bh) != 0)
1307
0
            return -1;
1308
1.28k
        hrecs = bh->hrecs;
1309
1.28k
    }
1310
1311
1.28k
    if (sam_hrecs_parse_lines(hrecs, lines, len) != 0)
1312
73
        return -1;
1313
1314
1.21k
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1315
0
        return -1;
1316
1317
1.21k
    hrecs->dirty = 1;
1318
1.21k
    redact_header_text(bh);
1319
1320
1.21k
    return 0;
1321
1.21k
}
1322
1323
/*
1324
 * Adds a single line to a SAM header.
1325
 * Specify type and one or more key,value pairs, ending with the NULL key.
1326
 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
1327
 *
1328
 * Returns 0 on success
1329
 *        -1 on failure
1330
 */
1331
0
int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) {
1332
0
    va_list args;
1333
0
    sam_hrecs_t *hrecs;
1334
1335
0
    if (!bh || !type)
1336
0
        return -1;
1337
1338
0
    if (!(hrecs = bh->hrecs)) {
1339
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1340
0
            return -1;
1341
0
        hrecs = bh->hrecs;
1342
0
    }
1343
1344
0
    va_start(args, type);
1345
0
    int ret = sam_hrecs_vadd(hrecs, type, args, NULL);
1346
0
    va_end(args);
1347
1348
0
    if (ret == 0) {
1349
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1350
0
            return -1;
1351
1352
0
        if (hrecs->dirty)
1353
0
            redact_header_text(bh);
1354
0
    }
1355
1356
0
    return ret;
1357
0
}
1358
1359
/*
1360
 * Returns a complete line of formatted text for a specific head type/ID
1361
 * combination. If ID_key is NULL then it returns the first line of the specified
1362
 * type.
1363
 */
1364
int sam_hdr_find_line_id(sam_hdr_t *bh, const char *type,
1365
0
                      const char *ID_key, const char *ID_val, kstring_t *ks) {
1366
0
    sam_hrecs_t *hrecs;
1367
0
    if (!bh || !type)
1368
0
        return -2;
1369
1370
0
    if (!(hrecs = bh->hrecs)) {
1371
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1372
0
            return -2;
1373
0
        hrecs = bh->hrecs;
1374
0
    }
1375
1376
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_val);
1377
0
    if (!ty)
1378
0
        return -1;
1379
1380
0
    ks->l = 0;
1381
0
    if (build_header_line(ty, ks) < 0) {
1382
0
        return -2;
1383
0
    }
1384
1385
0
    return 0;
1386
0
}
1387
1388
int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type,
1389
0
                          int pos, kstring_t *ks) {
1390
0
    sam_hrecs_t *hrecs;
1391
0
    if (!bh || !type)
1392
0
        return -2;
1393
1394
0
    if (!(hrecs = bh->hrecs)) {
1395
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1396
0
            return -2;
1397
0
        hrecs = bh->hrecs;
1398
0
    }
1399
1400
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1401
0
    if (!ty)
1402
0
        return -1;
1403
1404
0
    ks->l = 0;
1405
0
    if (build_header_line(ty, ks) < 0) {
1406
0
        return -2;
1407
0
    }
1408
1409
0
    return 0;
1410
0
}
1411
1412
/*
1413
 * Remove a line from the header by specifying a tag:value that uniquely
1414
 * identifies a line, i.e. the @SQ line containing "SN:ref1".
1415
 * @SQ line is uniquely identified by SN tag.
1416
 * @RG line is uniquely identified by ID tag.
1417
 * @PG line is uniquely identified by ID tag.
1418
 *
1419
 * Returns 0 on success and -1 on error
1420
 */
1421
1422
0
int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1423
0
    sam_hrecs_t *hrecs;
1424
0
    if (!bh || !type)
1425
0
        return -1;
1426
1427
0
    if (!(hrecs = bh->hrecs)) {
1428
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1429
0
            return -1;
1430
0
        hrecs = bh->hrecs;
1431
0
    }
1432
1433
0
    if (!strncmp(type, "PG", 2)) {
1434
0
        hts_log_warning("Removing PG lines is not supported!");
1435
0
        return -1;
1436
0
    }
1437
1438
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1439
0
    if (!type_found)
1440
0
        return 0;
1441
1442
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1);
1443
0
    if (ret == 0) {
1444
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1445
0
            return -1;
1446
1447
0
        if (hrecs->dirty)
1448
0
            redact_header_text(bh);
1449
0
    }
1450
1451
0
    return ret;
1452
0
}
1453
1454
/*
1455
 * Remove a line from the header by specifying the position in the type
1456
 * group, i.e. 3rd @SQ line.
1457
 *
1458
 * Returns 0 on success and -1 on error
1459
 */
1460
1461
0
int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) {
1462
0
    sam_hrecs_t *hrecs;
1463
0
    if (!bh || !type || position < 0)
1464
0
        return -1;
1465
1466
0
    if (!(hrecs = bh->hrecs)) {
1467
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1468
0
            return -1;
1469
0
        hrecs = bh->hrecs;
1470
0
    }
1471
1472
0
    if (!strncmp(type, "PG", 2)) {
1473
0
        hts_log_warning("Removing PG lines is not supported!");
1474
0
        return -1;
1475
0
    }
1476
1477
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_pos(hrecs, type,
1478
0
                                                          position);
1479
0
    if (!type_found)
1480
0
        return -1;
1481
1482
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1);
1483
0
    if (ret == 0) {
1484
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1485
0
            return -1;
1486
1487
0
        if (hrecs->dirty)
1488
0
            redact_header_text(bh);
1489
0
    }
1490
1491
0
    return ret;
1492
0
}
1493
1494
/*
1495
 * Check if sam_hdr_update_line() is being used to change the name of
1496
 * a record, and if the new name is going to clash with an existing one.
1497
 *
1498
 * If ap includes repeated keys, we go with the last one as sam_hrecs_vupdate()
1499
 * will go through them all and leave the final one in place.
1500
 *
1501
 * Returns 0 if the name does not change
1502
 *         1 if the name changes but does not clash
1503
 *        -1 if the name changes and the new one is already in use
1504
 */
1505
static int check_for_name_update(sam_hrecs_t *hrecs, sam_hrec_type_t *rec,
1506
                                 va_list ap, const char **old_name,
1507
                                 const char **new_name,
1508
                                 char id_tag_out[3],
1509
0
                                 khash_t(m_s2i) **hash_out) {
1510
0
    char *key, *val;
1511
0
    const char *id_tag;
1512
0
    sam_hrec_tag_t *tag, *prev;
1513
0
    khash_t(m_s2i) *hash;
1514
0
    khint_t k;
1515
0
    int ret = 0;
1516
1517
0
    if        (rec->type == TYPEKEY("SQ")) {
1518
0
        id_tag = "SN"; hash = hrecs->ref_hash;
1519
0
    } else if (rec->type == TYPEKEY("RG")) {
1520
0
        id_tag = "ID"; hash = hrecs->rg_hash;
1521
0
    } else if (rec->type == TYPEKEY("PG")) {
1522
0
        id_tag = "ID"; hash = hrecs->pg_hash;
1523
0
    } else {
1524
0
        return 0;
1525
0
    }
1526
1527
0
    memcpy(id_tag_out, id_tag, 3);
1528
0
    *hash_out = hash;
1529
1530
0
    tag = sam_hrecs_find_key(rec, id_tag, &prev);
1531
0
    if (!tag)
1532
0
        return 0;
1533
0
    assert(tag->len >= 3);
1534
0
    *old_name = tag->str + 3;
1535
1536
0
    while ((key = va_arg(ap, char *)) != NULL) {
1537
0
        val = va_arg(ap, char *);
1538
0
        if (!val) val = "";
1539
0
        if (strcmp(key, id_tag) != 0) continue;
1540
0
        if (strcmp(val, tag->str + 3) == 0) { ret = 0; continue; }
1541
0
        k = kh_get(m_s2i, hash, val);
1542
0
        ret = k < kh_end(hash) ? -1 : 1;
1543
0
        *new_name = val;
1544
0
    }
1545
0
    return ret;
1546
0
}
1547
1548
int sam_hdr_update_line(sam_hdr_t *bh, const char *type,
1549
0
        const char *ID_key, const char *ID_value, ...) {
1550
0
    sam_hrecs_t *hrecs;
1551
0
    if (!bh)
1552
0
        return -1;
1553
1554
0
    if (!(hrecs = bh->hrecs)) {
1555
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1556
0
            return -1;
1557
0
        hrecs = bh->hrecs;
1558
0
    }
1559
1560
0
    int ret, rename;
1561
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1562
0
    if (!ty)
1563
0
        return -1;
1564
1565
0
    va_list args;
1566
0
    const char *old_name = "?", *new_name = "?";
1567
0
    char id_tag[3];
1568
0
    khash_t(m_s2i) *hash = NULL;
1569
0
    va_start(args, ID_value);
1570
0
    rename = check_for_name_update(hrecs, ty, args,
1571
0
                                   &old_name, &new_name, id_tag, &hash);
1572
0
    va_end(args);
1573
0
    if (rename < 0) {
1574
0
        hts_log_error("Cannot rename @%s \"%s\" to \"%s\" : already exists",
1575
0
                      type, old_name, new_name);
1576
0
        return -1;
1577
0
    }
1578
0
    if (rename > 0 && TYPEKEY(type) == TYPEKEY("PG")) {
1579
        // This is just too complicated
1580
0
        hts_log_error("Renaming @PG records is not supported");
1581
0
        return -1;
1582
0
    }
1583
0
    va_start(args, ID_value);
1584
0
    ret = sam_hrecs_vupdate(hrecs, ty, args);
1585
0
    va_end(args);
1586
1587
0
    if (ret)
1588
0
        return ret;
1589
1590
    // TODO Account for @SQ-AN altnames
1591
1592
0
    if (rename) {
1593
        // Adjust the hash table to point to the new name
1594
        // sam_hrecs_update_hashes() should sort out everything else
1595
0
        khint_t k = kh_get(m_s2i, hash, old_name);
1596
0
        sam_hrec_tag_t *new_tag = sam_hrecs_find_key(ty, id_tag, NULL);
1597
0
        int r, pos;
1598
0
        assert(k < kh_end(hash));        // Or we wouldn't have found it earlier
1599
0
        assert(new_tag && new_tag->str); // id_tag should exist
1600
0
        assert(new_tag->len > 3);
1601
0
        pos = kh_val(hash, k);
1602
0
        kh_del(m_s2i, hash, k);
1603
0
        k = kh_put(m_s2i, hash, new_tag->str + 3, &r);
1604
0
        if (r < 1) {
1605
0
            hts_log_error("Failed to rename item in hash table");
1606
0
            return -1;
1607
0
        }
1608
0
        kh_val(hash, k) = pos;
1609
0
    }
1610
1611
0
    ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty);
1612
1613
0
    if (!ret && hrecs->refs_changed >= 0)
1614
0
        ret = rebuild_target_arrays(bh);
1615
1616
0
    if (!ret && hrecs->dirty)
1617
0
        redact_header_text(bh);
1618
1619
0
    return ret;
1620
0
}
1621
1622
0
static int rebuild_hash(sam_hrecs_t *hrecs, const char *type) {
1623
0
    sam_hrec_type_t *head, *step;
1624
0
    khiter_t k;
1625
1626
0
    if (strncmp(type, "SQ", 2) == 0) {
1627
0
        hrecs->nref = 0;
1628
0
        kh_clear(m_s2i, hrecs->ref_hash);
1629
0
    } else if (strncmp(type, "RG", 2) == 0) {
1630
0
        hrecs->nrg = 0;
1631
0
        kh_clear(m_s2i, hrecs->rg_hash);
1632
0
    }
1633
1634
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1635
1636
0
    if (k != kh_end(hrecs->h)) { // something to rebuild
1637
0
        head = kh_val(hrecs->h, k);
1638
0
        step = head;
1639
1640
0
        do {
1641
0
            if (sam_hrecs_update_hashes(hrecs, TYPEKEY(type), step) == -1) {
1642
0
                hts_log_error("Unable to rebuild hashes");
1643
0
                return -1;
1644
0
            }
1645
1646
0
            step = step->next;
1647
0
        } while (step != head);
1648
0
    }
1649
1650
0
    return 0;
1651
0
}
1652
1653
0
int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1654
0
    sam_hrecs_t *hrecs;
1655
0
    if (!bh || !type)
1656
0
        return -1;
1657
1658
0
    if (!(hrecs = bh->hrecs)) {
1659
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1660
0
            return -1;
1661
0
        hrecs = bh->hrecs;
1662
0
    }
1663
1664
0
    sam_hrec_type_t *step;
1665
0
    int ret = 1, remove_all = (ID_key == NULL);
1666
1667
0
    if (!strncmp(type, "PG", 2) || !strncmp(type, "CO", 2)) {
1668
0
        hts_log_warning("Removing PG or CO lines is not supported!");
1669
0
        return -1;
1670
0
    }
1671
1672
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1673
0
    if (!type_found) { // remove all line of this type
1674
0
        khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1675
0
        if (k == kh_end(hrecs->h))
1676
0
            return 0;
1677
0
        type_found =  kh_val(hrecs->h, k);
1678
0
        if (!type_found)
1679
0
            return 0;
1680
0
        remove_all = 1;
1681
0
    }
1682
1683
0
    step = type_found->next;
1684
0
    while (step != type_found) {
1685
0
        sam_hrec_type_t *to_remove = step;
1686
0
        step = step->next;
1687
0
        ret &= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
1688
0
    }
1689
1690
0
    if (remove_all)
1691
0
        ret &= sam_hrecs_remove_line(hrecs, type, type_found, 0);
1692
1693
    /* if RG or SQ, delete then rebuild the hashes (as it is faster
1694
       to rebuild than delete one by one).
1695
    */
1696
1697
0
    if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) {
1698
0
        if (rebuild_hash(hrecs, type)) {
1699
0
            return -1;
1700
0
        }
1701
0
    }
1702
1703
0
    if (!ret && hrecs->dirty)
1704
0
        redact_header_text(bh);
1705
1706
0
    return 0;
1707
0
}
1708
1709
0
int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *vrh) {
1710
0
    sam_hrecs_t *hrecs;
1711
0
    rmhash_t *rh = (rmhash_t *)vrh;
1712
1713
0
    if (!bh || !type)
1714
0
        return -1;
1715
0
    if (!rh) // remove all lines
1716
0
        return sam_hdr_remove_except(bh, type, NULL, NULL);
1717
0
    if (!id)
1718
0
        return -1;
1719
1720
0
    if (!(hrecs = bh->hrecs)) {
1721
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1722
0
            return -1;
1723
0
        hrecs = bh->hrecs;
1724
0
    }
1725
1726
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1727
0
    if (k == kh_end(hrecs->h)) // nothing to remove from
1728
0
        return 0;
1729
1730
0
    sam_hrec_type_t *head = kh_val(hrecs->h, k);
1731
0
    if (!head) {
1732
0
        hts_log_error("Header inconsistency");
1733
0
        return -1;
1734
0
    }
1735
1736
0
    int ret = 0;
1737
0
    sam_hrec_type_t *step = head->next;
1738
0
    while (step != head) {
1739
0
        sam_hrec_tag_t *tag = sam_hrecs_find_key(step, id, NULL);
1740
0
        if (tag && tag->str && tag->len >= 3) {
1741
0
           k = kh_get(rm, rh, tag->str+3);
1742
0
           if (k == kh_end(rh)) { // value is not in the hash table, so remove
1743
0
               sam_hrec_type_t *to_remove = step;
1744
0
               step = step->next;
1745
0
               ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
1746
0
           } else {
1747
0
               step = step->next;
1748
0
           }
1749
0
        } else { // tag is not on the line, so skip to next line
1750
0
            step = step->next;
1751
0
        }
1752
0
    }
1753
1754
    // process the first line
1755
0
    sam_hrec_tag_t * tag = sam_hrecs_find_key(head, id, NULL);
1756
0
    if (tag && tag->str && tag->len >= 3) {
1757
0
       k = kh_get(rm, rh, tag->str+3);
1758
0
       if (k == kh_end(rh)) { // value is not in the hash table, so remove
1759
0
           sam_hrec_type_t *to_remove = head;
1760
0
           head = head->next;
1761
0
           ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0);
1762
0
       }
1763
0
    }
1764
1765
    /* if RG or SQ, delete then rebuild the hashes (as it is faster
1766
       to rebuild than delete one by one).
1767
    */
1768
1769
0
    if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) {
1770
0
        if (rebuild_hash(hrecs, type)) {
1771
0
            return -1;
1772
0
        }
1773
0
    }
1774
1775
0
    if (!ret && hrecs->dirty)
1776
0
        redact_header_text(bh);
1777
1778
0
    return ret;
1779
0
}
1780
1781
26.9k
int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) {
1782
26.9k
    int count;
1783
26.9k
    sam_hrec_type_t *first_ty, *itr_ty;
1784
1785
26.9k
    if (!bh || !type)
1786
0
        return -1;
1787
1788
26.9k
    if (!bh->hrecs) {
1789
26.9k
        if (sam_hdr_fill_hrecs(bh) != 0)
1790
2.81k
            return -1;
1791
26.9k
    }
1792
1793
    // Deal with types that have counts
1794
24.1k
    switch (type[0]) {
1795
24.1k
    case 'S':
1796
24.1k
        if (type[1] == 'Q')
1797
24.1k
            return bh->hrecs->nref;
1798
0
        break;
1799
0
    case 'R':
1800
0
        if (type[1] == 'G')
1801
0
            return bh->hrecs->nrg;
1802
0
        break;
1803
0
    case 'P':
1804
0
        if (type[1] == 'G')
1805
0
            return bh->hrecs->npg;
1806
0
        break;
1807
0
    default:
1808
0
        break;
1809
24.1k
    }
1810
1811
0
    first_ty = sam_hrecs_find_type_id(bh->hrecs, type, NULL, NULL);
1812
0
    if (!first_ty)
1813
0
        return 0;
1814
1815
0
    count = 1;
1816
0
    for (itr_ty = first_ty->next;
1817
0
         itr_ty && itr_ty != first_ty; itr_ty = itr_ty->next) {
1818
0
        count++;
1819
0
    }
1820
1821
0
    return count;
1822
0
}
1823
1824
int sam_hdr_line_index(sam_hdr_t *bh,
1825
                       const char *type,
1826
0
                       const char *key) {
1827
0
    sam_hrecs_t *hrecs;
1828
0
    if (!bh || !type || !key)
1829
0
        return -2;
1830
1831
0
    if (!(hrecs = bh->hrecs)) {
1832
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1833
0
            return -2;
1834
0
        hrecs = bh->hrecs;
1835
0
    }
1836
1837
0
    khint_t k;
1838
0
    int idx = -1;
1839
0
    switch (type[0]) {
1840
0
    case 'S':
1841
0
        if (type[1] == 'Q') {
1842
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
1843
0
            if (k != kh_end(hrecs->ref_hash))
1844
0
                idx = kh_val(hrecs->ref_hash, k);
1845
0
        } else {
1846
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1847
0
        }
1848
0
        break;
1849
0
    case 'R':
1850
0
        if (type[1] == 'G') {
1851
0
            k = kh_get(m_s2i, hrecs->rg_hash, key);
1852
0
            if (k != kh_end(hrecs->rg_hash))
1853
0
                idx = kh_val(hrecs->rg_hash, k);
1854
0
        } else {
1855
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1856
0
        }
1857
0
        break;
1858
0
    case 'P':
1859
0
        if (type[1] == 'G') {
1860
0
            k = kh_get(m_s2i, hrecs->pg_hash, key);
1861
0
            if (k != kh_end(hrecs->pg_hash))
1862
0
                idx = kh_val(hrecs->pg_hash, k);
1863
0
        } else {
1864
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1865
0
        }
1866
0
        break;
1867
0
    default:
1868
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1869
0
    }
1870
1871
0
    return idx;
1872
0
}
1873
1874
const char *sam_hdr_line_name(sam_hdr_t *bh,
1875
                              const char *type,
1876
0
                              int pos) {
1877
0
    sam_hrecs_t *hrecs;
1878
0
    if (!bh || !type || pos < 0)
1879
0
        return NULL;
1880
1881
0
    if (!(hrecs = bh->hrecs)) {
1882
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1883
0
            return NULL;
1884
0
        hrecs = bh->hrecs;
1885
0
    }
1886
1887
0
    switch (type[0]) {
1888
0
    case 'S':
1889
0
        if (type[1] == 'Q') {
1890
0
            if (pos < hrecs->nref)
1891
0
                return hrecs->ref[pos].name;
1892
0
        } else {
1893
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1894
0
        }
1895
0
        break;
1896
0
    case 'R':
1897
0
        if (type[1] == 'G') {
1898
0
            if (pos < hrecs->nrg)
1899
0
                return hrecs->rg[pos].name;
1900
0
        } else {
1901
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1902
0
        }
1903
0
        break;
1904
0
    case 'P':
1905
0
        if (type[1] == 'G') {
1906
0
            if (pos < hrecs->npg)
1907
0
                return hrecs->pg[pos].name;
1908
0
        } else {
1909
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1910
0
        }
1911
0
        break;
1912
0
    default:
1913
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1914
0
    }
1915
1916
0
    return NULL;
1917
0
}
1918
1919
/* ==== Key:val level methods ==== */
1920
1921
int sam_hdr_find_tag_id(sam_hdr_t *bh,
1922
                     const char *type,
1923
                     const char *ID_key,
1924
                     const char *ID_value,
1925
                     const char *key,
1926
0
                     kstring_t *ks) {
1927
0
    sam_hrecs_t *hrecs;
1928
0
    if (!bh || !type || !key)
1929
0
        return -2;
1930
1931
0
    if (!(hrecs = bh->hrecs)) {
1932
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1933
0
            return -2;
1934
0
        hrecs = bh->hrecs;
1935
0
    }
1936
1937
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1938
0
    if (!ty)
1939
0
        return -1;
1940
1941
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1942
0
    if (!tag || !tag->str || tag->len < 4)
1943
0
        return -1;
1944
1945
0
    ks->l = 0;
1946
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1947
0
        return -2;
1948
0
    }
1949
1950
0
    return 0;
1951
0
}
1952
1953
int sam_hdr_find_tag_pos(sam_hdr_t *bh,
1954
                     const char *type,
1955
                     int pos,
1956
                     const char *key,
1957
0
                     kstring_t *ks) {
1958
0
    sam_hrecs_t *hrecs;
1959
0
    if (!bh || !type || !key)
1960
0
        return -2;
1961
1962
0
    if (!(hrecs = bh->hrecs)) {
1963
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1964
0
            return -2;
1965
0
        hrecs = bh->hrecs;
1966
0
    }
1967
1968
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1969
0
    if (!ty)
1970
0
        return -1;
1971
1972
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1973
0
    if (!tag || !tag->str || tag->len < 4)
1974
0
        return -1;
1975
1976
0
    ks->l = 0;
1977
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1978
0
        return -2;
1979
0
    }
1980
1981
0
    return 0;
1982
0
}
1983
1984
int sam_hdr_remove_tag_id(sam_hdr_t *bh,
1985
        const char *type,
1986
        const char *ID_key,
1987
        const char *ID_value,
1988
0
        const char *key) {
1989
0
    sam_hrecs_t *hrecs;
1990
0
    if (!bh || !type || !key)
1991
0
        return -1;
1992
1993
0
    if (!(hrecs = bh->hrecs)) {
1994
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1995
0
            return -1;
1996
0
        hrecs = bh->hrecs;
1997
0
    }
1998
1999
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
2000
0
    if (!ty)
2001
0
        return -1;
2002
2003
0
    int ret = sam_hrecs_remove_key(hrecs, ty, key);
2004
0
    if (!ret && hrecs->dirty)
2005
0
        redact_header_text(bh);
2006
2007
0
    return ret;
2008
0
}
2009
2010
/*
2011
 * Reconstructs a kstring from the header hash table.
2012
 * Returns 0 on success
2013
 *        -1 on failure
2014
 */
2015
29.8k
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) {
2016
29.8k
    ks->l = 0;
2017
2018
29.8k
    if (!hrecs->h || !hrecs->h->size) {
2019
18.9k
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
2020
18.9k
    }
2021
10.8k
    if (sam_hrecs_rebuild_lines(hrecs, ks) != 0)
2022
0
        return -1;
2023
2024
10.8k
    return 0;
2025
10.8k
}
2026
2027
/*
2028
 * Looks up a reference sequence by name and returns the numerical ID.
2029
 * Returns -1 if unknown reference; -2 if header could not be parsed.
2030
 */
2031
456k
int sam_hdr_name2tid(sam_hdr_t *bh, const char *ref) {
2032
456k
    sam_hrecs_t *hrecs;
2033
456k
    khint_t k;
2034
2035
456k
    if (!bh)
2036
0
        return -1;
2037
2038
456k
    if (!(hrecs = bh->hrecs)) {
2039
54
        if (sam_hdr_fill_hrecs(bh) != 0)
2040
54
            return -2;
2041
0
        hrecs = bh->hrecs;
2042
0
    }
2043
2044
456k
    if (!hrecs->ref_hash)
2045
0
        return -1;
2046
2047
456k
    k = kh_get(m_s2i, hrecs->ref_hash, ref);
2048
456k
    return k == kh_end(hrecs->ref_hash) ? -1 : kh_val(hrecs->ref_hash, k);
2049
456k
}
2050
2051
0
const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) {
2052
0
    sam_hrecs_t *hrecs;
2053
2054
0
    if (!h || tid < 0)
2055
0
        return NULL;
2056
2057
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2058
0
        return hrecs->ref[tid].name;
2059
0
    } else {
2060
0
        if (tid < h->n_targets)
2061
0
            return h->target_name[tid];
2062
0
    }
2063
2064
0
    return NULL;
2065
0
}
2066
2067
0
hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
2068
0
    sam_hrecs_t *hrecs;
2069
2070
0
    if (!h || tid < 0)
2071
0
        return 0;
2072
2073
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2074
0
        return hrecs->ref[tid].len;
2075
0
    } else {
2076
0
        if (tid < h->n_targets) {
2077
0
            if (h->target_len[tid] < UINT32_MAX || !h->sdict) {
2078
0
                return h->target_len[tid];
2079
0
            } else {
2080
0
                khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict;
2081
0
                khint_t k = kh_get(s2i, long_refs, h->target_name[tid]);
2082
0
                if (k < kh_end(long_refs)) {
2083
0
                    return kh_val(long_refs, k);
2084
0
                } else {
2085
0
                    return UINT32_MAX;
2086
0
                }
2087
0
            }
2088
0
        }
2089
0
    }
2090
2091
0
    return 0;
2092
0
}
2093
2094
/*
2095
 * Fixes any PP links in @PG headers.
2096
 * If the entries are in order then this doesn't need doing, but in case
2097
 * our header is out of order this goes through the hrecs->pg[] array
2098
 * setting the prev_id field.
2099
 *
2100
 * Note we can have multiple complete chains. This code should identify the
2101
 * tails of these chains as these are the entries we have to link to in
2102
 * subsequent PP records.
2103
 *
2104
 * Returns 0 on success
2105
 *        -1 on failure (indicating broken PG/PP records)
2106
 */
2107
0
static int sam_hdr_link_pg(sam_hdr_t *bh) {
2108
0
    sam_hrecs_t *hrecs;
2109
0
    int i, j, ret = 0, *new_pg_end;
2110
2111
0
    if (!bh)
2112
0
        return -1;
2113
2114
0
    if (!(hrecs = bh->hrecs)) {
2115
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2116
0
            return -1;
2117
0
        hrecs = bh->hrecs;
2118
0
    }
2119
2120
0
    if (!hrecs->pgs_changed || !hrecs->npg)
2121
0
        return 0;
2122
2123
0
    hrecs->npg_end_alloc = hrecs->npg;
2124
0
    new_pg_end = realloc(hrecs->pg_end, hrecs->npg * sizeof(*new_pg_end));
2125
0
    if (!new_pg_end)
2126
0
        return -1;
2127
0
    hrecs->pg_end = new_pg_end;
2128
0
    int *chain_size = calloc(hrecs->npg, sizeof(int));
2129
0
    if (!chain_size)
2130
0
        return -1;
2131
2132
0
    for (i = 0; i < hrecs->npg; i++)
2133
0
        hrecs->pg_end[i] = i;
2134
2135
0
    for (i = 0; i < hrecs->npg; i++) {
2136
0
        khint_t k;
2137
0
        sam_hrec_tag_t *tag;
2138
2139
0
        assert(hrecs->pg[i].ty != NULL);
2140
0
        for (tag = hrecs->pg[i].ty->tag; tag; tag = tag->next) {
2141
0
            if (tag->str[0] == 'P' && tag->str[1] == 'P')
2142
0
                break;
2143
0
        }
2144
0
        if (!tag) {
2145
            /* Chain start points */
2146
0
            continue;
2147
0
        }
2148
2149
0
        k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
2150
2151
0
        if (k == kh_end(hrecs->pg_hash)) {
2152
0
            hts_log_warning("PG line with ID:%s has a PP link to missing program '%s'",
2153
0
                    hrecs->pg[i].name, tag->str+3);
2154
0
            continue;
2155
0
        }
2156
2157
0
        int pp_idx = kh_val(hrecs->pg_hash, k);
2158
0
        if (pp_idx == i) {
2159
0
            hts_log_warning("PG line with ID:%s has a PP link to itself",
2160
0
                            hrecs->pg[i].name);
2161
0
            continue;
2162
0
        }
2163
2164
0
        hrecs->pg[i].prev_id = hrecs->pg[pp_idx].id;
2165
0
        hrecs->pg_end[pp_idx] = -1;
2166
0
        chain_size[i] = chain_size[pp_idx]+1;
2167
0
    }
2168
2169
0
    int last_end = -1;
2170
0
    for (i = j = 0; i < hrecs->npg; i++) {
2171
0
        if (hrecs->pg_end[i] != -1) {
2172
0
            last_end = hrecs->pg_end[i];
2173
0
            if (chain_size[i] > 0)
2174
0
                hrecs->pg_end[j++] = hrecs->pg_end[i];
2175
0
        }
2176
0
    }
2177
    /* Only leafs? Choose the last one! */
2178
0
    if (!j && hrecs->npg_end > 0 && last_end >= 0) {
2179
0
        hrecs->pg_end[0] = last_end;
2180
0
        j = 1;
2181
0
    }
2182
2183
0
    hrecs->npg_end = j;
2184
0
    hrecs->pgs_changed = 0;
2185
2186
    /* mark as dirty or empty for rebuild */
2187
0
    hrecs->dirty = 1;
2188
0
    redact_header_text(bh);
2189
0
    free(chain_size);
2190
2191
0
    return ret;
2192
0
}
2193
2194
/*
2195
 * Returns a unique ID from a base name.
2196
 *
2197
 * The value returned is valid until the next call to
2198
 * this function.
2199
 */
2200
0
const char *sam_hdr_pg_id(sam_hdr_t *bh, const char *name) {
2201
0
    sam_hrecs_t *hrecs;
2202
0
    size_t name_len;
2203
0
    const size_t name_extra = 17;
2204
0
    if (!bh || !name)
2205
0
        return NULL;
2206
2207
0
    if (!(hrecs = bh->hrecs)) {
2208
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2209
0
            return NULL;
2210
0
        hrecs = bh->hrecs;
2211
0
    }
2212
2213
0
    khint_t k = kh_get(m_s2i, hrecs->pg_hash, name);
2214
0
    if (k == kh_end(hrecs->pg_hash))
2215
0
        return name;
2216
2217
0
    name_len = strlen(name);
2218
0
    if (name_len > 1000) name_len = 1000;
2219
0
    if (hrecs->ID_buf_sz < name_len + name_extra) {
2220
0
        char *new_ID_buf = realloc(hrecs->ID_buf, name_len + name_extra);
2221
0
        if (new_ID_buf == NULL)
2222
0
            return NULL;
2223
0
        hrecs->ID_buf = new_ID_buf;
2224
0
        hrecs->ID_buf_sz = name_len + name_extra;
2225
0
    }
2226
2227
0
    do {
2228
0
        snprintf(hrecs->ID_buf, hrecs->ID_buf_sz, "%.1000s.%d", name, hrecs->ID_cnt++);
2229
0
        k = kh_get(m_s2i, hrecs->pg_hash, hrecs->ID_buf);
2230
0
    } while (k != kh_end(hrecs->pg_hash));
2231
2232
0
    return hrecs->ID_buf;
2233
0
}
2234
2235
/*
2236
 * Add an @PG line.
2237
 *
2238
 * If we wish complete control over this use sam_hdr_add_line() directly. This
2239
 * function uses that, but attempts to do a lot of tedious house work for
2240
 * you too.
2241
 *
2242
 * - It will generate a suitable ID if the supplied one clashes.
2243
 * - It will generate multiple @PG records if we have multiple PG chains.
2244
 *
2245
 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
2246
 * in NULL.
2247
 *
2248
 * Returns 0 on success
2249
 *        -1 on failure
2250
 */
2251
0
int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) {
2252
0
    sam_hrecs_t *hrecs;
2253
0
    const char *specified_id = NULL, *specified_pn = NULL, *specified_pp = NULL;
2254
0
    const char *key, *val;
2255
0
    if (!bh)
2256
0
        return -1;
2257
2258
0
    if (!(hrecs = bh->hrecs)) {
2259
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2260
0
            return -1;
2261
0
        hrecs = bh->hrecs;
2262
0
    }
2263
2264
0
    bh->hrecs->pgs_changed = 1;
2265
0
    if (sam_hdr_link_pg(bh) < 0) {
2266
0
        hts_log_error("Error linking @PG lines");
2267
0
        return -1;
2268
0
    }
2269
2270
0
    va_list args;
2271
    // Check for ID / PN / PP tags in varargs list
2272
0
    va_start(args, name);
2273
0
    while ((key = va_arg(args, const char *)) != NULL) {
2274
0
        val = va_arg(args, const char *);
2275
0
        if (!val) break;
2276
0
        if (strcmp(key, "PN") == 0 && *val != '\0')
2277
0
            specified_pn = val;
2278
0
        else if (strcmp(key, "PP") == 0 && *val != '\0')
2279
0
            specified_pp = val;
2280
0
        else if (strcmp(key, "ID") == 0 && *val != '\0')
2281
0
            specified_id = val;
2282
0
    }
2283
0
    va_end(args);
2284
2285
0
    if (specified_id && hrecs->pg_hash) {
2286
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_id);
2287
0
        if (k != kh_end(hrecs->pg_hash)) {
2288
0
            hts_log_error("Header @PG ID:%s already present", specified_id);
2289
0
            return -1;
2290
0
        }
2291
0
    }
2292
2293
0
    if (specified_pp && hrecs->pg_hash) {
2294
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_pp);
2295
0
        if (k == kh_end(hrecs->pg_hash)) {
2296
0
            hts_log_error("Header @PG ID:%s referred to by PP tag not present",
2297
0
                          specified_pp);
2298
0
            return -1;
2299
0
        }
2300
0
    }
2301
2302
0
    if (!specified_pp && hrecs->npg_end) {
2303
        /* Copy ends array to avoid us looping while modifying it */
2304
0
        int *end = malloc(hrecs->npg_end * sizeof(int));
2305
0
        int i, nends = hrecs->npg_end;
2306
2307
0
        if (!end)
2308
0
            return -1;
2309
2310
0
        memcpy(end, hrecs->pg_end, nends * sizeof(*end));
2311
2312
0
        for (i = 0; i < nends; i++) {
2313
0
            const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2314
0
            if (!id) {
2315
0
                free(end);
2316
0
                return -1;
2317
0
            }
2318
0
            assert(end[i] >= 0 && end[i] < hrecs->npg);
2319
0
            va_start(args, name);
2320
0
            if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2321
0
                                     "ID", id,
2322
0
                                     "PN", !specified_pn ? name : "",
2323
0
                                     "PP", hrecs->pg[end[i]].name,
2324
0
                                     NULL)) {
2325
0
                free(end);
2326
0
                return  -1;
2327
0
            }
2328
0
            va_end(args);
2329
0
        }
2330
2331
0
        free(end);
2332
0
    } else {
2333
0
        const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2334
0
        if (!id)
2335
0
            return -1;
2336
0
        va_start(args, name);
2337
0
        if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2338
0
                                 "ID", id,
2339
0
                                 "PN", !specified_pn ? name : "",
2340
0
                                 NULL))
2341
0
            return -1;
2342
0
        va_end(args);
2343
0
    }
2344
2345
0
    hrecs->dirty = 1;
2346
0
    redact_header_text(bh);
2347
2348
0
    return 0;
2349
0
}
2350
2351
/*! Increments a reference count on bh.
2352
 *
2353
 * This permits multiple files to share the same header, all calling
2354
 * sam_hdr_destroy when done, without causing errors for other open files.
2355
 */
2356
0
void sam_hdr_incr_ref(sam_hdr_t *bh) {
2357
0
    if (!bh)
2358
0
        return;
2359
0
    bh->ref_count++;
2360
0
}
2361
2362
/* ==== Internal methods ==== */
2363
2364
/*
2365
 * Creates an empty SAM header.  Allocates space for the SAM header
2366
 * structures (hash tables) ready to be populated.
2367
 *
2368
 * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
2369
 *         NULL on failure
2370
 */
2371
43.3k
sam_hrecs_t *sam_hrecs_new(void) {
2372
43.3k
    sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
2373
2374
43.3k
    if (!hrecs)
2375
0
        return NULL;
2376
2377
43.3k
    hrecs->h = kh_init(sam_hrecs_t);
2378
43.3k
    if (!hrecs->h)
2379
0
        goto err;
2380
2381
43.3k
    hrecs->ID_cnt = 1;
2382
2383
43.3k
    hrecs->nref = 0;
2384
43.3k
    hrecs->ref_sz = 0;
2385
43.3k
    hrecs->ref  = NULL;
2386
43.3k
    if (!(hrecs->ref_hash = kh_init(m_s2i)))
2387
0
        goto err;
2388
43.3k
    hrecs->refs_changed = -1;
2389
2390
43.3k
    hrecs->nrg = 0;
2391
43.3k
    hrecs->rg_sz = 0;
2392
43.3k
    hrecs->rg  = NULL;
2393
43.3k
    if (!(hrecs->rg_hash = kh_init(m_s2i)))
2394
0
        goto err;
2395
2396
43.3k
    hrecs->npg = 0;
2397
43.3k
    hrecs->pg_sz = 0;
2398
43.3k
    hrecs->pg  = NULL;
2399
43.3k
    hrecs->npg_end = hrecs->npg_end_alloc = 0;
2400
43.3k
    hrecs->pg_end = NULL;
2401
43.3k
    if (!(hrecs->pg_hash = kh_init(m_s2i)))
2402
0
        goto err;
2403
2404
43.3k
    if (!(hrecs->tag_pool = pool_create(sizeof(sam_hrec_tag_t))))
2405
0
        goto err;
2406
2407
43.3k
    if (!(hrecs->type_pool = pool_create(sizeof(sam_hrec_type_t))))
2408
0
        goto err;
2409
2410
43.3k
    if (!(hrecs->str_pool = string_pool_create(65536)))
2411
0
        goto err;
2412
2413
43.3k
    if (sam_hrecs_init_type_order(hrecs, NULL))
2414
0
        goto err;
2415
2416
43.3k
    return hrecs;
2417
2418
0
err:
2419
0
    if (hrecs->h)
2420
0
        kh_destroy(sam_hrecs_t, hrecs->h);
2421
2422
0
    if (hrecs->tag_pool)
2423
0
        pool_destroy(hrecs->tag_pool);
2424
2425
0
    if (hrecs->type_pool)
2426
0
        pool_destroy(hrecs->type_pool);
2427
2428
0
    if (hrecs->str_pool)
2429
0
        string_pool_destroy(hrecs->str_pool);
2430
2431
0
    free(hrecs);
2432
2433
0
    return NULL;
2434
43.3k
}
2435
#if 0
2436
/*
2437
 * Produces a duplicate copy of source and returns it.
2438
 * Returns NULL on failure
2439
 */
2440
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *source) {
2441
        return NULL;
2442
}
2443
#endif
2444
/*! Deallocates all storage used by a sam_hrecs_t struct.
2445
 *
2446
 * This also decrements the header reference count. If after decrementing
2447
 * it is still non-zero then the header is assumed to be in use by another
2448
 * caller and the free is not done.
2449
 *
2450
 */
2451
43.3k
void sam_hrecs_free(sam_hrecs_t *hrecs) {
2452
43.3k
    if (!hrecs)
2453
0
        return;
2454
2455
43.3k
    if (hrecs->h)
2456
43.3k
        kh_destroy(sam_hrecs_t, hrecs->h);
2457
2458
43.3k
    if (hrecs->ref_hash)
2459
43.3k
        kh_destroy(m_s2i, hrecs->ref_hash);
2460
2461
43.3k
    if (hrecs->ref)
2462
10.8k
        free(hrecs->ref);
2463
2464
43.3k
    if (hrecs->rg_hash)
2465
43.3k
        kh_destroy(m_s2i, hrecs->rg_hash);
2466
2467
43.3k
    if (hrecs->rg)
2468
408
        free(hrecs->rg);
2469
2470
43.3k
    if (hrecs->pg_hash)
2471
43.3k
        kh_destroy(m_s2i, hrecs->pg_hash);
2472
2473
43.3k
    if (hrecs->pg)
2474
1.11k
        free(hrecs->pg);
2475
2476
43.3k
    if (hrecs->pg_end)
2477
1.01k
        free(hrecs->pg_end);
2478
2479
43.3k
    if (hrecs->type_pool)
2480
43.3k
        pool_destroy(hrecs->type_pool);
2481
2482
43.3k
    if (hrecs->tag_pool)
2483
43.3k
        pool_destroy(hrecs->tag_pool);
2484
2485
43.3k
    if (hrecs->str_pool)
2486
43.3k
        string_pool_destroy(hrecs->str_pool);
2487
2488
43.3k
    if (hrecs->type_order)
2489
43.3k
        free(hrecs->type_order);
2490
2491
43.3k
    if (hrecs->ID_buf)
2492
0
        free(hrecs->ID_buf);
2493
2494
43.3k
    free(hrecs);
2495
43.3k
}
2496
2497
/*
2498
 * Internal method already used by the CRAM code
2499
 * Returns the first header item matching 'type'. If ID is non-NULL it checks
2500
 * for the tag ID: and compares against the specified ID.
2501
 *
2502
 * Returns NULL if no type/ID is found
2503
 */
2504
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
2505
10.9k
                                     const char *ID_key, const char *ID_value) {
2506
10.9k
    if (!hrecs || !type)
2507
0
        return NULL;
2508
10.9k
    sam_hrec_type_t *t1, *t2;
2509
10.9k
    khint_t k;
2510
2511
    /* Special case for types we have prebuilt hashes on */
2512
10.9k
    if (ID_key) {
2513
10.9k
        if (!ID_value)
2514
0
            return NULL;
2515
2516
10.9k
        if (type[0]   == 'S' && type[1]   == 'Q' &&
2517
10.9k
            ID_key[0] == 'S' && ID_key[1] == 'N') {
2518
10.9k
            k = kh_get(m_s2i, hrecs->ref_hash, ID_value);
2519
10.9k
            return k != kh_end(hrecs->ref_hash)
2520
10.9k
                ? hrecs->ref[kh_val(hrecs->ref_hash, k)].ty
2521
10.9k
                : NULL;
2522
10.9k
        }
2523
2524
0
        if (type[0]   == 'R' && type[1]   == 'G' &&
2525
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2526
0
            k = kh_get(m_s2i, hrecs->rg_hash, ID_value);
2527
0
            return k != kh_end(hrecs->rg_hash)
2528
0
                ? hrecs->rg[kh_val(hrecs->rg_hash, k)].ty
2529
0
                : NULL;
2530
0
        }
2531
2532
0
        if (type[0]   == 'P' && type[1]   == 'G' &&
2533
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2534
0
            k = kh_get(m_s2i, hrecs->pg_hash, ID_value);
2535
0
            return k != kh_end(hrecs->pg_hash)
2536
0
                ? hrecs->pg[kh_val(hrecs->pg_hash, k)].ty
2537
0
                : NULL;
2538
0
        }
2539
0
    }
2540
2541
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2542
0
    if (k == kh_end(hrecs->h))
2543
0
        return NULL;
2544
2545
0
    if (!ID_key)
2546
0
        return kh_val(hrecs->h, k);
2547
2548
0
    t1 = t2 = kh_val(hrecs->h, k);
2549
0
    do {
2550
0
        sam_hrec_tag_t *tag;
2551
0
        for (tag = t1->tag; tag; tag = tag->next) {
2552
0
            if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
2553
0
                const char *cp1 = tag->str+3;
2554
0
                const char *cp2 = ID_value;
2555
0
                while (*cp1 && *cp1 == *cp2)
2556
0
                    cp1++, cp2++;
2557
0
                if (*cp2 || *cp1)
2558
0
                    continue;
2559
0
                return t1;
2560
0
            }
2561
0
        }
2562
0
        t1 = t1->next;
2563
0
    } while (t1 != t2);
2564
2565
0
    return NULL;
2566
0
}
2567
2568
/*
2569
 * Adds or updates tag key,value pairs in a header line.
2570
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2571
 * @HD line.
2572
 *
2573
 * va_list contains multiple key,value pairs ending in NULL.
2574
 *
2575
 * Returns 0 on success
2576
 *        -1 on failure
2577
 */
2578
701
int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
2579
701
    if (!hrecs)
2580
0
        return -1;
2581
2582
1.40k
    for (;;) {
2583
1.40k
        char *k, *v, *str;
2584
1.40k
        sam_hrec_tag_t *tag, *prev = NULL;
2585
2586
1.40k
        if (!(k = (char *)va_arg(ap, char *)))
2587
701
            break;
2588
701
        if (!(v = va_arg(ap, char *)))
2589
0
            v = "";
2590
2591
701
        tag = sam_hrecs_find_key(type, k, &prev);
2592
701
        if (!tag) {
2593
0
            if (!(tag = pool_alloc(hrecs->tag_pool)))
2594
0
                return -1;
2595
0
            if (prev)
2596
0
                prev->next = tag;
2597
0
            else
2598
0
                type->tag = tag;
2599
2600
0
            tag->next = NULL;
2601
0
        }
2602
2603
701
        tag->len = 3 + strlen(v);
2604
701
        str = string_alloc(hrecs->str_pool, tag->len+1);
2605
701
        if (!str)
2606
0
            return -1;
2607
2608
701
        if (snprintf(str, tag->len+1, "%2.2s:%s", k, v) < 0)
2609
0
            return -1;
2610
2611
701
        tag->str = str;
2612
701
    }
2613
2614
701
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
2615
2616
701
    return 0;
2617
701
}
2618
2619
/*
2620
 * Adds or updates tag key,value pairs in a header line.
2621
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2622
 * @HD line.
2623
 *
2624
 * Specify multiple key,value pairs ending in NULL.
2625
 *
2626
 * Returns 0 on success
2627
 *        -1 on failure
2628
 */
2629
701
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
2630
701
    va_list args;
2631
701
    int res;
2632
701
    va_start(args, type);
2633
701
    res = sam_hrecs_vupdate(hrecs, type, args);
2634
701
    va_end(args);
2635
701
    return res;
2636
701
}
2637
2638
/*
2639
 * Looks for a specific key in a single sam header line identified by *type.
2640
 * If prev is non-NULL it also fills this out with the previous tag, to
2641
 * permit use in key removal. *prev is set to NULL when the tag is the first
2642
 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
2643
 * tag in the existing list.
2644
 *
2645
 * Returns the tag pointer on success
2646
 *         NULL on failure
2647
 */
2648
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
2649
                                   const char *key,
2650
73.2k
                                   sam_hrec_tag_t **prev) {
2651
73.2k
    sam_hrec_tag_t *tag, *p = NULL;
2652
73.2k
    if (!type)
2653
0
        return NULL;
2654
2655
134k
    for (tag = type->tag; tag; p = tag, tag = tag->next) {
2656
122k
        if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
2657
60.9k
            if (prev)
2658
701
                *prev = p;
2659
60.9k
            return tag;
2660
60.9k
        }
2661
122k
    }
2662
2663
12.3k
    if (prev)
2664
0
        *prev = p;
2665
2666
12.3k
    return NULL;
2667
73.2k
}
2668
2669
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
2670
                         sam_hrec_type_t *type,
2671
0
                         const char *key) {
2672
0
    sam_hrec_tag_t *tag, *prev;
2673
0
    if (!hrecs)
2674
0
        return -1;
2675
0
    tag = sam_hrecs_find_key(type, key, &prev);
2676
0
    if (!tag)
2677
0
        return 0; // Not there anyway
2678
2679
0
    if (type->type == TYPEKEY("SQ") && tag->str[0] == 'A' && tag->str[1] == 'N') {
2680
0
        assert(tag->len >= 3);
2681
0
        sam_hrec_tag_t *sn_tag = sam_hrecs_find_key(type, "SN", NULL);
2682
0
        if (sn_tag) {
2683
0
            assert(sn_tag->len >= 3);
2684
0
            khint_t k = kh_get(m_s2i, hrecs->ref_hash, sn_tag->str + 3);
2685
0
            if (k != kh_end(hrecs->ref_hash))
2686
0
                sam_hrecs_remove_ref_altnames(hrecs, kh_val(hrecs->ref_hash, k), tag->str + 3);
2687
0
        }
2688
0
    }
2689
2690
0
    if (!prev) { //first tag
2691
0
        type->tag = tag->next;
2692
0
    } else {
2693
0
        prev->next = tag->next;
2694
0
    }
2695
0
    pool_free(hrecs->tag_pool, tag);
2696
0
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
2697
2698
0
    return 1;
2699
0
}
2700
2701
/*
2702
 * Looks up a read-group by name and returns a pointer to the start of the
2703
 * associated tag list.
2704
 *
2705
 * Returns NULL on failure
2706
 */
2707
436
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg) {
2708
436
    khint_t k = kh_get(m_s2i, hrecs->rg_hash, rg);
2709
436
    return k == kh_end(hrecs->rg_hash)
2710
436
        ? NULL
2711
436
        : &hrecs->rg[kh_val(hrecs->rg_hash, k)];
2712
436
}
2713
2714
#if DEBUG_HEADER
2715
void sam_hrecs_dump(sam_hrecs_t *hrecs) {
2716
    khint_t k;
2717
    int i;
2718
2719
    printf("===DUMP===\n");
2720
    for (k = kh_begin(hrecs->h); k != kh_end(hrecs->h); k++) {
2721
        sam_hrec_type_t *t1, *t2;
2722
        char c[2];
2723
        int idx = 0;
2724
2725
        if (!kh_exist(hrecs->h, k))
2726
            continue;
2727
2728
        t1 = t2 = kh_val(hrecs->h, k);
2729
        c[0] = kh_key(hrecs->h, k)>>8;
2730
        c[1] = kh_key(hrecs->h, k)&0xff;
2731
        printf("Type %.2s\n", c);
2732
2733
        do {
2734
            sam_hrec_tag_t *tag;
2735
            printf(">>>%d ", idx++);
2736
            for (tag = t1->tag; tag; tag=tag->next) {
2737
                if (strncmp(c, "CO", 2))
2738
                    printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3);
2739
                else
2740
                    printf("%s", tag->str);
2741
            }
2742
            putchar('\n');
2743
            t1 = t1->next;
2744
        } while (t1 != t2);
2745
    }
2746
2747
    /* Dump out PG chains */
2748
    printf("\n@PG chains:\n");
2749
    for (i = 0; i < hrecs->npg_end; i++) {
2750
        int j;
2751
        printf("  %d:", i);
2752
        for (j = hrecs->pg_end[i]; j != -1; j = hrecs->pg[j].prev_id) {
2753
            printf("%s%d(%.*s)",
2754
                   j == hrecs->pg_end[i] ? " " : "->",
2755
                   j, hrecs->pg[j].name_len, hrecs->pg[j].name);
2756
        }
2757
        printf("\n");
2758
    }
2759
2760
    puts("===END DUMP===");
2761
}
2762
#endif
2763
2764
/*
2765
 * Returns the sort order:
2766
 */
2767
0
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs) {
2768
0
    khint_t k;
2769
0
    enum sam_sort_order so;
2770
2771
0
    so = ORDER_UNKNOWN;
2772
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2773
0
    if (k != kh_end(hrecs->h)) {
2774
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2775
0
        sam_hrec_tag_t *tag;
2776
0
        for (tag = ty->tag; tag; tag = tag->next) {
2777
0
            if (tag->str[0] == 'S' && tag->str[1] == 'O') {
2778
0
                if (strcmp(tag->str+3, "unsorted") == 0)
2779
0
                    so = ORDER_UNSORTED;
2780
0
                else if (strcmp(tag->str+3, "queryname") == 0)
2781
0
                    so = ORDER_NAME;
2782
0
                else if (strcmp(tag->str+3, "coordinate") == 0)
2783
0
                    so = ORDER_COORD;
2784
0
                else if (strcmp(tag->str+3, "unknown") != 0)
2785
0
                    hts_log_error("Unknown sort order field: %s", tag->str+3);
2786
0
            }
2787
0
        }
2788
0
    }
2789
2790
0
    return so;
2791
0
}
2792
2793
0
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs) {
2794
0
    khint_t k;
2795
0
    enum sam_group_order go;
2796
2797
0
    go = ORDER_NONE;
2798
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2799
0
    if (k != kh_end(hrecs->h)) {
2800
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2801
0
        sam_hrec_tag_t *tag;
2802
0
        for (tag = ty->tag; tag; tag = tag->next) {
2803
0
            if (tag->str[0] == 'G' && tag->str[1] == 'O') {
2804
0
                if (strcmp(tag->str+3, "query") == 0)
2805
0
                    go = ORDER_QUERY;
2806
0
                else if (strcmp(tag->str+3, "reference") == 0)
2807
0
                    go = ORDER_REFERENCE;
2808
0
            }
2809
0
        }
2810
0
    }
2811
2812
0
    return go;
2813
0
}