Coverage Report

Created: 2023-01-24 06:07

/src/htslib/header.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2018-2020 Genome Research Ltd.
3
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <string.h>
35
#include <assert.h>
36
#include <errno.h>
37
#include "textutils_internal.h"
38
#include "header.h"
39
40
// Hash table for removing multiple lines from the header
41
KHASH_SET_INIT_STR(rm)
42
// Used for long refs in SAM files
43
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44
45
typedef khash_t(rm) rmhash_t;
46
47
static int sam_hdr_link_pg(sam_hdr_t *bh);
48
49
static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
50
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);
51
52
53
660
#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
54
391
static void sam_hrecs_error(const char *msg, const char *line, size_t len, size_t lno) {
55
391
    int j;
56
57
391
    if (len > MAX_ERROR_QUOTE)
58
269
        len = MAX_ERROR_QUOTE;
59
44.2k
    for (j = 0; j < len && line[j] != '\n'; j++)
60
43.8k
        ;
61
391
    hts_log_error("%s at line %zd: \"%.*s\"", msg, lno, j, line);
62
391
}
63
64
/* ==== Static methods ==== */
65
66
3.57k
static int sam_hrecs_init_type_order(sam_hrecs_t *hrecs, char *type_list) {
67
3.57k
    if (!hrecs)
68
0
        return -1;
69
70
3.57k
    if (!type_list) {
71
3.57k
        hrecs->type_count = 5;
72
3.57k
        hrecs->type_order = calloc(hrecs->type_count, 3);
73
3.57k
        if (!hrecs->type_order)
74
0
            return -1;
75
3.57k
        memcpy(hrecs->type_order[0], "HD", 2);
76
3.57k
        memcpy(hrecs->type_order[1], "SQ", 2);
77
3.57k
        memcpy(hrecs->type_order[2], "RG", 2);
78
3.57k
        memcpy(hrecs->type_order[3], "PG", 2);
79
3.57k
        memcpy(hrecs->type_order[4], "CO", 2);
80
3.57k
    }
81
82
3.57k
    return 0;
83
3.57k
}
84
85
5.92k
static int sam_hrecs_add_ref_altnames(sam_hrecs_t *hrecs, int nref, const char *list) {
86
5.92k
    const char *token;
87
5.92k
    ks_tokaux_t aux;
88
89
5.92k
    if (!list)
90
5.75k
        return 0;
91
92
364k
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
93
364k
        if (aux.p == token)
94
351k
            continue;
95
96
13.0k
        char *name = string_ndup(hrecs->str_pool, token, aux.p - token);
97
13.0k
        if (!name)
98
0
            return -1;
99
13.0k
        int r;
100
13.0k
        khint_t k = kh_put(m_s2i, hrecs->ref_hash, name, &r);
101
13.0k
        if (r < 0) return -1;
102
103
13.0k
        if (r > 0)
104
2.70k
            kh_val(hrecs->ref_hash, k) = nref;
105
10.3k
        else if (kh_val(hrecs->ref_hash, k) != nref)
106
419
            hts_log_warning("Duplicate entry AN:\"%s\" in sam header", name);
107
13.0k
    }
108
109
169
    return 0;
110
169
}
111
112
0
static void sam_hrecs_remove_ref_altnames(sam_hrecs_t *hrecs, int expected, const char *list) {
113
0
    const char *token, *sn;
114
0
    ks_tokaux_t aux;
115
0
    kstring_t str = KS_INITIALIZE;
116
117
0
    if (expected < 0 || expected >= hrecs->nref)
118
0
        return;
119
0
    sn = hrecs->ref[expected].name;
120
121
0
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
122
0
        kputsn(token, aux.p - token, ks_clear(&str));
123
0
        khint_t k = kh_get(m_s2i, hrecs->ref_hash, str.s);
124
0
        if (k != kh_end(hrecs->ref_hash)
125
0
            && kh_val(hrecs->ref_hash, k) == expected
126
0
            && strcmp(sn, str.s) != 0)
127
0
            kh_del(m_s2i, hrecs->ref_hash, k);
128
0
    }
129
130
0
    free(str.s);
131
0
}
132
133
/* Updates the hash tables in the sam_hrecs_t structure.
134
 *
135
 * Returns 0 on success;
136
 *        -1 on failure
137
 */
138
static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
139
                                   khint32_t type,
140
5.26M
                                   sam_hrec_type_t *h_type) {
141
    /* Add to reference hash? */
142
5.26M
    if (type == TYPEKEY("SQ")) {
143
5.95k
        sam_hrec_tag_t *tag = h_type->tag;
144
5.95k
        int nref = hrecs->nref;
145
5.95k
        const char *name = NULL;
146
5.95k
        const char *altnames = NULL;
147
5.95k
        hts_pos_t len = -1;
148
5.95k
        int r;
149
5.95k
        khint_t k;
150
151
24.2k
        while (tag) {
152
18.2k
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
153
7.61k
                assert(tag->len >= 3);
154
7.61k
                name = tag->str+3;
155
10.6k
            } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
156
7.24k
                assert(tag->len >= 3);
157
7.24k
                len = strtoll(tag->str+3, NULL, 10);
158
7.24k
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
159
331
                assert(tag->len >= 3);
160
331
                altnames = tag->str+3;
161
331
            }
162
18.2k
            tag = tag->next;
163
18.2k
        }
164
165
5.95k
        if (!name) {
166
7
            hts_log_error("Header includes @SQ line with no SN: tag");
167
7
            return -1; // SN should be present, according to spec.
168
7
        }
169
170
5.94k
        if (len == -1) {
171
14
            hts_log_error("Header includes @SQ line \"%s\" with no LN: tag",
172
14
                          name);
173
14
            return -1; // LN should be present, according to spec.
174
14
        }
175
176
        // Seen already?
177
5.93k
        k = kh_get(m_s2i, hrecs->ref_hash, name);
178
5.93k
        if (k < kh_end(hrecs->ref_hash)) {
179
4.51k
            nref = kh_val(hrecs->ref_hash, k);
180
4.51k
            int ref_changed_flag = 0;
181
182
            // Check for hash entry added by sam_hrecs_refs_from_targets_array()
183
4.51k
            if (hrecs->ref[nref].ty == NULL) {
184
                // Attach header line to existing stub entry.
185
4.49k
                hrecs->ref[nref].ty = h_type;
186
                // Check lengths match; correct if not.
187
4.49k
                if (len != hrecs->ref[nref].len) {
188
334
                    char tmp[32];
189
334
                    snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
190
334
                             hrecs->ref[nref].len);
191
334
                    if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
192
0
                        return -1;
193
334
                    ref_changed_flag = 1;
194
334
                }
195
4.49k
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
196
0
                    return -1;
197
198
4.49k
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
199
130
                    hrecs->refs_changed = nref;
200
4.49k
                return 0;
201
4.49k
            }
202
203
            // Check to see if an existing entry is being updated
204
15
            if (hrecs->ref[nref].ty == h_type) {
205
0
                if (hrecs->ref[nref].len != len) {
206
0
                    hrecs->ref[nref].len = len;
207
0
                    ref_changed_flag = 1;
208
0
                }
209
0
                if (!hrecs->ref[nref].name || strcmp(hrecs->ref[nref].name, name)) {
210
0
                    hrecs->ref[nref].name = name;
211
0
                    ref_changed_flag = 1;
212
0
                }
213
0
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
214
0
                    return -1;
215
216
0
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
217
0
                    hrecs->refs_changed = nref;
218
0
                return 0;
219
0
            }
220
221
            // If here, the name is a duplicate.
222
            // Check to see if it matches the SN: tag from the earlier record.
223
15
            if (strcmp(hrecs->ref[nref].name, name) == 0) {
224
8
                hts_log_error("Duplicate entry \"%s\" in sam header",
225
8
                                name);
226
8
                return -1;
227
8
            }
228
229
            // Clash with an already-seen altname
230
            // As SN: should be preferred to AN: add this as a new
231
            // record and update the hash entry to point to it.
232
7
            hts_log_warning("Ref name SN:\"%s\" is a duplicate of an existing AN key", name);
233
7
            nref = hrecs->nref;
234
7
        }
235
236
1.43k
        if (nref == hrecs->ref_sz) {
237
184
            size_t new_sz = hrecs->ref_sz >= 4 ? hrecs->ref_sz + (hrecs->ref_sz / 4) : 32;
238
184
            sam_hrec_sq_t *new_ref = realloc(hrecs->ref, sizeof(*hrecs->ref) * new_sz);
239
184
            if (!new_ref)
240
0
                return -1;
241
184
            hrecs->ref = new_ref;
242
184
            hrecs->ref_sz = new_sz;
243
184
        }
244
245
1.43k
        hrecs->ref[nref].name = name;
246
1.43k
        hrecs->ref[nref].len  = len;
247
1.43k
        hrecs->ref[nref].ty = h_type;
248
249
1.43k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[nref].name, &r);
250
1.43k
        if (-1 == r) return -1;
251
1.43k
        kh_val(hrecs->ref_hash, k) = nref;
252
253
1.43k
        if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
254
0
            return -1;
255
256
1.43k
        if (hrecs->refs_changed < 0 || hrecs->refs_changed > hrecs->nref)
257
142
            hrecs->refs_changed = hrecs->nref;
258
1.43k
        hrecs->nref++;
259
1.43k
    }
260
261
    /* Add to read-group hash? */
262
5.26M
    if (type == TYPEKEY("RG")) {
263
17.8k
        sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, "ID", NULL);
264
17.8k
        int nrg = hrecs->nrg, r;
265
17.8k
        khint_t k;
266
267
17.8k
        if (!tag) {
268
3
            hts_log_error("Header includes @RG line with no ID: tag");
269
3
            return -1;  // ID should be present, according to spec.
270
3
        }
271
17.8k
        assert(tag->str && tag->len >= 3);
272
273
        // Seen already?
274
17.8k
        k = kh_get(m_s2i, hrecs->rg_hash, tag->str + 3);
275
17.8k
        if (k < kh_end(hrecs->rg_hash)) {
276
16.3k
            nrg = kh_val(hrecs->rg_hash, k);
277
16.3k
            assert(hrecs->rg[nrg].ty != NULL);
278
16.3k
            if (hrecs->rg[nrg].ty != h_type) {
279
16.3k
                hts_log_warning("Duplicate entry \"%s\" in sam header",
280
16.3k
                                tag->str + 3);
281
16.3k
            } else {
282
0
                hrecs->rg[nrg].name = tag->str + 3;
283
0
                hrecs->rg[nrg].name_len = tag->len - 3;
284
0
            }
285
16.3k
            return 0;
286
16.3k
        }
287
288
1.49k
        if (nrg == hrecs->rg_sz) {
289
671
            size_t new_sz = hrecs->rg_sz >= 4 ? hrecs->rg_sz + hrecs->rg_sz / 4 : 4;
290
671
            sam_hrec_rg_t *new_rg = realloc(hrecs->rg, sizeof(*hrecs->rg) * new_sz);
291
671
            if (!new_rg)
292
0
                return -1;
293
671
            hrecs->rg = new_rg;
294
671
            hrecs->rg_sz = new_sz;
295
671
        }
296
297
1.49k
        hrecs->rg[nrg].name = tag->str + 3;
298
1.49k
        hrecs->rg[nrg].name_len = tag->len - 3;
299
1.49k
        hrecs->rg[nrg].ty   = h_type;
300
1.49k
        hrecs->rg[nrg].id   = nrg;
301
302
1.49k
        k = kh_put(m_s2i, hrecs->rg_hash, hrecs->rg[nrg].name, &r);
303
1.49k
        if (-1 == r) return -1;
304
1.49k
        kh_val(hrecs->rg_hash, k) = nrg;
305
306
1.49k
        hrecs->nrg++;
307
1.49k
    }
308
309
    /* Add to program hash? */
310
5.24M
    if (type == TYPEKEY("PG")) {
311
5.21M
        sam_hrec_tag_t *tag;
312
5.21M
        sam_hrec_pg_t *new_pg;
313
5.21M
        int npg = hrecs->npg;
314
315
5.21M
        if (npg == hrecs->pg_sz) {
316
3.27k
            size_t new_sz = hrecs->pg_sz >= 4 ? hrecs->pg_sz + hrecs->pg_sz / 4 : 4;
317
3.27k
            new_pg = realloc(hrecs->pg, sizeof(*hrecs->pg) * new_sz);
318
3.27k
            if (!new_pg)
319
0
                return -1;
320
3.27k
            hrecs->pg = new_pg;
321
3.27k
            hrecs->pg_sz = new_sz;
322
3.27k
        }
323
324
5.21M
        tag = h_type->tag;
325
5.21M
        hrecs->pg[npg].name = NULL;
326
5.21M
        hrecs->pg[npg].name_len = 0;
327
5.21M
        hrecs->pg[npg].ty  = h_type;
328
5.21M
        hrecs->pg[npg].id   = npg;
329
5.21M
        hrecs->pg[npg].prev_id = -1;
330
331
10.5M
        while (tag) {
332
5.34M
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
333
                /* Avoid duplicate ID tags coming from other applications */
334
5.26M
                if (!hrecs->pg[npg].name) {
335
5.21M
                    assert(tag->len >= 3);
336
5.21M
                    hrecs->pg[npg].name = tag->str + 3;
337
5.21M
                    hrecs->pg[npg].name_len = tag->len - 3;
338
5.21M
                } else {
339
48.2k
                    hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name);
340
48.2k
                }
341
5.26M
            } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
342
                // Resolve later if needed
343
46.6k
                khint_t k;
344
46.6k
                k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
345
346
46.6k
                if (k != kh_end(hrecs->pg_hash)) {
347
15.8k
                    int p_id = kh_val(hrecs->pg_hash, k);
348
15.8k
                    hrecs->pg[npg].prev_id = hrecs->pg[p_id].id;
349
350
                    /* Unmark previous entry as a PG termination */
351
15.8k
                    if (hrecs->npg_end > 0 &&
352
15.8k
                        hrecs->pg_end[hrecs->npg_end-1] == p_id) {
353
4.37k
                        hrecs->npg_end--;
354
11.4k
                    } else {
355
11.4k
                        int i;
356
24.6M
                        for (i = 0; i < hrecs->npg_end; i++) {
357
24.6M
                            if (hrecs->pg_end[i] == p_id) {
358
5.18k
                                memmove(&hrecs->pg_end[i], &hrecs->pg_end[i+1],
359
5.18k
                                        (hrecs->npg_end-i-1)*sizeof(*hrecs->pg_end));
360
5.18k
                                hrecs->npg_end--;
361
5.18k
                            }
362
24.6M
                        }
363
11.4k
                    }
364
30.8k
                } else {
365
30.8k
                    hrecs->pg[npg].prev_id = -1;
366
30.8k
                }
367
46.6k
            }
368
5.34M
            tag = tag->next;
369
5.34M
        }
370
371
5.21M
        if (hrecs->pg[npg].name) {
372
5.21M
            khint_t k;
373
5.21M
            int r;
374
5.21M
            k = kh_put(m_s2i, hrecs->pg_hash, hrecs->pg[npg].name, &r);
375
5.21M
            if (-1 == r) return -1;
376
5.21M
            kh_val(hrecs->pg_hash, k) = npg;
377
5.21M
        } else {
378
45
            return -1; // ID should be present, according to spec.
379
45
        }
380
381
        /* Add to npg_end[] array. Remove later if we find a PP line */
382
5.21M
        if (hrecs->npg_end >= hrecs->npg_end_alloc) {
383
1.17k
            int *new_pg_end;
384
1.17k
            int  new_alloc = hrecs->npg_end_alloc ? hrecs->npg_end_alloc*2 : 4;
385
386
1.17k
            new_pg_end = realloc(hrecs->pg_end, new_alloc * sizeof(int));
387
1.17k
            if (!new_pg_end)
388
0
                return -1;
389
1.17k
            hrecs->npg_end_alloc = new_alloc;
390
1.17k
            hrecs->pg_end = new_pg_end;
391
1.17k
        }
392
5.21M
        hrecs->pg_end[hrecs->npg_end++] = npg;
393
394
5.21M
        hrecs->npg++;
395
5.21M
    }
396
397
5.24M
    return 0;
398
5.24M
}
399
400
0
static int sam_hrecs_remove_hash_entry(sam_hrecs_t *hrecs, khint32_t type, sam_hrec_type_t *h_type) {
401
0
    if (!hrecs || !h_type)
402
0
        return -1;
403
404
0
    sam_hrec_tag_t *tag;
405
0
    const char *key = NULL;
406
0
    khint_t k;
407
408
    /* Remove name and any alternative names from reference hash */
409
0
    if (type == TYPEKEY("SQ")) {
410
0
        const char *altnames = NULL;
411
412
0
        tag = h_type->tag;
413
414
0
        while (tag) {
415
0
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
416
0
                assert(tag->len >= 3);
417
0
                key = tag->str + 3;
418
0
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
419
0
                assert(tag->len >= 3);
420
0
                altnames = tag->str + 3;
421
0
            }
422
0
            tag = tag->next;
423
0
        }
424
425
0
        if (key) {
426
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
427
0
            if (k != kh_end(hrecs->ref_hash)) {
428
0
                int idx = kh_val(hrecs->ref_hash, k);
429
0
                if (idx + 1 < hrecs->nref)
430
0
                    memmove(&hrecs->ref[idx], &hrecs->ref[idx+1],
431
0
                            sizeof(sam_hrec_sq_t)*(hrecs->nref - idx - 1));
432
0
                if (altnames)
433
0
                    sam_hrecs_remove_ref_altnames(hrecs, idx, altnames);
434
0
                kh_del(m_s2i, hrecs->ref_hash, k);
435
0
                hrecs->nref--;
436
0
                if (hrecs->refs_changed < 0 || hrecs->refs_changed > idx)
437
0
                    hrecs->refs_changed = idx;
438
0
                for (k = 0; k < kh_end(hrecs->ref_hash); k++) {
439
0
                    if (kh_exist(hrecs->ref_hash, k)
440
0
                        && kh_value(hrecs->ref_hash, k) > idx) {
441
0
                        kh_value(hrecs->ref_hash, k)--;
442
0
                    }
443
0
                }
444
0
            }
445
0
        }
446
0
    }
447
448
    /* Remove from read-group hash */
449
0
    if (type == TYPEKEY("RG")) {
450
0
        tag = h_type->tag;
451
452
0
        while (tag) {
453
0
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
454
0
                assert(tag->len >= 3);
455
0
                key = tag->str + 3;
456
0
                k = kh_get(m_s2i, hrecs->rg_hash, key);
457
0
                if (k != kh_end(hrecs->rg_hash)) {
458
0
                    int idx = kh_val(hrecs->rg_hash, k);
459
0
                    if (idx + 1 < hrecs->nrg)
460
0
                        memmove(&hrecs->rg[idx], &hrecs->rg[idx+1], sizeof(sam_hrec_rg_t)*(hrecs->nrg - idx - 1));
461
0
                    kh_del(m_s2i, hrecs->rg_hash, k);
462
0
                    hrecs->nrg--;
463
0
                    for (k = 0; k < kh_end(hrecs->rg_hash); k++) {
464
0
                        if (kh_exist(hrecs->rg_hash, k)
465
0
                            && kh_value(hrecs->rg_hash, k) > idx) {
466
0
                            kh_value(hrecs->rg_hash, k)--;
467
0
                        }
468
0
                    }
469
0
                }
470
0
                break;
471
0
            }
472
0
            tag = tag->next;
473
0
        }
474
0
    }
475
476
0
    return 0;
477
0
}
478
479
/** Add a header record to the global line ordering
480
 *
481
 * If @p after is not NULL, the new record will be inserted after this one,
482
 * otherwise it will go at the end.
483
 *
484
 * An exception is an HD record, which will always be put first unless
485
 * one is already present.
486
 */
487
static void sam_hrecs_global_list_add(sam_hrecs_t *hrecs,
488
                                      sam_hrec_type_t *h_type,
489
5.26M
                                      sam_hrec_type_t *after) {
490
5.26M
    const khint32_t hd_type = TYPEKEY("HD");
491
5.26M
    int update_first_line = 0;
492
493
    // First line seen
494
5.26M
    if (!hrecs->first_line) {
495
1.85k
        hrecs->first_line = h_type->global_next = h_type->global_prev = h_type;
496
1.85k
        return;
497
1.85k
    }
498
499
    // @HD goes at the top (unless there's one already)
500
5.26M
    if (h_type->type == hd_type && hrecs->first_line->type != hd_type) {
501
53
        after = hrecs->first_line->global_prev;
502
53
        update_first_line = 1;
503
53
    }
504
505
    // If no instructions given, put it at the end
506
5.26M
    if (!after)
507
5.26M
        after = hrecs->first_line->global_prev;
508
509
5.26M
    h_type->global_prev = after;
510
5.26M
    h_type->global_next = after->global_next;
511
5.26M
    h_type->global_prev->global_next = h_type;
512
5.26M
    h_type->global_next->global_prev = h_type;
513
514
5.26M
    if (update_first_line)
515
53
        hrecs->first_line = h_type;
516
5.26M
}
517
518
/*! Add header record with a va_list interface.
519
 *
520
 * Adds a single record to a SAM header.
521
 *
522
 * This takes a header record type, a va_list argument and one or more
523
 * key,value pairs, ending with the NULL key.
524
 *
525
 * Eg. sam_hrecs_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
526
 *
527
 * The purpose of the additional va_list parameter is to permit other
528
 * varargs functions to call this while including their own additional
529
 * parameters; an example is in sam_hdr_add_pg().
530
 *
531
 * Note: this function invokes va_arg at least once, making the value
532
 * of ap indeterminate after the return. The caller should call
533
 * va_start/va_end before/after calling this function or use va_copy.
534
 *
535
 * @return
536
 * Returns >= 0 on success;
537
 *        -1 on failure
538
 */
539
2.53k
static int sam_hrecs_vadd(sam_hrecs_t *hrecs, const char *type, va_list ap, ...) {
540
2.53k
    va_list args;
541
2.53k
    sam_hrec_type_t *h_type;
542
2.53k
    sam_hrec_tag_t *h_tag, *last=NULL;
543
2.53k
    int new;
544
2.53k
    khint32_t type_i = TYPEKEY(type), k;
545
546
2.53k
    if (!strncmp(type, "HD", 2) && (h_type = sam_hrecs_find_type_id(hrecs, "HD", NULL, NULL)))
547
0
        return sam_hrecs_vupdate(hrecs, h_type, ap);
548
549
2.53k
    if (!(h_type = pool_alloc(hrecs->type_pool)))
550
0
        return -1;
551
2.53k
    k = kh_put(sam_hrecs_t, hrecs->h, type_i, &new);
552
2.53k
    if (new < 0)
553
0
        return -1;
554
555
2.53k
    h_type->type = type_i;
556
557
    // Form the ring, either with self or other lines of this type
558
2.53k
    if (!new) {
559
2.27k
        sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
560
2.27k
        p = t->prev;
561
562
2.27k
        assert(p->next == t);
563
2.27k
        p->next = h_type;
564
2.27k
        h_type->prev = p;
565
566
2.27k
        t->prev = h_type;
567
2.27k
        h_type->next = t;
568
2.27k
    } else {
569
261
        kh_val(hrecs->h, k) = h_type;
570
261
        h_type->prev = h_type->next = h_type;
571
261
    }
572
2.53k
    h_type->tag = NULL;
573
574
    // Add to global line ordering after any existing line of the same type,
575
    // or at the end if no line of this type exists yet.
576
2.53k
    sam_hrecs_global_list_add(hrecs, h_type, !new ? h_type->prev : NULL);
577
578
    // Check linked-list invariants
579
2.53k
    assert(h_type->prev->next == h_type);
580
2.53k
    assert(h_type->next->prev == h_type);
581
2.53k
    assert(h_type->global_prev->global_next == h_type);
582
2.53k
    assert(h_type->global_next->global_prev == h_type);
583
584
    // Any ... varargs
585
2.53k
    va_start(args, ap);
586
2.53k
    for (;;) {
587
2.53k
        char *key, *val = NULL, *str;
588
589
2.53k
        if (!(key = (char *)va_arg(args, char *)))
590
2.53k
            break;
591
0
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(args, char *)))
592
0
            break;
593
0
        if (*val == '\0')
594
0
            continue;
595
596
0
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
597
0
            return -1;
598
599
0
        if (strncmp(type, "CO", 2)) {
600
0
            h_tag->len = 3 + strlen(val);
601
0
            str = string_alloc(hrecs->str_pool, h_tag->len+1);
602
0
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
603
0
                return -1;
604
0
            h_tag->str = str;
605
0
        } else {
606
0
            h_tag->len = strlen(key);
607
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
608
0
            if (!h_tag->str)
609
0
                return -1;
610
0
        }
611
612
0
        h_tag->next = NULL;
613
0
        if (last)
614
0
            last->next = h_tag;
615
0
        else
616
0
            h_type->tag = h_tag;
617
618
0
        last = h_tag;
619
0
    }
620
2.53k
    va_end(args);
621
622
    // Plus the specified va_list params
623
7.59k
    for (;;) {
624
7.59k
        char *key, *val = NULL, *str;
625
626
7.59k
        if (!(key = (char *)va_arg(ap, char *)))
627
2.53k
            break;
628
5.06k
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(ap, char *)))
629
0
            break;
630
631
5.06k
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
632
0
            return -1;
633
634
5.06k
        if (strncmp(type, "CO", 2)) {
635
5.06k
            h_tag->len = 3 + strlen(val);
636
5.06k
            str = string_alloc(hrecs->str_pool, h_tag->len+1);
637
5.06k
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
638
0
                return -1;
639
5.06k
            h_tag->str = str;
640
5.06k
        } else {
641
0
            h_tag->len = strlen(key);
642
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
643
0
            if (!h_tag->str)
644
0
                return -1;
645
0
        }
646
647
5.06k
        h_tag->next = NULL;
648
5.06k
        if (last)
649
2.53k
            last->next = h_tag;
650
2.53k
        else
651
2.53k
            h_type->tag = h_tag;
652
653
5.06k
        last = h_tag;
654
5.06k
    }
655
656
2.53k
    if (-1 == sam_hrecs_update_hashes(hrecs, TYPEKEY(type), h_type))
657
0
        return -1;
658
659
2.53k
    if (!strncmp(type, "PG", 2))
660
0
        hrecs->pgs_changed = 1;
661
662
2.53k
    hrecs->dirty = 1;
663
664
2.53k
    return 0;
665
2.53k
}
666
667
// As sam_hrecs_vadd(), but without the extra va_list parameter
668
2.53k
static int sam_hrecs_add(sam_hrecs_t *hrecs, const char *type, ...) {
669
2.53k
    va_list args;
670
2.53k
    int res;
671
2.53k
    va_start(args, type);
672
2.53k
    res = sam_hrecs_vadd(hrecs, type, args, NULL);
673
2.53k
    va_end(args);
674
2.53k
    return res;
675
2.53k
}
676
677
/*
678
 * Function for deallocating a list of tags
679
 */
680
681
0
static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) {
682
0
    if (!hrecs || !tag)
683
0
        return;
684
0
    if (tag->next)
685
0
        sam_hrecs_free_tags(hrecs, tag->next);
686
687
0
    pool_free(hrecs->tag_pool, tag);
688
0
}
689
690
0
static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found) {
691
0
    if (!hrecs || !type_name || !type_found)
692
0
        return -1;
693
694
0
    khint32_t itype = TYPEKEY(type_name);
695
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype);
696
0
    if (k == kh_end(hrecs->h))
697
0
        return -1;
698
699
    // Remove from global list (remembering it could be the only line)
700
0
    if (hrecs->first_line == type_found) {
701
0
        hrecs->first_line = (type_found->global_next != type_found
702
0
                             ? type_found->global_next : NULL);
703
0
    }
704
0
    type_found->global_next->global_prev = type_found->global_prev;
705
0
    type_found->global_prev->global_next = type_found->global_next;
706
707
    /* single element in the list */
708
0
    if (type_found->prev == type_found || type_found->next == type_found) {
709
0
        kh_del(sam_hrecs_t, hrecs->h, k);
710
0
    } else {
711
0
        type_found->prev->next = type_found->next;
712
0
        type_found->next->prev = type_found->prev;
713
0
        if (kh_val(hrecs->h, k) == type_found) { //first element
714
0
            kh_val(hrecs->h, k) = type_found->next;
715
0
        }
716
0
    }
717
718
0
    if (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2))
719
0
        sam_hrecs_remove_hash_entry(hrecs, itype, type_found);
720
721
0
    sam_hrecs_free_tags(hrecs, type_found->tag);
722
0
    pool_free(hrecs->type_pool, type_found);
723
724
0
    hrecs->dirty = 1;
725
726
0
    return 0;
727
0
}
728
729
// Paste together a line from the parsed data structures
730
3.05M
static int build_header_line(const sam_hrec_type_t *ty, kstring_t *ks) {
731
3.05M
    sam_hrec_tag_t *tag;
732
3.05M
    int r = 0;
733
3.05M
    char c[2]= { ty->type >> 8, ty->type & 0xff };
734
735
3.05M
    r |= (kputc_('@', ks) == EOF);
736
3.05M
    r |= (kputsn(c, 2, ks) == EOF);
737
6.18M
    for (tag = ty->tag; tag; tag = tag->next) {
738
3.13M
        r |= (kputc_('\t', ks) == EOF);
739
3.13M
        r |= (kputsn(tag->str, tag->len, ks) == EOF);
740
3.13M
    }
741
742
3.05M
    return r;
743
3.05M
}
744
745
1.38k
static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) {
746
1.38k
    const sam_hrec_type_t *t1, *t2;
747
748
1.38k
    if (!hrecs->first_line)
749
0
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
750
751
1.38k
    t1 = t2 = hrecs->first_line;
752
3.05M
    do {
753
3.05M
        if (build_header_line(t1, ks) != 0)
754
0
            return -1;
755
3.05M
        if (kputc('\n', ks) < 0)
756
0
            return -1;
757
758
3.05M
        t1 = t1->global_next;
759
3.05M
    } while (t1 != t2);
760
761
1.38k
    return 0;
762
1.38k
}
763
764
1.93k
static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) {
765
1.93k
    size_t i, lno;
766
767
1.93k
    if (!hrecs || len > SSIZE_MAX)
768
0
        return -1;
769
770
1.93k
    if (!len)
771
0
        len = strlen(hdr);
772
773
1.93k
    if (len < 3) {
774
62
        if (len == 0 || *hdr == '\0') return 0;
775
0
        sam_hrecs_error("Header line too short", hdr, len, 1);
776
0
        return -1;
777
62
    }
778
779
5.26M
    for (i = 0, lno = 1; i < len - 3 && hdr[i] != '\0'; i++, lno++) {
780
5.26M
        khint32_t type;
781
5.26M
        khint_t k;
782
783
5.26M
        int l_start = i, new;
784
5.26M
        sam_hrec_type_t *h_type;
785
5.26M
        sam_hrec_tag_t *h_tag, *last;
786
787
5.26M
        if (hdr[i] != '@') {
788
78
            sam_hrecs_error("Header line does not start with '@'",
789
78
                          &hdr[l_start], len - l_start, lno);
790
78
            return -1;
791
78
        }
792
793
5.26M
        if (!isalpha_c(hdr[i+1]) || !isalpha_c(hdr[i+2])) {
794
20
            sam_hrecs_error("Header line does not have a two character key",
795
20
                          &hdr[l_start], len - l_start, lno);
796
20
            return -1;
797
20
        }
798
5.26M
        type = TYPEKEY(&hdr[i+1]);
799
800
5.26M
        i += 3;
801
5.26M
        if (i == len || hdr[i] == '\n')
802
1
            continue;
803
804
        // Add the header line type
805
5.26M
        if (!(h_type = pool_alloc(hrecs->type_pool)))
806
0
            return -1;
807
5.26M
        k = kh_put(sam_hrecs_t, hrecs->h, type, &new);
808
5.26M
        if (new < 0)
809
0
            return -1;
810
811
5.26M
        h_type->type = type;
812
813
        // Add to end of global list
814
5.26M
        sam_hrecs_global_list_add(hrecs, h_type, NULL);
815
816
        // Form the ring, either with self or other lines of this type
817
5.26M
        if (!new) {
818
5.26M
            sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
819
5.26M
            p = t->prev;
820
821
5.26M
            assert(p->next == t);
822
5.26M
            p->next = h_type;
823
5.26M
            h_type->prev = p;
824
825
5.26M
            t->prev = h_type;
826
5.26M
            h_type->next = t;
827
5.26M
        } else {
828
2.63k
            kh_val(hrecs->h, k) = h_type;
829
2.63k
            h_type->prev = h_type->next = h_type;
830
2.63k
        }
831
832
        // Parse the tags on this line
833
5.26M
        last = NULL;
834
5.26M
        if (type == TYPEKEY("CO")) {
835
1.58k
            size_t j;
836
837
1.58k
            if (i == len || hdr[i] != '\t') {
838
6
                sam_hrecs_error("Missing tab",
839
6
                              &hdr[l_start], len - l_start, lno);
840
6
                return -1;
841
6
            }
842
843
1.11M
            for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
844
1.11M
                ;
845
846
1.57k
            if (!(h_type->tag = h_tag = pool_alloc(hrecs->tag_pool)))
847
0
                return -1;
848
1.57k
            h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
849
1.57k
            h_tag->len = j-i;
850
1.57k
            h_tag->next = NULL;
851
1.57k
            if (!h_tag->str)
852
0
                return -1;
853
854
1.57k
            i = j;
855
856
5.26M
        } else {
857
5.41M
            do {
858
5.41M
                size_t j;
859
860
5.41M
                if (i == len || hdr[i] != '\t') {
861
16
                    sam_hrecs_error("Missing tab",
862
16
                                  &hdr[l_start], len - l_start, lno);
863
16
                    return -1;
864
16
                }
865
866
200M
                for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
867
195M
                    ;
868
869
5.41M
                if (j - i < 3 || hdr[i + 2] != ':') {
870
271
                    sam_hrecs_error("Malformed key:value pair",
871
271
                                   &hdr[l_start], len - l_start, lno);
872
271
                    return -1;
873
271
                }
874
875
5.41M
                if (!(h_tag = pool_alloc(hrecs->tag_pool)))
876
0
                    return -1;
877
5.41M
                h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
878
5.41M
                h_tag->len = j-i;
879
5.41M
                h_tag->next = NULL;
880
5.41M
                if (!h_tag->str)
881
0
                    return -1;
882
883
5.41M
                if (last)
884
149k
                    last->next = h_tag;
885
5.26M
                else
886
5.26M
                    h_type->tag = h_tag;
887
888
5.41M
                last = h_tag;
889
5.41M
                i = j;
890
5.41M
            } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
891
5.26M
        }
892
893
        /* Update RG/SQ hashes */
894
5.26M
        if (-1 == sam_hrecs_update_hashes(hrecs, type, h_type))
895
77
            return -1;
896
5.26M
    }
897
898
1.40k
    return 0;
899
1.87k
}
900
901
/*! Update sam_hdr_t target_name and target_len arrays
902
 *
903
 *  @return 0 on success; -1 on failure
904
 */
905
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
906
1.01k
                                 int refs_changed) {
907
1.01k
    if (!bh || !hrecs)
908
0
        return -1;
909
910
1.01k
    if (refs_changed < 0)
911
0
        return 0;
912
913
    // Grow arrays if necessary
914
1.01k
    if (bh->n_targets < hrecs->nref) {
915
233
        char **new_names = realloc(bh->target_name,
916
233
                                   hrecs->nref * sizeof(*new_names));
917
233
        if (!new_names)
918
0
            return -1;
919
233
        bh->target_name = new_names;
920
233
        uint32_t *new_lens = realloc(bh->target_len,
921
233
                                     hrecs->nref * sizeof(*new_lens));
922
233
        if (!new_lens)
923
0
            return -1;
924
233
        bh->target_len = new_lens;
925
233
    }
926
927
    // Update names and lengths where changed
928
    // hrecs->refs_changed is the first ref that has been updated, so ones
929
    // before that can be skipped.
930
1.01k
    int i;
931
1.01k
    khint_t k;
932
1.01k
    khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
933
5.56k
    for (i = refs_changed; i < hrecs->nref; i++) {
934
4.55k
        if (i >= bh->n_targets
935
4.55k
            || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) {
936
2.62k
            if (i < bh->n_targets)
937
0
                free(bh->target_name[i]);
938
2.62k
            bh->target_name[i] = strdup(hrecs->ref[i].name);
939
2.62k
            if (!bh->target_name[i])
940
0
                return -1;
941
2.62k
        }
942
4.55k
        if (hrecs->ref[i].len < UINT32_MAX) {
943
3.37k
            bh->target_len[i] = hrecs->ref[i].len;
944
945
3.37k
            if (!long_refs)
946
2.07k
                continue;
947
948
            // Check if we have an old length, if so remove it.
949
1.29k
            k = kh_get(s2i, long_refs, bh->target_name[i]);
950
1.29k
            if (k < kh_end(long_refs))
951
1.29k
                kh_del(s2i, long_refs, k);
952
1.29k
        } else {
953
1.18k
            bh->target_len[i] = UINT32_MAX;
954
1.18k
            if (bh->hrecs != hrecs) {
955
                // Called from sam_hdr_dup; need to add sdict entries
956
290
                if (!long_refs) {
957
61
                    if (!(bh->sdict = long_refs = kh_init(s2i)))
958
0
                        return -1;
959
61
                }
960
961
                // Add / update length
962
290
                int absent;
963
290
                k = kh_put(s2i, long_refs, bh->target_name[i], &absent);
964
290
                if (absent < 0)
965
0
                    return -1;
966
290
                kh_val(long_refs, k) = hrecs->ref[i].len;
967
290
            }
968
1.18k
        }
969
4.55k
    }
970
971
    // Free up any names that have been removed
972
1.01k
    for (; i < bh->n_targets; i++) {
973
0
        if (long_refs) {
974
0
            k = kh_get(s2i, long_refs, bh->target_name[i]);
975
0
            if (k < kh_end(long_refs))
976
0
                kh_del(s2i, long_refs, k);
977
0
        }
978
0
        free(bh->target_name[i]);
979
0
    }
980
981
1.01k
    bh->n_targets = hrecs->nref;
982
1.01k
    return 0;
983
1.01k
}
984
985
218
static int rebuild_target_arrays(sam_hdr_t *bh) {
986
218
    if (!bh || !bh->hrecs)
987
0
        return -1;
988
989
218
    sam_hrecs_t *hrecs = bh->hrecs;
990
218
    if (hrecs->refs_changed < 0)
991
0
        return 0;
992
993
218
    if (sam_hdr_update_target_arrays(bh, hrecs, hrecs->refs_changed) != 0)
994
0
        return -1;
995
996
218
    hrecs->refs_changed = -1;
997
218
    return 0;
998
218
}
999
1000
/// Populate hrecs refs array from header target_name, target_len arrays
1001
/**
1002
 * @return 0 on success; -1 on failure
1003
 *
1004
 * Pre-fills the refs hash from the target arrays.  For BAM files this
1005
 * will ensure that they are in the correct order as the target arrays
1006
 * are the canonical source for converting target ids to names and lengths.
1007
 *
1008
 * The added entries do not link to a header line. sam_hrecs_update_hashes()
1009
 * will add the links later for lines found in the text header.
1010
 *
1011
 * This should be called before the text header is parsed.
1012
 */
1013
static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs,
1014
1.08k
                                             const sam_hdr_t *bh) {
1015
1.08k
    int32_t tid = 0;
1016
1017
1.08k
    if (!hrecs || !bh)
1018
0
        return -1;
1019
1020
    // This should always be called before parsing the text header
1021
    // so the ref array should start off empty, and we don't have to try
1022
    // to reconcile any existing data.
1023
1.08k
    if (hrecs->nref > 0) {
1024
0
        hts_log_error("Called with non-empty ref array");
1025
0
        return -1;
1026
0
    }
1027
1028
1.08k
    if (hrecs->ref_sz < bh->n_targets) {
1029
1.08k
        sam_hrec_sq_t *new_ref = realloc(hrecs->ref,
1030
1.08k
                                         bh->n_targets * sizeof(*new_ref));
1031
1.08k
        if (!new_ref)
1032
0
            return -1;
1033
1034
1.08k
        hrecs->ref = new_ref;
1035
1.08k
        hrecs->ref_sz = bh->n_targets;
1036
1.08k
    }
1037
1038
9.33k
    for (tid = 0; tid < bh->n_targets; tid++) {
1039
8.25k
        khint_t k;
1040
8.25k
        int r;
1041
8.25k
        hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]);
1042
8.25k
        if (!hrecs->ref[tid].name) goto fail;
1043
8.25k
        if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) {
1044
5.15k
            hrecs->ref[tid].len  = bh->target_len[tid];
1045
5.15k
        } else {
1046
3.09k
            khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1047
3.09k
            k = kh_get(s2i, long_refs, hrecs->ref[tid].name);
1048
3.09k
            if (k < kh_end(long_refs)) {
1049
3.07k
                hrecs->ref[tid].len = kh_val(long_refs, k);
1050
3.07k
            } else {
1051
21
                hrecs->ref[tid].len = UINT32_MAX;
1052
21
            }
1053
3.09k
        }
1054
8.25k
        hrecs->ref[tid].ty   = NULL;
1055
8.25k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r);
1056
8.25k
        if (r < 0) goto fail;
1057
8.25k
        if (r == 0) {
1058
0
            hts_log_error("Duplicate entry \"%s\" in target list",
1059
0
                            hrecs->ref[tid].name);
1060
0
            return -1;
1061
8.25k
        } else {
1062
8.25k
            kh_val(hrecs->ref_hash, k) = tid;
1063
8.25k
        }
1064
8.25k
    }
1065
1.08k
    hrecs->nref = bh->n_targets;
1066
1.08k
    return 0;
1067
1068
0
 fail: {
1069
0
        int32_t i;
1070
0
        hts_log_error("%s", strerror(errno));
1071
0
        for (i = 0; i < tid; i++) {
1072
0
            khint_t k;
1073
0
            if (!hrecs->ref[i].name) continue;
1074
0
            k = kh_get(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name);
1075
0
            if (k < kh_end(hrecs->ref_hash)) kh_del(m_s2i, hrecs->ref_hash, k);
1076
0
        }
1077
0
        hrecs->nref = 0;
1078
0
        return -1;
1079
1.08k
    }
1080
1.08k
}
1081
1082
/*
1083
 * Add SQ header records for any references in the hrecs->ref array that
1084
 * were added by sam_hrecs_refs_from_targets_array() but have not
1085
 * been linked to an @SQ line by sam_hrecs_update_hashes() yet.
1086
 *
1087
 * This may be needed either because:
1088
 *
1089
 *   - A bam file was read that had entries in its refs list with no
1090
 *     corresponding @SQ line.
1091
 *
1092
 *   - A program constructed a sam_hdr_t which has target_name and target_len
1093
 *     array entries with no corresponding @SQ line in text.
1094
 */
1095
3.10k
static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) {
1096
3.10k
    int tid;
1097
3.10k
    char len[32];
1098
1099
7.31k
    for (tid = 0; tid < hrecs->nref; tid++) {
1100
4.20k
        if (hrecs->ref[tid].ty == NULL) {
1101
2.53k
            snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len);
1102
2.53k
            if (sam_hrecs_add(hrecs, "SQ",
1103
2.53k
                              "SN", hrecs->ref[tid].name,
1104
2.53k
                              "LN", len, NULL) != 0)
1105
0
                return -1;
1106
1107
            // Check that the stub has actually been filled
1108
2.53k
            if(hrecs->ref[tid].ty == NULL) {
1109
0
                hts_log_error("Reference stub with tid=%d, name=\"%s\", len=%"PRIhts_pos" could not be filled",
1110
0
                        tid, hrecs->ref[tid].name, hrecs->ref[tid].len);
1111
0
                return -1;
1112
0
            }
1113
2.53k
        }
1114
4.20k
    }
1115
3.10k
    return 0;
1116
3.10k
}
1117
1118
3.57k
int sam_hdr_fill_hrecs(sam_hdr_t *bh) {
1119
3.57k
    sam_hrecs_t *hrecs = sam_hrecs_new();
1120
1121
3.57k
    if (!hrecs)
1122
0
        return -1;
1123
1124
3.57k
    if (bh->target_name && bh->target_len && bh->n_targets > 0) {
1125
1.08k
        if (sam_hrecs_refs_from_targets_array(hrecs, bh) != 0) {
1126
0
            sam_hrecs_free(hrecs);
1127
0
            return -1;
1128
0
        }
1129
1.08k
    }
1130
1131
    // Parse existing header text
1132
3.57k
    if (bh->text && bh->l_text > 0) {
1133
1.73k
        if (sam_hrecs_parse_lines(hrecs, bh->text, bh->l_text) != 0) {
1134
468
            sam_hrecs_free(hrecs);
1135
468
            return -1;
1136
468
        }
1137
1.73k
    }
1138
1139
3.10k
    if (add_stub_ref_sq_lines(hrecs) < 0) {
1140
0
        sam_hrecs_free(hrecs);
1141
0
        return -1;
1142
0
    }
1143
1144
3.10k
    bh->hrecs = hrecs;
1145
1146
3.10k
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1147
0
        return -1;
1148
1149
3.10k
    return 0;
1150
3.10k
}
1151
1152
/** Remove outdated header text
1153
1154
    @param bh     BAM header
1155
1156
    This is called when API functions have changed the header so that the
1157
    text version is no longer valid.
1158
 */
1159
206
static void redact_header_text(sam_hdr_t *bh) {
1160
206
    assert(bh->hrecs && bh->hrecs->dirty);
1161
206
    bh->l_text = 0;
1162
206
    free(bh->text);
1163
206
    bh->text = NULL;
1164
206
}
1165
1166
/** Find nth header record of a given type
1167
1168
    @param type   Header type (SQ, RG etc.)
1169
    @param idx    0-based index
1170
1171
    @return sam_hrec_type_t pointer to the record on success
1172
            NULL if no record exists with the given type and index
1173
 */
1174
1175
static sam_hrec_type_t *sam_hrecs_find_type_pos(sam_hrecs_t *hrecs,
1176
0
                                                const char *type, int idx) {
1177
0
    sam_hrec_type_t *first, *itr;
1178
1179
0
    if (idx < 0)
1180
0
        return NULL;
1181
1182
0
    if (type[0] == 'S' && type[1] == 'Q')
1183
0
        return idx < hrecs->nref ? hrecs->ref[idx].ty : NULL;
1184
1185
0
    if (type[0] == 'R' && type[1] == 'G')
1186
0
        return idx < hrecs->nrg ? hrecs->rg[idx].ty : NULL;
1187
1188
0
    if (type[0] == 'P' && type[1] == 'G')
1189
0
        return idx < hrecs->npg ? hrecs->pg[idx].ty : NULL;
1190
1191
0
    first = itr = sam_hrecs_find_type_id(hrecs, type, NULL, NULL);
1192
0
    if (!first)
1193
0
        return NULL;
1194
1195
0
    while (idx > 0) {
1196
0
        itr = itr->next;
1197
0
        if (itr == first)
1198
0
            break;
1199
0
        --idx;
1200
0
    }
1201
1202
0
    return idx == 0 ? itr : NULL;
1203
0
}
1204
1205
/* ==== Public methods ==== */
1206
1207
0
size_t sam_hdr_length(sam_hdr_t *bh) {
1208
0
    if (!bh || -1 == sam_hdr_rebuild(bh))
1209
0
        return SIZE_MAX;
1210
1211
0
    return bh->l_text;
1212
0
}
1213
1214
0
const char *sam_hdr_str(sam_hdr_t *bh) {
1215
0
    if (!bh || -1 == sam_hdr_rebuild(bh))
1216
0
        return NULL;
1217
1218
0
    return bh->text;
1219
0
}
1220
1221
0
int sam_hdr_nref(const sam_hdr_t *bh) {
1222
0
    if (!bh)
1223
0
        return -1;
1224
1225
0
    return bh->hrecs ? bh->hrecs->nref : bh->n_targets;
1226
0
}
1227
1228
/*
1229
 * Reconstructs the text representation from the header hash table.
1230
 * Returns 0 on success
1231
 *        -1 on failure
1232
 */
1233
0
int sam_hdr_rebuild(sam_hdr_t *bh) {
1234
0
    sam_hrecs_t *hrecs;
1235
0
    if (!bh)
1236
0
        return -1;
1237
1238
0
    if (!(hrecs = bh->hrecs))
1239
0
        return bh->text ? 0 : -1;
1240
1241
0
    if (hrecs->refs_changed >= 0) {
1242
0
        if (rebuild_target_arrays(bh) < 0) {
1243
0
            hts_log_error("Header target array rebuild has failed");
1244
0
            return -1;
1245
0
        }
1246
0
    }
1247
1248
    /* If header text wasn't changed or header is empty, don't rebuild it. */
1249
0
    if (!hrecs->dirty)
1250
0
        return 0;
1251
1252
0
    if (hrecs->pgs_changed && sam_hdr_link_pg(bh) < 0) {
1253
0
        hts_log_error("Linking @PG lines has failed");
1254
0
        return -1;
1255
0
    }
1256
1257
0
    kstring_t ks = KS_INITIALIZE;
1258
0
    if (sam_hrecs_rebuild_text(hrecs, &ks) != 0) {
1259
0
        ks_free(&ks);
1260
0
        hts_log_error("Header text rebuild has failed");
1261
0
        return -1;
1262
0
    }
1263
1264
0
    hrecs->dirty = 0;
1265
1266
    /* Sync */
1267
0
    free(bh->text);
1268
0
    bh->l_text = ks_len(&ks);
1269
0
    bh->text = ks_release(&ks);
1270
1271
0
    return 0;
1272
0
}
1273
1274
/*
1275
 * Appends a formatted line to an existing SAM header.
1276
 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
1277
 * optional new-line. If it contains more than 1 line then multiple lines
1278
 * will be added in order.
1279
 *
1280
 * Input text is of maximum length len or as terminated earlier by a NUL.
1281
 * len may be 0 if unknown, in which case lines must be NUL-terminated.
1282
 *
1283
 * Returns 0 on success
1284
 *        -1 on failure
1285
 */
1286
793
int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) {
1287
793
    sam_hrecs_t *hrecs;
1288
1289
793
    if (!bh || !lines)
1290
0
        return -1;
1291
1292
793
    if (len == 0 && *lines == '\0')
1293
587
        return 0;
1294
1295
206
    if (!(hrecs = bh->hrecs)) {
1296
206
        if (sam_hdr_fill_hrecs(bh) != 0)
1297
0
            return -1;
1298
206
        hrecs = bh->hrecs;
1299
206
    }
1300
1301
206
    if (sam_hrecs_parse_lines(hrecs, lines, len) != 0)
1302
0
        return -1;
1303
1304
206
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1305
0
        return -1;
1306
1307
206
    hrecs->dirty = 1;
1308
206
    redact_header_text(bh);
1309
1310
206
    return 0;
1311
206
}
1312
1313
/*
1314
 * Adds a single line to a SAM header.
1315
 * Specify type and one or more key,value pairs, ending with the NULL key.
1316
 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
1317
 *
1318
 * Returns 0 on success
1319
 *        -1 on failure
1320
 */
1321
0
int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) {
1322
0
    va_list args;
1323
0
    sam_hrecs_t *hrecs;
1324
1325
0
    if (!bh || !type)
1326
0
        return -1;
1327
1328
0
    if (!(hrecs = bh->hrecs)) {
1329
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1330
0
            return -1;
1331
0
        hrecs = bh->hrecs;
1332
0
    }
1333
1334
0
    va_start(args, type);
1335
0
    int ret = sam_hrecs_vadd(hrecs, type, args, NULL);
1336
0
    va_end(args);
1337
1338
0
    if (ret == 0) {
1339
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1340
0
            return -1;
1341
1342
0
        if (hrecs->dirty)
1343
0
            redact_header_text(bh);
1344
0
    }
1345
1346
0
    return ret;
1347
0
}
1348
1349
/*
1350
 * Returns a complete line of formatted text for a specific head type/ID
1351
 * combination. If ID_key is NULL then it returns the first line of the specified
1352
 * type.
1353
 */
1354
int sam_hdr_find_line_id(sam_hdr_t *bh, const char *type,
1355
0
                      const char *ID_key, const char *ID_val, kstring_t *ks) {
1356
0
    sam_hrecs_t *hrecs;
1357
0
    if (!bh || !type)
1358
0
        return -2;
1359
1360
0
    if (!(hrecs = bh->hrecs)) {
1361
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1362
0
            return -2;
1363
0
        hrecs = bh->hrecs;
1364
0
    }
1365
1366
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_val);
1367
0
    if (!ty)
1368
0
        return -1;
1369
1370
0
    ks->l = 0;
1371
0
    if (build_header_line(ty, ks) < 0) {
1372
0
        return -2;
1373
0
    }
1374
1375
0
    return 0;
1376
0
}
1377
1378
int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type,
1379
0
                          int pos, kstring_t *ks) {
1380
0
    sam_hrecs_t *hrecs;
1381
0
    if (!bh || !type)
1382
0
        return -2;
1383
1384
0
    if (!(hrecs = bh->hrecs)) {
1385
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1386
0
            return -2;
1387
0
        hrecs = bh->hrecs;
1388
0
    }
1389
1390
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1391
0
    if (!ty)
1392
0
        return -1;
1393
1394
0
    ks->l = 0;
1395
0
    if (build_header_line(ty, ks) < 0) {
1396
0
        return -2;
1397
0
    }
1398
1399
0
    return 0;
1400
0
}
1401
1402
/*
1403
 * Remove a line from the header by specifying a tag:value that uniquely
1404
 * identifies a line, i.e. the @SQ line containing "SN:ref1".
1405
 * @SQ line is uniquely identified by SN tag.
1406
 * @RG line is uniquely identified by ID tag.
1407
 * @PG line is uniquely identified by ID tag.
1408
 *
1409
 * Returns 0 on success and -1 on error
1410
 */
1411
1412
0
int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1413
0
    sam_hrecs_t *hrecs;
1414
0
    if (!bh || !type)
1415
0
        return -1;
1416
1417
0
    if (!(hrecs = bh->hrecs)) {
1418
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1419
0
            return -1;
1420
0
        hrecs = bh->hrecs;
1421
0
    }
1422
1423
0
    if (!strncmp(type, "PG", 2)) {
1424
0
        hts_log_warning("Removing PG lines is not supported!");
1425
0
        return -1;
1426
0
    }
1427
1428
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1429
0
    if (!type_found)
1430
0
        return 0;
1431
1432
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found);
1433
0
    if (ret == 0) {
1434
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1435
0
            return -1;
1436
1437
0
        if (hrecs->dirty)
1438
0
            redact_header_text(bh);
1439
0
    }
1440
1441
0
    return ret;
1442
0
}
1443
1444
/*
1445
 * Remove a line from the header by specifying the position in the type
1446
 * group, i.e. 3rd @SQ line.
1447
 *
1448
 * Returns 0 on success and -1 on error
1449
 */
1450
1451
0
int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) {
1452
0
    sam_hrecs_t *hrecs;
1453
0
    if (!bh || !type || position <= 0)
1454
0
        return -1;
1455
1456
0
    if (!(hrecs = bh->hrecs)) {
1457
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1458
0
            return -1;
1459
0
        hrecs = bh->hrecs;
1460
0
    }
1461
1462
0
    if (!strncmp(type, "PG", 2)) {
1463
0
        hts_log_warning("Removing PG lines is not supported!");
1464
0
        return -1;
1465
0
    }
1466
1467
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_pos(hrecs, type,
1468
0
                                                          position);
1469
0
    if (!type_found)
1470
0
        return -1;
1471
1472
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found);
1473
0
    if (ret == 0) {
1474
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1475
0
            return -1;
1476
1477
0
        if (hrecs->dirty)
1478
0
            redact_header_text(bh);
1479
0
    }
1480
1481
0
    return ret;
1482
0
}
1483
1484
/*
1485
 * Check if sam_hdr_update_line() is being used to change the name of
1486
 * a record, and if the new name is going to clash with an existing one.
1487
 *
1488
 * If ap includes repeated keys, we go with the last one as sam_hrecs_vupdate()
1489
 * will go through them all and leave the final one in place.
1490
 *
1491
 * Returns 0 if the name does not change
1492
 *         1 if the name changes but does not clash
1493
 *        -1 if the name changes and the new one is already in use
1494
 */
1495
static int check_for_name_update(sam_hrecs_t *hrecs, sam_hrec_type_t *rec,
1496
                                 va_list ap, const char **old_name,
1497
                                 const char **new_name,
1498
                                 char id_tag_out[3],
1499
0
                                 khash_t(m_s2i) **hash_out) {
1500
0
    char *key, *val;
1501
0
    const char *id_tag;
1502
0
    sam_hrec_tag_t *tag, *prev;
1503
0
    khash_t(m_s2i) *hash;
1504
0
    khint_t k;
1505
0
    int ret = 0;
1506
1507
0
    if        (rec->type == TYPEKEY("SQ")) {
1508
0
        id_tag = "SN"; hash = hrecs->ref_hash;
1509
0
    } else if (rec->type == TYPEKEY("RG")) {
1510
0
        id_tag = "ID"; hash = hrecs->rg_hash;
1511
0
    } else if (rec->type == TYPEKEY("PG")) {
1512
0
        id_tag = "ID"; hash = hrecs->pg_hash;
1513
0
    } else {
1514
0
        return 0;
1515
0
    }
1516
1517
0
    memcpy(id_tag_out, id_tag, 3);
1518
0
    *hash_out = hash;
1519
1520
0
    tag = sam_hrecs_find_key(rec, id_tag, &prev);
1521
0
    if (!tag)
1522
0
        return 0;
1523
0
    assert(tag->len >= 3);
1524
0
    *old_name = tag->str + 3;
1525
1526
0
    while ((key = va_arg(ap, char *)) != NULL) {
1527
0
        val = va_arg(ap, char *);
1528
0
        if (!val) val = "";
1529
0
        if (strcmp(key, id_tag) != 0) continue;
1530
0
        if (strcmp(val, tag->str + 3) == 0) { ret = 0; continue; }
1531
0
        k = kh_get(m_s2i, hash, val);
1532
0
        ret = k < kh_end(hash) ? -1 : 1;
1533
0
        *new_name = val;
1534
0
    }
1535
0
    return ret;
1536
0
}
1537
1538
int sam_hdr_update_line(sam_hdr_t *bh, const char *type,
1539
0
        const char *ID_key, const char *ID_value, ...) {
1540
0
    sam_hrecs_t *hrecs;
1541
0
    if (!bh)
1542
0
        return -1;
1543
1544
0
    if (!(hrecs = bh->hrecs)) {
1545
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1546
0
            return -1;
1547
0
        hrecs = bh->hrecs;
1548
0
    }
1549
1550
0
    int ret, rename;
1551
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1552
0
    if (!ty)
1553
0
        return -1;
1554
1555
0
    va_list args;
1556
0
    const char *old_name = "?", *new_name = "?";
1557
0
    char id_tag[3];
1558
0
    khash_t(m_s2i) *hash = NULL;
1559
0
    va_start(args, ID_value);
1560
0
    rename = check_for_name_update(hrecs, ty, args,
1561
0
                                   &old_name, &new_name, id_tag, &hash);
1562
0
    va_end(args);
1563
0
    if (rename < 0) {
1564
0
        hts_log_error("Cannot rename @%s \"%s\" to \"%s\" : already exists",
1565
0
                      type, old_name, new_name);
1566
0
        return -1;
1567
0
    }
1568
0
    if (rename > 0 && TYPEKEY(type) == TYPEKEY("PG")) {
1569
        // This is just too complicated
1570
0
        hts_log_error("Renaming @PG records is not supported");
1571
0
        return -1;
1572
0
    }
1573
0
    va_start(args, ID_value);
1574
0
    ret = sam_hrecs_vupdate(hrecs, ty, args);
1575
0
    va_end(args);
1576
1577
0
    if (ret)
1578
0
        return ret;
1579
1580
    // TODO Account for @SQ-AN altnames
1581
1582
0
    if (rename) {
1583
        // Adjust the hash table to point to the new name
1584
        // sam_hrecs_update_hashes() should sort out everything else
1585
0
        khint_t k = kh_get(m_s2i, hash, old_name);
1586
0
        sam_hrec_tag_t *new_tag = sam_hrecs_find_key(ty, id_tag, NULL);
1587
0
        int r, pos;
1588
0
        assert(k < kh_end(hash));        // Or we wouldn't have found it earlier
1589
0
        assert(new_tag && new_tag->str); // id_tag should exist
1590
0
        assert(new_tag->len > 3);
1591
0
        pos = kh_val(hash, k);
1592
0
        kh_del(m_s2i, hash, k);
1593
0
        k = kh_put(m_s2i, hash, new_tag->str + 3, &r);
1594
0
        if (r < 1) {
1595
0
            hts_log_error("Failed to rename item in hash table");
1596
0
            return -1;
1597
0
        }
1598
0
        kh_val(hash, k) = pos;
1599
0
    }
1600
1601
0
    ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty);
1602
1603
0
    if (!ret && hrecs->refs_changed >= 0)
1604
0
        ret = rebuild_target_arrays(bh);
1605
1606
0
    if (!ret && hrecs->dirty)
1607
0
        redact_header_text(bh);
1608
1609
0
    return ret;
1610
0
}
1611
1612
0
int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1613
0
    sam_hrecs_t *hrecs;
1614
0
    if (!bh || !type)
1615
0
        return -1;
1616
1617
0
    if (!(hrecs = bh->hrecs)) {
1618
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1619
0
            return -1;
1620
0
        hrecs = bh->hrecs;
1621
0
    }
1622
1623
0
    sam_hrec_type_t *step;
1624
0
    int ret = 1, remove_all = (ID_key == NULL);
1625
1626
0
    if (!strncmp(type, "PG", 2) || !strncmp(type, "CO", 2)) {
1627
0
        hts_log_warning("Removing PG or CO lines is not supported!");
1628
0
        return -1;
1629
0
    }
1630
1631
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1632
0
    if (!type_found) { // remove all line of this type
1633
0
        khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1634
0
        if (k == kh_end(hrecs->h))
1635
0
            return 0;
1636
0
        type_found =  kh_val(hrecs->h, k);
1637
0
        if (!type_found)
1638
0
            return 0;
1639
0
        remove_all = 1;
1640
0
    }
1641
1642
0
    step = type_found->next;
1643
0
    while (step != type_found) {
1644
0
        sam_hrec_type_t *to_remove = step;
1645
0
        step = step->next;
1646
0
        ret &= sam_hrecs_remove_line(hrecs, type, to_remove);
1647
0
    }
1648
1649
0
    if (remove_all)
1650
0
        ret &= sam_hrecs_remove_line(hrecs, type, type_found);
1651
1652
0
    if (!ret && hrecs->dirty)
1653
0
        redact_header_text(bh);
1654
1655
0
    return 0;
1656
0
}
1657
1658
0
int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *vrh) {
1659
0
    sam_hrecs_t *hrecs;
1660
0
    rmhash_t *rh = (rmhash_t *)vrh;
1661
1662
0
    if (!bh || !type)
1663
0
        return -1;
1664
0
    if (!rh) // remove all lines
1665
0
        return sam_hdr_remove_except(bh, type, NULL, NULL);
1666
0
    if (!id)
1667
0
        return -1;
1668
1669
0
    if (!(hrecs = bh->hrecs)) {
1670
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1671
0
            return -1;
1672
0
        hrecs = bh->hrecs;
1673
0
    }
1674
1675
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1676
0
    if (k == kh_end(hrecs->h)) // nothing to remove from
1677
0
        return 0;
1678
1679
0
    sam_hrec_type_t *head = kh_val(hrecs->h, k);
1680
0
    if (!head) {
1681
0
        hts_log_error("Header inconsistency");
1682
0
        return -1;
1683
0
    }
1684
1685
0
    int ret = 0;
1686
0
    sam_hrec_type_t *step = head->next;
1687
0
    while (step != head) {
1688
0
        sam_hrec_tag_t *tag = sam_hrecs_find_key(step, id, NULL);
1689
0
        if (tag && tag->str && tag->len >= 3) {
1690
0
           k = kh_get(rm, rh, tag->str+3);
1691
0
           if (k == kh_end(rh)) { // value is not in the hash table, so remove
1692
0
               sam_hrec_type_t *to_remove = step;
1693
0
               step = step->next;
1694
0
               ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
1695
0
           } else {
1696
0
               step = step->next;
1697
0
           }
1698
0
        } else { // tag is not on the line, so skip to next line
1699
0
            step = step->next;
1700
0
        }
1701
0
    }
1702
1703
    // process the first line
1704
0
    sam_hrec_tag_t * tag = sam_hrecs_find_key(head, id, NULL);
1705
0
    if (tag && tag->str && tag->len >= 3) {
1706
0
       k = kh_get(rm, rh, tag->str+3);
1707
0
       if (k == kh_end(rh)) { // value is not in the hash table, so remove
1708
0
           sam_hrec_type_t *to_remove = head;
1709
0
           head = head->next;
1710
0
           ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
1711
0
       }
1712
0
    }
1713
1714
0
    if (!ret && hrecs->dirty)
1715
0
        redact_header_text(bh);
1716
1717
0
    return ret;
1718
0
}
1719
1720
2.68k
int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) {
1721
2.68k
    int count;
1722
2.68k
    sam_hrec_type_t *first_ty, *itr_ty;
1723
1724
2.68k
    if (!bh || !type)
1725
0
        return -1;
1726
1727
2.68k
    if (!bh->hrecs) {
1728
2.68k
        if (sam_hdr_fill_hrecs(bh) != 0)
1729
376
            return -1;
1730
2.68k
    }
1731
1732
    // Deal with types that have counts
1733
2.31k
    switch (type[0]) {
1734
2.31k
    case 'S':
1735
2.31k
        if (type[1] == 'Q')
1736
2.31k
            return bh->hrecs->nref;
1737
0
        break;
1738
0
    case 'R':
1739
0
        if (type[1] == 'G')
1740
0
            return bh->hrecs->nrg;
1741
0
        break;
1742
0
    case 'P':
1743
0
        if (type[1] == 'G')
1744
0
            return bh->hrecs->npg;
1745
0
        break;
1746
0
    default:
1747
0
        break;
1748
2.31k
    }
1749
1750
0
    first_ty = sam_hrecs_find_type_id(bh->hrecs, type, NULL, NULL);
1751
0
    if (!first_ty)
1752
0
        return 0;
1753
1754
0
    count = 1;
1755
0
    for (itr_ty = first_ty->next;
1756
0
         itr_ty && itr_ty != first_ty; itr_ty = itr_ty->next) {
1757
0
        count++;
1758
0
    }
1759
1760
0
    return count;
1761
0
}
1762
1763
int sam_hdr_line_index(sam_hdr_t *bh,
1764
                       const char *type,
1765
0
                       const char *key) {
1766
0
    sam_hrecs_t *hrecs;
1767
0
    if (!bh || !type || !key)
1768
0
        return -2;
1769
1770
0
    if (!(hrecs = bh->hrecs)) {
1771
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1772
0
            return -2;
1773
0
        hrecs = bh->hrecs;
1774
0
    }
1775
1776
0
    khint_t k;
1777
0
    int idx = -1;
1778
0
    switch (type[0]) {
1779
0
    case 'S':
1780
0
        if (type[1] == 'Q') {
1781
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
1782
0
            if (k != kh_end(hrecs->ref_hash))
1783
0
                idx = kh_val(hrecs->ref_hash, k);
1784
0
        } else {
1785
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1786
0
        }
1787
0
        break;
1788
0
    case 'R':
1789
0
        if (type[1] == 'G') {
1790
0
            k = kh_get(m_s2i, hrecs->rg_hash, key);
1791
0
            if (k != kh_end(hrecs->rg_hash))
1792
0
                idx = kh_val(hrecs->rg_hash, k);
1793
0
        } else {
1794
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1795
0
        }
1796
0
        break;
1797
0
    case 'P':
1798
0
        if (type[1] == 'G') {
1799
0
            k = kh_get(m_s2i, hrecs->pg_hash, key);
1800
0
            if (k != kh_end(hrecs->pg_hash))
1801
0
                idx = kh_val(hrecs->pg_hash, k);
1802
0
        } else {
1803
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1804
0
        }
1805
0
        break;
1806
0
    default:
1807
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1808
0
    }
1809
1810
0
    return idx;
1811
0
}
1812
1813
const char *sam_hdr_line_name(sam_hdr_t *bh,
1814
                              const char *type,
1815
0
                              int pos) {
1816
0
    sam_hrecs_t *hrecs;
1817
0
    if (!bh || !type || pos < 0)
1818
0
        return NULL;
1819
1820
0
    if (!(hrecs = bh->hrecs)) {
1821
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1822
0
            return NULL;
1823
0
        hrecs = bh->hrecs;
1824
0
    }
1825
1826
0
    switch (type[0]) {
1827
0
    case 'S':
1828
0
        if (type[1] == 'Q') {
1829
0
            if (pos < hrecs->nref)
1830
0
                return hrecs->ref[pos].name;
1831
0
        } else {
1832
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1833
0
        }
1834
0
        break;
1835
0
    case 'R':
1836
0
        if (type[1] == 'G') {
1837
0
            if (pos < hrecs->nrg)
1838
0
                return hrecs->rg[pos].name;
1839
0
        } else {
1840
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1841
0
        }
1842
0
        break;
1843
0
    case 'P':
1844
0
        if (type[1] == 'G') {
1845
0
            if (pos < hrecs->npg)
1846
0
                return hrecs->pg[pos].name;
1847
0
        } else {
1848
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1849
0
        }
1850
0
        break;
1851
0
    default:
1852
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1853
0
    }
1854
1855
0
    return NULL;
1856
0
}
1857
1858
/* ==== Key:val level methods ==== */
1859
1860
int sam_hdr_find_tag_id(sam_hdr_t *bh,
1861
                     const char *type,
1862
                     const char *ID_key,
1863
                     const char *ID_value,
1864
                     const char *key,
1865
0
                     kstring_t *ks) {
1866
0
    sam_hrecs_t *hrecs;
1867
0
    if (!bh || !type || !key)
1868
0
        return -2;
1869
1870
0
    if (!(hrecs = bh->hrecs)) {
1871
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1872
0
            return -2;
1873
0
        hrecs = bh->hrecs;
1874
0
    }
1875
1876
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1877
0
    if (!ty)
1878
0
        return -1;
1879
1880
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1881
0
    if (!tag || !tag->str || tag->len < 4)
1882
0
        return -1;
1883
1884
0
    ks->l = 0;
1885
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1886
0
        return -2;
1887
0
    }
1888
1889
0
    return 0;
1890
0
}
1891
1892
int sam_hdr_find_tag_pos(sam_hdr_t *bh,
1893
                     const char *type,
1894
                     int pos,
1895
                     const char *key,
1896
0
                     kstring_t *ks) {
1897
0
    sam_hrecs_t *hrecs;
1898
0
    if (!bh || !type || !key)
1899
0
        return -2;
1900
1901
0
    if (!(hrecs = bh->hrecs)) {
1902
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1903
0
            return -2;
1904
0
        hrecs = bh->hrecs;
1905
0
    }
1906
1907
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1908
0
    if (!ty)
1909
0
        return -1;
1910
1911
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1912
0
    if (!tag || !tag->str || tag->len < 4)
1913
0
        return -1;
1914
1915
0
    ks->l = 0;
1916
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1917
0
        return -2;
1918
0
    }
1919
1920
0
    return 0;
1921
0
}
1922
1923
int sam_hdr_remove_tag_id(sam_hdr_t *bh,
1924
        const char *type,
1925
        const char *ID_key,
1926
        const char *ID_value,
1927
0
        const char *key) {
1928
0
    sam_hrecs_t *hrecs;
1929
0
    if (!bh || !type || !key)
1930
0
        return -1;
1931
1932
0
    if (!(hrecs = bh->hrecs)) {
1933
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1934
0
            return -1;
1935
0
        hrecs = bh->hrecs;
1936
0
    }
1937
1938
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1939
0
    if (!ty)
1940
0
        return -1;
1941
1942
0
    int ret = sam_hrecs_remove_key(hrecs, ty, key);
1943
0
    if (!ret && hrecs->dirty)
1944
0
        redact_header_text(bh);
1945
1946
0
    return ret;
1947
0
}
1948
1949
/*
1950
 * Reconstructs a kstring from the header hash table.
1951
 * Returns 0 on success
1952
 *        -1 on failure
1953
 */
1954
3.10k
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) {
1955
3.10k
    ks->l = 0;
1956
1957
3.10k
    if (!hrecs->h || !hrecs->h->size) {
1958
1.71k
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
1959
1.71k
    }
1960
1.38k
    if (sam_hrecs_rebuild_lines(hrecs, ks) != 0)
1961
0
        return -1;
1962
1963
1.38k
    return 0;
1964
1.38k
}
1965
1966
/*
1967
 * Looks up a reference sequence by name and returns the numerical ID.
1968
 * Returns -1 if unknown reference; -2 if header could not be parsed.
1969
 */
1970
20.8k
int sam_hdr_name2tid(sam_hdr_t *bh, const char *ref) {
1971
20.8k
    sam_hrecs_t *hrecs;
1972
20.8k
    khint_t k;
1973
1974
20.8k
    if (!bh)
1975
0
        return -1;
1976
1977
20.8k
    if (!(hrecs = bh->hrecs)) {
1978
92
        if (sam_hdr_fill_hrecs(bh) != 0)
1979
92
            return -2;
1980
0
        hrecs = bh->hrecs;
1981
0
    }
1982
1983
20.7k
    if (!hrecs->ref_hash)
1984
0
        return -1;
1985
1986
20.7k
    k = kh_get(m_s2i, hrecs->ref_hash, ref);
1987
20.7k
    return k == kh_end(hrecs->ref_hash) ? -1 : kh_val(hrecs->ref_hash, k);
1988
20.7k
}
1989
1990
0
const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) {
1991
0
    sam_hrecs_t *hrecs;
1992
1993
0
    if (!h || tid < 0)
1994
0
        return NULL;
1995
1996
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
1997
0
        return hrecs->ref[tid].name;
1998
0
    } else {
1999
0
        if (tid < h->n_targets)
2000
0
            return h->target_name[tid];
2001
0
    }
2002
2003
0
    return NULL;
2004
0
}
2005
2006
0
hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
2007
0
    sam_hrecs_t *hrecs;
2008
2009
0
    if (!h || tid < 0)
2010
0
        return 0;
2011
2012
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2013
0
        return hrecs->ref[tid].len;
2014
0
    } else {
2015
0
        if (tid < h->n_targets) {
2016
0
            if (h->target_len[tid] < UINT32_MAX || !h->sdict) {
2017
0
                return h->target_len[tid];
2018
0
            } else {
2019
0
                khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict;
2020
0
                khint_t k = kh_get(s2i, long_refs, h->target_name[tid]);
2021
0
                if (k < kh_end(long_refs)) {
2022
0
                    return kh_val(long_refs, k);
2023
0
                } else {
2024
0
                    return UINT32_MAX;
2025
0
                }
2026
0
            }
2027
0
        }
2028
0
    }
2029
2030
0
    return 0;
2031
0
}
2032
2033
/*
2034
 * Fixes any PP links in @PG headers.
2035
 * If the entries are in order then this doesn't need doing, but in case
2036
 * our header is out of order this goes through the hrecs->pg[] array
2037
 * setting the prev_id field.
2038
 *
2039
 * Note we can have multiple complete chains. This code should identify the
2040
 * tails of these chains as these are the entries we have to link to in
2041
 * subsequent PP records.
2042
 *
2043
 * Returns 0 on success
2044
 *        -1 on failure (indicating broken PG/PP records)
2045
 */
2046
0
static int sam_hdr_link_pg(sam_hdr_t *bh) {
2047
0
    sam_hrecs_t *hrecs;
2048
0
    int i, j, ret = 0, *new_pg_end;
2049
2050
0
    if (!bh)
2051
0
        return -1;
2052
2053
0
    if (!(hrecs = bh->hrecs)) {
2054
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2055
0
            return -1;
2056
0
        hrecs = bh->hrecs;
2057
0
    }
2058
2059
0
    if (!hrecs->pgs_changed || !hrecs->npg)
2060
0
        return 0;
2061
2062
0
    hrecs->npg_end_alloc = hrecs->npg;
2063
0
    new_pg_end = realloc(hrecs->pg_end, hrecs->npg * sizeof(*new_pg_end));
2064
0
    if (!new_pg_end)
2065
0
        return -1;
2066
0
    hrecs->pg_end = new_pg_end;
2067
0
    int *chain_size = calloc(hrecs->npg, sizeof(int));
2068
0
    if (!chain_size)
2069
0
        return -1;
2070
2071
0
    for (i = 0; i < hrecs->npg; i++)
2072
0
        hrecs->pg_end[i] = i;
2073
2074
0
    for (i = 0; i < hrecs->npg; i++) {
2075
0
        khint_t k;
2076
0
        sam_hrec_tag_t *tag;
2077
2078
0
        assert(hrecs->pg[i].ty != NULL);
2079
0
        for (tag = hrecs->pg[i].ty->tag; tag; tag = tag->next) {
2080
0
            if (tag->str[0] == 'P' && tag->str[1] == 'P')
2081
0
                break;
2082
0
        }
2083
0
        if (!tag) {
2084
            /* Chain start points */
2085
0
            continue;
2086
0
        }
2087
2088
0
        k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
2089
2090
0
        if (k == kh_end(hrecs->pg_hash)) {
2091
0
            hts_log_warning("PG line with PN:%s has a PP link to missing program '%s'",
2092
0
                    hrecs->pg[i].name, tag->str+3);
2093
0
            continue;
2094
0
        }
2095
2096
0
        hrecs->pg[i].prev_id = hrecs->pg[kh_val(hrecs->pg_hash, k)].id;
2097
0
        hrecs->pg_end[kh_val(hrecs->pg_hash, k)] = -1;
2098
0
        chain_size[i] = chain_size[kh_val(hrecs->pg_hash, k)]+1;
2099
0
    }
2100
2101
0
    for (i = j = 0; i < hrecs->npg; i++) {
2102
0
        if (hrecs->pg_end[i] != -1 && chain_size[i] > 0)
2103
0
            hrecs->pg_end[j++] = hrecs->pg_end[i];
2104
0
    }
2105
    /* Only leafs? Choose the last one! */
2106
0
    if (!j && hrecs->npg_end > 0) {
2107
0
        hrecs->pg_end[0] = hrecs->pg_end[hrecs->npg_end-1];
2108
0
        j = 1;
2109
0
    }
2110
2111
0
    hrecs->npg_end = j;
2112
0
    hrecs->pgs_changed = 0;
2113
2114
    /* mark as dirty or empty for rebuild */
2115
0
    hrecs->dirty = 1;
2116
0
    redact_header_text(bh);
2117
0
    free(chain_size);
2118
2119
0
    return ret;
2120
0
}
2121
2122
/*
2123
 * Returns a unique ID from a base name.
2124
 *
2125
 * The value returned is valid until the next call to
2126
 * this function.
2127
 */
2128
0
const char *sam_hdr_pg_id(sam_hdr_t *bh, const char *name) {
2129
0
    sam_hrecs_t *hrecs;
2130
0
    size_t name_len;
2131
0
    const size_t name_extra = 17;
2132
0
    if (!bh || !name)
2133
0
        return NULL;
2134
2135
0
    if (!(hrecs = bh->hrecs)) {
2136
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2137
0
            return NULL;
2138
0
        hrecs = bh->hrecs;
2139
0
    }
2140
2141
0
    khint_t k = kh_get(m_s2i, hrecs->pg_hash, name);
2142
0
    if (k == kh_end(hrecs->pg_hash))
2143
0
        return name;
2144
2145
0
    name_len = strlen(name);
2146
0
    if (name_len > 1000) name_len = 1000;
2147
0
    if (hrecs->ID_buf_sz < name_len + name_extra) {
2148
0
        char *new_ID_buf = realloc(hrecs->ID_buf, name_len + name_extra);
2149
0
        if (new_ID_buf == NULL)
2150
0
            return NULL;
2151
0
        hrecs->ID_buf = new_ID_buf;
2152
0
        hrecs->ID_buf_sz = name_len + name_extra;
2153
0
    }
2154
2155
0
    do {
2156
0
        snprintf(hrecs->ID_buf, hrecs->ID_buf_sz, "%.1000s.%d", name, hrecs->ID_cnt++);
2157
0
        k = kh_get(m_s2i, hrecs->pg_hash, hrecs->ID_buf);
2158
0
    } while (k != kh_end(hrecs->pg_hash));
2159
2160
0
    return hrecs->ID_buf;
2161
0
}
2162
2163
/*
2164
 * Add an @PG line.
2165
 *
2166
 * If we wish complete control over this use sam_hdr_add_line() directly. This
2167
 * function uses that, but attempts to do a lot of tedious house work for
2168
 * you too.
2169
 *
2170
 * - It will generate a suitable ID if the supplied one clashes.
2171
 * - It will generate multiple @PG records if we have multiple PG chains.
2172
 *
2173
 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
2174
 * in NULL.
2175
 *
2176
 * Returns 0 on success
2177
 *        -1 on failure
2178
 */
2179
0
int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) {
2180
0
    sam_hrecs_t *hrecs;
2181
0
    const char *specified_id = NULL, *specified_pn = NULL, *specified_pp = NULL;
2182
0
    const char *key, *val;
2183
0
    if (!bh)
2184
0
        return -1;
2185
2186
0
    if (!(hrecs = bh->hrecs)) {
2187
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2188
0
            return -1;
2189
0
        hrecs = bh->hrecs;
2190
0
    }
2191
2192
0
    bh->hrecs->pgs_changed = 1;
2193
0
    if (sam_hdr_link_pg(bh) < 0) {
2194
0
        hts_log_error("Error linking @PG lines");
2195
0
        return -1;
2196
0
    }
2197
2198
0
    va_list args;
2199
    // Check for ID / PN / PP tags in varargs list
2200
0
    va_start(args, name);
2201
0
    while ((key = va_arg(args, const char *)) != NULL) {
2202
0
        val = va_arg(args, const char *);
2203
0
        if (!val) break;
2204
0
        if (strcmp(key, "PN") == 0 && *val != '\0')
2205
0
            specified_pn = val;
2206
0
        else if (strcmp(key, "PP") == 0 && *val != '\0')
2207
0
            specified_pp = val;
2208
0
        else if (strcmp(key, "ID") == 0 && *val != '\0')
2209
0
            specified_id = val;
2210
0
    }
2211
0
    va_end(args);
2212
2213
0
    if (specified_id && hrecs->pg_hash) {
2214
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_id);
2215
0
        if (k != kh_end(hrecs->pg_hash)) {
2216
0
            hts_log_error("Header @PG ID:%s already present", specified_id);
2217
0
            return -1;
2218
0
        }
2219
0
    }
2220
2221
0
    if (specified_pp && hrecs->pg_hash) {
2222
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_pp);
2223
0
        if (k == kh_end(hrecs->pg_hash)) {
2224
0
            hts_log_error("Header @PG ID:%s referred to by PP tag not present",
2225
0
                          specified_pp);
2226
0
            return -1;
2227
0
        }
2228
0
    }
2229
2230
0
    if (!specified_pp && hrecs->npg_end) {
2231
        /* Copy ends array to avoid us looping while modifying it */
2232
0
        int *end = malloc(hrecs->npg_end * sizeof(int));
2233
0
        int i, nends = hrecs->npg_end;
2234
2235
0
        if (!end)
2236
0
            return -1;
2237
2238
0
        memcpy(end, hrecs->pg_end, nends * sizeof(*end));
2239
2240
0
        for (i = 0; i < nends; i++) {
2241
0
            const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2242
0
            if (!id) {
2243
0
                free(end);
2244
0
                return -1;
2245
0
            }
2246
0
            va_start(args, name);
2247
0
            if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2248
0
                                     "ID", id,
2249
0
                                     "PN", !specified_pn ? name : "",
2250
0
                                     "PP", hrecs->pg[end[i]].name,
2251
0
                                     NULL)) {
2252
0
                free(end);
2253
0
                return  -1;
2254
0
            }
2255
0
            va_end(args);
2256
0
        }
2257
2258
0
        free(end);
2259
0
    } else {
2260
0
        const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2261
0
        if (!id)
2262
0
            return -1;
2263
0
        va_start(args, name);
2264
0
        if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2265
0
                                 "ID", id,
2266
0
                                 "PN", !specified_pn ? name : "",
2267
0
                                 NULL))
2268
0
            return -1;
2269
0
        va_end(args);
2270
0
    }
2271
2272
0
    hrecs->dirty = 1;
2273
0
    redact_header_text(bh);
2274
2275
0
    return 0;
2276
0
}
2277
2278
/*! Increments a reference count on bh.
2279
 *
2280
 * This permits multiple files to share the same header, all calling
2281
 * sam_hdr_destroy when done, without causing errors for other open files.
2282
 */
2283
0
void sam_hdr_incr_ref(sam_hdr_t *bh) {
2284
0
    if (!bh)
2285
0
        return;
2286
0
    bh->ref_count++;
2287
0
}
2288
2289
/* ==== Internal methods ==== */
2290
2291
/*
2292
 * Creates an empty SAM header.  Allocates space for the SAM header
2293
 * structures (hash tables) ready to be populated.
2294
 *
2295
 * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
2296
 *         NULL on failure
2297
 */
2298
3.57k
sam_hrecs_t *sam_hrecs_new() {
2299
3.57k
    sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
2300
2301
3.57k
    if (!hrecs)
2302
0
        return NULL;
2303
2304
3.57k
    hrecs->h = kh_init(sam_hrecs_t);
2305
3.57k
    if (!hrecs->h)
2306
0
        goto err;
2307
2308
3.57k
    hrecs->ID_cnt = 1;
2309
2310
3.57k
    hrecs->nref = 0;
2311
3.57k
    hrecs->ref_sz = 0;
2312
3.57k
    hrecs->ref  = NULL;
2313
3.57k
    if (!(hrecs->ref_hash = kh_init(m_s2i)))
2314
0
        goto err;
2315
3.57k
    hrecs->refs_changed = -1;
2316
2317
3.57k
    hrecs->nrg = 0;
2318
3.57k
    hrecs->rg_sz = 0;
2319
3.57k
    hrecs->rg  = NULL;
2320
3.57k
    if (!(hrecs->rg_hash = kh_init(m_s2i)))
2321
0
        goto err;
2322
2323
3.57k
    hrecs->npg = 0;
2324
3.57k
    hrecs->pg_sz = 0;
2325
3.57k
    hrecs->pg  = NULL;
2326
3.57k
    hrecs->npg_end = hrecs->npg_end_alloc = 0;
2327
3.57k
    hrecs->pg_end = NULL;
2328
3.57k
    if (!(hrecs->pg_hash = kh_init(m_s2i)))
2329
0
        goto err;
2330
2331
3.57k
    if (!(hrecs->tag_pool = pool_create(sizeof(sam_hrec_tag_t))))
2332
0
        goto err;
2333
2334
3.57k
    if (!(hrecs->type_pool = pool_create(sizeof(sam_hrec_type_t))))
2335
0
        goto err;
2336
2337
3.57k
    if (!(hrecs->str_pool = string_pool_create(65536)))
2338
0
        goto err;
2339
2340
3.57k
    if (sam_hrecs_init_type_order(hrecs, NULL))
2341
0
        goto err;
2342
2343
3.57k
    return hrecs;
2344
2345
0
err:
2346
0
    if (hrecs->h)
2347
0
        kh_destroy(sam_hrecs_t, hrecs->h);
2348
2349
0
    if (hrecs->tag_pool)
2350
0
        pool_destroy(hrecs->tag_pool);
2351
2352
0
    if (hrecs->type_pool)
2353
0
        pool_destroy(hrecs->type_pool);
2354
2355
0
    if (hrecs->str_pool)
2356
0
        string_pool_destroy(hrecs->str_pool);
2357
2358
0
    free(hrecs);
2359
2360
0
    return NULL;
2361
3.57k
}
2362
#if 0
2363
/*
2364
 * Produces a duplicate copy of source and returns it.
2365
 * Returns NULL on failure
2366
 */
2367
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *source) {
2368
        return NULL;
2369
}
2370
#endif
2371
/*! Deallocates all storage used by a sam_hrecs_t struct.
2372
 *
2373
 * This also decrements the header reference count. If after decrementing
2374
 * it is still non-zero then the header is assumed to be in use by another
2375
 * caller and the free is not done.
2376
 *
2377
 */
2378
3.57k
void sam_hrecs_free(sam_hrecs_t *hrecs) {
2379
3.57k
    if (!hrecs)
2380
0
        return;
2381
2382
3.57k
    if (hrecs->h)
2383
3.57k
        kh_destroy(sam_hrecs_t, hrecs->h);
2384
2385
3.57k
    if (hrecs->ref_hash)
2386
3.57k
        kh_destroy(m_s2i, hrecs->ref_hash);
2387
2388
3.57k
    if (hrecs->ref)
2389
1.20k
        free(hrecs->ref);
2390
2391
3.57k
    if (hrecs->rg_hash)
2392
3.57k
        kh_destroy(m_s2i, hrecs->rg_hash);
2393
2394
3.57k
    if (hrecs->rg)
2395
136
        free(hrecs->rg);
2396
2397
3.57k
    if (hrecs->pg_hash)
2398
3.57k
        kh_destroy(m_s2i, hrecs->pg_hash);
2399
2400
3.57k
    if (hrecs->pg)
2401
266
        free(hrecs->pg);
2402
2403
3.57k
    if (hrecs->pg_end)
2404
252
        free(hrecs->pg_end);
2405
2406
3.57k
    if (hrecs->type_pool)
2407
3.57k
        pool_destroy(hrecs->type_pool);
2408
2409
3.57k
    if (hrecs->tag_pool)
2410
3.57k
        pool_destroy(hrecs->tag_pool);
2411
2412
3.57k
    if (hrecs->str_pool)
2413
3.57k
        string_pool_destroy(hrecs->str_pool);
2414
2415
3.57k
    if (hrecs->type_order)
2416
3.57k
        free(hrecs->type_order);
2417
2418
3.57k
    if (hrecs->ID_buf)
2419
0
        free(hrecs->ID_buf);
2420
2421
3.57k
    free(hrecs);
2422
3.57k
}
2423
2424
/*
2425
 * Internal method already used by the CRAM code
2426
 * Returns the first header item matching 'type'. If ID is non-NULL it checks
2427
 * for the tag ID: and compares against the specified ID.
2428
 *
2429
 * Returns NULL if no type/ID is found
2430
 */
2431
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
2432
1.30k
                                     const char *ID_key, const char *ID_value) {
2433
1.30k
    if (!hrecs || !type)
2434
0
        return NULL;
2435
1.30k
    sam_hrec_type_t *t1, *t2;
2436
1.30k
    khint_t k;
2437
2438
    /* Special case for types we have prebuilt hashes on */
2439
1.30k
    if (ID_key) {
2440
1.30k
        if (!ID_value)
2441
0
            return NULL;
2442
2443
1.30k
        if (type[0]   == 'S' && type[1]   == 'Q' &&
2444
1.30k
            ID_key[0] == 'S' && ID_key[1] == 'N') {
2445
1.30k
            k = kh_get(m_s2i, hrecs->ref_hash, ID_value);
2446
1.30k
            return k != kh_end(hrecs->ref_hash)
2447
1.30k
                ? hrecs->ref[kh_val(hrecs->ref_hash, k)].ty
2448
1.30k
                : NULL;
2449
1.30k
        }
2450
2451
0
        if (type[0]   == 'R' && type[1]   == 'G' &&
2452
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2453
0
            k = kh_get(m_s2i, hrecs->rg_hash, ID_value);
2454
0
            return k != kh_end(hrecs->rg_hash)
2455
0
                ? hrecs->rg[kh_val(hrecs->rg_hash, k)].ty
2456
0
                : NULL;
2457
0
        }
2458
2459
0
        if (type[0]   == 'P' && type[1]   == 'G' &&
2460
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2461
0
            k = kh_get(m_s2i, hrecs->pg_hash, ID_value);
2462
0
            return k != kh_end(hrecs->pg_hash)
2463
0
                ? hrecs->pg[kh_val(hrecs->pg_hash, k)].ty
2464
0
                : NULL;
2465
0
        }
2466
0
    }
2467
2468
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2469
0
    if (k == kh_end(hrecs->h))
2470
0
        return NULL;
2471
2472
0
    if (!ID_key)
2473
0
        return kh_val(hrecs->h, k);
2474
2475
0
    t1 = t2 = kh_val(hrecs->h, k);
2476
0
    do {
2477
0
        sam_hrec_tag_t *tag;
2478
0
        for (tag = t1->tag; tag; tag = tag->next) {
2479
0
            if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
2480
0
                const char *cp1 = tag->str+3;
2481
0
                const char *cp2 = ID_value;
2482
0
                while (*cp1 && *cp1 == *cp2)
2483
0
                    cp1++, cp2++;
2484
0
                if (*cp2 || *cp1)
2485
0
                    continue;
2486
0
                return t1;
2487
0
            }
2488
0
        }
2489
0
        t1 = t1->next;
2490
0
    } while (t1 != t2);
2491
2492
0
    return NULL;
2493
0
}
2494
2495
/*
2496
 * Adds or updates tag key,value pairs in a header line.
2497
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2498
 * @HD line.
2499
 *
2500
 * va_list contains multiple key,value pairs ending in NULL.
2501
 *
2502
 * Returns 0 on success
2503
 *        -1 on failure
2504
 */
2505
334
int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
2506
334
    if (!hrecs)
2507
0
        return -1;
2508
2509
668
    for (;;) {
2510
668
        char *k, *v, *str;
2511
668
        sam_hrec_tag_t *tag, *prev = NULL;
2512
2513
668
        if (!(k = (char *)va_arg(ap, char *)))
2514
334
            break;
2515
334
        if (!(v = va_arg(ap, char *)))
2516
0
            v = "";
2517
2518
334
        tag = sam_hrecs_find_key(type, k, &prev);
2519
334
        if (!tag) {
2520
0
            if (!(tag = pool_alloc(hrecs->tag_pool)))
2521
0
                return -1;
2522
0
            if (prev)
2523
0
                prev->next = tag;
2524
0
            else
2525
0
                type->tag = tag;
2526
2527
0
            tag->next = NULL;
2528
0
        }
2529
2530
334
        tag->len = 3 + strlen(v);
2531
334
        str = string_alloc(hrecs->str_pool, tag->len+1);
2532
334
        if (!str)
2533
0
            return -1;
2534
2535
334
        if (snprintf(str, tag->len+1, "%2.2s:%s", k, v) < 0)
2536
0
            return -1;
2537
2538
334
        tag->str = str;
2539
334
    }
2540
2541
334
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
2542
2543
334
    return 0;
2544
334
}
2545
2546
/*
2547
 * Adds or updates tag key,value pairs in a header line.
2548
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2549
 * @HD line.
2550
 *
2551
 * Specify multiple key,value pairs ending in NULL.
2552
 *
2553
 * Returns 0 on success
2554
 *        -1 on failure
2555
 */
2556
334
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
2557
334
    va_list args;
2558
334
    int res;
2559
334
    va_start(args, type);
2560
334
    res = sam_hrecs_vupdate(hrecs, type, args);
2561
334
    va_end(args);
2562
334
    return res;
2563
334
}
2564
2565
/*
2566
 * Looks for a specific key in a single sam header line identified by *type.
2567
 * If prev is non-NULL it also fills this out with the previous tag, to
2568
 * permit use in key removal. *prev is set to NULL when the tag is the first
2569
 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
2570
 * tag in the existing list.
2571
 *
2572
 * Returns the tag pointer on success
2573
 *         NULL on failure
2574
 */
2575
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
2576
                                   const char *key,
2577
19.4k
                                   sam_hrec_tag_t **prev) {
2578
19.4k
    sam_hrec_tag_t *tag, *p = NULL;
2579
19.4k
    if (!type)
2580
0
        return NULL;
2581
2582
24.5k
    for (tag = type->tag; tag; p = tag, tag = tag->next) {
2583
23.2k
        if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
2584
18.1k
            if (prev)
2585
334
                *prev = p;
2586
18.1k
            return tag;
2587
18.1k
        }
2588
23.2k
    }
2589
2590
1.30k
    if (prev)
2591
0
        *prev = p;
2592
2593
1.30k
    return NULL;
2594
19.4k
}
2595
2596
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
2597
                         sam_hrec_type_t *type,
2598
0
                         const char *key) {
2599
0
    sam_hrec_tag_t *tag, *prev;
2600
0
    if (!hrecs)
2601
0
        return -1;
2602
0
    tag = sam_hrecs_find_key(type, key, &prev);
2603
0
    if (!tag)
2604
0
        return 0; // Not there anyway
2605
2606
0
    if (type->type == TYPEKEY("SQ") && tag->str[0] == 'A' && tag->str[1] == 'N') {
2607
0
        assert(tag->len >= 3);
2608
0
        sam_hrec_tag_t *sn_tag = sam_hrecs_find_key(type, "SN", NULL);
2609
0
        if (sn_tag) {
2610
0
            assert(sn_tag->len >= 3);
2611
0
            khint_t k = kh_get(m_s2i, hrecs->ref_hash, sn_tag->str + 3);
2612
0
            if (k != kh_end(hrecs->ref_hash))
2613
0
                sam_hrecs_remove_ref_altnames(hrecs, kh_val(hrecs->ref_hash, k), tag->str + 3);
2614
0
        }
2615
0
    }
2616
2617
0
    if (!prev) { //first tag
2618
0
        type->tag = tag->next;
2619
0
    } else {
2620
0
        prev->next = tag->next;
2621
0
    }
2622
0
    pool_free(hrecs->tag_pool, tag);
2623
0
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
2624
2625
0
    return 1;
2626
0
}
2627
2628
/*
2629
 * Looks up a read-group by name and returns a pointer to the start of the
2630
 * associated tag list.
2631
 *
2632
 * Returns NULL on failure
2633
 */
2634
0
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg) {
2635
0
    khint_t k = kh_get(m_s2i, hrecs->rg_hash, rg);
2636
0
    return k == kh_end(hrecs->rg_hash)
2637
0
        ? NULL
2638
0
        : &hrecs->rg[kh_val(hrecs->rg_hash, k)];
2639
0
}
2640
2641
#if DEBUG_HEADER
2642
void sam_hrecs_dump(sam_hrecs_t *hrecs) {
2643
    khint_t k;
2644
    int i;
2645
2646
    printf("===DUMP===\n");
2647
    for (k = kh_begin(hrecs->h); k != kh_end(hrecs->h); k++) {
2648
        sam_hrec_type_t *t1, *t2;
2649
        char c[2];
2650
        int idx = 0;
2651
2652
        if (!kh_exist(hrecs->h, k))
2653
            continue;
2654
2655
        t1 = t2 = kh_val(hrecs->h, k);
2656
        c[0] = kh_key(hrecs->h, k)>>8;
2657
        c[1] = kh_key(hrecs->h, k)&0xff;
2658
        printf("Type %.2s\n", c);
2659
2660
        do {
2661
            sam_hrec_tag_t *tag;
2662
            printf(">>>%d ", idx++);
2663
            for (tag = t1->tag; tag; tag=tag->next) {
2664
                if (strncmp(c, "CO", 2))
2665
                    printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3);
2666
                else
2667
                    printf("%s", tag->str);
2668
            }
2669
            putchar('\n');
2670
            t1 = t1->next;
2671
        } while (t1 != t2);
2672
    }
2673
2674
    /* Dump out PG chains */
2675
    printf("\n@PG chains:\n");
2676
    for (i = 0; i < hrecs->npg_end; i++) {
2677
        int j;
2678
        printf("  %d:", i);
2679
        for (j = hrecs->pg_end[i]; j != -1; j = hrecs->pg[j].prev_id) {
2680
            printf("%s%d(%.*s)",
2681
                   j == hrecs->pg_end[i] ? " " : "->",
2682
                   j, hrecs->pg[j].name_len, hrecs->pg[j].name);
2683
        }
2684
        printf("\n");
2685
    }
2686
2687
    puts("===END DUMP===");
2688
}
2689
#endif
2690
2691
/*
2692
 * Returns the sort order:
2693
 */
2694
0
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs) {
2695
0
    khint_t k;
2696
0
    enum sam_sort_order so;
2697
2698
0
    so = ORDER_UNKNOWN;
2699
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2700
0
    if (k != kh_end(hrecs->h)) {
2701
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2702
0
        sam_hrec_tag_t *tag;
2703
0
        for (tag = ty->tag; tag; tag = tag->next) {
2704
0
            if (tag->str[0] == 'S' && tag->str[1] == 'O') {
2705
0
                if (strcmp(tag->str+3, "unsorted") == 0)
2706
0
                    so = ORDER_UNSORTED;
2707
0
                else if (strcmp(tag->str+3, "queryname") == 0)
2708
0
                    so = ORDER_NAME;
2709
0
                else if (strcmp(tag->str+3, "coordinate") == 0)
2710
0
                    so = ORDER_COORD;
2711
0
                else if (strcmp(tag->str+3, "unknown") != 0)
2712
0
                    hts_log_error("Unknown sort order field: %s", tag->str+3);
2713
0
            }
2714
0
        }
2715
0
    }
2716
2717
0
    return so;
2718
0
}
2719
2720
0
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs) {
2721
0
    khint_t k;
2722
0
    enum sam_group_order go;
2723
2724
0
    go = ORDER_NONE;
2725
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2726
0
    if (k != kh_end(hrecs->h)) {
2727
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2728
0
        sam_hrec_tag_t *tag;
2729
0
        for (tag = ty->tag; tag; tag = tag->next) {
2730
0
            if (tag->str[0] == 'G' && tag->str[1] == 'O') {
2731
0
                if (strcmp(tag->str+3, "query") == 0)
2732
0
                    go = ORDER_QUERY;
2733
0
                else if (strcmp(tag->str+3, "reference") == 0)
2734
0
                    go = ORDER_REFERENCE;
2735
0
            }
2736
0
        }
2737
0
    }
2738
2739
0
    return go;
2740
0
}