Coverage Report

Created: 2023-01-17 06:24

/src/htslib/header.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2018-2020 Genome Research Ltd.
3
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <string.h>
35
#include <assert.h>
36
#include <errno.h>
37
#include "textutils_internal.h"
38
#include "header.h"
39
40
// Hash table for removing multiple lines from the header
41
KHASH_SET_INIT_STR(rm)
42
// Used for long refs in SAM files
43
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
44
45
typedef khash_t(rm) rmhash_t;
46
47
static int sam_hdr_link_pg(sam_hdr_t *bh);
48
49
static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
50
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);
51
52
53
221
#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
54
135
static void sam_hrecs_error(const char *msg, const char *line, size_t len, size_t lno) {
55
135
    int j;
56
57
135
    if (len > MAX_ERROR_QUOTE)
58
86
        len = MAX_ERROR_QUOTE;
59
16.0k
    for (j = 0; j < len && line[j] != '\n'; j++)
60
15.9k
        ;
61
135
    hts_log_error("%s at line %zd: \"%.*s\"", msg, lno, j, line);
62
135
}
63
64
/* ==== Static methods ==== */
65
66
1.16k
static int sam_hrecs_init_type_order(sam_hrecs_t *hrecs, char *type_list) {
67
1.16k
    if (!hrecs)
68
0
        return -1;
69
70
1.16k
    if (!type_list) {
71
1.16k
        hrecs->type_count = 5;
72
1.16k
        hrecs->type_order = calloc(hrecs->type_count, 3);
73
1.16k
        if (!hrecs->type_order)
74
0
            return -1;
75
1.16k
        memcpy(hrecs->type_order[0], "HD", 2);
76
1.16k
        memcpy(hrecs->type_order[1], "SQ", 2);
77
1.16k
        memcpy(hrecs->type_order[2], "RG", 2);
78
1.16k
        memcpy(hrecs->type_order[3], "PG", 2);
79
1.16k
        memcpy(hrecs->type_order[4], "CO", 2);
80
1.16k
    }
81
82
1.16k
    return 0;
83
1.16k
}
84
85
1.67k
static int sam_hrecs_add_ref_altnames(sam_hrecs_t *hrecs, int nref, const char *list) {
86
1.67k
    const char *token;
87
1.67k
    ks_tokaux_t aux;
88
89
1.67k
    if (!list)
90
1.61k
        return 0;
91
92
282k
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
93
282k
        if (aux.p == token)
94
275k
            continue;
95
96
6.93k
        char *name = string_ndup(hrecs->str_pool, token, aux.p - token);
97
6.93k
        if (!name)
98
0
            return -1;
99
6.93k
        int r;
100
6.93k
        khint_t k = kh_put(m_s2i, hrecs->ref_hash, name, &r);
101
6.93k
        if (r < 0) return -1;
102
103
6.93k
        if (r > 0)
104
803
            kh_val(hrecs->ref_hash, k) = nref;
105
6.13k
        else if (kh_val(hrecs->ref_hash, k) != nref)
106
94
            hts_log_warning("Duplicate entry AN:\"%s\" in sam header", name);
107
6.93k
    }
108
109
56
    return 0;
110
56
}
111
112
0
static void sam_hrecs_remove_ref_altnames(sam_hrecs_t *hrecs, int expected, const char *list) {
113
0
    const char *token, *sn;
114
0
    ks_tokaux_t aux;
115
0
    kstring_t str = KS_INITIALIZE;
116
117
0
    if (expected < 0 || expected >= hrecs->nref)
118
0
        return;
119
0
    sn = hrecs->ref[expected].name;
120
121
0
    for (token = kstrtok(list, ",", &aux); token; token = kstrtok(NULL, NULL, &aux)) {
122
0
        kputsn(token, aux.p - token, ks_clear(&str));
123
0
        khint_t k = kh_get(m_s2i, hrecs->ref_hash, str.s);
124
0
        if (k != kh_end(hrecs->ref_hash)
125
0
            && kh_val(hrecs->ref_hash, k) == expected
126
0
            && strcmp(sn, str.s) != 0)
127
0
            kh_del(m_s2i, hrecs->ref_hash, k);
128
0
    }
129
130
0
    free(str.s);
131
0
}
132
133
/* Updates the hash tables in the sam_hrecs_t structure.
134
 *
135
 * Returns 0 on success;
136
 *        -1 on failure
137
 */
138
static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
139
                                   khint32_t type,
140
2.73M
                                   sam_hrec_type_t *h_type) {
141
    /* Add to reference hash? */
142
2.73M
    if (type == TYPEKEY("SQ")) {
143
1.68k
        sam_hrec_tag_t *tag = h_type->tag;
144
1.68k
        int nref = hrecs->nref;
145
1.68k
        const char *name = NULL;
146
1.68k
        const char *altnames = NULL;
147
1.68k
        hts_pos_t len = -1;
148
1.68k
        int r;
149
1.68k
        khint_t k;
150
151
10.9k
        while (tag) {
152
9.22k
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
153
3.47k
                assert(tag->len >= 3);
154
3.47k
                name = tag->str+3;
155
5.74k
            } else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
156
3.22k
                assert(tag->len >= 3);
157
3.22k
                len = strtoll(tag->str+3, NULL, 10);
158
3.22k
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
159
1.57k
                assert(tag->len >= 3);
160
1.57k
                altnames = tag->str+3;
161
1.57k
            }
162
9.22k
            tag = tag->next;
163
9.22k
        }
164
165
1.68k
        if (!name) {
166
2
            hts_log_error("Header includes @SQ line with no SN: tag");
167
2
            return -1; // SN should be present, according to spec.
168
2
        }
169
170
1.68k
        if (len == -1) {
171
1
            hts_log_error("Header includes @SQ line \"%s\" with no LN: tag",
172
1
                          name);
173
1
            return -1; // LN should be present, according to spec.
174
1
        }
175
176
        // Seen already?
177
1.68k
        k = kh_get(m_s2i, hrecs->ref_hash, name);
178
1.68k
        if (k < kh_end(hrecs->ref_hash)) {
179
1.38k
            nref = kh_val(hrecs->ref_hash, k);
180
1.38k
            int ref_changed_flag = 0;
181
182
            // Check for hash entry added by sam_hrecs_refs_from_targets_array()
183
1.38k
            if (hrecs->ref[nref].ty == NULL) {
184
                // Attach header line to existing stub entry.
185
1.37k
                hrecs->ref[nref].ty = h_type;
186
                // Check lengths match; correct if not.
187
1.37k
                if (len != hrecs->ref[nref].len) {
188
106
                    char tmp[32];
189
106
                    snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
190
106
                             hrecs->ref[nref].len);
191
106
                    if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
192
0
                        return -1;
193
106
                    ref_changed_flag = 1;
194
106
                }
195
1.37k
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
196
0
                    return -1;
197
198
1.37k
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
199
39
                    hrecs->refs_changed = nref;
200
1.37k
                return 0;
201
1.37k
            }
202
203
            // Check to see if an existing entry is being updated
204
13
            if (hrecs->ref[nref].ty == h_type) {
205
0
                if (hrecs->ref[nref].len != len) {
206
0
                    hrecs->ref[nref].len = len;
207
0
                    ref_changed_flag = 1;
208
0
                }
209
0
                if (!hrecs->ref[nref].name || strcmp(hrecs->ref[nref].name, name)) {
210
0
                    hrecs->ref[nref].name = name;
211
0
                    ref_changed_flag = 1;
212
0
                }
213
0
                if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
214
0
                    return -1;
215
216
0
                if (ref_changed_flag && (hrecs->refs_changed < 0 || hrecs->refs_changed > nref))
217
0
                    hrecs->refs_changed = nref;
218
0
                return 0;
219
0
            }
220
221
            // If here, the name is a duplicate.
222
            // Check to see if it matches the SN: tag from the earlier record.
223
13
            if (strcmp(hrecs->ref[nref].name, name) == 0) {
224
9
                hts_log_error("Duplicate entry \"%s\" in sam header",
225
9
                                name);
226
9
                return -1;
227
9
            }
228
229
            // Clash with an already-seen altname
230
            // As SN: should be preferred to AN: add this as a new
231
            // record and update the hash entry to point to it.
232
4
            hts_log_warning("Ref name SN:\"%s\" is a duplicate of an existing AN key", name);
233
4
            nref = hrecs->nref;
234
4
        }
235
236
301
        if (nref == hrecs->ref_sz) {
237
65
            size_t new_sz = hrecs->ref_sz >= 4 ? hrecs->ref_sz + (hrecs->ref_sz / 4) : 32;
238
65
            sam_hrec_sq_t *new_ref = realloc(hrecs->ref, sizeof(*hrecs->ref) * new_sz);
239
65
            if (!new_ref)
240
0
                return -1;
241
65
            hrecs->ref = new_ref;
242
65
            hrecs->ref_sz = new_sz;
243
65
        }
244
245
301
        hrecs->ref[nref].name = name;
246
301
        hrecs->ref[nref].len  = len;
247
301
        hrecs->ref[nref].ty = h_type;
248
249
301
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[nref].name, &r);
250
301
        if (-1 == r) return -1;
251
301
        kh_val(hrecs->ref_hash, k) = nref;
252
253
301
        if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
254
0
            return -1;
255
256
301
        if (hrecs->refs_changed < 0 || hrecs->refs_changed > hrecs->nref)
257
28
            hrecs->refs_changed = hrecs->nref;
258
301
        hrecs->nref++;
259
301
    }
260
261
    /* Add to read-group hash? */
262
2.73M
    if (type == TYPEKEY("RG")) {
263
14.9k
        sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, "ID", NULL);
264
14.9k
        int nrg = hrecs->nrg, r;
265
14.9k
        khint_t k;
266
267
14.9k
        if (!tag) {
268
0
            hts_log_error("Header includes @RG line with no ID: tag");
269
0
            return -1;  // ID should be present, according to spec.
270
0
        }
271
14.9k
        assert(tag->str && tag->len >= 3);
272
273
        // Seen already?
274
14.9k
        k = kh_get(m_s2i, hrecs->rg_hash, tag->str + 3);
275
14.9k
        if (k < kh_end(hrecs->rg_hash)) {
276
14.1k
            nrg = kh_val(hrecs->rg_hash, k);
277
14.1k
            assert(hrecs->rg[nrg].ty != NULL);
278
14.1k
            if (hrecs->rg[nrg].ty != h_type) {
279
14.1k
                hts_log_warning("Duplicate entry \"%s\" in sam header",
280
14.1k
                                tag->str + 3);
281
14.1k
            } else {
282
0
                hrecs->rg[nrg].name = tag->str + 3;
283
0
                hrecs->rg[nrg].name_len = tag->len - 3;
284
0
            }
285
14.1k
            return 0;
286
14.1k
        }
287
288
741
        if (nrg == hrecs->rg_sz) {
289
342
            size_t new_sz = hrecs->rg_sz >= 4 ? hrecs->rg_sz + hrecs->rg_sz / 4 : 4;
290
342
            sam_hrec_rg_t *new_rg = realloc(hrecs->rg, sizeof(*hrecs->rg) * new_sz);
291
342
            if (!new_rg)
292
0
                return -1;
293
342
            hrecs->rg = new_rg;
294
342
            hrecs->rg_sz = new_sz;
295
342
        }
296
297
741
        hrecs->rg[nrg].name = tag->str + 3;
298
741
        hrecs->rg[nrg].name_len = tag->len - 3;
299
741
        hrecs->rg[nrg].ty   = h_type;
300
741
        hrecs->rg[nrg].id   = nrg;
301
302
741
        k = kh_put(m_s2i, hrecs->rg_hash, hrecs->rg[nrg].name, &r);
303
741
        if (-1 == r) return -1;
304
741
        kh_val(hrecs->rg_hash, k) = nrg;
305
306
741
        hrecs->nrg++;
307
741
    }
308
309
    /* Add to program hash? */
310
2.72M
    if (type == TYPEKEY("PG")) {
311
2.70M
        sam_hrec_tag_t *tag;
312
2.70M
        sam_hrec_pg_t *new_pg;
313
2.70M
        int npg = hrecs->npg;
314
315
2.70M
        if (npg == hrecs->pg_sz) {
316
1.08k
            size_t new_sz = hrecs->pg_sz >= 4 ? hrecs->pg_sz + hrecs->pg_sz / 4 : 4;
317
1.08k
            new_pg = realloc(hrecs->pg, sizeof(*hrecs->pg) * new_sz);
318
1.08k
            if (!new_pg)
319
0
                return -1;
320
1.08k
            hrecs->pg = new_pg;
321
1.08k
            hrecs->pg_sz = new_sz;
322
1.08k
        }
323
324
2.70M
        tag = h_type->tag;
325
2.70M
        hrecs->pg[npg].name = NULL;
326
2.70M
        hrecs->pg[npg].name_len = 0;
327
2.70M
        hrecs->pg[npg].ty  = h_type;
328
2.70M
        hrecs->pg[npg].id   = npg;
329
2.70M
        hrecs->pg[npg].prev_id = -1;
330
331
5.50M
        while (tag) {
332
2.79M
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
333
                /* Avoid duplicate ID tags coming from other applications */
334
2.73M
                if (!hrecs->pg[npg].name) {
335
2.70M
                    assert(tag->len >= 3);
336
2.70M
                    hrecs->pg[npg].name = tag->str + 3;
337
2.70M
                    hrecs->pg[npg].name_len = tag->len - 3;
338
2.70M
                } else {
339
28.7k
                    hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name);
340
28.7k
                }
341
2.73M
            } else if (tag->str[0] == 'P' && tag->str[1] == 'P') {
342
                // Resolve later if needed
343
38.4k
                khint_t k;
344
38.4k
                k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
345
346
38.4k
                if (k != kh_end(hrecs->pg_hash)) {
347
14.3k
                    int p_id = kh_val(hrecs->pg_hash, k);
348
14.3k
                    hrecs->pg[npg].prev_id = hrecs->pg[p_id].id;
349
350
                    /* Unmark previous entry as a PG termination */
351
14.3k
                    if (hrecs->npg_end > 0 &&
352
14.3k
                        hrecs->pg_end[hrecs->npg_end-1] == p_id) {
353
3.42k
                        hrecs->npg_end--;
354
10.9k
                    } else {
355
10.9k
                        int i;
356
24.5M
                        for (i = 0; i < hrecs->npg_end; i++) {
357
24.5M
                            if (hrecs->pg_end[i] == p_id) {
358
4.28k
                                memmove(&hrecs->pg_end[i], &hrecs->pg_end[i+1],
359
4.28k
                                        (hrecs->npg_end-i-1)*sizeof(*hrecs->pg_end));
360
4.28k
                                hrecs->npg_end--;
361
4.28k
                            }
362
24.5M
                        }
363
10.9k
                    }
364
24.0k
                } else {
365
24.0k
                    hrecs->pg[npg].prev_id = -1;
366
24.0k
                }
367
38.4k
            }
368
2.79M
            tag = tag->next;
369
2.79M
        }
370
371
2.70M
        if (hrecs->pg[npg].name) {
372
2.70M
            khint_t k;
373
2.70M
            int r;
374
2.70M
            k = kh_put(m_s2i, hrecs->pg_hash, hrecs->pg[npg].name, &r);
375
2.70M
            if (-1 == r) return -1;
376
2.70M
            kh_val(hrecs->pg_hash, k) = npg;
377
2.70M
        } else {
378
8
            return -1; // ID should be present, according to spec.
379
8
        }
380
381
        /* Add to npg_end[] array. Remove later if we find a PP line */
382
2.70M
        if (hrecs->npg_end >= hrecs->npg_end_alloc) {
383
373
            int *new_pg_end;
384
373
            int  new_alloc = hrecs->npg_end_alloc ? hrecs->npg_end_alloc*2 : 4;
385
386
373
            new_pg_end = realloc(hrecs->pg_end, new_alloc * sizeof(int));
387
373
            if (!new_pg_end)
388
0
                return -1;
389
373
            hrecs->npg_end_alloc = new_alloc;
390
373
            hrecs->pg_end = new_pg_end;
391
373
        }
392
2.70M
        hrecs->pg_end[hrecs->npg_end++] = npg;
393
394
2.70M
        hrecs->npg++;
395
2.70M
    }
396
397
2.72M
    return 0;
398
2.72M
}
399
400
0
static int sam_hrecs_remove_hash_entry(sam_hrecs_t *hrecs, khint32_t type, sam_hrec_type_t *h_type) {
401
0
    if (!hrecs || !h_type)
402
0
        return -1;
403
404
0
    sam_hrec_tag_t *tag;
405
0
    const char *key = NULL;
406
0
    khint_t k;
407
408
    /* Remove name and any alternative names from reference hash */
409
0
    if (type == TYPEKEY("SQ")) {
410
0
        const char *altnames = NULL;
411
412
0
        tag = h_type->tag;
413
414
0
        while (tag) {
415
0
            if (tag->str[0] == 'S' && tag->str[1] == 'N') {
416
0
                assert(tag->len >= 3);
417
0
                key = tag->str + 3;
418
0
            } else if (tag->str[0] == 'A' && tag->str[1] == 'N') {
419
0
                assert(tag->len >= 3);
420
0
                altnames = tag->str + 3;
421
0
            }
422
0
            tag = tag->next;
423
0
        }
424
425
0
        if (key) {
426
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
427
0
            if (k != kh_end(hrecs->ref_hash)) {
428
0
                int idx = kh_val(hrecs->ref_hash, k);
429
0
                if (idx + 1 < hrecs->nref)
430
0
                    memmove(&hrecs->ref[idx], &hrecs->ref[idx+1],
431
0
                            sizeof(sam_hrec_sq_t)*(hrecs->nref - idx - 1));
432
0
                if (altnames)
433
0
                    sam_hrecs_remove_ref_altnames(hrecs, idx, altnames);
434
0
                kh_del(m_s2i, hrecs->ref_hash, k);
435
0
                hrecs->nref--;
436
0
                if (hrecs->refs_changed < 0 || hrecs->refs_changed > idx)
437
0
                    hrecs->refs_changed = idx;
438
0
                for (k = 0; k < kh_end(hrecs->ref_hash); k++) {
439
0
                    if (kh_exist(hrecs->ref_hash, k)
440
0
                        && kh_value(hrecs->ref_hash, k) > idx) {
441
0
                        kh_value(hrecs->ref_hash, k)--;
442
0
                    }
443
0
                }
444
0
            }
445
0
        }
446
0
    }
447
448
    /* Remove from read-group hash */
449
0
    if (type == TYPEKEY("RG")) {
450
0
        tag = h_type->tag;
451
452
0
        while (tag) {
453
0
            if (tag->str[0] == 'I' && tag->str[1] == 'D') {
454
0
                assert(tag->len >= 3);
455
0
                key = tag->str + 3;
456
0
                k = kh_get(m_s2i, hrecs->rg_hash, key);
457
0
                if (k != kh_end(hrecs->rg_hash)) {
458
0
                    int idx = kh_val(hrecs->rg_hash, k);
459
0
                    if (idx + 1 < hrecs->nrg)
460
0
                        memmove(&hrecs->rg[idx], &hrecs->rg[idx+1], sizeof(sam_hrec_rg_t)*(hrecs->nrg - idx - 1));
461
0
                    kh_del(m_s2i, hrecs->rg_hash, k);
462
0
                    hrecs->nrg--;
463
0
                    for (k = 0; k < kh_end(hrecs->rg_hash); k++) {
464
0
                        if (kh_exist(hrecs->rg_hash, k)
465
0
                            && kh_value(hrecs->rg_hash, k) > idx) {
466
0
                            kh_value(hrecs->rg_hash, k)--;
467
0
                        }
468
0
                    }
469
0
                }
470
0
                break;
471
0
            }
472
0
            tag = tag->next;
473
0
        }
474
0
    }
475
476
0
    return 0;
477
0
}
478
479
/** Add a header record to the global line ordering
480
 *
481
 * If @p after is not NULL, the new record will be inserted after this one,
482
 * otherwise it will go at the end.
483
 *
484
 * An exception is an HD record, which will always be put first unless
485
 * one is already present.
486
 */
487
static void sam_hrecs_global_list_add(sam_hrecs_t *hrecs,
488
                                      sam_hrec_type_t *h_type,
489
2.73M
                                      sam_hrec_type_t *after) {
490
2.73M
    const khint32_t hd_type = TYPEKEY("HD");
491
2.73M
    int update_first_line = 0;
492
493
    // First line seen
494
2.73M
    if (!hrecs->first_line) {
495
567
        hrecs->first_line = h_type->global_next = h_type->global_prev = h_type;
496
567
        return;
497
567
    }
498
499
    // @HD goes at the top (unless there's one already)
500
2.73M
    if (h_type->type == hd_type && hrecs->first_line->type != hd_type) {
501
16
        after = hrecs->first_line->global_prev;
502
16
        update_first_line = 1;
503
16
    }
504
505
    // If no instructions given, put it at the end
506
2.73M
    if (!after)
507
2.73M
        after = hrecs->first_line->global_prev;
508
509
2.73M
    h_type->global_prev = after;
510
2.73M
    h_type->global_next = after->global_next;
511
2.73M
    h_type->global_prev->global_next = h_type;
512
2.73M
    h_type->global_next->global_prev = h_type;
513
514
2.73M
    if (update_first_line)
515
16
        hrecs->first_line = h_type;
516
2.73M
}
517
518
/*! Add header record with a va_list interface.
519
 *
520
 * Adds a single record to a SAM header.
521
 *
522
 * This takes a header record type, a va_list argument and one or more
523
 * key,value pairs, ending with the NULL key.
524
 *
525
 * Eg. sam_hrecs_vadd(h, "SQ", args, "ID", "foo", "LN", "100", NULL).
526
 *
527
 * The purpose of the additional va_list parameter is to permit other
528
 * varargs functions to call this while including their own additional
529
 * parameters; an example is in sam_hdr_add_pg().
530
 *
531
 * Note: this function invokes va_arg at least once, making the value
532
 * of ap indeterminate after the return. The caller should call
533
 * va_start/va_end before/after calling this function or use va_copy.
534
 *
535
 * @return
536
 * Returns >= 0 on success;
537
 *        -1 on failure
538
 */
539
920
static int sam_hrecs_vadd(sam_hrecs_t *hrecs, const char *type, va_list ap, ...) {
540
920
    va_list args;
541
920
    sam_hrec_type_t *h_type;
542
920
    sam_hrec_tag_t *h_tag, *last=NULL;
543
920
    int new;
544
920
    khint32_t type_i = TYPEKEY(type), k;
545
546
920
    if (!strncmp(type, "HD", 2) && (h_type = sam_hrecs_find_type_id(hrecs, "HD", NULL, NULL)))
547
0
        return sam_hrecs_vupdate(hrecs, h_type, ap);
548
549
920
    if (!(h_type = pool_alloc(hrecs->type_pool)))
550
0
        return -1;
551
920
    k = kh_put(sam_hrecs_t, hrecs->h, type_i, &new);
552
920
    if (new < 0)
553
0
        return -1;
554
555
920
    h_type->type = type_i;
556
557
    // Form the ring, either with self or other lines of this type
558
920
    if (!new) {
559
840
        sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
560
840
        p = t->prev;
561
562
840
        assert(p->next == t);
563
840
        p->next = h_type;
564
840
        h_type->prev = p;
565
566
840
        t->prev = h_type;
567
840
        h_type->next = t;
568
840
    } else {
569
80
        kh_val(hrecs->h, k) = h_type;
570
80
        h_type->prev = h_type->next = h_type;
571
80
    }
572
920
    h_type->tag = NULL;
573
574
    // Add to global line ordering after any existing line of the same type,
575
    // or at the end if no line of this type exists yet.
576
920
    sam_hrecs_global_list_add(hrecs, h_type, !new ? h_type->prev : NULL);
577
578
    // Check linked-list invariants
579
920
    assert(h_type->prev->next == h_type);
580
920
    assert(h_type->next->prev == h_type);
581
920
    assert(h_type->global_prev->global_next == h_type);
582
920
    assert(h_type->global_next->global_prev == h_type);
583
584
    // Any ... varargs
585
920
    va_start(args, ap);
586
920
    for (;;) {
587
920
        char *key, *val = NULL, *str;
588
589
920
        if (!(key = (char *)va_arg(args, char *)))
590
920
            break;
591
0
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(args, char *)))
592
0
            break;
593
0
        if (*val == '\0')
594
0
            continue;
595
596
0
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
597
0
            return -1;
598
599
0
        if (strncmp(type, "CO", 2)) {
600
0
            h_tag->len = 3 + strlen(val);
601
0
            str = string_alloc(hrecs->str_pool, h_tag->len+1);
602
0
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
603
0
                return -1;
604
0
            h_tag->str = str;
605
0
        } else {
606
0
            h_tag->len = strlen(key);
607
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
608
0
            if (!h_tag->str)
609
0
                return -1;
610
0
        }
611
612
0
        h_tag->next = NULL;
613
0
        if (last)
614
0
            last->next = h_tag;
615
0
        else
616
0
            h_type->tag = h_tag;
617
618
0
        last = h_tag;
619
0
    }
620
920
    va_end(args);
621
622
    // Plus the specified va_list params
623
2.76k
    for (;;) {
624
2.76k
        char *key, *val = NULL, *str;
625
626
2.76k
        if (!(key = (char *)va_arg(ap, char *)))
627
920
            break;
628
1.84k
        if (strncmp(type, "CO", 2) && !(val = (char *)va_arg(ap, char *)))
629
0
            break;
630
631
1.84k
        if (!(h_tag = pool_alloc(hrecs->tag_pool)))
632
0
            return -1;
633
634
1.84k
        if (strncmp(type, "CO", 2)) {
635
1.84k
            h_tag->len = 3 + strlen(val);
636
1.84k
            str = string_alloc(hrecs->str_pool, h_tag->len+1);
637
1.84k
            if (!str || snprintf(str, h_tag->len+1, "%2.2s:%s", key, val) < 0)
638
0
                return -1;
639
1.84k
            h_tag->str = str;
640
1.84k
        } else {
641
0
            h_tag->len = strlen(key);
642
0
            h_tag->str = string_ndup(hrecs->str_pool, key, h_tag->len);
643
0
            if (!h_tag->str)
644
0
                return -1;
645
0
        }
646
647
1.84k
        h_tag->next = NULL;
648
1.84k
        if (last)
649
920
            last->next = h_tag;
650
920
        else
651
920
            h_type->tag = h_tag;
652
653
1.84k
        last = h_tag;
654
1.84k
    }
655
656
920
    if (-1 == sam_hrecs_update_hashes(hrecs, TYPEKEY(type), h_type))
657
0
        return -1;
658
659
920
    if (!strncmp(type, "PG", 2))
660
0
        hrecs->pgs_changed = 1;
661
662
920
    hrecs->dirty = 1;
663
664
920
    return 0;
665
920
}
666
667
// As sam_hrecs_vadd(), but without the extra va_list parameter
668
920
static int sam_hrecs_add(sam_hrecs_t *hrecs, const char *type, ...) {
669
920
    va_list args;
670
920
    int res;
671
920
    va_start(args, type);
672
920
    res = sam_hrecs_vadd(hrecs, type, args, NULL);
673
920
    va_end(args);
674
920
    return res;
675
920
}
676
677
/*
678
 * Function for deallocating a list of tags
679
 */
680
681
0
static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) {
682
0
    if (!hrecs || !tag)
683
0
        return;
684
0
    if (tag->next)
685
0
        sam_hrecs_free_tags(hrecs, tag->next);
686
687
0
    pool_free(hrecs->tag_pool, tag);
688
0
}
689
690
0
static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found) {
691
0
    if (!hrecs || !type_name || !type_found)
692
0
        return -1;
693
694
0
    khint32_t itype = TYPEKEY(type_name);
695
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, itype);
696
0
    if (k == kh_end(hrecs->h))
697
0
        return -1;
698
699
    // Remove from global list (remembering it could be the only line)
700
0
    if (hrecs->first_line == type_found) {
701
0
        hrecs->first_line = (type_found->global_next != type_found
702
0
                             ? type_found->global_next : NULL);
703
0
    }
704
0
    type_found->global_next->global_prev = type_found->global_prev;
705
0
    type_found->global_prev->global_next = type_found->global_next;
706
707
    /* single element in the list */
708
0
    if (type_found->prev == type_found || type_found->next == type_found) {
709
0
        kh_del(sam_hrecs_t, hrecs->h, k);
710
0
    } else {
711
0
        type_found->prev->next = type_found->next;
712
0
        type_found->next->prev = type_found->prev;
713
0
        if (kh_val(hrecs->h, k) == type_found) { //first element
714
0
            kh_val(hrecs->h, k) = type_found->next;
715
0
        }
716
0
    }
717
718
0
    if (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2))
719
0
        sam_hrecs_remove_hash_entry(hrecs, itype, type_found);
720
721
0
    sam_hrecs_free_tags(hrecs, type_found->tag);
722
0
    pool_free(hrecs->type_pool, type_found);
723
724
0
    hrecs->dirty = 1;
725
726
0
    return 0;
727
0
}
728
729
// Paste together a line from the parsed data structures
730
1.53M
static int build_header_line(const sam_hrec_type_t *ty, kstring_t *ks) {
731
1.53M
    sam_hrec_tag_t *tag;
732
1.53M
    int r = 0;
733
1.53M
    char c[2]= { ty->type >> 8, ty->type & 0xff };
734
735
1.53M
    r |= (kputc_('@', ks) == EOF);
736
1.53M
    r |= (kputsn(c, 2, ks) == EOF);
737
3.14M
    for (tag = ty->tag; tag; tag = tag->next) {
738
1.60M
        r |= (kputc_('\t', ks) == EOF);
739
1.60M
        r |= (kputsn(tag->str, tag->len, ks) == EOF);
740
1.60M
    }
741
742
1.53M
    return r;
743
1.53M
}
744
745
412
static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) {
746
412
    const sam_hrec_type_t *t1, *t2;
747
748
412
    if (!hrecs->first_line)
749
0
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
750
751
412
    t1 = t2 = hrecs->first_line;
752
1.53M
    do {
753
1.53M
        if (build_header_line(t1, ks) != 0)
754
0
            return -1;
755
1.53M
        if (kputc('\n', ks) < 0)
756
0
            return -1;
757
758
1.53M
        t1 = t1->global_next;
759
1.53M
    } while (t1 != t2);
760
761
412
    return 0;
762
412
}
763
764
596
static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) {
765
596
    size_t i, lno;
766
767
596
    if (!hrecs || len > SSIZE_MAX)
768
0
        return -1;
769
770
596
    if (!len)
771
0
        len = strlen(hdr);
772
773
596
    if (len < 3) {
774
23
        if (len == 0 || *hdr == '\0') return 0;
775
0
        sam_hrecs_error("Header line too short", hdr, len, 1);
776
0
        return -1;
777
23
    }
778
779
2.73M
    for (i = 0, lno = 1; i < len - 3 && hdr[i] != '\0'; i++, lno++) {
780
2.73M
        khint32_t type;
781
2.73M
        khint_t k;
782
783
2.73M
        int l_start = i, new;
784
2.73M
        sam_hrec_type_t *h_type;
785
2.73M
        sam_hrec_tag_t *h_tag, *last;
786
787
2.73M
        if (hdr[i] != '@') {
788
30
            sam_hrecs_error("Header line does not start with '@'",
789
30
                          &hdr[l_start], len - l_start, lno);
790
30
            return -1;
791
30
        }
792
793
2.73M
        if (!isalpha_c(hdr[i+1]) || !isalpha_c(hdr[i+2])) {
794
15
            sam_hrecs_error("Header line does not have a two character key",
795
15
                          &hdr[l_start], len - l_start, lno);
796
15
            return -1;
797
15
        }
798
2.73M
        type = TYPEKEY(&hdr[i+1]);
799
800
2.73M
        i += 3;
801
2.73M
        if (i == len || hdr[i] == '\n')
802
0
            continue;
803
804
        // Add the header line type
805
2.73M
        if (!(h_type = pool_alloc(hrecs->type_pool)))
806
0
            return -1;
807
2.73M
        k = kh_put(sam_hrecs_t, hrecs->h, type, &new);
808
2.73M
        if (new < 0)
809
0
            return -1;
810
811
2.73M
        h_type->type = type;
812
813
        // Add to end of global list
814
2.73M
        sam_hrecs_global_list_add(hrecs, h_type, NULL);
815
816
        // Form the ring, either with self or other lines of this type
817
2.73M
        if (!new) {
818
2.73M
            sam_hrec_type_t *t = kh_val(hrecs->h, k), *p;
819
2.73M
            p = t->prev;
820
821
2.73M
            assert(p->next == t);
822
2.73M
            p->next = h_type;
823
2.73M
            h_type->prev = p;
824
825
2.73M
            t->prev = h_type;
826
2.73M
            h_type->next = t;
827
2.73M
        } else {
828
781
            kh_val(hrecs->h, k) = h_type;
829
781
            h_type->prev = h_type->next = h_type;
830
781
        }
831
832
        // Parse the tags on this line
833
2.73M
        last = NULL;
834
2.73M
        if (type == TYPEKEY("CO")) {
835
705
            size_t j;
836
837
705
            if (i == len || hdr[i] != '\t') {
838
2
                sam_hrecs_error("Missing tab",
839
2
                              &hdr[l_start], len - l_start, lno);
840
2
                return -1;
841
2
            }
842
843
349k
            for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n'; j++)
844
348k
                ;
845
846
703
            if (!(h_type->tag = h_tag = pool_alloc(hrecs->tag_pool)))
847
0
                return -1;
848
703
            h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
849
703
            h_tag->len = j-i;
850
703
            h_tag->next = NULL;
851
703
            if (!h_tag->str)
852
0
                return -1;
853
854
703
            i = j;
855
856
2.73M
        } else {
857
2.84M
            do {
858
2.84M
                size_t j;
859
860
2.84M
                if (i == len || hdr[i] != '\t') {
861
11
                    sam_hrecs_error("Missing tab",
862
11
                                  &hdr[l_start], len - l_start, lno);
863
11
                    return -1;
864
11
                }
865
866
108M
                for (j = ++i; j < len && hdr[j] != '\0' && hdr[j] != '\n' && hdr[j] != '\t'; j++)
867
105M
                    ;
868
869
2.84M
                if (j - i < 3 || hdr[i + 2] != ':') {
870
77
                    sam_hrecs_error("Malformed key:value pair",
871
77
                                   &hdr[l_start], len - l_start, lno);
872
77
                    return -1;
873
77
                }
874
875
2.84M
                if (!(h_tag = pool_alloc(hrecs->tag_pool)))
876
0
                    return -1;
877
2.84M
                h_tag->str = string_ndup(hrecs->str_pool, &hdr[i], j-i);
878
2.84M
                h_tag->len = j-i;
879
2.84M
                h_tag->next = NULL;
880
2.84M
                if (!h_tag->str)
881
0
                    return -1;
882
883
2.84M
                if (last)
884
106k
                    last->next = h_tag;
885
2.73M
                else
886
2.73M
                    h_type->tag = h_tag;
887
888
2.84M
                last = h_tag;
889
2.84M
                i = j;
890
2.84M
            } while (i < len && hdr[i] != '\0' && hdr[i] != '\n');
891
2.73M
        }
892
893
        /* Update RG/SQ hashes */
894
2.73M
        if (-1 == sam_hrecs_update_hashes(hrecs, type, h_type))
895
20
            return -1;
896
2.73M
    }
897
898
418
    return 0;
899
573
}
900
901
/*! Update sam_hdr_t target_name and target_len arrays
902
 *
903
 *  @return 0 on success; -1 on failure
904
 */
905
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
906
289
                                 int refs_changed) {
907
289
    if (!bh || !hrecs)
908
0
        return -1;
909
910
289
    if (refs_changed < 0)
911
0
        return 0;
912
913
    // Grow arrays if necessary
914
289
    if (bh->n_targets < hrecs->nref) {
915
38
        char **new_names = realloc(bh->target_name,
916
38
                                   hrecs->nref * sizeof(*new_names));
917
38
        if (!new_names)
918
0
            return -1;
919
38
        bh->target_name = new_names;
920
38
        uint32_t *new_lens = realloc(bh->target_len,
921
38
                                     hrecs->nref * sizeof(*new_lens));
922
38
        if (!new_lens)
923
0
            return -1;
924
38
        bh->target_len = new_lens;
925
38
    }
926
927
    // Update names and lengths where changed
928
    // hrecs->refs_changed is the first ref that has been updated, so ones
929
    // before that can be skipped.
930
289
    int i;
931
289
    khint_t k;
932
289
    khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
933
1.43k
    for (i = refs_changed; i < hrecs->nref; i++) {
934
1.14k
        if (i >= bh->n_targets
935
1.14k
            || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) {
936
436
            if (i < bh->n_targets)
937
0
                free(bh->target_name[i]);
938
436
            bh->target_name[i] = strdup(hrecs->ref[i].name);
939
436
            if (!bh->target_name[i])
940
0
                return -1;
941
436
        }
942
1.14k
        if (hrecs->ref[i].len < UINT32_MAX) {
943
756
            bh->target_len[i] = hrecs->ref[i].len;
944
945
756
            if (!long_refs)
946
342
                continue;
947
948
            // Check if we have an old length, if so remove it.
949
414
            k = kh_get(s2i, long_refs, bh->target_name[i]);
950
414
            if (k < kh_end(long_refs))
951
414
                kh_del(s2i, long_refs, k);
952
414
        } else {
953
392
            bh->target_len[i] = UINT32_MAX;
954
392
            if (bh->hrecs != hrecs) {
955
                // Called from sam_hdr_dup; need to add sdict entries
956
42
                if (!long_refs) {
957
11
                    if (!(bh->sdict = long_refs = kh_init(s2i)))
958
0
                        return -1;
959
11
                }
960
961
                // Add / update length
962
42
                int absent;
963
42
                k = kh_put(s2i, long_refs, bh->target_name[i], &absent);
964
42
                if (absent < 0)
965
0
                    return -1;
966
42
                kh_val(long_refs, k) = hrecs->ref[i].len;
967
42
            }
968
392
        }
969
1.14k
    }
970
971
    // Free up any names that have been removed
972
289
    for (; i < bh->n_targets; i++) {
973
0
        if (long_refs) {
974
0
            k = kh_get(s2i, long_refs, bh->target_name[i]);
975
0
            if (k < kh_end(long_refs))
976
0
                kh_del(s2i, long_refs, k);
977
0
        }
978
0
        free(bh->target_name[i]);
979
0
    }
980
981
289
    bh->n_targets = hrecs->nref;
982
289
    return 0;
983
289
}
984
985
44
static int rebuild_target_arrays(sam_hdr_t *bh) {
986
44
    if (!bh || !bh->hrecs)
987
0
        return -1;
988
989
44
    sam_hrecs_t *hrecs = bh->hrecs;
990
44
    if (hrecs->refs_changed < 0)
991
0
        return 0;
992
993
44
    if (sam_hdr_update_target_arrays(bh, hrecs, hrecs->refs_changed) != 0)
994
0
        return -1;
995
996
44
    hrecs->refs_changed = -1;
997
44
    return 0;
998
44
}
999
1000
/// Populate hrecs refs array from header target_name, target_len arrays
1001
/**
1002
 * @return 0 on success; -1 on failure
1003
 *
1004
 * Pre-fills the refs hash from the target arrays.  For BAM files this
1005
 * will ensure that they are in the correct order as the target arrays
1006
 * are the canonical source for converting target ids to names and lengths.
1007
 *
1008
 * The added entries do not link to a header line. sam_hrecs_update_hashes()
1009
 * will add the links later for lines found in the text header.
1010
 *
1011
 * This should be called before the text header is parsed.
1012
 */
1013
static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs,
1014
325
                                             const sam_hdr_t *bh) {
1015
325
    int32_t tid = 0;
1016
1017
325
    if (!hrecs || !bh)
1018
0
        return -1;
1019
1020
    // This should always be called before parsing the text header
1021
    // so the ref array should start off empty, and we don't have to try
1022
    // to reconcile any existing data.
1023
325
    if (hrecs->nref > 0) {
1024
0
        hts_log_error("Called with non-empty ref array");
1025
0
        return -1;
1026
0
    }
1027
1028
325
    if (hrecs->ref_sz < bh->n_targets) {
1029
325
        sam_hrec_sq_t *new_ref = realloc(hrecs->ref,
1030
325
                                         bh->n_targets * sizeof(*new_ref));
1031
325
        if (!new_ref)
1032
0
            return -1;
1033
1034
325
        hrecs->ref = new_ref;
1035
325
        hrecs->ref_sz = bh->n_targets;
1036
325
    }
1037
1038
2.35k
    for (tid = 0; tid < bh->n_targets; tid++) {
1039
2.03k
        khint_t k;
1040
2.03k
        int r;
1041
2.03k
        hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]);
1042
2.03k
        if (!hrecs->ref[tid].name) goto fail;
1043
2.03k
        if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) {
1044
1.33k
            hrecs->ref[tid].len  = bh->target_len[tid];
1045
1.33k
        } else {
1046
702
            khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
1047
702
            k = kh_get(s2i, long_refs, hrecs->ref[tid].name);
1048
702
            if (k < kh_end(long_refs)) {
1049
695
                hrecs->ref[tid].len = kh_val(long_refs, k);
1050
695
            } else {
1051
7
                hrecs->ref[tid].len = UINT32_MAX;
1052
7
            }
1053
702
        }
1054
2.03k
        hrecs->ref[tid].ty   = NULL;
1055
2.03k
        k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r);
1056
2.03k
        if (r < 0) goto fail;
1057
2.03k
        if (r == 0) {
1058
0
            hts_log_error("Duplicate entry \"%s\" in target list",
1059
0
                            hrecs->ref[tid].name);
1060
0
            return -1;
1061
2.03k
        } else {
1062
2.03k
            kh_val(hrecs->ref_hash, k) = tid;
1063
2.03k
        }
1064
2.03k
    }
1065
325
    hrecs->nref = bh->n_targets;
1066
325
    return 0;
1067
1068
0
 fail: {
1069
0
        int32_t i;
1070
0
        hts_log_error("%s", strerror(errno));
1071
0
        for (i = 0; i < tid; i++) {
1072
0
            khint_t k;
1073
0
            if (!hrecs->ref[i].name) continue;
1074
0
            k = kh_get(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name);
1075
0
            if (k < kh_end(hrecs->ref_hash)) kh_del(m_s2i, hrecs->ref_hash, k);
1076
0
        }
1077
0
        hrecs->nref = 0;
1078
0
        return -1;
1079
325
    }
1080
325
}
1081
1082
/*
1083
 * Add SQ header records for any references in the hrecs->ref array that
1084
 * were added by sam_hrecs_refs_from_targets_array() but have not
1085
 * been linked to an @SQ line by sam_hrecs_update_hashes() yet.
1086
 *
1087
 * This may be needed either because:
1088
 *
1089
 *   - A bam file was read that had entries in its refs list with no
1090
 *     corresponding @SQ line.
1091
 *
1092
 *   - A program constructed a sam_hdr_t which has target_name and target_len
1093
 *     array entries with no corresponding @SQ line in text.
1094
 */
1095
1.01k
static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) {
1096
1.01k
    int tid;
1097
1.01k
    char len[32];
1098
1099
2.24k
    for (tid = 0; tid < hrecs->nref; tid++) {
1100
1.23k
        if (hrecs->ref[tid].ty == NULL) {
1101
920
            snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len);
1102
920
            if (sam_hrecs_add(hrecs, "SQ",
1103
920
                              "SN", hrecs->ref[tid].name,
1104
920
                              "LN", len, NULL) != 0)
1105
0
                return -1;
1106
1107
            // Check that the stub has actually been filled
1108
920
            if(hrecs->ref[tid].ty == NULL) {
1109
0
                hts_log_error("Reference stub with tid=%d, name=\"%s\", len=%"PRIhts_pos" could not be filled",
1110
0
                        tid, hrecs->ref[tid].name, hrecs->ref[tid].len);
1111
0
                return -1;
1112
0
            }
1113
920
        }
1114
1.23k
    }
1115
1.01k
    return 0;
1116
1.01k
}
1117
1118
1.16k
int sam_hdr_fill_hrecs(sam_hdr_t *bh) {
1119
1.16k
    sam_hrecs_t *hrecs = sam_hrecs_new();
1120
1121
1.16k
    if (!hrecs)
1122
0
        return -1;
1123
1124
1.16k
    if (bh->target_name && bh->target_len && bh->n_targets > 0) {
1125
325
        if (sam_hrecs_refs_from_targets_array(hrecs, bh) != 0) {
1126
0
            sam_hrecs_free(hrecs);
1127
0
            return -1;
1128
0
        }
1129
325
    }
1130
1131
    // Parse existing header text
1132
1.16k
    if (bh->text && bh->l_text > 0) {
1133
547
        if (sam_hrecs_parse_lines(hrecs, bh->text, bh->l_text) != 0) {
1134
154
            sam_hrecs_free(hrecs);
1135
154
            return -1;
1136
154
        }
1137
547
    }
1138
1139
1.01k
    if (add_stub_ref_sq_lines(hrecs) < 0) {
1140
0
        sam_hrecs_free(hrecs);
1141
0
        return -1;
1142
0
    }
1143
1144
1.01k
    bh->hrecs = hrecs;
1145
1146
1.01k
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1147
0
        return -1;
1148
1149
1.01k
    return 0;
1150
1.01k
}
1151
1152
/** Remove outdated header text
1153
1154
    @param bh     BAM header
1155
1156
    This is called when API functions have changed the header so that the
1157
    text version is no longer valid.
1158
 */
1159
48
static void redact_header_text(sam_hdr_t *bh) {
1160
48
    assert(bh->hrecs && bh->hrecs->dirty);
1161
48
    bh->l_text = 0;
1162
48
    free(bh->text);
1163
48
    bh->text = NULL;
1164
48
}
1165
1166
/** Find nth header record of a given type
1167
1168
    @param type   Header type (SQ, RG etc.)
1169
    @param idx    0-based index
1170
1171
    @return sam_hrec_type_t pointer to the record on success
1172
            NULL if no record exists with the given type and index
1173
 */
1174
1175
static sam_hrec_type_t *sam_hrecs_find_type_pos(sam_hrecs_t *hrecs,
1176
0
                                                const char *type, int idx) {
1177
0
    sam_hrec_type_t *first, *itr;
1178
1179
0
    if (idx < 0)
1180
0
        return NULL;
1181
1182
0
    if (type[0] == 'S' && type[1] == 'Q')
1183
0
        return idx < hrecs->nref ? hrecs->ref[idx].ty : NULL;
1184
1185
0
    if (type[0] == 'R' && type[1] == 'G')
1186
0
        return idx < hrecs->nrg ? hrecs->rg[idx].ty : NULL;
1187
1188
0
    if (type[0] == 'P' && type[1] == 'G')
1189
0
        return idx < hrecs->npg ? hrecs->pg[idx].ty : NULL;
1190
1191
0
    first = itr = sam_hrecs_find_type_id(hrecs, type, NULL, NULL);
1192
0
    if (!first)
1193
0
        return NULL;
1194
1195
0
    while (idx > 0) {
1196
0
        itr = itr->next;
1197
0
        if (itr == first)
1198
0
            break;
1199
0
        --idx;
1200
0
    }
1201
1202
0
    return idx == 0 ? itr : NULL;
1203
0
}
1204
1205
/* ==== Public methods ==== */
1206
1207
0
size_t sam_hdr_length(sam_hdr_t *bh) {
1208
0
    if (!bh || -1 == sam_hdr_rebuild(bh))
1209
0
        return SIZE_MAX;
1210
1211
0
    return bh->l_text;
1212
0
}
1213
1214
0
const char *sam_hdr_str(sam_hdr_t *bh) {
1215
0
    if (!bh || -1 == sam_hdr_rebuild(bh))
1216
0
        return NULL;
1217
1218
0
    return bh->text;
1219
0
}
1220
1221
0
int sam_hdr_nref(const sam_hdr_t *bh) {
1222
0
    if (!bh)
1223
0
        return -1;
1224
1225
0
    return bh->hrecs ? bh->hrecs->nref : bh->n_targets;
1226
0
}
1227
1228
/*
1229
 * Reconstructs the text representation from the header hash table.
1230
 * Returns 0 on success
1231
 *        -1 on failure
1232
 */
1233
0
int sam_hdr_rebuild(sam_hdr_t *bh) {
1234
0
    sam_hrecs_t *hrecs;
1235
0
    if (!bh)
1236
0
        return -1;
1237
1238
0
    if (!(hrecs = bh->hrecs))
1239
0
        return bh->text ? 0 : -1;
1240
1241
0
    if (hrecs->refs_changed >= 0) {
1242
0
        if (rebuild_target_arrays(bh) < 0) {
1243
0
            hts_log_error("Header target array rebuild has failed");
1244
0
            return -1;
1245
0
        }
1246
0
    }
1247
1248
    /* If header text wasn't changed or header is empty, don't rebuild it. */
1249
0
    if (!hrecs->dirty)
1250
0
        return 0;
1251
1252
0
    if (hrecs->pgs_changed && sam_hdr_link_pg(bh) < 0) {
1253
0
        hts_log_error("Linking @PG lines has failed");
1254
0
        return -1;
1255
0
    }
1256
1257
0
    kstring_t ks = KS_INITIALIZE;
1258
0
    if (sam_hrecs_rebuild_text(hrecs, &ks) != 0) {
1259
0
        ks_free(&ks);
1260
0
        hts_log_error("Header text rebuild has failed");
1261
0
        return -1;
1262
0
    }
1263
1264
0
    hrecs->dirty = 0;
1265
1266
    /* Sync */
1267
0
    free(bh->text);
1268
0
    bh->l_text = ks_len(&ks);
1269
0
    bh->text = ks_release(&ks);
1270
1271
0
    return 0;
1272
0
}
1273
1274
/*
1275
 * Appends a formatted line to an existing SAM header.
1276
 * Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
1277
 * optional new-line. If it contains more than 1 line then multiple lines
1278
 * will be added in order.
1279
 *
1280
 * Input text is of maximum length len or as terminated earlier by a NUL.
1281
 * len may be 0 if unknown, in which case lines must be NUL-terminated.
1282
 *
1283
 * Returns 0 on success
1284
 *        -1 on failure
1285
 */
1286
246
int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) {
1287
246
    sam_hrecs_t *hrecs;
1288
1289
246
    if (!bh || !lines)
1290
0
        return -1;
1291
1292
246
    if (len == 0 && *lines == '\0')
1293
197
        return 0;
1294
1295
49
    if (!(hrecs = bh->hrecs)) {
1296
49
        if (sam_hdr_fill_hrecs(bh) != 0)
1297
0
            return -1;
1298
49
        hrecs = bh->hrecs;
1299
49
    }
1300
1301
49
    if (sam_hrecs_parse_lines(hrecs, lines, len) != 0)
1302
1
        return -1;
1303
1304
48
    if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1305
0
        return -1;
1306
1307
48
    hrecs->dirty = 1;
1308
48
    redact_header_text(bh);
1309
1310
48
    return 0;
1311
48
}
1312
1313
/*
1314
 * Adds a single line to a SAM header.
1315
 * Specify type and one or more key,value pairs, ending with the NULL key.
1316
 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL).
1317
 *
1318
 * Returns 0 on success
1319
 *        -1 on failure
1320
 */
1321
0
int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) {
1322
0
    va_list args;
1323
0
    sam_hrecs_t *hrecs;
1324
1325
0
    if (!bh || !type)
1326
0
        return -1;
1327
1328
0
    if (!(hrecs = bh->hrecs)) {
1329
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1330
0
            return -1;
1331
0
        hrecs = bh->hrecs;
1332
0
    }
1333
1334
0
    va_start(args, type);
1335
0
    int ret = sam_hrecs_vadd(hrecs, type, args, NULL);
1336
0
    va_end(args);
1337
1338
0
    if (ret == 0) {
1339
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1340
0
            return -1;
1341
1342
0
        if (hrecs->dirty)
1343
0
            redact_header_text(bh);
1344
0
    }
1345
1346
0
    return ret;
1347
0
}
1348
1349
/*
1350
 * Returns a complete line of formatted text for a specific head type/ID
1351
 * combination. If ID_key is NULL then it returns the first line of the specified
1352
 * type.
1353
 */
1354
int sam_hdr_find_line_id(sam_hdr_t *bh, const char *type,
1355
0
                      const char *ID_key, const char *ID_val, kstring_t *ks) {
1356
0
    sam_hrecs_t *hrecs;
1357
0
    if (!bh || !type)
1358
0
        return -2;
1359
1360
0
    if (!(hrecs = bh->hrecs)) {
1361
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1362
0
            return -2;
1363
0
        hrecs = bh->hrecs;
1364
0
    }
1365
1366
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_val);
1367
0
    if (!ty)
1368
0
        return -1;
1369
1370
0
    ks->l = 0;
1371
0
    if (build_header_line(ty, ks) < 0) {
1372
0
        return -2;
1373
0
    }
1374
1375
0
    return 0;
1376
0
}
1377
1378
int sam_hdr_find_line_pos(sam_hdr_t *bh, const char *type,
1379
0
                          int pos, kstring_t *ks) {
1380
0
    sam_hrecs_t *hrecs;
1381
0
    if (!bh || !type)
1382
0
        return -2;
1383
1384
0
    if (!(hrecs = bh->hrecs)) {
1385
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1386
0
            return -2;
1387
0
        hrecs = bh->hrecs;
1388
0
    }
1389
1390
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1391
0
    if (!ty)
1392
0
        return -1;
1393
1394
0
    ks->l = 0;
1395
0
    if (build_header_line(ty, ks) < 0) {
1396
0
        return -2;
1397
0
    }
1398
1399
0
    return 0;
1400
0
}
1401
1402
/*
1403
 * Remove a line from the header by specifying a tag:value that uniquely
1404
 * identifies a line, i.e. the @SQ line containing "SN:ref1".
1405
 * @SQ line is uniquely identified by SN tag.
1406
 * @RG line is uniquely identified by ID tag.
1407
 * @PG line is uniquely identified by ID tag.
1408
 *
1409
 * Returns 0 on success and -1 on error
1410
 */
1411
1412
0
int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1413
0
    sam_hrecs_t *hrecs;
1414
0
    if (!bh || !type)
1415
0
        return -1;
1416
1417
0
    if (!(hrecs = bh->hrecs)) {
1418
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1419
0
            return -1;
1420
0
        hrecs = bh->hrecs;
1421
0
    }
1422
1423
0
    if (!strncmp(type, "PG", 2)) {
1424
0
        hts_log_warning("Removing PG lines is not supported!");
1425
0
        return -1;
1426
0
    }
1427
1428
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1429
0
    if (!type_found)
1430
0
        return 0;
1431
1432
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found);
1433
0
    if (ret == 0) {
1434
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1435
0
            return -1;
1436
1437
0
        if (hrecs->dirty)
1438
0
            redact_header_text(bh);
1439
0
    }
1440
1441
0
    return ret;
1442
0
}
1443
1444
/*
1445
 * Remove a line from the header by specifying the position in the type
1446
 * group, i.e. 3rd @SQ line.
1447
 *
1448
 * Returns 0 on success and -1 on error
1449
 */
1450
1451
0
int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) {
1452
0
    sam_hrecs_t *hrecs;
1453
0
    if (!bh || !type || position <= 0)
1454
0
        return -1;
1455
1456
0
    if (!(hrecs = bh->hrecs)) {
1457
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1458
0
            return -1;
1459
0
        hrecs = bh->hrecs;
1460
0
    }
1461
1462
0
    if (!strncmp(type, "PG", 2)) {
1463
0
        hts_log_warning("Removing PG lines is not supported!");
1464
0
        return -1;
1465
0
    }
1466
1467
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_pos(hrecs, type,
1468
0
                                                          position);
1469
0
    if (!type_found)
1470
0
        return -1;
1471
1472
0
    int ret = sam_hrecs_remove_line(hrecs, type, type_found);
1473
0
    if (ret == 0) {
1474
0
        if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0)
1475
0
            return -1;
1476
1477
0
        if (hrecs->dirty)
1478
0
            redact_header_text(bh);
1479
0
    }
1480
1481
0
    return ret;
1482
0
}
1483
1484
/*
1485
 * Check if sam_hdr_update_line() is being used to change the name of
1486
 * a record, and if the new name is going to clash with an existing one.
1487
 *
1488
 * If ap includes repeated keys, we go with the last one as sam_hrecs_vupdate()
1489
 * will go through them all and leave the final one in place.
1490
 *
1491
 * Returns 0 if the name does not change
1492
 *         1 if the name changes but does not clash
1493
 *        -1 if the name changes and the new one is already in use
1494
 */
1495
static int check_for_name_update(sam_hrecs_t *hrecs, sam_hrec_type_t *rec,
1496
                                 va_list ap, const char **old_name,
1497
                                 const char **new_name,
1498
                                 char id_tag_out[3],
1499
0
                                 khash_t(m_s2i) **hash_out) {
1500
0
    char *key, *val;
1501
0
    const char *id_tag;
1502
0
    sam_hrec_tag_t *tag, *prev;
1503
0
    khash_t(m_s2i) *hash;
1504
0
    khint_t k;
1505
0
    int ret = 0;
1506
1507
0
    if        (rec->type == TYPEKEY("SQ")) {
1508
0
        id_tag = "SN"; hash = hrecs->ref_hash;
1509
0
    } else if (rec->type == TYPEKEY("RG")) {
1510
0
        id_tag = "ID"; hash = hrecs->rg_hash;
1511
0
    } else if (rec->type == TYPEKEY("PG")) {
1512
0
        id_tag = "ID"; hash = hrecs->pg_hash;
1513
0
    } else {
1514
0
        return 0;
1515
0
    }
1516
1517
0
    memcpy(id_tag_out, id_tag, 3);
1518
0
    *hash_out = hash;
1519
1520
0
    tag = sam_hrecs_find_key(rec, id_tag, &prev);
1521
0
    if (!tag)
1522
0
        return 0;
1523
0
    assert(tag->len >= 3);
1524
0
    *old_name = tag->str + 3;
1525
1526
0
    while ((key = va_arg(ap, char *)) != NULL) {
1527
0
        val = va_arg(ap, char *);
1528
0
        if (!val) val = "";
1529
0
        if (strcmp(key, id_tag) != 0) continue;
1530
0
        if (strcmp(val, tag->str + 3) == 0) { ret = 0; continue; }
1531
0
        k = kh_get(m_s2i, hash, val);
1532
0
        ret = k < kh_end(hash) ? -1 : 1;
1533
0
        *new_name = val;
1534
0
    }
1535
0
    return ret;
1536
0
}
1537
1538
int sam_hdr_update_line(sam_hdr_t *bh, const char *type,
1539
0
        const char *ID_key, const char *ID_value, ...) {
1540
0
    sam_hrecs_t *hrecs;
1541
0
    if (!bh)
1542
0
        return -1;
1543
1544
0
    if (!(hrecs = bh->hrecs)) {
1545
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1546
0
            return -1;
1547
0
        hrecs = bh->hrecs;
1548
0
    }
1549
1550
0
    int ret, rename;
1551
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1552
0
    if (!ty)
1553
0
        return -1;
1554
1555
0
    va_list args;
1556
0
    const char *old_name = "?", *new_name = "?";
1557
0
    char id_tag[3];
1558
0
    khash_t(m_s2i) *hash = NULL;
1559
0
    va_start(args, ID_value);
1560
0
    rename = check_for_name_update(hrecs, ty, args,
1561
0
                                   &old_name, &new_name, id_tag, &hash);
1562
0
    va_end(args);
1563
0
    if (rename < 0) {
1564
0
        hts_log_error("Cannot rename @%s \"%s\" to \"%s\" : already exists",
1565
0
                      type, old_name, new_name);
1566
0
        return -1;
1567
0
    }
1568
0
    if (rename > 0 && TYPEKEY(type) == TYPEKEY("PG")) {
1569
        // This is just too complicated
1570
0
        hts_log_error("Renaming @PG records is not supported");
1571
0
        return -1;
1572
0
    }
1573
0
    va_start(args, ID_value);
1574
0
    ret = sam_hrecs_vupdate(hrecs, ty, args);
1575
0
    va_end(args);
1576
1577
0
    if (ret)
1578
0
        return ret;
1579
1580
    // TODO Account for @SQ-AN altnames
1581
1582
0
    if (rename) {
1583
        // Adjust the hash table to point to the new name
1584
        // sam_hrecs_update_hashes() should sort out everything else
1585
0
        khint_t k = kh_get(m_s2i, hash, old_name);
1586
0
        sam_hrec_tag_t *new_tag = sam_hrecs_find_key(ty, id_tag, NULL);
1587
0
        int r, pos;
1588
0
        assert(k < kh_end(hash));        // Or we wouldn't have found it earlier
1589
0
        assert(new_tag && new_tag->str); // id_tag should exist
1590
0
        assert(new_tag->len > 3);
1591
0
        pos = kh_val(hash, k);
1592
0
        kh_del(m_s2i, hash, k);
1593
0
        k = kh_put(m_s2i, hash, new_tag->str + 3, &r);
1594
0
        if (r < 1) {
1595
0
            hts_log_error("Failed to rename item in hash table");
1596
0
            return -1;
1597
0
        }
1598
0
        kh_val(hash, k) = pos;
1599
0
    }
1600
1601
0
    ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty);
1602
1603
0
    if (!ret && hrecs->refs_changed >= 0)
1604
0
        ret = rebuild_target_arrays(bh);
1605
1606
0
    if (!ret && hrecs->dirty)
1607
0
        redact_header_text(bh);
1608
1609
0
    return ret;
1610
0
}
1611
1612
0
int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) {
1613
0
    sam_hrecs_t *hrecs;
1614
0
    if (!bh || !type)
1615
0
        return -1;
1616
1617
0
    if (!(hrecs = bh->hrecs)) {
1618
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1619
0
            return -1;
1620
0
        hrecs = bh->hrecs;
1621
0
    }
1622
1623
0
    sam_hrec_type_t *step;
1624
0
    int ret = 1, remove_all = (ID_key == NULL);
1625
1626
0
    if (!strncmp(type, "PG", 2) || !strncmp(type, "CO", 2)) {
1627
0
        hts_log_warning("Removing PG or CO lines is not supported!");
1628
0
        return -1;
1629
0
    }
1630
1631
0
    sam_hrec_type_t *type_found = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1632
0
    if (!type_found) { // remove all line of this type
1633
0
        khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1634
0
        if (k == kh_end(hrecs->h))
1635
0
            return 0;
1636
0
        type_found =  kh_val(hrecs->h, k);
1637
0
        if (!type_found)
1638
0
            return 0;
1639
0
        remove_all = 1;
1640
0
    }
1641
1642
0
    step = type_found->next;
1643
0
    while (step != type_found) {
1644
0
        sam_hrec_type_t *to_remove = step;
1645
0
        step = step->next;
1646
0
        ret &= sam_hrecs_remove_line(hrecs, type, to_remove);
1647
0
    }
1648
1649
0
    if (remove_all)
1650
0
        ret &= sam_hrecs_remove_line(hrecs, type, type_found);
1651
1652
0
    if (!ret && hrecs->dirty)
1653
0
        redact_header_text(bh);
1654
1655
0
    return 0;
1656
0
}
1657
1658
0
int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void *vrh) {
1659
0
    sam_hrecs_t *hrecs;
1660
0
    rmhash_t *rh = (rmhash_t *)vrh;
1661
1662
0
    if (!bh || !type)
1663
0
        return -1;
1664
0
    if (!rh) // remove all lines
1665
0
        return sam_hdr_remove_except(bh, type, NULL, NULL);
1666
0
    if (!id)
1667
0
        return -1;
1668
1669
0
    if (!(hrecs = bh->hrecs)) {
1670
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1671
0
            return -1;
1672
0
        hrecs = bh->hrecs;
1673
0
    }
1674
1675
0
    khint_t k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
1676
0
    if (k == kh_end(hrecs->h)) // nothing to remove from
1677
0
        return 0;
1678
1679
0
    sam_hrec_type_t *head = kh_val(hrecs->h, k);
1680
0
    if (!head) {
1681
0
        hts_log_error("Header inconsistency");
1682
0
        return -1;
1683
0
    }
1684
1685
0
    int ret = 0;
1686
0
    sam_hrec_type_t *step = head->next;
1687
0
    while (step != head) {
1688
0
        sam_hrec_tag_t *tag = sam_hrecs_find_key(step, id, NULL);
1689
0
        if (tag && tag->str && tag->len >= 3) {
1690
0
           k = kh_get(rm, rh, tag->str+3);
1691
0
           if (k == kh_end(rh)) { // value is not in the hash table, so remove
1692
0
               sam_hrec_type_t *to_remove = step;
1693
0
               step = step->next;
1694
0
               ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
1695
0
           } else {
1696
0
               step = step->next;
1697
0
           }
1698
0
        } else { // tag is not on the line, so skip to next line
1699
0
            step = step->next;
1700
0
        }
1701
0
    }
1702
1703
    // process the first line
1704
0
    sam_hrec_tag_t * tag = sam_hrecs_find_key(head, id, NULL);
1705
0
    if (tag && tag->str && tag->len >= 3) {
1706
0
       k = kh_get(rm, rh, tag->str+3);
1707
0
       if (k == kh_end(rh)) { // value is not in the hash table, so remove
1708
0
           sam_hrec_type_t *to_remove = head;
1709
0
           head = head->next;
1710
0
           ret |= sam_hrecs_remove_line(hrecs, type, to_remove);
1711
0
       }
1712
0
    }
1713
1714
0
    if (!ret && hrecs->dirty)
1715
0
        redact_header_text(bh);
1716
1717
0
    return ret;
1718
0
}
1719
1720
885
int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) {
1721
885
    int count;
1722
885
    sam_hrec_type_t *first_ty, *itr_ty;
1723
1724
885
    if (!bh || !type)
1725
0
        return -1;
1726
1727
885
    if (!bh->hrecs) {
1728
885
        if (sam_hdr_fill_hrecs(bh) != 0)
1729
119
            return -1;
1730
885
    }
1731
1732
    // Deal with types that have counts
1733
766
    switch (type[0]) {
1734
766
    case 'S':
1735
766
        if (type[1] == 'Q')
1736
766
            return bh->hrecs->nref;
1737
0
        break;
1738
0
    case 'R':
1739
0
        if (type[1] == 'G')
1740
0
            return bh->hrecs->nrg;
1741
0
        break;
1742
0
    case 'P':
1743
0
        if (type[1] == 'G')
1744
0
            return bh->hrecs->npg;
1745
0
        break;
1746
0
    default:
1747
0
        break;
1748
766
    }
1749
1750
0
    first_ty = sam_hrecs_find_type_id(bh->hrecs, type, NULL, NULL);
1751
0
    if (!first_ty)
1752
0
        return 0;
1753
1754
0
    count = 1;
1755
0
    for (itr_ty = first_ty->next;
1756
0
         itr_ty && itr_ty != first_ty; itr_ty = itr_ty->next) {
1757
0
        count++;
1758
0
    }
1759
1760
0
    return count;
1761
0
}
1762
1763
int sam_hdr_line_index(sam_hdr_t *bh,
1764
                       const char *type,
1765
0
                       const char *key) {
1766
0
    sam_hrecs_t *hrecs;
1767
0
    if (!bh || !type || !key)
1768
0
        return -2;
1769
1770
0
    if (!(hrecs = bh->hrecs)) {
1771
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1772
0
            return -2;
1773
0
        hrecs = bh->hrecs;
1774
0
    }
1775
1776
0
    khint_t k;
1777
0
    int idx = -1;
1778
0
    switch (type[0]) {
1779
0
    case 'S':
1780
0
        if (type[1] == 'Q') {
1781
0
            k = kh_get(m_s2i, hrecs->ref_hash, key);
1782
0
            if (k != kh_end(hrecs->ref_hash))
1783
0
                idx = kh_val(hrecs->ref_hash, k);
1784
0
        } else {
1785
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1786
0
        }
1787
0
        break;
1788
0
    case 'R':
1789
0
        if (type[1] == 'G') {
1790
0
            k = kh_get(m_s2i, hrecs->rg_hash, key);
1791
0
            if (k != kh_end(hrecs->rg_hash))
1792
0
                idx = kh_val(hrecs->rg_hash, k);
1793
0
        } else {
1794
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1795
0
        }
1796
0
        break;
1797
0
    case 'P':
1798
0
        if (type[1] == 'G') {
1799
0
            k = kh_get(m_s2i, hrecs->pg_hash, key);
1800
0
            if (k != kh_end(hrecs->pg_hash))
1801
0
                idx = kh_val(hrecs->pg_hash, k);
1802
0
        } else {
1803
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1804
0
        }
1805
0
        break;
1806
0
    default:
1807
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1808
0
    }
1809
1810
0
    return idx;
1811
0
}
1812
1813
const char *sam_hdr_line_name(sam_hdr_t *bh,
1814
                              const char *type,
1815
0
                              int pos) {
1816
0
    sam_hrecs_t *hrecs;
1817
0
    if (!bh || !type || pos < 0)
1818
0
        return NULL;
1819
1820
0
    if (!(hrecs = bh->hrecs)) {
1821
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1822
0
            return NULL;
1823
0
        hrecs = bh->hrecs;
1824
0
    }
1825
1826
0
    switch (type[0]) {
1827
0
    case 'S':
1828
0
        if (type[1] == 'Q') {
1829
0
            if (pos < hrecs->nref)
1830
0
                return hrecs->ref[pos].name;
1831
0
        } else {
1832
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1833
0
        }
1834
0
        break;
1835
0
    case 'R':
1836
0
        if (type[1] == 'G') {
1837
0
            if (pos < hrecs->nrg)
1838
0
                return hrecs->rg[pos].name;
1839
0
        } else {
1840
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1841
0
        }
1842
0
        break;
1843
0
    case 'P':
1844
0
        if (type[1] == 'G') {
1845
0
            if (pos < hrecs->npg)
1846
0
                return hrecs->pg[pos].name;
1847
0
        } else {
1848
0
            hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1849
0
        }
1850
0
        break;
1851
0
    default:
1852
0
        hts_log_warning("Type '%s' not supported. Only @SQ, @RG and @PG lines are indexed", type);
1853
0
    }
1854
1855
0
    return NULL;
1856
0
}
1857
1858
/* ==== Key:val level methods ==== */
1859
1860
int sam_hdr_find_tag_id(sam_hdr_t *bh,
1861
                     const char *type,
1862
                     const char *ID_key,
1863
                     const char *ID_value,
1864
                     const char *key,
1865
0
                     kstring_t *ks) {
1866
0
    sam_hrecs_t *hrecs;
1867
0
    if (!bh || !type || !key)
1868
0
        return -2;
1869
1870
0
    if (!(hrecs = bh->hrecs)) {
1871
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1872
0
            return -2;
1873
0
        hrecs = bh->hrecs;
1874
0
    }
1875
1876
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1877
0
    if (!ty)
1878
0
        return -1;
1879
1880
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1881
0
    if (!tag || !tag->str || tag->len < 4)
1882
0
        return -1;
1883
1884
0
    ks->l = 0;
1885
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1886
0
        return -2;
1887
0
    }
1888
1889
0
    return 0;
1890
0
}
1891
1892
int sam_hdr_find_tag_pos(sam_hdr_t *bh,
1893
                     const char *type,
1894
                     int pos,
1895
                     const char *key,
1896
0
                     kstring_t *ks) {
1897
0
    sam_hrecs_t *hrecs;
1898
0
    if (!bh || !type || !key)
1899
0
        return -2;
1900
1901
0
    if (!(hrecs = bh->hrecs)) {
1902
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1903
0
            return -2;
1904
0
        hrecs = bh->hrecs;
1905
0
    }
1906
1907
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_pos(hrecs, type, pos);
1908
0
    if (!ty)
1909
0
        return -1;
1910
1911
0
    sam_hrec_tag_t *tag = sam_hrecs_find_key(ty, key, NULL);
1912
0
    if (!tag || !tag->str || tag->len < 4)
1913
0
        return -1;
1914
1915
0
    ks->l = 0;
1916
0
    if (kputsn(tag->str+3, tag->len-3, ks) == EOF) {
1917
0
        return -2;
1918
0
    }
1919
1920
0
    return 0;
1921
0
}
1922
1923
int sam_hdr_remove_tag_id(sam_hdr_t *bh,
1924
        const char *type,
1925
        const char *ID_key,
1926
        const char *ID_value,
1927
0
        const char *key) {
1928
0
    sam_hrecs_t *hrecs;
1929
0
    if (!bh || !type || !key)
1930
0
        return -1;
1931
1932
0
    if (!(hrecs = bh->hrecs)) {
1933
0
        if (sam_hdr_fill_hrecs(bh) != 0)
1934
0
            return -1;
1935
0
        hrecs = bh->hrecs;
1936
0
    }
1937
1938
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, type, ID_key, ID_value);
1939
0
    if (!ty)
1940
0
        return -1;
1941
1942
0
    int ret = sam_hrecs_remove_key(hrecs, ty, key);
1943
0
    if (!ret && hrecs->dirty)
1944
0
        redact_header_text(bh);
1945
1946
0
    return ret;
1947
0
}
1948
1949
/*
1950
 * Reconstructs a kstring from the header hash table.
1951
 * Returns 0 on success
1952
 *        -1 on failure
1953
 */
1954
1.01k
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) {
1955
1.01k
    ks->l = 0;
1956
1957
1.01k
    if (!hrecs->h || !hrecs->h->size) {
1958
599
        return kputsn("", 0, ks) >= 0 ? 0 : -1;
1959
599
    }
1960
412
    if (sam_hrecs_rebuild_lines(hrecs, ks) != 0)
1961
0
        return -1;
1962
1963
412
    return 0;
1964
412
}
1965
1966
/*
1967
 * Looks up a reference sequence by name and returns the numerical ID.
1968
 * Returns -1 if unknown reference; -2 if header could not be parsed.
1969
 */
1970
7.11k
int sam_hdr_name2tid(sam_hdr_t *bh, const char *ref) {
1971
7.11k
    sam_hrecs_t *hrecs;
1972
7.11k
    khint_t k;
1973
1974
7.11k
    if (!bh)
1975
0
        return -1;
1976
1977
7.11k
    if (!(hrecs = bh->hrecs)) {
1978
35
        if (sam_hdr_fill_hrecs(bh) != 0)
1979
35
            return -2;
1980
0
        hrecs = bh->hrecs;
1981
0
    }
1982
1983
7.07k
    if (!hrecs->ref_hash)
1984
0
        return -1;
1985
1986
7.07k
    k = kh_get(m_s2i, hrecs->ref_hash, ref);
1987
7.07k
    return k == kh_end(hrecs->ref_hash) ? -1 : kh_val(hrecs->ref_hash, k);
1988
7.07k
}
1989
1990
0
const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) {
1991
0
    sam_hrecs_t *hrecs;
1992
1993
0
    if (!h || tid < 0)
1994
0
        return NULL;
1995
1996
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
1997
0
        return hrecs->ref[tid].name;
1998
0
    } else {
1999
0
        if (tid < h->n_targets)
2000
0
            return h->target_name[tid];
2001
0
    }
2002
2003
0
    return NULL;
2004
0
}
2005
2006
0
hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
2007
0
    sam_hrecs_t *hrecs;
2008
2009
0
    if (!h || tid < 0)
2010
0
        return 0;
2011
2012
0
    if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
2013
0
        return hrecs->ref[tid].len;
2014
0
    } else {
2015
0
        if (tid < h->n_targets) {
2016
0
            if (h->target_len[tid] < UINT32_MAX || !h->sdict) {
2017
0
                return h->target_len[tid];
2018
0
            } else {
2019
0
                khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict;
2020
0
                khint_t k = kh_get(s2i, long_refs, h->target_name[tid]);
2021
0
                if (k < kh_end(long_refs)) {
2022
0
                    return kh_val(long_refs, k);
2023
0
                } else {
2024
0
                    return UINT32_MAX;
2025
0
                }
2026
0
            }
2027
0
        }
2028
0
    }
2029
2030
0
    return 0;
2031
0
}
2032
2033
/*
2034
 * Fixes any PP links in @PG headers.
2035
 * If the entries are in order then this doesn't need doing, but in case
2036
 * our header is out of order this goes through the hrecs->pg[] array
2037
 * setting the prev_id field.
2038
 *
2039
 * Note we can have multiple complete chains. This code should identify the
2040
 * tails of these chains as these are the entries we have to link to in
2041
 * subsequent PP records.
2042
 *
2043
 * Returns 0 on success
2044
 *        -1 on failure (indicating broken PG/PP records)
2045
 */
2046
0
static int sam_hdr_link_pg(sam_hdr_t *bh) {
2047
0
    sam_hrecs_t *hrecs;
2048
0
    int i, j, ret = 0, *new_pg_end;
2049
2050
0
    if (!bh)
2051
0
        return -1;
2052
2053
0
    if (!(hrecs = bh->hrecs)) {
2054
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2055
0
            return -1;
2056
0
        hrecs = bh->hrecs;
2057
0
    }
2058
2059
0
    if (!hrecs->pgs_changed || !hrecs->npg)
2060
0
        return 0;
2061
2062
0
    hrecs->npg_end_alloc = hrecs->npg;
2063
0
    new_pg_end = realloc(hrecs->pg_end, hrecs->npg * sizeof(*new_pg_end));
2064
0
    if (!new_pg_end)
2065
0
        return -1;
2066
0
    hrecs->pg_end = new_pg_end;
2067
0
    int *chain_size = calloc(hrecs->npg, sizeof(int));
2068
0
    if (!chain_size)
2069
0
        return -1;
2070
2071
0
    for (i = 0; i < hrecs->npg; i++)
2072
0
        hrecs->pg_end[i] = i;
2073
2074
0
    for (i = 0; i < hrecs->npg; i++) {
2075
0
        khint_t k;
2076
0
        sam_hrec_tag_t *tag;
2077
2078
0
        assert(hrecs->pg[i].ty != NULL);
2079
0
        for (tag = hrecs->pg[i].ty->tag; tag; tag = tag->next) {
2080
0
            if (tag->str[0] == 'P' && tag->str[1] == 'P')
2081
0
                break;
2082
0
        }
2083
0
        if (!tag) {
2084
            /* Chain start points */
2085
0
            continue;
2086
0
        }
2087
2088
0
        k = kh_get(m_s2i, hrecs->pg_hash, tag->str+3);
2089
2090
0
        if (k == kh_end(hrecs->pg_hash)) {
2091
0
            hts_log_warning("PG line with PN:%s has a PP link to missing program '%s'",
2092
0
                    hrecs->pg[i].name, tag->str+3);
2093
0
            continue;
2094
0
        }
2095
2096
0
        hrecs->pg[i].prev_id = hrecs->pg[kh_val(hrecs->pg_hash, k)].id;
2097
0
        hrecs->pg_end[kh_val(hrecs->pg_hash, k)] = -1;
2098
0
        chain_size[i] = chain_size[kh_val(hrecs->pg_hash, k)]+1;
2099
0
    }
2100
2101
0
    for (i = j = 0; i < hrecs->npg; i++) {
2102
0
        if (hrecs->pg_end[i] != -1 && chain_size[i] > 0)
2103
0
            hrecs->pg_end[j++] = hrecs->pg_end[i];
2104
0
    }
2105
    /* Only leafs? Choose the last one! */
2106
0
    if (!j && hrecs->npg_end > 0) {
2107
0
        hrecs->pg_end[0] = hrecs->pg_end[hrecs->npg_end-1];
2108
0
        j = 1;
2109
0
    }
2110
2111
0
    hrecs->npg_end = j;
2112
0
    hrecs->pgs_changed = 0;
2113
2114
    /* mark as dirty or empty for rebuild */
2115
0
    hrecs->dirty = 1;
2116
0
    redact_header_text(bh);
2117
0
    free(chain_size);
2118
2119
0
    return ret;
2120
0
}
2121
2122
/*
2123
 * Returns a unique ID from a base name.
2124
 *
2125
 * The value returned is valid until the next call to
2126
 * this function.
2127
 */
2128
0
const char *sam_hdr_pg_id(sam_hdr_t *bh, const char *name) {
2129
0
    sam_hrecs_t *hrecs;
2130
0
    size_t name_len;
2131
0
    const size_t name_extra = 17;
2132
0
    if (!bh || !name)
2133
0
        return NULL;
2134
2135
0
    if (!(hrecs = bh->hrecs)) {
2136
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2137
0
            return NULL;
2138
0
        hrecs = bh->hrecs;
2139
0
    }
2140
2141
0
    khint_t k = kh_get(m_s2i, hrecs->pg_hash, name);
2142
0
    if (k == kh_end(hrecs->pg_hash))
2143
0
        return name;
2144
2145
0
    name_len = strlen(name);
2146
0
    if (name_len > 1000) name_len = 1000;
2147
0
    if (hrecs->ID_buf_sz < name_len + name_extra) {
2148
0
        char *new_ID_buf = realloc(hrecs->ID_buf, name_len + name_extra);
2149
0
        if (new_ID_buf == NULL)
2150
0
            return NULL;
2151
0
        hrecs->ID_buf = new_ID_buf;
2152
0
        hrecs->ID_buf_sz = name_len + name_extra;
2153
0
    }
2154
2155
0
    do {
2156
0
        snprintf(hrecs->ID_buf, hrecs->ID_buf_sz, "%.1000s.%d", name, hrecs->ID_cnt++);
2157
0
        k = kh_get(m_s2i, hrecs->pg_hash, hrecs->ID_buf);
2158
0
    } while (k != kh_end(hrecs->pg_hash));
2159
2160
0
    return hrecs->ID_buf;
2161
0
}
2162
2163
/*
2164
 * Add an @PG line.
2165
 *
2166
 * If we wish complete control over this use sam_hdr_add_line() directly. This
2167
 * function uses that, but attempts to do a lot of tedious house work for
2168
 * you too.
2169
 *
2170
 * - It will generate a suitable ID if the supplied one clashes.
2171
 * - It will generate multiple @PG records if we have multiple PG chains.
2172
 *
2173
 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
2174
 * in NULL.
2175
 *
2176
 * Returns 0 on success
2177
 *        -1 on failure
2178
 */
2179
0
int sam_hdr_add_pg(sam_hdr_t *bh, const char *name, ...) {
2180
0
    sam_hrecs_t *hrecs;
2181
0
    const char *specified_id = NULL, *specified_pn = NULL, *specified_pp = NULL;
2182
0
    const char *key, *val;
2183
0
    if (!bh)
2184
0
        return -1;
2185
2186
0
    if (!(hrecs = bh->hrecs)) {
2187
0
        if (sam_hdr_fill_hrecs(bh) != 0)
2188
0
            return -1;
2189
0
        hrecs = bh->hrecs;
2190
0
    }
2191
2192
0
    bh->hrecs->pgs_changed = 1;
2193
0
    if (sam_hdr_link_pg(bh) < 0) {
2194
0
        hts_log_error("Error linking @PG lines");
2195
0
        return -1;
2196
0
    }
2197
2198
0
    va_list args;
2199
    // Check for ID / PN / PP tags in varargs list
2200
0
    va_start(args, name);
2201
0
    while ((key = va_arg(args, const char *)) != NULL) {
2202
0
        val = va_arg(args, const char *);
2203
0
        if (!val) break;
2204
0
        if (strcmp(key, "PN") == 0 && *val != '\0')
2205
0
            specified_pn = val;
2206
0
        else if (strcmp(key, "PP") == 0 && *val != '\0')
2207
0
            specified_pp = val;
2208
0
        else if (strcmp(key, "ID") == 0 && *val != '\0')
2209
0
            specified_id = val;
2210
0
    }
2211
0
    va_end(args);
2212
2213
0
    if (specified_id && hrecs->pg_hash) {
2214
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_id);
2215
0
        if (k != kh_end(hrecs->pg_hash)) {
2216
0
            hts_log_error("Header @PG ID:%s already present", specified_id);
2217
0
            return -1;
2218
0
        }
2219
0
    }
2220
2221
0
    if (specified_pp && hrecs->pg_hash) {
2222
0
        khint_t k = kh_get(m_s2i, hrecs->pg_hash, specified_pp);
2223
0
        if (k == kh_end(hrecs->pg_hash)) {
2224
0
            hts_log_error("Header @PG ID:%s referred to by PP tag not present",
2225
0
                          specified_pp);
2226
0
            return -1;
2227
0
        }
2228
0
    }
2229
2230
0
    if (!specified_pp && hrecs->npg_end) {
2231
        /* Copy ends array to avoid us looping while modifying it */
2232
0
        int *end = malloc(hrecs->npg_end * sizeof(int));
2233
0
        int i, nends = hrecs->npg_end;
2234
2235
0
        if (!end)
2236
0
            return -1;
2237
2238
0
        memcpy(end, hrecs->pg_end, nends * sizeof(*end));
2239
2240
0
        for (i = 0; i < nends; i++) {
2241
0
            const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2242
0
            if (!id) {
2243
0
                free(end);
2244
0
                return -1;
2245
0
            }
2246
0
            va_start(args, name);
2247
0
            if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2248
0
                                     "ID", id,
2249
0
                                     "PN", !specified_pn ? name : "",
2250
0
                                     "PP", hrecs->pg[end[i]].name,
2251
0
                                     NULL)) {
2252
0
                free(end);
2253
0
                return  -1;
2254
0
            }
2255
0
            va_end(args);
2256
0
        }
2257
2258
0
        free(end);
2259
0
    } else {
2260
0
        const char *id = !specified_id ? sam_hdr_pg_id(bh, name) : "";
2261
0
        if (!id)
2262
0
            return -1;
2263
0
        va_start(args, name);
2264
0
        if (-1 == sam_hrecs_vadd(hrecs, "PG", args,
2265
0
                                 "ID", id,
2266
0
                                 "PN", !specified_pn ? name : "",
2267
0
                                 NULL))
2268
0
            return -1;
2269
0
        va_end(args);
2270
0
    }
2271
2272
0
    hrecs->dirty = 1;
2273
0
    redact_header_text(bh);
2274
2275
0
    return 0;
2276
0
}
2277
2278
/*! Increments a reference count on bh.
2279
 *
2280
 * This permits multiple files to share the same header, all calling
2281
 * sam_hdr_destroy when done, without causing errors for other open files.
2282
 */
2283
0
void sam_hdr_incr_ref(sam_hdr_t *bh) {
2284
0
    if (!bh)
2285
0
        return;
2286
0
    bh->ref_count++;
2287
0
}
2288
2289
/* ==== Internal methods ==== */
2290
2291
/*
2292
 * Creates an empty SAM header.  Allocates space for the SAM header
2293
 * structures (hash tables) ready to be populated.
2294
 *
2295
 * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
2296
 *         NULL on failure
2297
 */
2298
1.16k
sam_hrecs_t *sam_hrecs_new() {
2299
1.16k
    sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
2300
2301
1.16k
    if (!hrecs)
2302
0
        return NULL;
2303
2304
1.16k
    hrecs->h = kh_init(sam_hrecs_t);
2305
1.16k
    if (!hrecs->h)
2306
0
        goto err;
2307
2308
1.16k
    hrecs->ID_cnt = 1;
2309
2310
1.16k
    hrecs->nref = 0;
2311
1.16k
    hrecs->ref_sz = 0;
2312
1.16k
    hrecs->ref  = NULL;
2313
1.16k
    if (!(hrecs->ref_hash = kh_init(m_s2i)))
2314
0
        goto err;
2315
1.16k
    hrecs->refs_changed = -1;
2316
2317
1.16k
    hrecs->nrg = 0;
2318
1.16k
    hrecs->rg_sz = 0;
2319
1.16k
    hrecs->rg  = NULL;
2320
1.16k
    if (!(hrecs->rg_hash = kh_init(m_s2i)))
2321
0
        goto err;
2322
2323
1.16k
    hrecs->npg = 0;
2324
1.16k
    hrecs->pg_sz = 0;
2325
1.16k
    hrecs->pg  = NULL;
2326
1.16k
    hrecs->npg_end = hrecs->npg_end_alloc = 0;
2327
1.16k
    hrecs->pg_end = NULL;
2328
1.16k
    if (!(hrecs->pg_hash = kh_init(m_s2i)))
2329
0
        goto err;
2330
2331
1.16k
    if (!(hrecs->tag_pool = pool_create(sizeof(sam_hrec_tag_t))))
2332
0
        goto err;
2333
2334
1.16k
    if (!(hrecs->type_pool = pool_create(sizeof(sam_hrec_type_t))))
2335
0
        goto err;
2336
2337
1.16k
    if (!(hrecs->str_pool = string_pool_create(65536)))
2338
0
        goto err;
2339
2340
1.16k
    if (sam_hrecs_init_type_order(hrecs, NULL))
2341
0
        goto err;
2342
2343
1.16k
    return hrecs;
2344
2345
0
err:
2346
0
    if (hrecs->h)
2347
0
        kh_destroy(sam_hrecs_t, hrecs->h);
2348
2349
0
    if (hrecs->tag_pool)
2350
0
        pool_destroy(hrecs->tag_pool);
2351
2352
0
    if (hrecs->type_pool)
2353
0
        pool_destroy(hrecs->type_pool);
2354
2355
0
    if (hrecs->str_pool)
2356
0
        string_pool_destroy(hrecs->str_pool);
2357
2358
0
    free(hrecs);
2359
2360
0
    return NULL;
2361
1.16k
}
2362
#if 0
2363
/*
2364
 * Produces a duplicate copy of source and returns it.
2365
 * Returns NULL on failure
2366
 */
2367
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *source) {
2368
        return NULL;
2369
}
2370
#endif
2371
/*! Deallocates all storage used by a sam_hrecs_t struct.
2372
 *
2373
 * This also decrements the header reference count. If after decrementing
2374
 * it is still non-zero then the header is assumed to be in use by another
2375
 * caller and the free is not done.
2376
 *
2377
 */
2378
1.16k
void sam_hrecs_free(sam_hrecs_t *hrecs) {
2379
1.16k
    if (!hrecs)
2380
0
        return;
2381
2382
1.16k
    if (hrecs->h)
2383
1.16k
        kh_destroy(sam_hrecs_t, hrecs->h);
2384
2385
1.16k
    if (hrecs->ref_hash)
2386
1.16k
        kh_destroy(m_s2i, hrecs->ref_hash);
2387
2388
1.16k
    if (hrecs->ref)
2389
344
        free(hrecs->ref);
2390
2391
1.16k
    if (hrecs->rg_hash)
2392
1.16k
        kh_destroy(m_s2i, hrecs->rg_hash);
2393
2394
1.16k
    if (hrecs->rg)
2395
57
        free(hrecs->rg);
2396
2397
1.16k
    if (hrecs->pg_hash)
2398
1.16k
        kh_destroy(m_s2i, hrecs->pg_hash);
2399
2400
1.16k
    if (hrecs->pg)
2401
70
        free(hrecs->pg);
2402
2403
1.16k
    if (hrecs->pg_end)
2404
70
        free(hrecs->pg_end);
2405
2406
1.16k
    if (hrecs->type_pool)
2407
1.16k
        pool_destroy(hrecs->type_pool);
2408
2409
1.16k
    if (hrecs->tag_pool)
2410
1.16k
        pool_destroy(hrecs->tag_pool);
2411
2412
1.16k
    if (hrecs->str_pool)
2413
1.16k
        string_pool_destroy(hrecs->str_pool);
2414
2415
1.16k
    if (hrecs->type_order)
2416
1.16k
        free(hrecs->type_order);
2417
2418
1.16k
    if (hrecs->ID_buf)
2419
0
        free(hrecs->ID_buf);
2420
2421
1.16k
    free(hrecs);
2422
1.16k
}
2423
2424
/*
2425
 * Internal method already used by the CRAM code
2426
 * Returns the first header item matching 'type'. If ID is non-NULL it checks
2427
 * for the tag ID: and compares against the specified ID.
2428
 *
2429
 * Returns NULL if no type/ID is found
2430
 */
2431
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
2432
216
                                     const char *ID_key, const char *ID_value) {
2433
216
    if (!hrecs || !type)
2434
0
        return NULL;
2435
216
    sam_hrec_type_t *t1, *t2;
2436
216
    khint_t k;
2437
2438
    /* Special case for types we have prebuilt hashes on */
2439
216
    if (ID_key) {
2440
216
        if (!ID_value)
2441
0
            return NULL;
2442
2443
216
        if (type[0]   == 'S' && type[1]   == 'Q' &&
2444
216
            ID_key[0] == 'S' && ID_key[1] == 'N') {
2445
216
            k = kh_get(m_s2i, hrecs->ref_hash, ID_value);
2446
216
            return k != kh_end(hrecs->ref_hash)
2447
216
                ? hrecs->ref[kh_val(hrecs->ref_hash, k)].ty
2448
216
                : NULL;
2449
216
        }
2450
2451
0
        if (type[0]   == 'R' && type[1]   == 'G' &&
2452
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2453
0
            k = kh_get(m_s2i, hrecs->rg_hash, ID_value);
2454
0
            return k != kh_end(hrecs->rg_hash)
2455
0
                ? hrecs->rg[kh_val(hrecs->rg_hash, k)].ty
2456
0
                : NULL;
2457
0
        }
2458
2459
0
        if (type[0]   == 'P' && type[1]   == 'G' &&
2460
0
            ID_key[0] == 'I' && ID_key[1] == 'D') {
2461
0
            k = kh_get(m_s2i, hrecs->pg_hash, ID_value);
2462
0
            return k != kh_end(hrecs->pg_hash)
2463
0
                ? hrecs->pg[kh_val(hrecs->pg_hash, k)].ty
2464
0
                : NULL;
2465
0
        }
2466
0
    }
2467
2468
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type));
2469
0
    if (k == kh_end(hrecs->h))
2470
0
        return NULL;
2471
2472
0
    if (!ID_key)
2473
0
        return kh_val(hrecs->h, k);
2474
2475
0
    t1 = t2 = kh_val(hrecs->h, k);
2476
0
    do {
2477
0
        sam_hrec_tag_t *tag;
2478
0
        for (tag = t1->tag; tag; tag = tag->next) {
2479
0
            if (tag->str[0] == ID_key[0] && tag->str[1] == ID_key[1]) {
2480
0
                const char *cp1 = tag->str+3;
2481
0
                const char *cp2 = ID_value;
2482
0
                while (*cp1 && *cp1 == *cp2)
2483
0
                    cp1++, cp2++;
2484
0
                if (*cp2 || *cp1)
2485
0
                    continue;
2486
0
                return t1;
2487
0
            }
2488
0
        }
2489
0
        t1 = t1->next;
2490
0
    } while (t1 != t2);
2491
2492
0
    return NULL;
2493
0
}
2494
2495
/*
2496
 * Adds or updates tag key,value pairs in a header line.
2497
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2498
 * @HD line.
2499
 *
2500
 * va_list contains multiple key,value pairs ending in NULL.
2501
 *
2502
 * Returns 0 on success
2503
 *        -1 on failure
2504
 */
2505
106
int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
2506
106
    if (!hrecs)
2507
0
        return -1;
2508
2509
212
    for (;;) {
2510
212
        char *k, *v, *str;
2511
212
        sam_hrec_tag_t *tag, *prev = NULL;
2512
2513
212
        if (!(k = (char *)va_arg(ap, char *)))
2514
106
            break;
2515
106
        if (!(v = va_arg(ap, char *)))
2516
0
            v = "";
2517
2518
106
        tag = sam_hrecs_find_key(type, k, &prev);
2519
106
        if (!tag) {
2520
0
            if (!(tag = pool_alloc(hrecs->tag_pool)))
2521
0
                return -1;
2522
0
            if (prev)
2523
0
                prev->next = tag;
2524
0
            else
2525
0
                type->tag = tag;
2526
2527
0
            tag->next = NULL;
2528
0
        }
2529
2530
106
        tag->len = 3 + strlen(v);
2531
106
        str = string_alloc(hrecs->str_pool, tag->len+1);
2532
106
        if (!str)
2533
0
            return -1;
2534
2535
106
        if (snprintf(str, tag->len+1, "%2.2s:%s", k, v) < 0)
2536
0
            return -1;
2537
2538
106
        tag->str = str;
2539
106
    }
2540
2541
106
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
2542
2543
106
    return 0;
2544
106
}
2545
2546
/*
2547
 * Adds or updates tag key,value pairs in a header line.
2548
 * Eg for adding M5 tags to @SQ lines or updating sort order for the
2549
 * @HD line.
2550
 *
2551
 * Specify multiple key,value pairs ending in NULL.
2552
 *
2553
 * Returns 0 on success
2554
 *        -1 on failure
2555
 */
2556
106
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
2557
106
    va_list args;
2558
106
    int res;
2559
106
    va_start(args, type);
2560
106
    res = sam_hrecs_vupdate(hrecs, type, args);
2561
106
    va_end(args);
2562
106
    return res;
2563
106
}
2564
2565
/*
2566
 * Looks for a specific key in a single sam header line identified by *type.
2567
 * If prev is non-NULL it also fills this out with the previous tag, to
2568
 * permit use in key removal. *prev is set to NULL when the tag is the first
2569
 * key in the list. When a tag isn't found, prev (if non NULL) will be the last
2570
 * tag in the existing list.
2571
 *
2572
 * Returns the tag pointer on success
2573
 *         NULL on failure
2574
 */
2575
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
2576
                                   const char *key,
2577
15.2k
                                   sam_hrec_tag_t **prev) {
2578
15.2k
    sam_hrec_tag_t *tag, *p = NULL;
2579
15.2k
    if (!type)
2580
0
        return NULL;
2581
2582
16.3k
    for (tag = type->tag; tag; p = tag, tag = tag->next) {
2583
16.1k
        if (tag->str[0] == key[0] && tag->str[1] == key[1]) {
2584
15.0k
            if (prev)
2585
106
                *prev = p;
2586
15.0k
            return tag;
2587
15.0k
        }
2588
16.1k
    }
2589
2590
216
    if (prev)
2591
0
        *prev = p;
2592
2593
216
    return NULL;
2594
15.2k
}
2595
2596
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
2597
                         sam_hrec_type_t *type,
2598
0
                         const char *key) {
2599
0
    sam_hrec_tag_t *tag, *prev;
2600
0
    if (!hrecs)
2601
0
        return -1;
2602
0
    tag = sam_hrecs_find_key(type, key, &prev);
2603
0
    if (!tag)
2604
0
        return 0; // Not there anyway
2605
2606
0
    if (type->type == TYPEKEY("SQ") && tag->str[0] == 'A' && tag->str[1] == 'N') {
2607
0
        assert(tag->len >= 3);
2608
0
        sam_hrec_tag_t *sn_tag = sam_hrecs_find_key(type, "SN", NULL);
2609
0
        if (sn_tag) {
2610
0
            assert(sn_tag->len >= 3);
2611
0
            khint_t k = kh_get(m_s2i, hrecs->ref_hash, sn_tag->str + 3);
2612
0
            if (k != kh_end(hrecs->ref_hash))
2613
0
                sam_hrecs_remove_ref_altnames(hrecs, kh_val(hrecs->ref_hash, k), tag->str + 3);
2614
0
        }
2615
0
    }
2616
2617
0
    if (!prev) { //first tag
2618
0
        type->tag = tag->next;
2619
0
    } else {
2620
0
        prev->next = tag->next;
2621
0
    }
2622
0
    pool_free(hrecs->tag_pool, tag);
2623
0
    hrecs->dirty = 1; //mark text as dirty and force a rebuild
2624
2625
0
    return 1;
2626
0
}
2627
2628
/*
2629
 * Looks up a read-group by name and returns a pointer to the start of the
2630
 * associated tag list.
2631
 *
2632
 * Returns NULL on failure
2633
 */
2634
0
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg) {
2635
0
    khint_t k = kh_get(m_s2i, hrecs->rg_hash, rg);
2636
0
    return k == kh_end(hrecs->rg_hash)
2637
0
        ? NULL
2638
0
        : &hrecs->rg[kh_val(hrecs->rg_hash, k)];
2639
0
}
2640
2641
#if DEBUG_HEADER
2642
void sam_hrecs_dump(sam_hrecs_t *hrecs) {
2643
    khint_t k;
2644
    int i;
2645
2646
    printf("===DUMP===\n");
2647
    for (k = kh_begin(hrecs->h); k != kh_end(hrecs->h); k++) {
2648
        sam_hrec_type_t *t1, *t2;
2649
        char c[2];
2650
        int idx = 0;
2651
2652
        if (!kh_exist(hrecs->h, k))
2653
            continue;
2654
2655
        t1 = t2 = kh_val(hrecs->h, k);
2656
        c[0] = kh_key(hrecs->h, k)>>8;
2657
        c[1] = kh_key(hrecs->h, k)&0xff;
2658
        printf("Type %.2s\n", c);
2659
2660
        do {
2661
            sam_hrec_tag_t *tag;
2662
            printf(">>>%d ", idx++);
2663
            for (tag = t1->tag; tag; tag=tag->next) {
2664
                if (strncmp(c, "CO", 2))
2665
                    printf("\"%.2s\":\"%.*s\"\t", tag->str, tag->len-3, tag->str+3);
2666
                else
2667
                    printf("%s", tag->str);
2668
            }
2669
            putchar('\n');
2670
            t1 = t1->next;
2671
        } while (t1 != t2);
2672
    }
2673
2674
    /* Dump out PG chains */
2675
    printf("\n@PG chains:\n");
2676
    for (i = 0; i < hrecs->npg_end; i++) {
2677
        int j;
2678
        printf("  %d:", i);
2679
        for (j = hrecs->pg_end[i]; j != -1; j = hrecs->pg[j].prev_id) {
2680
            printf("%s%d(%.*s)",
2681
                   j == hrecs->pg_end[i] ? " " : "->",
2682
                   j, hrecs->pg[j].name_len, hrecs->pg[j].name);
2683
        }
2684
        printf("\n");
2685
    }
2686
2687
    puts("===END DUMP===");
2688
}
2689
#endif
2690
2691
/*
2692
 * Returns the sort order:
2693
 */
2694
0
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs) {
2695
0
    khint_t k;
2696
0
    enum sam_sort_order so;
2697
2698
0
    so = ORDER_UNKNOWN;
2699
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2700
0
    if (k != kh_end(hrecs->h)) {
2701
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2702
0
        sam_hrec_tag_t *tag;
2703
0
        for (tag = ty->tag; tag; tag = tag->next) {
2704
0
            if (tag->str[0] == 'S' && tag->str[1] == 'O') {
2705
0
                if (strcmp(tag->str+3, "unsorted") == 0)
2706
0
                    so = ORDER_UNSORTED;
2707
0
                else if (strcmp(tag->str+3, "queryname") == 0)
2708
0
                    so = ORDER_NAME;
2709
0
                else if (strcmp(tag->str+3, "coordinate") == 0)
2710
0
                    so = ORDER_COORD;
2711
0
                else if (strcmp(tag->str+3, "unknown") != 0)
2712
0
                    hts_log_error("Unknown sort order field: %s", tag->str+3);
2713
0
            }
2714
0
        }
2715
0
    }
2716
2717
0
    return so;
2718
0
}
2719
2720
0
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs) {
2721
0
    khint_t k;
2722
0
    enum sam_group_order go;
2723
2724
0
    go = ORDER_NONE;
2725
0
    k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY("HD"));
2726
0
    if (k != kh_end(hrecs->h)) {
2727
0
        sam_hrec_type_t *ty = kh_val(hrecs->h, k);
2728
0
        sam_hrec_tag_t *tag;
2729
0
        for (tag = ty->tag; tag; tag = tag->next) {
2730
0
            if (tag->str[0] == 'G' && tag->str[1] == 'O') {
2731
0
                if (strcmp(tag->str+3, "query") == 0)
2732
0
                    go = ORDER_QUERY;
2733
0
                else if (strcmp(tag->str+3, "reference") == 0)
2734
0
                    go = ORDER_REFERENCE;
2735
0
            }
2736
0
        }
2737
0
    }
2738
2739
0
    return go;
2740
0
}