Coverage Report

Created: 2025-07-18 07:26

/src/htslib/sam_mods.c
Line
Count
Source (jump to first uncovered line)
1
/*  sam_mods.c -- Base modification handling in SAM and BAM.
2
3
    Copyright (C) 2020-2024 Genome Research Ltd.
4
5
    Author: James Bonfield <jkb@sanger.ac.uk>
6
7
Permission is hereby granted, free of charge, to any person obtaining a copy
8
of this software and associated documentation files (the "Software"), to deal
9
in the Software without restriction, including without limitation the rights
10
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
copies of the Software, and to permit persons to whom the Software is
12
furnished to do so, subject to the following conditions:
13
14
The above copyright notice and this permission notice shall be included in
15
all copies or substantial portions of the Software.
16
17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
DEALINGS IN THE SOFTWARE.  */
24
25
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
26
#include <config.h>
27
#include <assert.h>
28
29
#include "htslib/sam.h"
30
#include "textutils_internal.h"
31
32
// ---------------------------
33
// Base Modification retrieval
34
//
35
// These operate by recording state in an opaque type, allocated and freed
36
// via the functions below.
37
//
38
// Initially we call bam_parse_basemod to process the tags and record the
39
// modifications in the state structure, and then functions such as
40
// bam_next_basemod can iterate over this cached state.
41
42
/* Overview of API.
43
44
We start by allocating an hts_base_mod_state and parsing the MM, ML and MN
45
tags into it.  This has optional flags controlling how we report base
46
modifications in "explicit" coordinates.  See below
47
48
    hts_base_mod_state *m = hts_base_mod_state_alloc();
49
    bam_parse_basemod2(b, m, HTS_MOD_REPORT_UNCHECKED);
50
    // Or: bam_parse_basemod(b, m), which is equiv to flags==0
51
    //... do something ...
52
    hts_base_mod_state_free(m);
53
54
In the default implicit MM coordinate system, any location not
55
reported is implicitly assumed to contain no modification.  We only
56
report the places we think are likely modified.
57
58
Some tools however only look for base modifications in particular
59
contexts, eg CpG islands.  Here we need to distinguish between
60
not-looked-for and looked-for-but-didn't-find.  These calls have an
61
explicit coordinate system, where we only know information about the
62
coordinates explicitly listed and everything else is considered to be
63
unverified.
64
65
By default we don't get reports on the other coordinates in an
66
explicit MM tag, but the HTS_MOD_REPORT_UNCHECKED flag will report
67
them (with quality HTS_MOD_UNCHECKED) meaning we can do consensus
68
modification analysis with accurate counting when dealing with a
69
mixture of explicit and implicit records.
70
71
72
We have different ways of processing the base modifications.  We can
73
iterate either mod-by-mod or position-by-position, or we can simply
74
query a specific coordinate as may be done when processing a pileup.
75
76
To check for base modifications as a specific location within a
77
sequence we can use bam_mods_at_qpos.  This provides complete random
78
access within the MM string.  However currently this is inefficiently
79
implemented so should only be used for occasional analysis or as a way
80
to start iterating at a specific location.  It modifies the state
81
position, so after the first use we can then switch to
82
bam_mods_at_next_pos to iterate position by position from then on.
83
84
    hts_base_mod mods[10];
85
    int n = bam_mods_at_qpos(b, pos, m, mods, 10);
86
87
For base by base, we have bam_mods_at_next_pos.  This strictly starts
88
at the first base and reports entries one at a time.  It's more
89
efficient than a loop repeatedly calling ...at-pos.
90
91
    hts_base_mod mods[10];
92
    int n = bam_mods_at_next_pos(b, m, mods, 10);
93
    for (int i = 0; i < n; i++) {
94
        // report mod i of n
95
    }
96
97
Iterating over modifications instead of coordinates is simpler and
98
more efficient as it skips reporting of unmodified bases.  This is
99
done with bam_next_basemod.
100
101
    hts_base_mod mods[10];
102
    while ((n=bam_next_basemod(b, m, mods, 10, &pos)) > 0) {
103
        for (j = 0; j < n; j++) {
104
            // Report 'n'th mod at sequence position 'pos'
105
        }
106
    }
107
108
There are also functions that query meta-data about the MM line rather
109
than per-site information.
110
111
bam_mods_recorded returns an array of ints holding the +ve code ('m')
112
or -ve CHEBI numeric values.
113
114
    int ntypes, *types = bam_mods_recorded(m, &ntype);
115
116
We can then query a specific modification type to get further
117
information on the strand it is operating on, whether it has implicit
118
or explicit coordinates, and what it's corresponding canonical base it
119
is (The "C" in "C+m").  bam_mods_query_type does this by code name,
120
while bam_mods_queryi does this by numeric i^{th} type (from 0 to ntype-1).
121
122
    bam_mods_query_type(m, 'c', &strand, &implicit, &canonical);
123
    bam_mods_queryi(m, 2, &strand, &implicit, &canonical);
124
125
*/
126
127
/*
128
 * Base modification are stored in MM/Mm tags as <mod_list> defined as
129
 *
130
 * <mod_list>        ::= <mod_chain><mod_list> | ""
131
 * <mod_chain>       ::= <canonical_base><strand><mod-list><delta-list>
132
 *
133
 * <canonical_base>  ::= "A" | "C" | "G" | "T" | "N".
134
 *
135
 * <strand>          ::= "+" | "-".
136
 *
137
 * <mod-list>        ::= <simple-mod-list> | <ChEBI-code>
138
 * <simple-mod-list> ::= <simple-mod><simple-mod-list> | <simple-mod>
139
 * <ChEBI-code>      ::= <integer>
140
 * <simple-mod>      ::= <letter>
141
 *
142
 * <delta-list>      ::= "," <integer> <delta-list> | ";"
143
 *
144
 * We do not allocate additional memory other than the fixed size
145
 * state, thus we track up to 256 pointers to different locations
146
 * within the MM and ML tags.  Each pointer is for a distinct
147
 * modification code (simple or ChEBI), meaning some may point to the
148
 * same delta-list when multiple codes are combined together
149
 * (e.g. "C+mh,1,5,18,3;").  This is the MM[] array.
150
 *
151
 * Each numeric in the delta-list is tracked in MMcount[], counted
152
 * down until it hits zero in which case the next delta is fetched.
153
 *
154
 * ML array similarly holds the locations in the quality (ML) tag per
155
 * type, but these are interleaved so C+mhfc,10,15 will have 4 types
156
 * all pointing to the same delta position, but in ML we store
157
 * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1).  This ML
158
 * also has MLstride indicating how many positions along ML to jump
159
 * each time we consume a base. (4 in our above example, but usually 1
160
 * for the simple case).
161
 *
162
 * One complexity of the base modification system is that mods are
163
 * always stored in the original DNA orientation.  This is so that
164
 * tools that may reverse-complement a sequence (eg "samtools fastq -T
165
 * MM,ML") can pass through these modification tags irrespective of
166
 * whether they have any knowledge of their internal workings.
167
 *
168
 * Because we don't wish to allocate extra memory, we cannot simply
169
 * reverse the MM and ML tags.  Sadly this means we have to manage the
170
 * reverse complementing ourselves on-the-fly.
171
 * For reversed reads we start at the right end of MM and no longer
172
 * stop at the semicolon.  Instead we use MMend[] array to mark the
173
 * termination point.
174
 */
175
0
#define MAX_BASE_MOD 256
176
struct hts_base_mod_state {
177
    int type[MAX_BASE_MOD];     // char or minus-CHEBI
178
    int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15)
179
    char strand[MAX_BASE_MOD];  // strand of modification; + or -
180
    int MMcount[MAX_BASE_MOD];  // no. canonical bases left until next mod
181
    char *MM[MAX_BASE_MOD];     // next pos delta (string)
182
    char *MMend[MAX_BASE_MOD];  // end of pos-delta string
183
    uint8_t *ML[MAX_BASE_MOD];  // next qual
184
    int MLstride[MAX_BASE_MOD]; // bytes between quals for this type
185
    int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified?
186
    int seq_pos;                // current position along sequence
187
    int nmods;                  // used array size (0 to MAX_BASE_MOD-1).
188
    uint32_t flags;             // Bit-field: see HTS_MOD_REPORT_UNCHECKED
189
};
190
191
0
hts_base_mod_state *hts_base_mod_state_alloc(void) {
192
0
    return calloc(1, sizeof(hts_base_mod_state));
193
0
}
194
195
0
void hts_base_mod_state_free(hts_base_mod_state *state) {
196
0
    free(state);
197
0
}
198
199
/*
200
 * Count frequency of A, C, G, T and N canonical bases in the sequence
201
 */
202
0
static void seq_freq(const bam1_t *b, int freq[16]) {
203
0
    int i;
204
205
0
    memset(freq, 0, 16*sizeof(*freq));
206
0
    uint8_t *seq = bam_get_seq(b);
207
0
    for (i = 0; i < b->core.l_qseq; i++)
208
0
        freq[bam_seqi(seq, i)]++;
209
0
    freq[15] = b->core.l_qseq; // all bases count as N for base mods
210
0
}
211
212
//0123456789ABCDEF
213
//=ACMGRSVTWYHKDBN  aka seq_nt16_str[]
214
//=TGKCYSBAWRDMHVN  comp1ement of seq_nt16_str
215
//084C2A6E195D3B7F
216
static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
217
218
/*
219
 * Parse the MM and ML tags to populate the base mod state.
220
 * This structure will have been previously allocated via
221
 * hts_base_mod_state_alloc, but it does not need to be repeatedly
222
 * freed and allocated for each new bam record. (Although obviously
223
 * it requires a new call to this function.)
224
 *
225
 * Flags are copied into the state and used to control reporting functions.
226
 * Currently the only flag is HTS_MOD_REPORT_UNCHECKED, to control whether
227
 * explicit "C+m?" mods report quality HTS_MOD_UNCHECKED for the bases
228
 * outside the explicitly reported region.
229
 */
230
int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
231
0
                       uint32_t flags) {
232
    // Reset position, else upcoming calls may fail on
233
    // seq pos - length comparison
234
0
    state->seq_pos = 0;
235
0
    state->nmods = 0;
236
0
    state->flags = flags;
237
238
    // Read MM and ML tags
239
0
    uint8_t *mm = bam_aux_get(b, "MM");
240
0
    if (!mm) mm = bam_aux_get(b, "Mm");
241
0
    if (!mm)
242
0
        return 0;
243
0
    if (mm[0] != 'Z') {
244
0
        hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b));
245
0
        return -1;
246
0
    }
247
248
0
    uint8_t *mi = bam_aux_get(b, "MN");
249
0
    if (mi && bam_aux2i(mi) != b->core.l_qseq && b->core.l_qseq) {
250
        // bam_aux2i with set errno = EINVAL and return 0 if the tag
251
        // isn't integer, but 0 will be a seq-length mismatch anyway so
252
        // triggers an error here too.
253
0
        hts_log_error("%s: MM/MN data length is incompatible with"
254
0
                      " SEQ length", bam_get_qname(b));
255
0
        return -1;
256
0
    }
257
258
0
    uint8_t *ml = bam_aux_get(b, "ML");
259
0
    if (!ml) ml = bam_aux_get(b, "Ml");
260
0
    if (ml && (ml[0] != 'B' || ml[1] != 'C')) {
261
0
        hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b));
262
0
        return -1;
263
0
    }
264
0
    uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL;
265
0
    if (ml) ml += 6;
266
267
    // Aggregate freqs of ACGTN if reversed, to get final-delta (later)
268
0
    int freq[16];
269
0
    if (b->core.flag & BAM_FREVERSE)
270
0
        seq_freq(b, freq);
271
272
0
    char *cp = (char *)mm+1;
273
0
    int mod_num = 0;
274
0
    int implicit = 1;
275
0
    while (*cp) {
276
0
        for (; *cp; cp++) {
277
            // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*;
278
0
            unsigned char btype = *cp++;
279
280
0
            if (btype != 'A' && btype != 'C' &&
281
0
                btype != 'G' && btype != 'T' &&
282
0
                btype != 'U' && btype != 'N')
283
0
                return -1;
284
0
            if (btype == 'U') btype = 'T';
285
286
0
            btype = seq_nt16_table[btype];
287
288
            // Strand
289
0
            if (*cp != '+' && *cp != '-')
290
0
                return -1; // malformed
291
0
            char strand = *cp++;
292
293
            // List of modification types
294
0
            char *ms = cp, *me; // mod code start and end
295
0
            char *cp_end = NULL;
296
0
            int chebi = 0;
297
0
            if (isdigit_c(*cp)) {
298
0
                chebi = strtol(cp, &cp_end, 10);
299
0
                cp = cp_end;
300
0
                ms = cp-1;
301
0
            } else {
302
0
                while (*cp && isalpha_c(*cp))
303
0
                    cp++;
304
0
                if (*cp == '\0')
305
0
                    return -1;
306
0
            }
307
308
0
            me = cp;
309
310
            // Optional explicit vs implicit marker
311
0
            implicit = 1;
312
0
            if (*cp == '.') {
313
                // default is implicit = 1;
314
0
                cp++;
315
0
            } else if (*cp == '?') {
316
0
                implicit = 0;
317
0
                cp++;
318
0
            } else if (*cp != ',' && *cp != ';') {
319
                // parse error
320
0
                return -1;
321
0
            }
322
323
0
            long delta;
324
0
            int n = 0; // nth symbol in a multi-mod string
325
0
            int stride = me-ms;
326
0
            int ndelta = 0;
327
328
0
            if (b->core.flag & BAM_FREVERSE) {
329
                // We process the sequence in left to right order,
330
                // but delta is successive count of bases to skip
331
                // counting right to left.  This also means the number
332
                // of bases to skip at left edge is unrecorded (as it's
333
                // the remainder).
334
                //
335
                // To output mods in left to right, we step through the
336
                // MM list in reverse and need to identify the left-end
337
                // "remainder" delta.
338
0
                int total_seq = 0;
339
0
                for (;;) {
340
0
                    cp += (*cp == ',');
341
0
                    if (*cp == 0 || *cp == ';')
342
0
                        break;
343
344
0
                    delta = strtol(cp, &cp_end, 10);
345
0
                    if (cp_end == cp) {
346
0
                        hts_log_error("%s: Hit end of MM tag. Missing "
347
0
                                      "semicolon?", bam_get_qname(b));
348
0
                        return -1;
349
0
                    }
350
351
0
                    cp = cp_end;
352
0
                    total_seq += delta+1;
353
0
                    ndelta++;
354
0
                }
355
0
                delta = freq[seqi_rc[btype]] - total_seq; // remainder
356
0
            } else {
357
0
                delta = *cp == ','
358
0
                    ? strtol(cp+1, &cp_end, 10)
359
0
                    : 0;
360
0
                if (!cp_end) {
361
                    // empty list
362
0
                    delta = INT_MAX;
363
0
                    cp_end = cp;
364
0
                }
365
0
            }
366
            // Now delta is first in list or computed remainder,
367
            // and cp_end is either start or end of the MM list.
368
0
            while (ms < me) {
369
0
                state->type     [mod_num] = chebi ? -chebi : *ms;
370
0
                state->strand   [mod_num] = (strand == '-');
371
0
                state->canonical[mod_num] = btype;
372
0
                state->MLstride [mod_num] = stride;
373
0
                state->implicit [mod_num] = implicit;
374
375
0
                if (delta < 0) {
376
0
                    hts_log_error("%s: MM tag refers to bases beyond sequence "
377
0
                                  "length", bam_get_qname(b));
378
0
                    return -1;
379
0
                }
380
0
                state->MMcount  [mod_num] = delta;
381
0
                if (b->core.flag & BAM_FREVERSE) {
382
0
                    state->MM   [mod_num] = me+1;
383
0
                    state->MMend[mod_num] = cp_end;
384
0
                    state->ML   [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL;
385
0
                } else {
386
0
                    state->MM   [mod_num] = cp_end;
387
0
                    state->MMend[mod_num] = NULL;
388
0
                    state->ML   [mod_num] = ml ? ml+n : NULL;
389
0
                }
390
391
0
                if (++mod_num >= MAX_BASE_MOD) {
392
0
                    hts_log_error("%s: Too many base modification types",
393
0
                                  bam_get_qname(b));
394
0
                    return -1;
395
0
                }
396
0
                ms++; n++;
397
0
            }
398
399
            // Skip modification deltas
400
0
            if (ml) {
401
0
                if (b->core.flag & BAM_FREVERSE) {
402
0
                    ml += ndelta*stride;
403
0
                } else {
404
0
                    while (*cp && *cp != ';') {
405
0
                        if (*cp == ',')
406
0
                            ml+=stride;
407
0
                        cp++;
408
0
                    }
409
0
                }
410
0
                if (ml > ml_end) {
411
0
                    hts_log_error("%s: Insufficient number of entries in ML "
412
0
                                  "tag", bam_get_qname(b));
413
0
                    return -1;
414
0
                }
415
0
            } else {
416
                // cp_end already known if FREVERSE
417
0
                if (cp_end && (b->core.flag & BAM_FREVERSE))
418
0
                    cp = cp_end;
419
0
                else
420
0
                    while (*cp && *cp != ';')
421
0
                        cp++;
422
0
            }
423
0
            if (!*cp) {
424
0
                hts_log_error("%s: Hit end of MM tag. Missing semicolon?",
425
0
                              bam_get_qname(b));
426
0
                return -1;
427
0
            }
428
0
        }
429
0
    }
430
0
    if (ml && ml != ml_end) {
431
0
        hts_log_error("%s: Too many entries in ML tag", bam_get_qname(b));
432
0
        return -1;
433
0
    }
434
435
0
    state->nmods = mod_num;
436
437
0
    return 0;
438
0
}
439
440
0
int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
441
0
    return bam_parse_basemod2(b, state, 0);
442
0
}
443
444
/*
445
 * Fills out mods[] with the base modifications found.
446
 * Returns the number found (0 if none), which may be more than
447
 * the size of n_mods if more were found than reported.
448
 * Returns <= -1 on error.
449
 *
450
 * This always marches left to right along sequence, irrespective of
451
 * reverse flag or modification strand.
452
 */
453
int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
454
0
                         hts_base_mod *mods, int n_mods) {
455
0
    if (b->core.flag & BAM_FREVERSE) {
456
0
        if (state->seq_pos < 0)
457
0
            return -1;
458
0
    } else {
459
0
        if (state->seq_pos >= b->core.l_qseq)
460
0
            return -1;
461
0
    }
462
463
0
    int i, j, n = 0;
464
0
    unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos);
465
0
    state->seq_pos++;
466
0
    if (b->core.flag & BAM_FREVERSE)
467
0
        base = seqi_rc[base];
468
469
0
    for (i = 0; i < state->nmods; i++) {
470
0
        int unchecked = 0;
471
0
        if (state->canonical[i] != base && state->canonical[i] != 15/*N*/)
472
0
            continue;
473
474
0
        if (state->MMcount[i]-- > 0) {
475
0
            if (!state->implicit[i] &&
476
0
                (state->flags & HTS_MOD_REPORT_UNCHECKED))
477
0
                unchecked = 1;
478
0
            else
479
0
                continue;
480
0
        }
481
482
0
        char *MMptr = state->MM[i];
483
0
        if (n < n_mods) {
484
0
            mods[n].modified_base = state->type[i];
485
0
            mods[n].canonical_base = seq_nt16_str[state->canonical[i]];
486
0
            mods[n].strand = state->strand[i];
487
0
            mods[n].qual = unchecked
488
0
                ? HTS_MOD_UNCHECKED
489
0
                : (state->ML[i] ? *state->ML[i] : HTS_MOD_UNKNOWN);
490
0
        }
491
0
        n++;
492
493
0
        if (unchecked)
494
0
            continue;
495
496
0
        if (state->ML[i])
497
0
            state->ML[i] += (b->core.flag & BAM_FREVERSE)
498
0
                ? -state->MLstride[i]
499
0
                : +state->MLstride[i];
500
501
0
        if (b->core.flag & BAM_FREVERSE) {
502
            // process MM list backwards
503
0
            char *cp;
504
0
            if (state->MMend[i]-1 < state->MM[i]) {
505
                // Should be impossible to hit if coding is correct
506
0
                hts_log_error("Assert failed while processing base modification states");
507
0
                return -1;
508
0
            }
509
0
            for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--)
510
0
                if (*cp == ',')
511
0
                    break;
512
0
            state->MMend[i] = cp;
513
0
            if (cp != state->MM[i])
514
0
                state->MMcount[i] = strtol(cp+1, NULL, 10);
515
0
            else
516
0
                state->MMcount[i] = INT_MAX;
517
0
        } else {
518
0
            if (*state->MM[i] == ',')
519
0
                state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10);
520
0
            else
521
0
                state->MMcount[i] = INT_MAX;
522
0
        }
523
524
        // Multiple mods at the same coords.
525
0
        for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) {
526
0
            if (n < n_mods) {
527
0
                mods[n].modified_base = state->type[j];
528
0
                mods[n].canonical_base = seq_nt16_str[state->canonical[j]];
529
0
                mods[n].strand = state->strand[j];
530
0
                mods[n].qual = state->ML[j] ? *state->ML[j] : -1;
531
0
            }
532
0
            n++;
533
0
            state->MMcount[j] = state->MMcount[i];
534
0
            state->MM[j]      = state->MM[i];
535
0
            if (state->ML[j])
536
0
                state->ML[j] += (b->core.flag & BAM_FREVERSE)
537
0
                    ? -state->MLstride[j]
538
0
                    : +state->MLstride[j];
539
0
        }
540
0
        i = j-1;
541
0
    }
542
543
0
    return n;
544
0
}
545
546
/*
547
 * Return data at the next modified location.
548
 *
549
 * bam_mods_at_next_pos does quite a bit of work, so we don't want to
550
 * repeatedly call it for every location until we find a mod.  Instead
551
 * we check how many base types we can consume before the next mod,
552
 * and scan through the sequence looking for them.  Once we're at that
553
 * site, we defer back to bam_mods_at_next_pos for the return values.
554
 */
555
int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
556
0
                     hts_base_mod *mods, int n_mods, int *pos) {
557
    // Look through state->MMcount arrays to see when the next lowest is
558
    // per base type;
559
0
    int next[16], freq[16] = {0}, i;
560
0
    memset(next, 0x7f, 16*sizeof(*next));
561
0
    const int unchecked = state->flags & HTS_MOD_REPORT_UNCHECKED;
562
0
    if (b->core.flag & BAM_FREVERSE) {
563
0
        for (i = 0; i < state->nmods; i++) {
564
0
            if (unchecked && !state->implicit[i])
565
0
                next[seqi_rc[state->canonical[i]]] = 1;
566
0
            else if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i])
567
0
                next[seqi_rc[state->canonical[i]]] = state->MMcount[i];
568
0
        }
569
0
    } else {
570
0
        for (i = 0; i < state->nmods; i++) {
571
0
            if (unchecked && !state->implicit[i])
572
0
                next[state->canonical[i]] = 0;
573
0
            else if (next[state->canonical[i]] > state->MMcount[i])
574
0
                next[state->canonical[i]] = state->MMcount[i];
575
0
       }
576
0
    }
577
578
    // Now step through the sequence counting off base types.
579
0
    for (i = state->seq_pos; i < b->core.l_qseq; i++) {
580
0
        unsigned char bc = bam_seqi(bam_get_seq(b), i);
581
0
        if (next[bc] <= freq[bc] || next[15] <= freq[15])
582
0
            break;
583
0
        freq[bc]++;
584
0
        if (bc != 15) // N
585
0
            freq[15]++;
586
0
    }
587
0
    *pos = state->seq_pos = i;
588
589
0
    if (b->core.flag & BAM_FREVERSE) {
590
0
        for (i = 0; i < state->nmods; i++)
591
0
            state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]];
592
0
    } else {
593
0
        for (i = 0; i < state->nmods; i++)
594
0
            state->MMcount[i] -= freq[state->canonical[i]];
595
0
    }
596
597
0
    if (b->core.l_qseq && state->seq_pos >= b->core.l_qseq &&
598
0
        !(b->core.flag & BAM_FREVERSE)) {
599
        // Spots +ve orientation run-overs.
600
        // The -ve orientation is spotted in bam_parse_basemod2
601
0
        int i;
602
0
        for (i = 0; i < state->nmods; i++) {
603
            // Check if any remaining items in MM after hitting the end
604
            // of the sequence.
605
0
            if (state->MMcount[i] < 0x7f000000 ||
606
0
                (*state->MM[i]!=0 && *state->MM[i]!=';')) {
607
0
                hts_log_warning("MM tag refers to bases beyond sequence length");
608
0
                return -1;
609
0
            }
610
0
        }
611
0
        return 0;
612
0
    }
613
614
0
    int r = bam_mods_at_next_pos(b, state, mods, n_mods);
615
0
    return r > 0 ? r : 0;
616
0
}
617
618
/*
619
 * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos.
620
 * This can only march forwards along the read, but can do so by more than
621
 * one base-pair.
622
 *
623
 * This makes it useful for calling from pileup iterators where qpos may
624
 * start part way through a read for the first occurrence of that record.
625
 */
626
int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
627
0
                    hts_base_mod *mods, int n_mods) {
628
    // FIXME: for now this is inefficient in implementation.
629
0
    int r = 0;
630
0
    while (state->seq_pos <= qpos)
631
0
        if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0)
632
0
            break;
633
634
0
    return r;
635
0
}
636
637
/*
638
 * Returns the list of base modification codes provided for this
639
 * alignment record as an array of character codes (+ve) or ChEBI numbers
640
 * (negative).
641
 *
642
 * Returns the array, with *ntype filled out with the size.
643
 *         The array returned should not be freed.
644
 *         It is a valid pointer until the state is freed using
645
 *         hts_base_mod_free().
646
 */
647
0
int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) {
648
0
    *ntype = state->nmods;
649
0
    return state->type;
650
0
}
651
652
/*
653
 * Returns data about a specific modification type for the alignment record.
654
 * Code is either positive (eg 'm') or negative for ChEBI numbers.
655
 *
656
 * Return 0 on success or -1 if not found.  The strand, implicit and canonical
657
 * fields are filled out if passed in as non-NULL pointers.
658
 */
659
int bam_mods_query_type(hts_base_mod_state *state, int code,
660
0
                        int *strand, int *implicit, char *canonical) {
661
    // Find code entry
662
0
    int i;
663
0
    for (i = 0; i < state->nmods; i++) {
664
0
        if (state->type[i] == code)
665
0
            break;
666
0
    }
667
0
    if (i == state->nmods)
668
0
        return -1;
669
670
    // Return data
671
0
    if (strand)    *strand    = state->strand[i];
672
0
    if (implicit)  *implicit  = state->implicit[i];
673
0
    if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]];
674
675
0
    return 0;
676
0
}
677
678
/*
679
 * Returns data about the ith modification type for the alignment record.
680
 *
681
 * Return 0 on success or -1 if not found.  The strand, implicit and canonical
682
 * fields are filled out if passed in as non-NULL pointers.
683
 */
684
int bam_mods_queryi(hts_base_mod_state *state, int i,
685
0
                    int *strand, int *implicit, char *canonical) {
686
0
    if (i < 0 || i >= state->nmods)
687
0
        return -1;
688
689
    // Return data
690
0
    if (strand)    *strand    = state->strand[i];
691
0
    if (implicit)  *implicit  = state->implicit[i];
692
0
    if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]];
693
694
0
    return 0;
695
0
}