Line | Count | Source (jump to first uncovered line) |
1 | | /* sam_mods.c -- Base modification handling in SAM and BAM. |
2 | | |
3 | | Copyright (C) 2020-2024 Genome Research Ltd. |
4 | | |
5 | | Author: James Bonfield <jkb@sanger.ac.uk> |
6 | | |
7 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
8 | | of this software and associated documentation files (the "Software"), to deal |
9 | | in the Software without restriction, including without limitation the rights |
10 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | copies of the Software, and to permit persons to whom the Software is |
12 | | furnished to do so, subject to the following conditions: |
13 | | |
14 | | The above copyright notice and this permission notice shall be included in |
15 | | all copies or substantial portions of the Software. |
16 | | |
17 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
18 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
19 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
20 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
21 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
22 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
23 | | DEALINGS IN THE SOFTWARE. */ |
24 | | |
25 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
26 | | #include <config.h> |
27 | | #include <assert.h> |
28 | | |
29 | | #include "htslib/sam.h" |
30 | | #include "textutils_internal.h" |
31 | | |
32 | | // --------------------------- |
33 | | // Base Modification retrieval |
34 | | // |
35 | | // These operate by recording state in an opaque type, allocated and freed |
36 | | // via the functions below. |
37 | | // |
38 | | // Initially we call bam_parse_basemod to process the tags and record the |
39 | | // modifications in the state structure, and then functions such as |
40 | | // bam_next_basemod can iterate over this cached state. |
41 | | |
42 | | /* Overview of API. |
43 | | |
44 | | We start by allocating an hts_base_mod_state and parsing the MM, ML and MN |
45 | | tags into it. This has optional flags controlling how we report base |
46 | | modifications in "explicit" coordinates. See below |
47 | | |
48 | | hts_base_mod_state *m = hts_base_mod_state_alloc(); |
49 | | bam_parse_basemod2(b, m, HTS_MOD_REPORT_UNCHECKED); |
50 | | // Or: bam_parse_basemod(b, m), which is equiv to flags==0 |
51 | | //... do something ... |
52 | | hts_base_mod_state_free(m); |
53 | | |
54 | | In the default implicit MM coordinate system, any location not |
55 | | reported is implicitly assumed to contain no modification. We only |
56 | | report the places we think are likely modified. |
57 | | |
58 | | Some tools however only look for base modifications in particular |
59 | | contexts, eg CpG islands. Here we need to distinguish between |
60 | | not-looked-for and looked-for-but-didn't-find. These calls have an |
61 | | explicit coordinate system, where we only know information about the |
62 | | coordinates explicitly listed and everything else is considered to be |
63 | | unverified. |
64 | | |
65 | | By default we don't get reports on the other coordinates in an |
66 | | explicit MM tag, but the HTS_MOD_REPORT_UNCHECKED flag will report |
67 | | them (with quality HTS_MOD_UNCHECKED) meaning we can do consensus |
68 | | modification analysis with accurate counting when dealing with a |
69 | | mixture of explicit and implicit records. |
70 | | |
71 | | |
72 | | We have different ways of processing the base modifications. We can |
73 | | iterate either mod-by-mod or position-by-position, or we can simply |
74 | | query a specific coordinate as may be done when processing a pileup. |
75 | | |
76 | | To check for base modifications as a specific location within a |
77 | | sequence we can use bam_mods_at_qpos. This provides complete random |
78 | | access within the MM string. However currently this is inefficiently |
79 | | implemented so should only be used for occasional analysis or as a way |
80 | | to start iterating at a specific location. It modifies the state |
81 | | position, so after the first use we can then switch to |
82 | | bam_mods_at_next_pos to iterate position by position from then on. |
83 | | |
84 | | hts_base_mod mods[10]; |
85 | | int n = bam_mods_at_qpos(b, pos, m, mods, 10); |
86 | | |
87 | | For base by base, we have bam_mods_at_next_pos. This strictly starts |
88 | | at the first base and reports entries one at a time. It's more |
89 | | efficient than a loop repeatedly calling ...at-pos. |
90 | | |
91 | | hts_base_mod mods[10]; |
92 | | int n = bam_mods_at_next_pos(b, m, mods, 10); |
93 | | for (int i = 0; i < n; i++) { |
94 | | // report mod i of n |
95 | | } |
96 | | |
97 | | Iterating over modifications instead of coordinates is simpler and |
98 | | more efficient as it skips reporting of unmodified bases. This is |
99 | | done with bam_next_basemod. |
100 | | |
101 | | hts_base_mod mods[10]; |
102 | | while ((n=bam_next_basemod(b, m, mods, 10, &pos)) > 0) { |
103 | | for (j = 0; j < n; j++) { |
104 | | // Report 'n'th mod at sequence position 'pos' |
105 | | } |
106 | | } |
107 | | |
108 | | There are also functions that query meta-data about the MM line rather |
109 | | than per-site information. |
110 | | |
111 | | bam_mods_recorded returns an array of ints holding the +ve code ('m') |
112 | | or -ve CHEBI numeric values. |
113 | | |
114 | | int ntypes, *types = bam_mods_recorded(m, &ntype); |
115 | | |
116 | | We can then query a specific modification type to get further |
117 | | information on the strand it is operating on, whether it has implicit |
118 | | or explicit coordinates, and what it's corresponding canonical base it |
119 | | is (The "C" in "C+m"). bam_mods_query_type does this by code name, |
120 | | while bam_mods_queryi does this by numeric i^{th} type (from 0 to ntype-1). |
121 | | |
122 | | bam_mods_query_type(m, 'c', &strand, &implicit, &canonical); |
123 | | bam_mods_queryi(m, 2, &strand, &implicit, &canonical); |
124 | | |
125 | | */ |
126 | | |
127 | | /* |
128 | | * Base modification are stored in MM/Mm tags as <mod_list> defined as |
129 | | * |
130 | | * <mod_list> ::= <mod_chain><mod_list> | "" |
131 | | * <mod_chain> ::= <canonical_base><strand><mod-list><delta-list> |
132 | | * |
133 | | * <canonical_base> ::= "A" | "C" | "G" | "T" | "N". |
134 | | * |
135 | | * <strand> ::= "+" | "-". |
136 | | * |
137 | | * <mod-list> ::= <simple-mod-list> | <ChEBI-code> |
138 | | * <simple-mod-list> ::= <simple-mod><simple-mod-list> | <simple-mod> |
139 | | * <ChEBI-code> ::= <integer> |
140 | | * <simple-mod> ::= <letter> |
141 | | * |
142 | | * <delta-list> ::= "," <integer> <delta-list> | ";" |
143 | | * |
144 | | * We do not allocate additional memory other than the fixed size |
145 | | * state, thus we track up to 256 pointers to different locations |
146 | | * within the MM and ML tags. Each pointer is for a distinct |
147 | | * modification code (simple or ChEBI), meaning some may point to the |
148 | | * same delta-list when multiple codes are combined together |
149 | | * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array. |
150 | | * |
151 | | * Each numeric in the delta-list is tracked in MMcount[], counted |
152 | | * down until it hits zero in which case the next delta is fetched. |
153 | | * |
154 | | * ML array similarly holds the locations in the quality (ML) tag per |
155 | | * type, but these are interleaved so C+mhfc,10,15 will have 4 types |
156 | | * all pointing to the same delta position, but in ML we store |
157 | | * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML |
158 | | * also has MLstride indicating how many positions along ML to jump |
159 | | * each time we consume a base. (4 in our above example, but usually 1 |
160 | | * for the simple case). |
161 | | * |
162 | | * One complexity of the base modification system is that mods are |
163 | | * always stored in the original DNA orientation. This is so that |
164 | | * tools that may reverse-complement a sequence (eg "samtools fastq -T |
165 | | * MM,ML") can pass through these modification tags irrespective of |
166 | | * whether they have any knowledge of their internal workings. |
167 | | * |
168 | | * Because we don't wish to allocate extra memory, we cannot simply |
169 | | * reverse the MM and ML tags. Sadly this means we have to manage the |
170 | | * reverse complementing ourselves on-the-fly. |
171 | | * For reversed reads we start at the right end of MM and no longer |
172 | | * stop at the semicolon. Instead we use MMend[] array to mark the |
173 | | * termination point. |
174 | | */ |
175 | 0 | #define MAX_BASE_MOD 256 |
176 | | struct hts_base_mod_state { |
177 | | int type[MAX_BASE_MOD]; // char or minus-CHEBI |
178 | | int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) |
179 | | char strand[MAX_BASE_MOD]; // strand of modification; + or - |
180 | | int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod |
181 | | char *MM[MAX_BASE_MOD]; // next pos delta (string) |
182 | | char *MMend[MAX_BASE_MOD]; // end of pos-delta string |
183 | | uint8_t *ML[MAX_BASE_MOD]; // next qual |
184 | | int MLstride[MAX_BASE_MOD]; // bytes between quals for this type |
185 | | int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? |
186 | | int seq_pos; // current position along sequence |
187 | | int nmods; // used array size (0 to MAX_BASE_MOD-1). |
188 | | uint32_t flags; // Bit-field: see HTS_MOD_REPORT_UNCHECKED |
189 | | }; |
190 | | |
191 | 0 | hts_base_mod_state *hts_base_mod_state_alloc(void) { |
192 | 0 | return calloc(1, sizeof(hts_base_mod_state)); |
193 | 0 | } |
194 | | |
195 | 0 | void hts_base_mod_state_free(hts_base_mod_state *state) { |
196 | 0 | free(state); |
197 | 0 | } |
198 | | |
199 | | /* |
200 | | * Count frequency of A, C, G, T and N canonical bases in the sequence |
201 | | */ |
202 | 0 | static void seq_freq(const bam1_t *b, int freq[16]) { |
203 | 0 | int i; |
204 | |
|
205 | 0 | memset(freq, 0, 16*sizeof(*freq)); |
206 | 0 | uint8_t *seq = bam_get_seq(b); |
207 | 0 | for (i = 0; i < b->core.l_qseq; i++) |
208 | 0 | freq[bam_seqi(seq, i)]++; |
209 | 0 | freq[15] = b->core.l_qseq; // all bases count as N for base mods |
210 | 0 | } |
211 | | |
212 | | //0123456789ABCDEF |
213 | | //=ACMGRSVTWYHKDBN aka seq_nt16_str[] |
214 | | //=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str |
215 | | //084C2A6E195D3B7F |
216 | | static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; |
217 | | |
218 | | /* |
219 | | * Parse the MM and ML tags to populate the base mod state. |
220 | | * This structure will have been previously allocated via |
221 | | * hts_base_mod_state_alloc, but it does not need to be repeatedly |
222 | | * freed and allocated for each new bam record. (Although obviously |
223 | | * it requires a new call to this function.) |
224 | | * |
225 | | * Flags are copied into the state and used to control reporting functions. |
226 | | * Currently the only flag is HTS_MOD_REPORT_UNCHECKED, to control whether |
227 | | * explicit "C+m?" mods report quality HTS_MOD_UNCHECKED for the bases |
228 | | * outside the explicitly reported region. |
229 | | */ |
230 | | int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, |
231 | 0 | uint32_t flags) { |
232 | | // Reset position, else upcoming calls may fail on |
233 | | // seq pos - length comparison |
234 | 0 | state->seq_pos = 0; |
235 | 0 | state->nmods = 0; |
236 | 0 | state->flags = flags; |
237 | | |
238 | | // Read MM and ML tags |
239 | 0 | uint8_t *mm = bam_aux_get(b, "MM"); |
240 | 0 | if (!mm) mm = bam_aux_get(b, "Mm"); |
241 | 0 | if (!mm) |
242 | 0 | return 0; |
243 | 0 | if (mm[0] != 'Z') { |
244 | 0 | hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b)); |
245 | 0 | return -1; |
246 | 0 | } |
247 | | |
248 | 0 | uint8_t *mi = bam_aux_get(b, "MN"); |
249 | 0 | if (mi && bam_aux2i(mi) != b->core.l_qseq && b->core.l_qseq) { |
250 | | // bam_aux2i with set errno = EINVAL and return 0 if the tag |
251 | | // isn't integer, but 0 will be a seq-length mismatch anyway so |
252 | | // triggers an error here too. |
253 | 0 | hts_log_error("%s: MM/MN data length is incompatible with" |
254 | 0 | " SEQ length", bam_get_qname(b)); |
255 | 0 | return -1; |
256 | 0 | } |
257 | | |
258 | 0 | uint8_t *ml = bam_aux_get(b, "ML"); |
259 | 0 | if (!ml) ml = bam_aux_get(b, "Ml"); |
260 | 0 | if (ml && (ml[0] != 'B' || ml[1] != 'C')) { |
261 | 0 | hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b)); |
262 | 0 | return -1; |
263 | 0 | } |
264 | 0 | uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; |
265 | 0 | if (ml) ml += 6; |
266 | | |
267 | | // Aggregate freqs of ACGTN if reversed, to get final-delta (later) |
268 | 0 | int freq[16]; |
269 | 0 | if (b->core.flag & BAM_FREVERSE) |
270 | 0 | seq_freq(b, freq); |
271 | |
|
272 | 0 | char *cp = (char *)mm+1; |
273 | 0 | int mod_num = 0; |
274 | 0 | int implicit = 1; |
275 | 0 | while (*cp) { |
276 | 0 | for (; *cp; cp++) { |
277 | | // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; |
278 | 0 | unsigned char btype = *cp++; |
279 | |
|
280 | 0 | if (btype != 'A' && btype != 'C' && |
281 | 0 | btype != 'G' && btype != 'T' && |
282 | 0 | btype != 'U' && btype != 'N') |
283 | 0 | return -1; |
284 | 0 | if (btype == 'U') btype = 'T'; |
285 | |
|
286 | 0 | btype = seq_nt16_table[btype]; |
287 | | |
288 | | // Strand |
289 | 0 | if (*cp != '+' && *cp != '-') |
290 | 0 | return -1; // malformed |
291 | 0 | char strand = *cp++; |
292 | | |
293 | | // List of modification types |
294 | 0 | char *ms = cp, *me; // mod code start and end |
295 | 0 | char *cp_end = NULL; |
296 | 0 | int chebi = 0; |
297 | 0 | if (isdigit_c(*cp)) { |
298 | 0 | chebi = strtol(cp, &cp_end, 10); |
299 | 0 | cp = cp_end; |
300 | 0 | ms = cp-1; |
301 | 0 | } else { |
302 | 0 | while (*cp && isalpha_c(*cp)) |
303 | 0 | cp++; |
304 | 0 | if (*cp == '\0') |
305 | 0 | return -1; |
306 | 0 | } |
307 | | |
308 | 0 | me = cp; |
309 | | |
310 | | // Optional explicit vs implicit marker |
311 | 0 | implicit = 1; |
312 | 0 | if (*cp == '.') { |
313 | | // default is implicit = 1; |
314 | 0 | cp++; |
315 | 0 | } else if (*cp == '?') { |
316 | 0 | implicit = 0; |
317 | 0 | cp++; |
318 | 0 | } else if (*cp != ',' && *cp != ';') { |
319 | | // parse error |
320 | 0 | return -1; |
321 | 0 | } |
322 | | |
323 | 0 | long delta; |
324 | 0 | int n = 0; // nth symbol in a multi-mod string |
325 | 0 | int stride = me-ms; |
326 | 0 | int ndelta = 0; |
327 | |
|
328 | 0 | if (b->core.flag & BAM_FREVERSE) { |
329 | | // We process the sequence in left to right order, |
330 | | // but delta is successive count of bases to skip |
331 | | // counting right to left. This also means the number |
332 | | // of bases to skip at left edge is unrecorded (as it's |
333 | | // the remainder). |
334 | | // |
335 | | // To output mods in left to right, we step through the |
336 | | // MM list in reverse and need to identify the left-end |
337 | | // "remainder" delta. |
338 | 0 | int total_seq = 0; |
339 | 0 | for (;;) { |
340 | 0 | cp += (*cp == ','); |
341 | 0 | if (*cp == 0 || *cp == ';') |
342 | 0 | break; |
343 | | |
344 | 0 | delta = strtol(cp, &cp_end, 10); |
345 | 0 | if (cp_end == cp) { |
346 | 0 | hts_log_error("%s: Hit end of MM tag. Missing " |
347 | 0 | "semicolon?", bam_get_qname(b)); |
348 | 0 | return -1; |
349 | 0 | } |
350 | | |
351 | 0 | cp = cp_end; |
352 | 0 | total_seq += delta+1; |
353 | 0 | ndelta++; |
354 | 0 | } |
355 | 0 | delta = freq[seqi_rc[btype]] - total_seq; // remainder |
356 | 0 | } else { |
357 | 0 | delta = *cp == ',' |
358 | 0 | ? strtol(cp+1, &cp_end, 10) |
359 | 0 | : 0; |
360 | 0 | if (!cp_end) { |
361 | | // empty list |
362 | 0 | delta = INT_MAX; |
363 | 0 | cp_end = cp; |
364 | 0 | } |
365 | 0 | } |
366 | | // Now delta is first in list or computed remainder, |
367 | | // and cp_end is either start or end of the MM list. |
368 | 0 | while (ms < me) { |
369 | 0 | state->type [mod_num] = chebi ? -chebi : *ms; |
370 | 0 | state->strand [mod_num] = (strand == '-'); |
371 | 0 | state->canonical[mod_num] = btype; |
372 | 0 | state->MLstride [mod_num] = stride; |
373 | 0 | state->implicit [mod_num] = implicit; |
374 | |
|
375 | 0 | if (delta < 0) { |
376 | 0 | hts_log_error("%s: MM tag refers to bases beyond sequence " |
377 | 0 | "length", bam_get_qname(b)); |
378 | 0 | return -1; |
379 | 0 | } |
380 | 0 | state->MMcount [mod_num] = delta; |
381 | 0 | if (b->core.flag & BAM_FREVERSE) { |
382 | 0 | state->MM [mod_num] = me+1; |
383 | 0 | state->MMend[mod_num] = cp_end; |
384 | 0 | state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL; |
385 | 0 | } else { |
386 | 0 | state->MM [mod_num] = cp_end; |
387 | 0 | state->MMend[mod_num] = NULL; |
388 | 0 | state->ML [mod_num] = ml ? ml+n : NULL; |
389 | 0 | } |
390 | |
|
391 | 0 | if (++mod_num >= MAX_BASE_MOD) { |
392 | 0 | hts_log_error("%s: Too many base modification types", |
393 | 0 | bam_get_qname(b)); |
394 | 0 | return -1; |
395 | 0 | } |
396 | 0 | ms++; n++; |
397 | 0 | } |
398 | | |
399 | | // Skip modification deltas |
400 | 0 | if (ml) { |
401 | 0 | if (b->core.flag & BAM_FREVERSE) { |
402 | 0 | ml += ndelta*stride; |
403 | 0 | } else { |
404 | 0 | while (*cp && *cp != ';') { |
405 | 0 | if (*cp == ',') |
406 | 0 | ml+=stride; |
407 | 0 | cp++; |
408 | 0 | } |
409 | 0 | } |
410 | 0 | if (ml > ml_end) { |
411 | 0 | hts_log_error("%s: Insufficient number of entries in ML " |
412 | 0 | "tag", bam_get_qname(b)); |
413 | 0 | return -1; |
414 | 0 | } |
415 | 0 | } else { |
416 | | // cp_end already known if FREVERSE |
417 | 0 | if (cp_end && (b->core.flag & BAM_FREVERSE)) |
418 | 0 | cp = cp_end; |
419 | 0 | else |
420 | 0 | while (*cp && *cp != ';') |
421 | 0 | cp++; |
422 | 0 | } |
423 | 0 | if (!*cp) { |
424 | 0 | hts_log_error("%s: Hit end of MM tag. Missing semicolon?", |
425 | 0 | bam_get_qname(b)); |
426 | 0 | return -1; |
427 | 0 | } |
428 | 0 | } |
429 | 0 | } |
430 | 0 | if (ml && ml != ml_end) { |
431 | 0 | hts_log_error("%s: Too many entries in ML tag", bam_get_qname(b)); |
432 | 0 | return -1; |
433 | 0 | } |
434 | | |
435 | 0 | state->nmods = mod_num; |
436 | |
|
437 | 0 | return 0; |
438 | 0 | } |
439 | | |
440 | 0 | int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { |
441 | 0 | return bam_parse_basemod2(b, state, 0); |
442 | 0 | } |
443 | | |
444 | | /* |
445 | | * Fills out mods[] with the base modifications found. |
446 | | * Returns the number found (0 if none), which may be more than |
447 | | * the size of n_mods if more were found than reported. |
448 | | * Returns <= -1 on error. |
449 | | * |
450 | | * This always marches left to right along sequence, irrespective of |
451 | | * reverse flag or modification strand. |
452 | | */ |
453 | | int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, |
454 | 0 | hts_base_mod *mods, int n_mods) { |
455 | 0 | if (b->core.flag & BAM_FREVERSE) { |
456 | 0 | if (state->seq_pos < 0) |
457 | 0 | return -1; |
458 | 0 | } else { |
459 | 0 | if (state->seq_pos >= b->core.l_qseq) |
460 | 0 | return -1; |
461 | 0 | } |
462 | | |
463 | 0 | int i, j, n = 0; |
464 | 0 | unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos); |
465 | 0 | state->seq_pos++; |
466 | 0 | if (b->core.flag & BAM_FREVERSE) |
467 | 0 | base = seqi_rc[base]; |
468 | |
|
469 | 0 | for (i = 0; i < state->nmods; i++) { |
470 | 0 | int unchecked = 0; |
471 | 0 | if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) |
472 | 0 | continue; |
473 | | |
474 | 0 | if (state->MMcount[i]-- > 0) { |
475 | 0 | if (!state->implicit[i] && |
476 | 0 | (state->flags & HTS_MOD_REPORT_UNCHECKED)) |
477 | 0 | unchecked = 1; |
478 | 0 | else |
479 | 0 | continue; |
480 | 0 | } |
481 | | |
482 | 0 | char *MMptr = state->MM[i]; |
483 | 0 | if (n < n_mods) { |
484 | 0 | mods[n].modified_base = state->type[i]; |
485 | 0 | mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; |
486 | 0 | mods[n].strand = state->strand[i]; |
487 | 0 | mods[n].qual = unchecked |
488 | 0 | ? HTS_MOD_UNCHECKED |
489 | 0 | : (state->ML[i] ? *state->ML[i] : HTS_MOD_UNKNOWN); |
490 | 0 | } |
491 | 0 | n++; |
492 | |
|
493 | 0 | if (unchecked) |
494 | 0 | continue; |
495 | | |
496 | 0 | if (state->ML[i]) |
497 | 0 | state->ML[i] += (b->core.flag & BAM_FREVERSE) |
498 | 0 | ? -state->MLstride[i] |
499 | 0 | : +state->MLstride[i]; |
500 | |
|
501 | 0 | if (b->core.flag & BAM_FREVERSE) { |
502 | | // process MM list backwards |
503 | 0 | char *cp; |
504 | 0 | if (state->MMend[i]-1 < state->MM[i]) { |
505 | | // Should be impossible to hit if coding is correct |
506 | 0 | hts_log_error("Assert failed while processing base modification states"); |
507 | 0 | return -1; |
508 | 0 | } |
509 | 0 | for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--) |
510 | 0 | if (*cp == ',') |
511 | 0 | break; |
512 | 0 | state->MMend[i] = cp; |
513 | 0 | if (cp != state->MM[i]) |
514 | 0 | state->MMcount[i] = strtol(cp+1, NULL, 10); |
515 | 0 | else |
516 | 0 | state->MMcount[i] = INT_MAX; |
517 | 0 | } else { |
518 | 0 | if (*state->MM[i] == ',') |
519 | 0 | state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10); |
520 | 0 | else |
521 | 0 | state->MMcount[i] = INT_MAX; |
522 | 0 | } |
523 | | |
524 | | // Multiple mods at the same coords. |
525 | 0 | for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) { |
526 | 0 | if (n < n_mods) { |
527 | 0 | mods[n].modified_base = state->type[j]; |
528 | 0 | mods[n].canonical_base = seq_nt16_str[state->canonical[j]]; |
529 | 0 | mods[n].strand = state->strand[j]; |
530 | 0 | mods[n].qual = state->ML[j] ? *state->ML[j] : -1; |
531 | 0 | } |
532 | 0 | n++; |
533 | 0 | state->MMcount[j] = state->MMcount[i]; |
534 | 0 | state->MM[j] = state->MM[i]; |
535 | 0 | if (state->ML[j]) |
536 | 0 | state->ML[j] += (b->core.flag & BAM_FREVERSE) |
537 | 0 | ? -state->MLstride[j] |
538 | 0 | : +state->MLstride[j]; |
539 | 0 | } |
540 | 0 | i = j-1; |
541 | 0 | } |
542 | | |
543 | 0 | return n; |
544 | 0 | } |
545 | | |
546 | | /* |
547 | | * Return data at the next modified location. |
548 | | * |
549 | | * bam_mods_at_next_pos does quite a bit of work, so we don't want to |
550 | | * repeatedly call it for every location until we find a mod. Instead |
551 | | * we check how many base types we can consume before the next mod, |
552 | | * and scan through the sequence looking for them. Once we're at that |
553 | | * site, we defer back to bam_mods_at_next_pos for the return values. |
554 | | */ |
555 | | int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, |
556 | 0 | hts_base_mod *mods, int n_mods, int *pos) { |
557 | | // Look through state->MMcount arrays to see when the next lowest is |
558 | | // per base type; |
559 | 0 | int next[16], freq[16] = {0}, i; |
560 | 0 | memset(next, 0x7f, 16*sizeof(*next)); |
561 | 0 | const int unchecked = state->flags & HTS_MOD_REPORT_UNCHECKED; |
562 | 0 | if (b->core.flag & BAM_FREVERSE) { |
563 | 0 | for (i = 0; i < state->nmods; i++) { |
564 | 0 | if (unchecked && !state->implicit[i]) |
565 | 0 | next[seqi_rc[state->canonical[i]]] = 1; |
566 | 0 | else if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) |
567 | 0 | next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; |
568 | 0 | } |
569 | 0 | } else { |
570 | 0 | for (i = 0; i < state->nmods; i++) { |
571 | 0 | if (unchecked && !state->implicit[i]) |
572 | 0 | next[state->canonical[i]] = 0; |
573 | 0 | else if (next[state->canonical[i]] > state->MMcount[i]) |
574 | 0 | next[state->canonical[i]] = state->MMcount[i]; |
575 | 0 | } |
576 | 0 | } |
577 | | |
578 | | // Now step through the sequence counting off base types. |
579 | 0 | for (i = state->seq_pos; i < b->core.l_qseq; i++) { |
580 | 0 | unsigned char bc = bam_seqi(bam_get_seq(b), i); |
581 | 0 | if (next[bc] <= freq[bc] || next[15] <= freq[15]) |
582 | 0 | break; |
583 | 0 | freq[bc]++; |
584 | 0 | if (bc != 15) // N |
585 | 0 | freq[15]++; |
586 | 0 | } |
587 | 0 | *pos = state->seq_pos = i; |
588 | |
|
589 | 0 | if (b->core.flag & BAM_FREVERSE) { |
590 | 0 | for (i = 0; i < state->nmods; i++) |
591 | 0 | state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]]; |
592 | 0 | } else { |
593 | 0 | for (i = 0; i < state->nmods; i++) |
594 | 0 | state->MMcount[i] -= freq[state->canonical[i]]; |
595 | 0 | } |
596 | |
|
597 | 0 | if (b->core.l_qseq && state->seq_pos >= b->core.l_qseq && |
598 | 0 | !(b->core.flag & BAM_FREVERSE)) { |
599 | | // Spots +ve orientation run-overs. |
600 | | // The -ve orientation is spotted in bam_parse_basemod2 |
601 | 0 | int i; |
602 | 0 | for (i = 0; i < state->nmods; i++) { |
603 | | // Check if any remaining items in MM after hitting the end |
604 | | // of the sequence. |
605 | 0 | if (state->MMcount[i] < 0x7f000000 || |
606 | 0 | (*state->MM[i]!=0 && *state->MM[i]!=';')) { |
607 | 0 | hts_log_warning("MM tag refers to bases beyond sequence length"); |
608 | 0 | return -1; |
609 | 0 | } |
610 | 0 | } |
611 | 0 | return 0; |
612 | 0 | } |
613 | | |
614 | 0 | int r = bam_mods_at_next_pos(b, state, mods, n_mods); |
615 | 0 | return r > 0 ? r : 0; |
616 | 0 | } |
617 | | |
618 | | /* |
619 | | * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos. |
620 | | * This can only march forwards along the read, but can do so by more than |
621 | | * one base-pair. |
622 | | * |
623 | | * This makes it useful for calling from pileup iterators where qpos may |
624 | | * start part way through a read for the first occurrence of that record. |
625 | | */ |
626 | | int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, |
627 | 0 | hts_base_mod *mods, int n_mods) { |
628 | | // FIXME: for now this is inefficient in implementation. |
629 | 0 | int r = 0; |
630 | 0 | while (state->seq_pos <= qpos) |
631 | 0 | if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) |
632 | 0 | break; |
633 | |
|
634 | 0 | return r; |
635 | 0 | } |
636 | | |
637 | | /* |
638 | | * Returns the list of base modification codes provided for this |
639 | | * alignment record as an array of character codes (+ve) or ChEBI numbers |
640 | | * (negative). |
641 | | * |
642 | | * Returns the array, with *ntype filled out with the size. |
643 | | * The array returned should not be freed. |
644 | | * It is a valid pointer until the state is freed using |
645 | | * hts_base_mod_free(). |
646 | | */ |
647 | 0 | int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { |
648 | 0 | *ntype = state->nmods; |
649 | 0 | return state->type; |
650 | 0 | } |
651 | | |
652 | | /* |
653 | | * Returns data about a specific modification type for the alignment record. |
654 | | * Code is either positive (eg 'm') or negative for ChEBI numbers. |
655 | | * |
656 | | * Return 0 on success or -1 if not found. The strand, implicit and canonical |
657 | | * fields are filled out if passed in as non-NULL pointers. |
658 | | */ |
659 | | int bam_mods_query_type(hts_base_mod_state *state, int code, |
660 | 0 | int *strand, int *implicit, char *canonical) { |
661 | | // Find code entry |
662 | 0 | int i; |
663 | 0 | for (i = 0; i < state->nmods; i++) { |
664 | 0 | if (state->type[i] == code) |
665 | 0 | break; |
666 | 0 | } |
667 | 0 | if (i == state->nmods) |
668 | 0 | return -1; |
669 | | |
670 | | // Return data |
671 | 0 | if (strand) *strand = state->strand[i]; |
672 | 0 | if (implicit) *implicit = state->implicit[i]; |
673 | 0 | if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; |
674 | |
|
675 | 0 | return 0; |
676 | 0 | } |
677 | | |
678 | | /* |
679 | | * Returns data about the ith modification type for the alignment record. |
680 | | * |
681 | | * Return 0 on success or -1 if not found. The strand, implicit and canonical |
682 | | * fields are filled out if passed in as non-NULL pointers. |
683 | | */ |
684 | | int bam_mods_queryi(hts_base_mod_state *state, int i, |
685 | 0 | int *strand, int *implicit, char *canonical) { |
686 | 0 | if (i < 0 || i >= state->nmods) |
687 | 0 | return -1; |
688 | | |
689 | | // Return data |
690 | 0 | if (strand) *strand = state->strand[i]; |
691 | 0 | if (implicit) *implicit = state->implicit[i]; |
692 | 0 | if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; |
693 | |
|
694 | 0 | return 0; |
695 | 0 | } |