Line | Count | Source (jump to first uncovered line) |
1 | | /* region.c -- Functions to create and free region lists |
2 | | |
3 | | Copyright (C) 2019 Genome Research Ltd. |
4 | | |
5 | | Author: Valeriu Ohan <vo2@sanger.ac.uk> |
6 | | |
7 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
8 | | of this software and associated documentation files (the "Software"), to deal |
9 | | in the Software without restriction, including without limitation the rights |
10 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | copies of the Software, and to permit persons to whom the Software is |
12 | | furnished to do so, subject to the following conditions: |
13 | | |
14 | | The above copyright notice and this permission notice shall be included in |
15 | | all copies or substantial portions of the Software. |
16 | | |
17 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
18 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
19 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
20 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
21 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
22 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
23 | | DEALINGS IN THE SOFTWARE. */ |
24 | | |
25 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
26 | | #include <config.h> |
27 | | |
28 | | #include "htslib/hts.h" |
29 | | #include "htslib/khash.h" |
30 | | |
31 | | typedef struct reglist |
32 | | { |
33 | | uint32_t n, m; |
34 | | hts_pair_pos_t *a; |
35 | | int tid; |
36 | | } reglist_t; |
37 | | |
38 | | KHASH_MAP_INIT_INT(reg, reglist_t) |
39 | | typedef kh_reg_t reghash_t; |
40 | | |
41 | | static int compare_hts_pair_pos_t (const void *av, const void *bv) |
42 | 0 | { |
43 | 0 | hts_pair_pos_t *a = (hts_pair_pos_t *) av; |
44 | 0 | hts_pair_pos_t *b = (hts_pair_pos_t *) bv; |
45 | 0 | if (a->beg < b->beg) return -1; |
46 | 0 | if (a->beg > b->beg) return 1; |
47 | 0 | if (a->end < b->end) return -1; |
48 | 0 | if (a->end > b->end) return 1; |
49 | | |
50 | 0 | return 0; |
51 | 0 | } |
52 | | |
53 | | #if 0 |
54 | | /** |
55 | | * Good to have around for debugging |
56 | | */ |
57 | | static void reg_print(reghash_t *h) { |
58 | | reglist_t *p; |
59 | | khint_t k; |
60 | | uint32_t i; |
61 | | khint32_t key; |
62 | | |
63 | | if (!h) { |
64 | | fprintf(stderr, "Hash table is empty!\n"); |
65 | | return; |
66 | | } |
67 | | for (k = kh_begin(h); k < kh_end(h); k++) { |
68 | | if (kh_exist(h,k)) { |
69 | | key = kh_key(h,k); |
70 | | fprintf(stderr, "Region: key %u tid %d\n", key, p->tid); |
71 | | if ((p = &kh_val(h,k)) != NULL && p->n > 0) { |
72 | | for (i=0; i<p->n; i++) { |
73 | | fprintf(stderr, "\tinterval[%d]: %"PRIhts_pos"-%"PRIhts_pos"\n", i, |
74 | | p->a[i].beg, p->a[i].end); |
75 | | } |
76 | | } else { |
77 | | fprintf(stderr, "Region key %u has no intervals!\n", key); |
78 | | } |
79 | | } |
80 | | } |
81 | | } |
82 | | #endif |
83 | | |
84 | | /** |
85 | | * Sort and merge overlapping or adjacent intervals. |
86 | | */ |
87 | 0 | static int reg_compact(reghash_t *h) { |
88 | 0 | khint_t i; |
89 | 0 | uint32_t j, new_n; |
90 | 0 | reglist_t *p; |
91 | 0 | int count = 0; |
92 | |
|
93 | 0 | if (!h) |
94 | 0 | return 0; |
95 | | |
96 | 0 | for (i = kh_begin(h); i < kh_end(h); i++) { |
97 | 0 | if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || !(p->n)) |
98 | 0 | continue; |
99 | | |
100 | 0 | qsort(p->a, p->n, sizeof(p->a[0]), compare_hts_pair_pos_t); |
101 | 0 | for (new_n = 0, j = 1; j < p->n; j++) { |
102 | 0 | if (p->a[new_n].end < p->a[j].beg) { |
103 | 0 | p->a[++new_n].beg = p->a[j].beg; |
104 | 0 | p->a[new_n].end = p->a[j].end; |
105 | 0 | } else { |
106 | 0 | if (p->a[new_n].end < p->a[j].end) |
107 | 0 | p->a[new_n].end = p->a[j].end; |
108 | 0 | } |
109 | 0 | } |
110 | 0 | ++new_n; |
111 | 0 | if (p->n > new_n) { |
112 | | // Shrink array to required size. |
113 | 0 | hts_pair_pos_t *new_a = realloc(p->a, new_n * sizeof(p->a[0])); |
114 | 0 | if (new_a) p->a = new_a; |
115 | 0 | } |
116 | 0 | p->n = new_n; |
117 | 0 | count++; |
118 | 0 | } |
119 | |
|
120 | 0 | return count; |
121 | 0 | } |
122 | | |
123 | 0 | static int reg_insert(reghash_t *h, int tid, hts_pos_t beg, hts_pos_t end) { |
124 | |
|
125 | 0 | khint_t k; |
126 | 0 | reglist_t *p; |
127 | |
|
128 | 0 | if (!h) |
129 | 0 | return -1; |
130 | | |
131 | | // Put reg in the hash table if not already there |
132 | 0 | k = kh_get(reg, h, tid); |
133 | 0 | if (k == kh_end(h)) { // absent from the hash table |
134 | 0 | int ret; |
135 | 0 | k = kh_put(reg, h, tid, &ret); |
136 | 0 | if (-1 == ret) { |
137 | 0 | return -1; |
138 | 0 | } |
139 | 0 | memset(&kh_val(h, k), 0, sizeof(reglist_t)); |
140 | 0 | kh_val(h, k).tid = tid; |
141 | 0 | } |
142 | 0 | p = &kh_val(h, k); |
143 | | |
144 | | // Add beg and end to the list |
145 | 0 | if (p->n == p->m) { |
146 | 0 | uint32_t new_m = p->m ? p->m<<1 : 4; |
147 | 0 | if (new_m == 0) return -1; |
148 | 0 | hts_pair_pos_t *new_a = realloc(p->a, new_m * sizeof(p->a[0])); |
149 | 0 | if (new_a == NULL) return -1; |
150 | 0 | p->m = new_m; |
151 | 0 | p->a = new_a; |
152 | 0 | } |
153 | 0 | p->a[p->n].beg = beg; |
154 | 0 | p->a[p->n++].end = end; |
155 | |
|
156 | 0 | return 0; |
157 | 0 | } |
158 | | |
159 | 0 | static void reg_destroy(reghash_t *h) { |
160 | |
|
161 | 0 | khint_t k; |
162 | |
|
163 | 0 | if (!h) |
164 | 0 | return; |
165 | | |
166 | 0 | for (k = 0; k < kh_end(h); ++k) { |
167 | 0 | if (kh_exist(h, k)) { |
168 | 0 | free(kh_val(h, k).a); |
169 | 0 | } |
170 | 0 | } |
171 | 0 | kh_destroy(reg, h); |
172 | 0 | } |
173 | | |
174 | | /** |
175 | | * Take a char array of reg:interval elements and produce a hts_reglis_t with r_count elements. |
176 | | */ |
177 | 0 | hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr, hts_name2id_f getid) { |
178 | |
|
179 | 0 | if (!argv || argc < 1) |
180 | 0 | return NULL; |
181 | | |
182 | 0 | reghash_t *h = NULL; |
183 | 0 | reglist_t *p; |
184 | 0 | hts_reglist_t *h_reglist = NULL; |
185 | |
|
186 | 0 | khint_t k; |
187 | 0 | int i, l_count = 0, tid; |
188 | 0 | const char *q; |
189 | 0 | hts_pos_t beg, end; |
190 | | |
191 | | /* First, transform the char array into a hash table */ |
192 | 0 | h = kh_init(reg); |
193 | 0 | if (!h) { |
194 | 0 | hts_log_error("Error when creating the region hash table"); |
195 | 0 | return NULL; |
196 | 0 | } |
197 | | |
198 | 0 | for (i=0; i<argc; i++) { |
199 | 0 | if (!strcmp(argv[i], ".")) { |
200 | 0 | q = argv[i] + 1; |
201 | 0 | tid = HTS_IDX_START; beg = 0; end = INT64_MAX; |
202 | 0 | } else if (!strcmp(argv[i], "*")) { |
203 | 0 | q = argv[i] + 1; |
204 | 0 | tid = HTS_IDX_NOCOOR; beg = 0; end = INT64_MAX; |
205 | 0 | } else { |
206 | 0 | q = hts_parse_region(argv[i], &tid, &beg, &end, getid, hdr, |
207 | 0 | HTS_PARSE_THOUSANDS_SEP); |
208 | 0 | } |
209 | 0 | if (!q) { |
210 | 0 | if (tid < -1) { |
211 | 0 | hts_log_error("Failed to parse header"); |
212 | 0 | goto fail; |
213 | 0 | } else { |
214 | | // not parsable as a region |
215 | 0 | hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", argv[i]); |
216 | 0 | continue; |
217 | 0 | } |
218 | 0 | } |
219 | | |
220 | 0 | if (reg_insert(h, tid, beg, end) != 0) { |
221 | 0 | hts_log_error("Error when inserting region='%s' in the bed hash table at address=%p", argv[i], (void *) h); |
222 | 0 | goto fail; |
223 | 0 | } |
224 | 0 | } |
225 | | |
226 | 0 | *r_count = reg_compact(h); |
227 | 0 | if (!*r_count) |
228 | 0 | goto fail; |
229 | | |
230 | | /* Transform the hash table into a list */ |
231 | 0 | h_reglist = (hts_reglist_t *)calloc(*r_count, sizeof(hts_reglist_t)); |
232 | 0 | if (!h_reglist) |
233 | 0 | goto fail; |
234 | | |
235 | 0 | for (k = kh_begin(h); k < kh_end(h) && l_count < *r_count; k++) { |
236 | 0 | if (!kh_exist(h,k) || !(p = &kh_val(h,k))) |
237 | 0 | continue; |
238 | | |
239 | 0 | h_reglist[l_count].tid = p->tid; |
240 | 0 | h_reglist[l_count].intervals = p->a; |
241 | 0 | h_reglist[l_count].count = p->n; |
242 | 0 | p->a = NULL; // As we stole it. |
243 | | |
244 | | // After reg_compact(), list is ordered and non-overlapping, so... |
245 | 0 | if (p->n > 0) { |
246 | 0 | h_reglist[l_count].min_beg = h_reglist[l_count].intervals[0].beg; |
247 | 0 | h_reglist[l_count].max_end = h_reglist[l_count].intervals[p->n - 1].end; |
248 | 0 | } else { |
249 | 0 | h_reglist[l_count].min_beg = 0; |
250 | 0 | h_reglist[l_count].max_end = 0; |
251 | 0 | } |
252 | |
|
253 | 0 | l_count++; |
254 | 0 | } |
255 | 0 | reg_destroy(h); |
256 | |
|
257 | 0 | return h_reglist; |
258 | | |
259 | 0 | fail: |
260 | 0 | reg_destroy(h); |
261 | 0 | if(h_reglist) hts_reglist_free(h_reglist, l_count); |
262 | |
|
263 | 0 | return NULL; |
264 | 0 | } |
265 | | |
266 | 0 | void hts_reglist_free(hts_reglist_t *reglist, int count) { |
267 | |
|
268 | 0 | int i; |
269 | 0 | if(reglist) { |
270 | 0 | for (i = 0; i < count; i++) { |
271 | 0 | if (reglist[i].intervals) |
272 | 0 | free(reglist[i].intervals); |
273 | 0 | } |
274 | 0 | free(reglist); |
275 | 0 | } |
276 | 0 | } |