/src/htslib/cram/cram_stats.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright (c) 2012-2014, 2016, 2018, 2020 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
32 | | #include <config.h> |
33 | | |
34 | | #include <stdio.h> |
35 | | #include <errno.h> |
36 | | #include <assert.h> |
37 | | #include <stdlib.h> |
38 | | #include <string.h> |
39 | | #include <zlib.h> |
40 | | #include <sys/types.h> |
41 | | #include <sys/stat.h> |
42 | | #include <math.h> |
43 | | #include <inttypes.h> |
44 | | |
45 | | #include "cram.h" |
46 | | #include "os.h" |
47 | | |
48 | 0 | cram_stats *cram_stats_create(void) { |
49 | 0 | return calloc(1, sizeof(cram_stats)); |
50 | 0 | } |
51 | | |
52 | 0 | int cram_stats_add(cram_stats *st, int64_t val) { |
53 | 0 | st->nsamp++; |
54 | | |
55 | | //assert(val >= 0); |
56 | |
|
57 | 0 | if (val < MAX_STAT_VAL && val >= 0) { |
58 | 0 | st->freqs[val]++; |
59 | 0 | } else { |
60 | 0 | khint_t k; |
61 | 0 | int r; |
62 | |
|
63 | 0 | if (!st->h) { |
64 | 0 | st->h = kh_init(m_i2i); |
65 | 0 | if (!st->h) |
66 | 0 | return -1; |
67 | 0 | } |
68 | | |
69 | 0 | k = kh_put(m_i2i, st->h, val, &r); |
70 | 0 | if (r == 0) |
71 | 0 | kh_val(st->h, k)++; |
72 | 0 | else if (r != -1) |
73 | 0 | kh_val(st->h, k) = 1; |
74 | 0 | else |
75 | 0 | return -1; |
76 | 0 | } |
77 | 0 | return 0; |
78 | 0 | } |
79 | | |
80 | 0 | void cram_stats_del(cram_stats *st, int64_t val) { |
81 | 0 | st->nsamp--; |
82 | | |
83 | | //assert(val >= 0); |
84 | |
|
85 | 0 | if (val < MAX_STAT_VAL && val >= 0) { |
86 | 0 | st->freqs[val]--; |
87 | 0 | assert(st->freqs[val] >= 0); |
88 | 0 | } else if (st->h) { |
89 | 0 | khint_t k = kh_get(m_i2i, st->h, val); |
90 | |
|
91 | 0 | if (k != kh_end(st->h)) { |
92 | 0 | if (--kh_val(st->h, k) == 0) |
93 | 0 | kh_del(m_i2i, st->h, k); |
94 | 0 | } else { |
95 | 0 | hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); |
96 | 0 | st->nsamp++; |
97 | 0 | } |
98 | 0 | } else { |
99 | 0 | hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); |
100 | 0 | st->nsamp++; |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | | #if DEBUG_CRAM_STATS |
105 | | void cram_stats_dump(cram_stats *st) { |
106 | | int i; |
107 | | fprintf(stderr, "cram_stats:\n"); |
108 | | for (i = 0; i < MAX_STAT_VAL; i++) { |
109 | | if (!st->freqs[i]) |
110 | | continue; |
111 | | fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]); |
112 | | } |
113 | | if (st->h) { |
114 | | khint_t k; |
115 | | for (k = kh_begin(st->h); k != kh_end(st->h); k++) { |
116 | | if (!kh_exist(st->h, k)) |
117 | | continue; |
118 | | |
119 | | fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k)); |
120 | | } |
121 | | } |
122 | | } |
123 | | #endif |
124 | | |
125 | | /* |
126 | | * Computes entropy from integer frequencies for various encoding methods and |
127 | | * picks the best encoding. |
128 | | * |
129 | | * FIXME: we could reuse some of the code here for the actual encoding |
130 | | * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman. |
131 | | * |
132 | | * Returns the best codec to use. |
133 | | */ |
134 | 0 | enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { |
135 | 0 | int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX; |
136 | 0 | int *vals = NULL, *freqs = NULL, vals_alloc = 0; |
137 | |
|
138 | | #if DEBUG_CRAM_STATS |
139 | | cram_stats_dump(st); |
140 | | #endif |
141 | | |
142 | | /* Count number of unique symbols */ |
143 | 0 | for (nvals = i = 0; i < MAX_STAT_VAL; i++) { |
144 | 0 | if (!st->freqs[i]) |
145 | 0 | continue; |
146 | 0 | if (nvals >= vals_alloc) { |
147 | 0 | vals_alloc = vals_alloc ? vals_alloc*2 : 1024; |
148 | 0 | int *vals_tmp = realloc(vals, vals_alloc * sizeof(int)); |
149 | 0 | int *freqs_tmp = realloc(freqs, vals_alloc * sizeof(int)); |
150 | 0 | if (!vals_tmp || !freqs_tmp) { |
151 | 0 | free(vals_tmp ? vals_tmp : vals); |
152 | 0 | free(freqs_tmp ? freqs_tmp : freqs); |
153 | 0 | return E_HUFFMAN; // Cannot do much else atm |
154 | 0 | } |
155 | 0 | vals = vals_tmp; |
156 | 0 | freqs = freqs_tmp; |
157 | 0 | } |
158 | 0 | vals[nvals] = i; |
159 | 0 | freqs[nvals] = st->freqs[i]; |
160 | 0 | ntot += freqs[nvals]; |
161 | 0 | if (max_val < i) max_val = i; |
162 | 0 | if (min_val > i) min_val = i; |
163 | 0 | nvals++; |
164 | 0 | } |
165 | 0 | if (st->h) { |
166 | 0 | khint_t k; |
167 | 0 | int i; |
168 | |
|
169 | 0 | for (k = kh_begin(st->h); k != kh_end(st->h); k++) { |
170 | 0 | if (!kh_exist(st->h, k)) |
171 | 0 | continue; |
172 | | |
173 | 0 | if (nvals >= vals_alloc) { |
174 | 0 | vals_alloc = vals_alloc ? vals_alloc*2 : 1024; |
175 | 0 | int *vals_tmp = realloc(vals, vals_alloc * sizeof(int)); |
176 | 0 | int *freqs_tmp = realloc(freqs, vals_alloc * sizeof(int)); |
177 | 0 | if (!vals_tmp || !freqs_tmp) { |
178 | 0 | free(vals_tmp ? vals_tmp : vals); |
179 | 0 | free(freqs_tmp ? freqs_tmp : freqs); |
180 | 0 | return E_HUFFMAN; // Cannot do much else atm |
181 | 0 | } |
182 | 0 | vals = vals_tmp; |
183 | 0 | freqs = freqs_tmp; |
184 | 0 | } |
185 | 0 | i = kh_key(st->h, k); |
186 | 0 | vals[nvals]=i; |
187 | 0 | freqs[nvals] = kh_val(st->h, k); |
188 | 0 | ntot += freqs[nvals]; |
189 | 0 | if (max_val < i) max_val = i; |
190 | 0 | if (min_val > i) min_val = i; |
191 | 0 | nvals++; |
192 | 0 | } |
193 | 0 | } |
194 | | |
195 | 0 | st->nvals = nvals; |
196 | 0 | st->min_val = min_val; |
197 | 0 | st->max_val = max_val; |
198 | 0 | assert(ntot == st->nsamp); |
199 | | |
200 | 0 | free(vals); |
201 | 0 | free(freqs); |
202 | | |
203 | | /* |
204 | | * Simple policy that everything is external unless it can be |
205 | | * encoded using zero bits as a unary item huffman table. |
206 | | */ |
207 | 0 | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
208 | | // Note, we're assuming integer data here as we don't have the |
209 | | // type passed in. Cram_encoder_init does know the type and |
210 | | // will convert to E_CONST_BYTE or E_EXTERNAL as appropriate. |
211 | 0 | if (nvals == 1) |
212 | 0 | return E_CONST_INT; |
213 | 0 | else if (nvals == 0 || min_val < 0) |
214 | 0 | return E_VARINT_SIGNED; |
215 | 0 | else |
216 | 0 | return E_VARINT_UNSIGNED; |
217 | 0 | } else { |
218 | 0 | return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL; |
219 | 0 | } |
220 | 0 | } |
221 | | |
222 | 0 | void cram_stats_free(cram_stats *st) { |
223 | 0 | if (st->h) |
224 | 0 | kh_destroy(m_i2i, st->h); |
225 | 0 | free(st); |
226 | 0 | } |