/src/htslib/cram/cram_stats.c
Line | Count | Source |
1 | | /* |
2 | | Copyright (c) 2012-2014, 2016, 2018, 2020 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
32 | | #include <config.h> |
33 | | |
34 | | #include <stdio.h> |
35 | | #include <errno.h> |
36 | | #include <assert.h> |
37 | | #include <stdlib.h> |
38 | | #include <string.h> |
39 | | #include <zlib.h> |
40 | | #include <sys/types.h> |
41 | | #include <sys/stat.h> |
42 | | #include <math.h> |
43 | | #include <inttypes.h> |
44 | | |
45 | | #include "cram.h" |
46 | | #include "os.h" |
47 | | #include "../htslib/hts_alloc.h" |
48 | | |
49 | 245k | cram_stats *cram_stats_create(void) { |
50 | 245k | return calloc(1, sizeof(cram_stats)); |
51 | 245k | } |
52 | | |
53 | 350M | int cram_stats_add(cram_stats *st, int64_t val) { |
54 | 350M | st->nsamp++; |
55 | | |
56 | | //assert(val >= 0); |
57 | | |
58 | 350M | if (val < MAX_STAT_VAL && val >= 0) { |
59 | 334M | st->freqs[val]++; |
60 | 334M | } else { |
61 | 16.5M | khint_t k; |
62 | 16.5M | int r; |
63 | | |
64 | 16.5M | if (!st->h) { |
65 | 9.70k | st->h = kh_init(m_i2i); |
66 | 9.70k | if (!st->h) |
67 | 0 | return -1; |
68 | 9.70k | } |
69 | | |
70 | 16.5M | k = kh_put(m_i2i, st->h, val, &r); |
71 | 16.5M | if (r == 0) |
72 | 16.5M | kh_val(st->h, k)++; |
73 | 12.1k | else if (r != -1) |
74 | 12.1k | kh_val(st->h, k) = 1; |
75 | 0 | else |
76 | 0 | return -1; |
77 | 16.5M | } |
78 | 350M | return 0; |
79 | 350M | } |
80 | | |
81 | 0 | void cram_stats_del(cram_stats *st, int64_t val) { |
82 | 0 | st->nsamp--; |
83 | | |
84 | | //assert(val >= 0); |
85 | |
|
86 | 0 | if (val < MAX_STAT_VAL && val >= 0) { |
87 | 0 | st->freqs[val]--; |
88 | 0 | assert(st->freqs[val] >= 0); |
89 | 0 | } else if (st->h) { |
90 | 0 | khint_t k = kh_get(m_i2i, st->h, val); |
91 | |
|
92 | 0 | if (k != kh_end(st->h)) { |
93 | 0 | if (--kh_val(st->h, k) == 0) |
94 | 0 | kh_del(m_i2i, st->h, k); |
95 | 0 | } else { |
96 | 0 | hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); |
97 | 0 | st->nsamp++; |
98 | 0 | } |
99 | 0 | } else { |
100 | 0 | hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); |
101 | 0 | st->nsamp++; |
102 | 0 | } |
103 | 0 | } |
104 | | |
105 | | #if DEBUG_CRAM_STATS |
106 | | void cram_stats_dump(cram_stats *st) { |
107 | | int i; |
108 | | fprintf(stderr, "cram_stats:\n"); |
109 | | for (i = 0; i < MAX_STAT_VAL; i++) { |
110 | | if (!st->freqs[i]) |
111 | | continue; |
112 | | fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]); |
113 | | } |
114 | | if (st->h) { |
115 | | khint_t k; |
116 | | for (k = kh_begin(st->h); k != kh_end(st->h); k++) { |
117 | | if (!kh_exist(st->h, k)) |
118 | | continue; |
119 | | |
120 | | fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k)); |
121 | | } |
122 | | } |
123 | | } |
124 | | #endif |
125 | | |
126 | | /* |
127 | | * Computes entropy from integer frequencies for various encoding methods and |
128 | | * picks the best encoding. |
129 | | * |
130 | | * FIXME: we could reuse some of the code here for the actual encoding |
131 | | * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman. |
132 | | * |
133 | | * Returns the best codec to use. |
134 | | */ |
135 | 67.9k | enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { |
136 | 67.9k | int nvals, i, max_val = 0, min_val = INT_MAX; |
137 | 67.9k | int *vals = NULL, *freqs = NULL, vals_alloc = 0; |
138 | 67.9k | int ntot HTS_UNUSED = 0; |
139 | | |
140 | | #if DEBUG_CRAM_STATS |
141 | | cram_stats_dump(st); |
142 | | #endif |
143 | | |
144 | | /* Count number of unique symbols */ |
145 | 69.6M | for (nvals = i = 0; i < MAX_STAT_VAL; i++) { |
146 | 69.6M | if (!st->freqs[i]) |
147 | 69.5M | continue; |
148 | 49.2k | if (nvals >= vals_alloc) { |
149 | 29.3k | vals_alloc = vals_alloc ? vals_alloc*2 : 1024; |
150 | 29.3k | int *vals_tmp = hts_realloc_p(vals, sizeof(*vals), vals_alloc); |
151 | 29.3k | int *freqs_tmp = hts_realloc_p(freqs, sizeof(*freqs), vals_alloc); |
152 | 29.3k | if (!vals_tmp || !freqs_tmp) { |
153 | 0 | free(vals_tmp ? vals_tmp : vals); |
154 | 0 | free(freqs_tmp ? freqs_tmp : freqs); |
155 | 0 | return E_HUFFMAN; // Cannot do much else atm |
156 | 0 | } |
157 | 29.3k | vals = vals_tmp; |
158 | 29.3k | freqs = freqs_tmp; |
159 | 29.3k | } |
160 | 49.2k | vals[nvals] = i; |
161 | 49.2k | freqs[nvals] = st->freqs[i]; |
162 | 49.2k | ntot += freqs[nvals]; |
163 | 49.2k | if (max_val < i) max_val = i; |
164 | 49.2k | if (min_val > i) min_val = i; |
165 | 49.2k | nvals++; |
166 | 49.2k | } |
167 | 67.9k | if (st->h) { |
168 | 12.1k | khint_t k; |
169 | 12.1k | int i; |
170 | | |
171 | 64.8k | for (k = kh_begin(st->h); k != kh_end(st->h); k++) { |
172 | 52.7k | if (!kh_exist(st->h, k)) |
173 | 38.1k | continue; |
174 | | |
175 | 14.5k | if (nvals >= vals_alloc) { |
176 | 11.8k | vals_alloc = vals_alloc ? vals_alloc*2 : 1024; |
177 | 11.8k | int *vals_tmp = hts_realloc_p(vals, sizeof(*vals), vals_alloc); |
178 | 11.8k | int *freqs_tmp = hts_realloc_p(freqs, sizeof(*freqs), vals_alloc); |
179 | 11.8k | if (!vals_tmp || !freqs_tmp) { |
180 | 0 | free(vals_tmp ? vals_tmp : vals); |
181 | 0 | free(freqs_tmp ? freqs_tmp : freqs); |
182 | 0 | return E_HUFFMAN; // Cannot do much else atm |
183 | 0 | } |
184 | 11.8k | vals = vals_tmp; |
185 | 11.8k | freqs = freqs_tmp; |
186 | 11.8k | } |
187 | 14.5k | i = kh_key(st->h, k); |
188 | 14.5k | vals[nvals]=i; |
189 | 14.5k | freqs[nvals] = kh_val(st->h, k); |
190 | 14.5k | ntot += freqs[nvals]; |
191 | 14.5k | if (max_val < i) max_val = i; |
192 | 14.5k | if (min_val > i) min_val = i; |
193 | 14.5k | nvals++; |
194 | 14.5k | } |
195 | 12.1k | } |
196 | | |
197 | 67.9k | st->nvals = nvals; |
198 | 67.9k | st->min_val = min_val; |
199 | 67.9k | st->max_val = max_val; |
200 | 67.9k | assert(ntot == st->nsamp); |
201 | | |
202 | 67.9k | free(vals); |
203 | 67.9k | free(freqs); |
204 | | |
205 | | /* |
206 | | * Simple policy that everything is external unless it can be |
207 | | * encoded using zero bits as a unary item huffman table. |
208 | | */ |
209 | 67.9k | if (CRAM_MAJOR_VERS(fd->version) >= 4) { |
210 | | // Note, we're assuming integer data here as we don't have the |
211 | | // type passed in. Cram_encoder_init does know the type and |
212 | | // will convert to E_CONST_BYTE or E_EXTERNAL as appropriate. |
213 | 0 | if (nvals == 1) |
214 | 0 | return E_CONST_INT; |
215 | 0 | else if (nvals == 0 || min_val < 0) |
216 | 0 | return E_VARINT_SIGNED; |
217 | 0 | else |
218 | 0 | return E_VARINT_UNSIGNED; |
219 | 67.9k | } else { |
220 | 67.9k | return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL; |
221 | 67.9k | } |
222 | 67.9k | } |
223 | | |
224 | 245k | void cram_stats_free(cram_stats *st) { |
225 | 245k | if (st->h) |
226 | 9.70k | kh_destroy(m_i2i, st->h); |
227 | 245k | free(st); |
228 | 245k | } |