Coverage Report

Created: 2026-05-30 06:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_stats.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2014, 2016, 2018, 2020 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <stdio.h>
35
#include <errno.h>
36
#include <assert.h>
37
#include <stdlib.h>
38
#include <string.h>
39
#include <zlib.h>
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <math.h>
43
#include <inttypes.h>
44
45
#include "cram.h"
46
#include "os.h"
47
#include "../htslib/hts_alloc.h"
48
49
245k
cram_stats *cram_stats_create(void) {
50
245k
    return calloc(1, sizeof(cram_stats));
51
245k
}
52
53
350M
int cram_stats_add(cram_stats *st, int64_t val) {
54
350M
    st->nsamp++;
55
56
    //assert(val >= 0);
57
58
350M
    if (val < MAX_STAT_VAL && val >= 0) {
59
334M
        st->freqs[val]++;
60
334M
    } else {
61
16.5M
        khint_t k;
62
16.5M
        int r;
63
64
16.5M
        if (!st->h) {
65
9.70k
            st->h = kh_init(m_i2i);
66
9.70k
            if (!st->h)
67
0
                return -1;
68
9.70k
        }
69
70
16.5M
        k = kh_put(m_i2i, st->h, val, &r);
71
16.5M
        if (r == 0)
72
16.5M
            kh_val(st->h, k)++;
73
12.1k
        else if (r != -1)
74
12.1k
            kh_val(st->h, k) = 1;
75
0
        else
76
0
            return -1;
77
16.5M
    }
78
350M
    return 0;
79
350M
}
80
81
0
void cram_stats_del(cram_stats *st, int64_t val) {
82
0
    st->nsamp--;
83
84
    //assert(val >= 0);
85
86
0
    if (val < MAX_STAT_VAL && val >= 0) {
87
0
        st->freqs[val]--;
88
0
        assert(st->freqs[val] >= 0);
89
0
    } else if (st->h) {
90
0
        khint_t k = kh_get(m_i2i, st->h, val);
91
92
0
        if (k != kh_end(st->h)) {
93
0
            if (--kh_val(st->h, k) == 0)
94
0
                kh_del(m_i2i, st->h, k);
95
0
        } else {
96
0
            hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val);
97
0
            st->nsamp++;
98
0
        }
99
0
    } else {
100
0
        hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val);
101
0
        st->nsamp++;
102
0
    }
103
0
}
104
105
#if DEBUG_CRAM_STATS
106
void cram_stats_dump(cram_stats *st) {
107
    int i;
108
    fprintf(stderr, "cram_stats:\n");
109
    for (i = 0; i < MAX_STAT_VAL; i++) {
110
        if (!st->freqs[i])
111
            continue;
112
        fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]);
113
    }
114
    if (st->h) {
115
        khint_t k;
116
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
117
            if (!kh_exist(st->h, k))
118
                continue;
119
120
            fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k));
121
        }
122
    }
123
}
124
#endif
125
126
/*
127
 * Computes entropy from integer frequencies for various encoding methods and
128
 * picks the best encoding.
129
 *
130
 * FIXME: we could reuse some of the code here for the actual encoding
131
 * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
132
 *
133
 * Returns the best codec to use.
134
 */
135
67.9k
enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
136
67.9k
    int nvals, i, max_val = 0, min_val = INT_MAX;
137
67.9k
    int *vals = NULL, *freqs = NULL, vals_alloc = 0;
138
67.9k
    int ntot HTS_UNUSED = 0;
139
140
#if DEBUG_CRAM_STATS
141
    cram_stats_dump(st);
142
#endif
143
144
    /* Count number of unique symbols */
145
69.6M
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
146
69.6M
        if (!st->freqs[i])
147
69.5M
            continue;
148
49.2k
        if (nvals >= vals_alloc) {
149
29.3k
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
150
29.3k
            int *vals_tmp  = hts_realloc_p(vals,  sizeof(*vals),  vals_alloc);
151
29.3k
            int *freqs_tmp = hts_realloc_p(freqs, sizeof(*freqs), vals_alloc);
152
29.3k
            if (!vals_tmp || !freqs_tmp) {
153
0
                free(vals_tmp  ? vals_tmp  : vals);
154
0
                free(freqs_tmp ? freqs_tmp : freqs);
155
0
                return E_HUFFMAN; // Cannot do much else atm
156
0
            }
157
29.3k
            vals = vals_tmp;
158
29.3k
            freqs = freqs_tmp;
159
29.3k
        }
160
49.2k
        vals[nvals] = i;
161
49.2k
        freqs[nvals] = st->freqs[i];
162
49.2k
        ntot += freqs[nvals];
163
49.2k
        if (max_val < i) max_val = i;
164
49.2k
        if (min_val > i) min_val = i;
165
49.2k
        nvals++;
166
49.2k
    }
167
67.9k
    if (st->h) {
168
12.1k
        khint_t k;
169
12.1k
        int i;
170
171
64.8k
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
172
52.7k
            if (!kh_exist(st->h, k))
173
38.1k
                continue;
174
175
14.5k
            if (nvals >= vals_alloc) {
176
11.8k
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
177
11.8k
                int *vals_tmp  = hts_realloc_p(vals,  sizeof(*vals),  vals_alloc);
178
11.8k
                int *freqs_tmp = hts_realloc_p(freqs, sizeof(*freqs), vals_alloc);
179
11.8k
                if (!vals_tmp || !freqs_tmp) {
180
0
                    free(vals_tmp  ? vals_tmp  : vals);
181
0
                    free(freqs_tmp ? freqs_tmp : freqs);
182
0
                    return E_HUFFMAN; // Cannot do much else atm
183
0
                }
184
11.8k
                vals = vals_tmp;
185
11.8k
                freqs = freqs_tmp;
186
11.8k
            }
187
14.5k
            i = kh_key(st->h, k);
188
14.5k
            vals[nvals]=i;
189
14.5k
            freqs[nvals] = kh_val(st->h, k);
190
14.5k
            ntot += freqs[nvals];
191
14.5k
            if (max_val < i) max_val = i;
192
14.5k
            if (min_val > i) min_val = i;
193
14.5k
            nvals++;
194
14.5k
        }
195
12.1k
    }
196
197
67.9k
    st->nvals = nvals;
198
67.9k
    st->min_val = min_val;
199
67.9k
    st->max_val = max_val;
200
67.9k
    assert(ntot == st->nsamp);
201
202
67.9k
    free(vals);
203
67.9k
    free(freqs);
204
205
    /*
206
     * Simple policy that everything is external unless it can be
207
     * encoded using zero bits as a unary item huffman table.
208
     */
209
67.9k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
210
        // Note, we're assuming integer data here as we don't have the
211
        // type passed in.  Cram_encoder_init does know the type and
212
        // will convert to E_CONST_BYTE or E_EXTERNAL as appropriate.
213
0
        if (nvals == 1)
214
0
            return E_CONST_INT;
215
0
        else if (nvals == 0 || min_val < 0)
216
0
            return E_VARINT_SIGNED;
217
0
        else
218
0
            return E_VARINT_UNSIGNED;
219
67.9k
    } else {
220
67.9k
        return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL;
221
67.9k
    }
222
67.9k
}
223
224
245k
void cram_stats_free(cram_stats *st) {
225
245k
    if (st->h)
226
9.70k
        kh_destroy(m_i2i, st->h);
227
245k
    free(st);
228
245k
}