Coverage Report

Created: 2025-11-09 07:19

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_stats.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2014, 2016, 2018, 2020 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <stdio.h>
35
#include <errno.h>
36
#include <assert.h>
37
#include <stdlib.h>
38
#include <string.h>
39
#include <zlib.h>
40
#include <sys/types.h>
41
#include <sys/stat.h>
42
#include <math.h>
43
#include <inttypes.h>
44
45
#include "cram.h"
46
#include "os.h"
47
48
1.67M
cram_stats *cram_stats_create(void) {
49
1.67M
    return calloc(1, sizeof(cram_stats));
50
1.67M
}
51
52
526M
int cram_stats_add(cram_stats *st, int64_t val) {
53
526M
    st->nsamp++;
54
55
    //assert(val >= 0);
56
57
526M
    if (val < MAX_STAT_VAL && val >= 0) {
58
507M
        st->freqs[val]++;
59
507M
    } else {
60
19.4M
        khint_t k;
61
19.4M
        int r;
62
63
19.4M
        if (!st->h) {
64
149k
            st->h = kh_init(m_i2i);
65
149k
            if (!st->h)
66
0
                return -1;
67
149k
        }
68
69
19.4M
        k = kh_put(m_i2i, st->h, val, &r);
70
19.4M
        if (r == 0)
71
19.2M
            kh_val(st->h, k)++;
72
158k
        else if (r != -1)
73
158k
            kh_val(st->h, k) = 1;
74
0
        else
75
0
            return -1;
76
19.4M
    }
77
526M
    return 0;
78
526M
}
79
80
10
void cram_stats_del(cram_stats *st, int64_t val) {
81
10
    st->nsamp--;
82
83
    //assert(val >= 0);
84
85
10
    if (val < MAX_STAT_VAL && val >= 0) {
86
8
        st->freqs[val]--;
87
8
        assert(st->freqs[val] >= 0);
88
8
    } else if (st->h) {
89
2
        khint_t k = kh_get(m_i2i, st->h, val);
90
91
2
        if (k != kh_end(st->h)) {
92
2
            if (--kh_val(st->h, k) == 0)
93
1
                kh_del(m_i2i, st->h, k);
94
2
        } else {
95
0
            hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val);
96
0
            st->nsamp++;
97
0
        }
98
2
    } else {
99
0
        hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val);
100
0
        st->nsamp++;
101
0
    }
102
10
}
103
104
#if DEBUG_CRAM_STATS
105
void cram_stats_dump(cram_stats *st) {
106
    int i;
107
    fprintf(stderr, "cram_stats:\n");
108
    for (i = 0; i < MAX_STAT_VAL; i++) {
109
        if (!st->freqs[i])
110
            continue;
111
        fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]);
112
    }
113
    if (st->h) {
114
        khint_t k;
115
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
116
            if (!kh_exist(st->h, k))
117
                continue;
118
119
            fprintf(stderr, "\t%d\t%d\n", kh_key(st->h, k), kh_val(st->h, k));
120
        }
121
    }
122
}
123
#endif
124
125
/*
126
 * Computes entropy from integer frequencies for various encoding methods and
127
 * picks the best encoding.
128
 *
129
 * FIXME: we could reuse some of the code here for the actual encoding
130
 * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
131
 *
132
 * Returns the best codec to use.
133
 */
134
1.34M
enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
135
1.34M
    int nvals, i, max_val = 0, min_val = INT_MAX;
136
1.34M
    int *vals = NULL, *freqs = NULL, vals_alloc = 0;
137
1.34M
    int ntot HTS_UNUSED = 0;
138
139
#if DEBUG_CRAM_STATS
140
    cram_stats_dump(st);
141
#endif
142
143
    /* Count number of unique symbols */
144
1.37G
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
145
1.37G
        if (!st->freqs[i])
146
1.37G
            continue;
147
815k
        if (nvals >= vals_alloc) {
148
737k
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
149
737k
            int *vals_tmp  = realloc(vals,  vals_alloc * sizeof(int));
150
737k
            int *freqs_tmp = realloc(freqs, vals_alloc * sizeof(int));
151
737k
            if (!vals_tmp || !freqs_tmp) {
152
0
                free(vals_tmp  ? vals_tmp  : vals);
153
0
                free(freqs_tmp ? freqs_tmp : freqs);
154
0
                return E_HUFFMAN; // Cannot do much else atm
155
0
            }
156
737k
            vals = vals_tmp;
157
737k
            freqs = freqs_tmp;
158
737k
        }
159
815k
        vals[nvals] = i;
160
815k
        freqs[nvals] = st->freqs[i];
161
815k
        ntot += freqs[nvals];
162
815k
        if (max_val < i) max_val = i;
163
815k
        if (min_val > i) min_val = i;
164
815k
        nvals++;
165
815k
    }
166
1.34M
    if (st->h) {
167
175k
        khint_t k;
168
175k
        int i;
169
170
886k
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
171
711k
            if (!kh_exist(st->h, k))
172
527k
                continue;
173
174
183k
            if (nvals >= vals_alloc) {
175
169k
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
176
169k
                int *vals_tmp  = realloc(vals,  vals_alloc * sizeof(int));
177
169k
                int *freqs_tmp = realloc(freqs, vals_alloc * sizeof(int));
178
169k
                if (!vals_tmp || !freqs_tmp) {
179
0
                    free(vals_tmp  ? vals_tmp  : vals);
180
0
                    free(freqs_tmp ? freqs_tmp : freqs);
181
0
                    return E_HUFFMAN; // Cannot do much else atm
182
0
                }
183
169k
                vals = vals_tmp;
184
169k
                freqs = freqs_tmp;
185
169k
            }
186
183k
            i = kh_key(st->h, k);
187
183k
            vals[nvals]=i;
188
183k
            freqs[nvals] = kh_val(st->h, k);
189
183k
            ntot += freqs[nvals];
190
183k
            if (max_val < i) max_val = i;
191
183k
            if (min_val > i) min_val = i;
192
183k
            nvals++;
193
183k
        }
194
175k
    }
195
196
1.34M
    st->nvals = nvals;
197
1.34M
    st->min_val = min_val;
198
1.34M
    st->max_val = max_val;
199
1.34M
    assert(ntot == st->nsamp);
200
201
1.34M
    free(vals);
202
1.34M
    free(freqs);
203
204
    /*
205
     * Simple policy that everything is external unless it can be
206
     * encoded using zero bits as a unary item huffman table.
207
     */
208
1.34M
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
209
        // Note, we're assuming integer data here as we don't have the
210
        // type passed in.  Cram_encoder_init does know the type and
211
        // will convert to E_CONST_BYTE or E_EXTERNAL as appropriate.
212
0
        if (nvals == 1)
213
0
            return E_CONST_INT;
214
0
        else if (nvals == 0 || min_val < 0)
215
0
            return E_VARINT_SIGNED;
216
0
        else
217
0
            return E_VARINT_UNSIGNED;
218
1.34M
    } else {
219
1.34M
        return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL;
220
1.34M
    }
221
1.34M
}
222
223
1.67M
void cram_stats_free(cram_stats *st) {
224
1.67M
    if (st->h)
225
149k
        kh_destroy(m_i2i, st->h);
226
1.67M
    free(st);
227
1.67M
}