Coverage Report

Created: 2026-02-11 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/htscodecs/htscodecs/pack.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2019-2020, 2022 Genome Research Ltd.
3
 * Author(s): James Bonfield
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 *
8
 *    1. Redistributions of source code must retain the above copyright notice,
9
 *       this list of conditions and the following disclaimer.
10
 *
11
 *    2. Redistributions in binary form must reproduce the above
12
 *       copyright notice, this list of conditions and the following
13
 *       disclaimer in the documentation and/or other materials provided
14
 *       with the distribution.
15
 *
16
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17
 *       Institute nor the names of its contributors may be used to endorse
18
 *       or promote products derived from this software without specific
19
 *       prior written permission.
20
 *
21
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
 */
33
34
#include "config.h"
35
36
#include <stdint.h>
37
#include <string.h>
38
#include <stdlib.h>
39
#include <stdio.h>
40
41
#include "pack.h"
42
43
//-----------------------------------------------------------------------------
44
45
/*
46
 * Packs multiple symbols into a single byte if the total alphabet of symbols
47
 * used is <= 16.  Each new symbol takes up 1, 2, 4 or 8 bits, or 0 if the
48
 * alphabet used is 1 (constant).
49
 *
50
 * If successful, out_meta/out_meta_len are set to hold the mapping table
51
 * to be used during decompression.
52
 *
53
 * Returns the packed buffer on success with new length in out_len,
54
 *         NULL of failure
55
 */
56
uint8_t *hts_pack(uint8_t *data, int64_t len,
57
104k
                  uint8_t *out_meta, int *out_meta_len, uint64_t *out_len) {
58
104k
    int p[256] = {0}, n;
59
104k
    uint64_t i, j;
60
61
    // count syms
62
2.31G
    for (i = 0; i < len; i++)
63
2.31G
        p[data[i]]=1;
64
    
65
26.8M
    for (i = n = 0; i < 256; i++) {
66
26.7M
        if (p[i]) {
67
351k
            p[i] = n++; // p[i] is now the code number
68
351k
            out_meta[n] = i;
69
351k
        }
70
26.7M
    }
71
104k
    out_meta[0] = n; // 256 wraps to 0
72
104k
    j = n+1;
73
74
    // 1 value per byte
75
104k
    if (n > 16)
76
3.11k
        return NULL;
77
78
101k
    uint8_t *out = malloc(len+1);
79
101k
    if (!out)
80
0
        return NULL;
81
82
    // Work out how many values per byte to encode.
83
101k
    int val_per_byte;
84
101k
    if (n > 4)
85
11.0k
        val_per_byte = 2;
86
90.4k
    else if (n > 2)
87
19.4k
        val_per_byte = 4;
88
71.0k
    else if (n > 1)
89
23.3k
        val_per_byte = 8;
90
47.6k
    else
91
47.6k
        val_per_byte = 0; // infinite
92
93
101k
    *out_meta_len = j;
94
101k
    j = 0;
95
96
101k
    switch (val_per_byte) {
97
11.0k
    case 2:
98
193M
        for (i = 0; i < (len & ~1); i+=2)
99
193M
            out[j++] = (p[data[i]]<<0) | (p[data[i+1]]<<4);
100
11.0k
        switch (len-i) {
101
4.46k
        case 1: out[j++] = p[data[i]];
102
11.0k
        }
103
11.0k
        *out_len = j;
104
11.0k
        return out;
105
106
19.4k
    case 4: {
107
15.9M
        for (i = 0; i < (len & ~3); i+=4)
108
15.9M
            out[j++] = (p[data[i]]<<0) | (p[data[i+1]]<<2) | (p[data[i+2]]<<4) | (p[data[i+3]]<<6);
109
19.4k
        out[j] = 0;
110
19.4k
        int s = len-i, x = 0;
111
19.4k
        switch (s) {
112
5.08k
        case 3: out[j] |= p[data[i++]] << x; x+=2; // fall-through
113
9.14k
        case 2: out[j] |= p[data[i++]] << x; x+=2; // fall-through
114
12.3k
        case 1: out[j] |= p[data[i++]] << x; x+=2;
115
12.3k
            j++;
116
19.4k
        }
117
19.4k
        *out_len = j;
118
19.4k
        return out;
119
19.4k
    }
120
121
23.3k
    case 8: {
122
8.98M
        for (i = 0; i < (len & ~7); i+=8)
123
8.96M
            out[j++] = (p[data[i+0]]<<0) | (p[data[i+1]]<<1) | (p[data[i+2]]<<2) | (p[data[i+3]]<<3)
124
8.96M
                     | (p[data[i+4]]<<4) | (p[data[i+5]]<<5) | (p[data[i+6]]<<6) | (p[data[i+7]]<<7);
125
23.3k
        out[j] = 0;
126
23.3k
        int s = len-i, x = 0;
127
23.3k
        switch (s) {
128
1.84k
        case 7: out[j] |= p[data[i++]] << x++; // fall-through
129
3.99k
        case 6: out[j] |= p[data[i++]] << x++; // fall-through
130
5.44k
        case 5: out[j] |= p[data[i++]] << x++; // fall-through
131
11.4k
        case 4: out[j] |= p[data[i++]] << x++; // fall-through
132
13.0k
        case 3: out[j] |= p[data[i++]] << x++; // fall-through
133
18.4k
        case 2: out[j] |= p[data[i++]] << x++; // fall-through
134
19.5k
        case 1: out[j] |= p[data[i++]] << x++;
135
19.5k
            j++;
136
23.3k
        }
137
23.3k
        *out_len = j;
138
23.3k
        return out;
139
23.3k
    }
140
141
47.6k
    case 0:
142
47.6k
        *out_len = j;
143
47.6k
        return out;
144
101k
    }
145
146
0
    return NULL;
147
101k
}
148
149
150
/*
151
 * Unpacks the meta-data portions of the hts_pack algorithm.
152
 * This consists of the count of symbols and their values.
153
 *
154
 * The "map" array is filled out with the used symbols.
155
 * "nsym" is set to contain the number of symbols per byte;
156
 * 0, 1, 2, 4 or 8.
157
 *
158
 * Returns number of bytes of data[] consumed on success,
159
 *         zero on failure.
160
 */
161
uint8_t hts_unpack_meta(uint8_t *data, uint32_t data_len,
162
1.56k
                        uint64_t udata_len, uint8_t *map, int *nsym) {
163
1.56k
    if (data_len == 0)
164
3
        return 0;
165
166
    // Number of symbols used
167
1.56k
    unsigned int n = data[0];
168
1.56k
    if (n == 0)
169
239
        n = 256;
170
171
    // Symbols per byte
172
1.56k
    if (n <= 1)
173
167
        *nsym = 0;
174
1.39k
    else if (n <= 2)
175
189
        *nsym = 8;
176
1.20k
    else if (n <= 4)
177
4
        *nsym = 4;
178
1.20k
    else if (n <= 16)
179
113
        *nsym = 2;
180
1.08k
    else {
181
1.08k
        *nsym = 1; // no packing
182
1.08k
        return 1;
183
1.08k
    }
184
185
473
    if (data_len <= 1)
186
0
        return 0;
187
188
473
    int j = 1, c = 0;
189
1.65k
    do {
190
1.65k
        map[c++] = data[j++];
191
1.65k
    } while (c < n && j < data_len);
192
193
473
    return c < n ? 0 : j;
194
473
}
195
196
/*
197
 * Unpacks a packed data steam (given the unpacked meta-data).
198
 *
199
 * "map" is the pack map, mapping 0->n to the expanded symbols.
200
 * The "out" buffer must be preallocated by the caller to be the correct
201
 * size.  For error checking purposes, out_len is set to the size of
202
 * this buffer.
203
 *
204
 * Returns uncompressed data (out) on success,
205
 *         NULL on failure.
206
 */
207
1.52k
uint8_t *hts_unpack(uint8_t *data, int64_t len, uint8_t *out, uint64_t out_len, int nsym, uint8_t *p) {
208
    //uint8_t *out;
209
1.52k
    uint8_t c = 0;
210
1.52k
    int64_t i, j = 0, olen;
211
212
1.52k
    if (nsym == 1) {
213
        // raw data; FIXME: shortcut the need for malloc & memcpy here
214
1.06k
        memcpy(out, data, len);
215
1.06k
        return out;
216
1.06k
    }
217
218
462
    switch(nsym) {
219
189
    case 8: {
220
189
        union {
221
189
            uint64_t w;
222
189
            uint8_t c[8];
223
189
        } map[256];
224
189
        int x;
225
48.5k
        for (x = 0; x < 256; x++) {
226
48.3k
            map[x].c[0] = p[x>>0&1];
227
48.3k
            map[x].c[1] = p[x>>1&1];
228
48.3k
            map[x].c[2] = p[x>>2&1];
229
48.3k
            map[x].c[3] = p[x>>3&1];
230
48.3k
            map[x].c[4] = p[x>>4&1];
231
48.3k
            map[x].c[5] = p[x>>5&1];
232
48.3k
            map[x].c[6] = p[x>>6&1];
233
48.3k
            map[x].c[7] = p[x>>7&1];
234
48.3k
        }
235
189
        if ((out_len+7)/8 > len)
236
0
            return NULL;
237
189
        olen = out_len & ~7;
238
239
303
        for (i = 0; i < olen; i+=8)
240
114
            memcpy(&out[i], &map[data[j++]].w, 8);
241
242
189
        if (out_len != olen) {
243
12
            c = data[j++];
244
84
            while (i < out_len) {
245
72
                out[i++] = p[c & 1];
246
72
                c >>= 1;
247
72
            }
248
12
        }
249
189
        break;
250
189
    }
251
252
4
    case 4: {
253
4
        union {
254
4
            uint32_t w;
255
4
            uint8_t c[4];
256
4
        } map[256];
257
258
4
        int x, y, z, _, P=0;
259
20
        for (x = 0; x < 4; x++)
260
80
            for (y = 0; y < 4; y++)
261
320
                for (z = 0; z < 4; z++)
262
1.28k
                    for (_ = 0; _ < 4; _++, P++) {
263
1.02k
                        map[P].c[0] = p[_];
264
1.02k
                        map[P].c[1] = p[z];
265
1.02k
                        map[P].c[2] = p[y];
266
1.02k
                        map[P].c[3] = p[x];
267
1.02k
                    }
268
269
4
        if ((out_len+3)/4 > len)
270
0
            return NULL;
271
4
        olen = out_len & ~3;
272
273
4
        for (i = 0; i < olen-12; i+=16) {
274
0
            uint32_t w[] = {
275
0
                map[data[j+0]].w,
276
0
                map[data[j+1]].w,
277
0
                map[data[j+2]].w,
278
0
                map[data[j+3]].w
279
0
            };
280
0
            j += 4;
281
0
            memcpy(&out[i], &w, 16);
282
0
        }
283
284
10
        for (; i < olen; i+=4)
285
6
            memcpy(&out[i], &map[data[j++]].w, 4);
286
287
4
        if (out_len != olen) {
288
4
            c = data[j++];
289
16
            while (i < out_len) {
290
12
                out[i++] = p[c & 3];
291
12
                c >>= 2;
292
12
            }
293
4
        }
294
4
        break;
295
4
    }
296
297
110
    case 2: {
298
110
        union {
299
110
            uint16_t w;
300
110
            uint8_t c[2];
301
110
        } map[256];
302
303
110
        int x, y;
304
1.87k
        for (x = 0; x < 16; x++) {
305
29.9k
            for (y = 0; y < 16; y++) {
306
28.1k
                map[x*16+y].c[0] = p[y];
307
28.1k
                map[x*16+y].c[1] = p[x];
308
28.1k
            }
309
1.76k
        }
310
311
110
        if ((out_len+1)/2 > len)
312
5
            return NULL;
313
105
        olen = out_len & ~1;
314
315
825
        for (i = j = 0; i+2 < olen; i+=4) {
316
720
            uint16_t w[] = {
317
720
                map[data[j+0]].w,
318
720
                map[data[j+1]].w
319
720
            };
320
720
            memcpy(&out[i], &w, 4);
321
322
720
            j += 2;
323
720
        }
324
325
129
        for (; i < olen; i+=2)
326
24
            memcpy(&out[i], &map[data[j++]].w, 2);
327
328
105
        if (out_len != olen) {
329
24
            c = data[j++];
330
24
            out[i+0] = p[c&15];
331
24
        }
332
105
        break;
333
110
    }
334
335
159
    case 0:
336
159
        memset(out, p[0], out_len);
337
159
        break;
338
339
0
    default:
340
0
        return NULL;
341
462
    }
342
343
457
    return out;
344
462
}
345
346
347
0
uint8_t *hts_unpack_(uint8_t *data, int64_t len, uint8_t *out, uint64_t out_len, int nsym, uint8_t *p) {
348
    //uint8_t *out;
349
0
    uint8_t c = 0;
350
0
    int64_t i, j = 0, olen;
351
352
0
    if (nsym == 1) {
353
        // raw data; FIXME: shortcut the need for malloc & memcpy here
354
0
        memcpy(out, data, len);
355
0
        return out;
356
0
    }
357
358
0
    switch(nsym) {
359
0
    case 2: {
360
0
        uint16_t map[256], x, y;
361
0
        for (x = 0; x < 16; x++)
362
0
            for (y = 0; y < 16; y++)
363
0
                map[x*16+y] = p[x]*256+p[y];
364
365
0
        if ((out_len+1)/2 > len)
366
0
            return NULL;
367
0
        olen = out_len & ~1;
368
369
0
        uint16_t *o16 = (uint16_t *)out;
370
0
        for (i = 0; i+4 < olen/2; i+=4) {
371
0
            int k;
372
0
            for (k = 0; k < 4; k++)
373
0
                o16[i+k] = map[data[i+k]];
374
0
        }
375
0
        j = i; i *= 2;
376
377
0
        for (; i < olen; i+=2) {
378
0
            uint16_t w1 = map[data[j++]];
379
0
            *(uint16_t *)&out[i] = w1;
380
0
        }
381
382
0
        if (out_len != olen) {
383
0
            c = data[j++];
384
0
            out[i+0] = p[c&15];
385
0
        }
386
0
        break;
387
0
    }
388
389
0
    default:
390
0
        return NULL;
391
0
    }
392
393
0
    return out;
394
0
}