Coverage Report

Created: 2023-01-17 06:24

/src/htslib/htscodecs/htscodecs/pack.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2019 Genome Research Ltd.
3
 * Author(s): James Bonfield
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 *
8
 *    1. Redistributions of source code must retain the above copyright notice,
9
 *       this list of conditions and the following disclaimer.
10
 *
11
 *    2. Redistributions in binary form must reproduce the above
12
 *       copyright notice, this list of conditions and the following
13
 *       disclaimer in the documentation and/or other materials provided
14
 *       with the distribution.
15
 *
16
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17
 *       Institute nor the names of its contributors may be used to endorse
18
 *       or promote products derived from this software without specific
19
 *       prior written permission.
20
 *
21
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
 */
33
34
#include "config.h"
35
36
#include <stdint.h>
37
#include <string.h>
38
#include <stdlib.h>
39
#include <stdio.h>
40
41
#include "pack.h"
42
43
//-----------------------------------------------------------------------------
44
45
/*
46
 * Packs multiple symbols into a single byte if the total alphabet of symbols
47
 * used is <= 16.  Each new symbol takes up 1, 2, 4 or 8 bits, or 0 if the
48
 * alphabet used is 1 (constant).
49
 *
50
 * If successful, out_meta/out_meta_len are set to hold the mapping table
51
 * to be used during decompression.
52
 *
53
 * Returns the packed buffer on success with new length in out_len,
54
 *         NULL of failure
55
 */
56
uint8_t *hts_pack(uint8_t *data, int64_t len,
57
0
                  uint8_t *out_meta, int *out_meta_len, uint64_t *out_len) {
58
0
    int p[256] = {0}, n;
59
0
    uint64_t i, j;
60
0
    uint8_t *out = malloc(len+1);
61
0
    if (!out)
62
0
        return NULL;
63
64
    // count syms
65
0
    for (i = 0; i < len; i++)
66
0
        p[data[i]]=1;
67
    
68
0
    for (i = n = 0; i < 256; i++) {
69
0
        if (p[i]) {
70
0
            p[i] = n++; // p[i] is now the code number
71
0
            out_meta[n] = i;
72
0
        }
73
0
    }
74
0
    out_meta[0] = n; // 256 wraps to 0
75
0
    j = n+1;
76
77
    // 1 value per byte
78
0
    if (n > 16) {
79
0
        *out_meta_len = 1;
80
        // FIXME shortcut this by returning data and checking later.
81
0
        memcpy(out, data, len);
82
0
        *out_len = len;
83
0
        return out;
84
0
    }
85
86
    // Work out how many values per byte to encode.
87
0
    int val_per_byte;
88
0
    if (n > 4)
89
0
        val_per_byte = 2;
90
0
    else if (n > 2)
91
0
        val_per_byte = 4;
92
0
    else if (n > 1)
93
0
        val_per_byte = 8;
94
0
    else
95
0
        val_per_byte = 0; // infinite
96
97
0
    *out_meta_len = j;
98
0
    j = 0;
99
100
0
    switch (val_per_byte) {
101
0
    case 2:
102
0
        for (i = 0; i < (len & ~1); i+=2)
103
0
            out[j++] = (p[data[i]]<<0) | (p[data[i+1]]<<4);
104
0
        switch (len-i) {
105
0
        case 1: out[j++] = p[data[i]];
106
0
        }
107
0
        *out_len = j;
108
0
        return out;
109
110
0
    case 4: {
111
0
        for (i = 0; i < (len & ~3); i+=4)
112
0
            out[j++] = (p[data[i]]<<0) | (p[data[i+1]]<<2) | (p[data[i+2]]<<4) | (p[data[i+3]]<<6);
113
0
        out[j] = 0;
114
0
        int s = len-i, x = 0;
115
0
        switch (s) {
116
0
        case 3: out[j] |= p[data[i++]] << x; x+=2;
117
0
        case 2: out[j] |= p[data[i++]] << x; x+=2;
118
0
        case 1: out[j] |= p[data[i++]] << x; x+=2;
119
0
            j++;
120
0
        }
121
0
        *out_len = j;
122
0
        return out;
123
0
    }
124
125
0
    case 8: {
126
0
        for (i = 0; i < (len & ~7); i+=8)
127
0
            out[j++] = (p[data[i+0]]<<0) | (p[data[i+1]]<<1) | (p[data[i+2]]<<2) | (p[data[i+3]]<<3)
128
0
                     | (p[data[i+4]]<<4) | (p[data[i+5]]<<5) | (p[data[i+6]]<<6) | (p[data[i+7]]<<7);
129
0
        out[j] = 0;
130
0
        int s = len-i, x = 0;
131
0
        switch (s) {
132
0
        case 7: out[j] |= p[data[i++]] << x++;
133
0
        case 6: out[j] |= p[data[i++]] << x++;
134
0
        case 5: out[j] |= p[data[i++]] << x++;
135
0
        case 4: out[j] |= p[data[i++]] << x++;
136
0
        case 3: out[j] |= p[data[i++]] << x++;
137
0
        case 2: out[j] |= p[data[i++]] << x++;
138
0
        case 1: out[j] |= p[data[i++]] << x++;
139
0
            j++;
140
0
        }
141
0
        *out_len = j;
142
0
        return out;
143
0
    }
144
145
0
    case 0:
146
0
        *out_len = j;
147
0
        return out;
148
0
    }
149
150
0
    return NULL;
151
0
}
152
153
154
/*
155
 * Unpacks the meta-data portions of the hts_pack algorithm.
156
 * This consists of the count of symbols and their values.
157
 *
158
 * The "map" array is filled out with the used symbols.
159
 * "nsym" is set to contain the number of symbols per byte;
160
 * 0, 1, 2, 4 or 8.
161
 *
162
 * Returns number of bytes of data[] consumed on success,
163
 *         zero on failure.
164
 */
165
uint8_t hts_unpack_meta(uint8_t *data, uint32_t data_len,
166
806
                        uint64_t udata_len, uint8_t *map, int *nsym) {
167
806
    if (data_len == 0)
168
2
        return 0;
169
170
    // Number of symbols used
171
804
    unsigned int n = data[0];
172
804
    if (n == 0)
173
96
        n = 256;
174
175
    // Symbols per byte
176
804
    if (n <= 1)
177
86
        *nsym = 0;
178
718
    else if (n <= 2)
179
66
        *nsym = 8;
180
652
    else if (n <= 4)
181
26
        *nsym = 4;
182
626
    else if (n <= 16)
183
76
        *nsym = 2;
184
550
    else {
185
550
        *nsym = 1; // no packing
186
550
        return 1;
187
550
    }
188
189
254
    if (data_len <= 1)
190
0
        return 0;
191
192
254
    int j = 1, c = 0;
193
1.46k
    do {
194
1.46k
        map[c++] = data[j++];
195
1.46k
    } while (c < n && j < data_len);
196
197
254
    return c < n ? 0 : j;
198
254
}
199
200
/*
201
 * Unpacks a packed data steam (given the unpacked meta-data).
202
 *
203
 * "map" is the pack map, mapping 0->n to the expanded symbols.
204
 * The "out" buffer must be preallocated by the caller to be the correct
205
 * size.  For error checking purposes, out_len is set to the size of
206
 * this buffer.
207
 *
208
 * Returns uncompressed data (out) on success,
209
 *         NULL on failure.
210
 */
211
789
uint8_t *hts_unpack(uint8_t *data, int64_t len, uint8_t *out, uint64_t out_len, int nsym, uint8_t *p) {
212
    //uint8_t *out;
213
789
    uint8_t c = 0;
214
789
    int64_t i, j = 0, olen;
215
216
789
    if (nsym == 1) {
217
        // raw data; FIXME: shortcut the need for malloc & memcpy here
218
540
        memcpy(out, data, len);
219
540
        return out;
220
540
    }
221
222
249
    switch(nsym) {
223
66
    case 8: {
224
66
        union {
225
66
            uint64_t w;
226
66
            uint8_t c[8];
227
66
        } map[256];
228
66
        int x;
229
16.9k
        for (x = 0; x < 256; x++) {
230
16.8k
            map[x].c[0] = p[x>>0&1];
231
16.8k
            map[x].c[1] = p[x>>1&1];
232
16.8k
            map[x].c[2] = p[x>>2&1];
233
16.8k
            map[x].c[3] = p[x>>3&1];
234
16.8k
            map[x].c[4] = p[x>>4&1];
235
16.8k
            map[x].c[5] = p[x>>5&1];
236
16.8k
            map[x].c[6] = p[x>>6&1];
237
16.8k
            map[x].c[7] = p[x>>7&1];
238
16.8k
        }
239
66
        if ((out_len+7)/8 > len)
240
1
            return NULL;
241
65
        olen = out_len & ~7;
242
243
65
        for (i = 0; i < olen; i+=8)
244
0
            memcpy(&out[i], &map[data[j++]].w, 8);
245
246
65
        if (out_len != olen) {
247
0
            c = data[j++];
248
0
            while (i < out_len) {
249
0
                out[i++] = p[c & 1];
250
0
                c >>= 1;
251
0
            }
252
0
        }
253
65
        break;
254
66
    }
255
256
25
    case 4: {
257
25
        union {
258
25
            uint32_t w;
259
25
            uint8_t c[4];
260
25
        } map[256];
261
262
25
        int x, y, z, _, P=0;
263
125
        for (x = 0; x < 4; x++)
264
500
            for (y = 0; y < 4; y++)
265
2.00k
                for (z = 0; z < 4; z++)
266
8.00k
                    for (_ = 0; _ < 4; _++, P++) {
267
6.40k
                        map[P].c[0] = p[_];
268
6.40k
                        map[P].c[1] = p[z];
269
6.40k
                        map[P].c[2] = p[y];
270
6.40k
                        map[P].c[3] = p[x];
271
6.40k
                    }
272
273
25
        if ((out_len+3)/4 > len)
274
0
            return NULL;
275
25
        olen = out_len & ~3;
276
277
25
        for (i = 0; i < olen-12; i+=16) {
278
0
            uint32_t w[] = {
279
0
                map[data[j+0]].w,
280
0
                map[data[j+1]].w,
281
0
                map[data[j+2]].w,
282
0
                map[data[j+3]].w
283
0
            };
284
0
            j += 4;
285
0
            memcpy(&out[i], &w, 16);
286
0
        }
287
288
29
        for (; i < olen; i+=4)
289
4
            memcpy(&out[i], &map[data[j++]].w, 4);
290
291
25
        if (out_len != olen) {
292
1
            c = data[j++];
293
4
            while (i < out_len) {
294
3
                out[i++] = p[c & 3];
295
3
                c >>= 2;
296
3
            }
297
1
        }
298
25
        break;
299
25
    }
300
301
74
    case 2: {
302
74
        union {
303
74
            uint16_t w;
304
74
            uint8_t c[2];
305
74
        } map[256];
306
307
74
        int x, y;
308
1.25k
        for (x = 0; x < 16; x++) {
309
20.1k
            for (y = 0; y < 16; y++) {
310
18.9k
                map[x*16+y].c[0] = p[y];
311
18.9k
                map[x*16+y].c[1] = p[x];
312
18.9k
            }
313
1.18k
        }
314
315
74
        if ((out_len+1)/2 > len)
316
0
            return NULL;
317
74
        olen = out_len & ~1;
318
319
77
        for (i = j = 0; i+2 < olen; i+=4) {
320
3
            uint16_t w[] = {
321
3
                map[data[j+0]].w,
322
3
                map[data[j+1]].w
323
3
            };
324
3
            memcpy(&out[i], &w, 4);
325
326
3
            j += 2;
327
3
        }
328
329
78
        for (; i < olen; i+=2)
330
4
            memcpy(&out[i], &map[data[j++]].w, 2);
331
332
74
        if (out_len != olen) {
333
1
            c = data[j++];
334
1
            out[i+0] = p[c&15];
335
1
        }
336
74
        break;
337
74
    }
338
339
84
    case 0:
340
84
        memset(out, p[0], out_len);
341
84
        break;
342
343
0
    default:
344
0
        return NULL;
345
249
    }
346
347
248
    return out;
348
249
}
349
350
351
0
uint8_t *hts_unpack_(uint8_t *data, int64_t len, uint8_t *out, uint64_t out_len, int nsym, uint8_t *p) {
352
    //uint8_t *out;
353
0
    uint8_t c = 0;
354
0
    int64_t i, j = 0, olen;
355
356
0
    if (nsym == 1) {
357
        // raw data; FIXME: shortcut the need for malloc & memcpy here
358
0
        memcpy(out, data, len);
359
0
        return out;
360
0
    }
361
362
0
    switch(nsym) {
363
0
    case 2: {
364
0
        uint16_t map[256], x, y;
365
0
        for (x = 0; x < 16; x++)
366
0
            for (y = 0; y < 16; y++)
367
0
                map[x*16+y] = p[x]*256+p[y];
368
369
0
        if ((out_len+1)/2 > len)
370
0
            return NULL;
371
0
        olen = out_len & ~1;
372
373
0
        uint16_t *o16 = (uint16_t *)out;
374
0
        for (i = 0; i+4 < olen/2; i+=4) {
375
0
            int k;
376
0
            for (k = 0; k < 4; k++)
377
0
                o16[i+k] = map[data[i+k]];
378
0
        }
379
0
        j = i; i *= 2;
380
381
0
        for (; i < olen; i+=2) {
382
0
            uint16_t w1 = map[data[j++]];
383
0
            *(uint16_t *)&out[i] = w1;
384
0
        }
385
386
0
        if (out_len != olen) {
387
0
            c = data[j++];
388
0
            out[i+0] = p[c&15];
389
0
        }
390
0
        break;
391
0
    }
392
393
0
    default:
394
0
        return NULL;
395
0
    }
396
397
0
    return out;
398
0
}