/src/htslib/htscodecs/htscodecs/pack.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2019 Genome Research Ltd. |
3 | | * Author(s): James Bonfield |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright notice, |
9 | | * this list of conditions and the following disclaimer. |
10 | | * |
11 | | * 2. Redistributions in binary form must reproduce the above |
12 | | * copyright notice, this list of conditions and the following |
13 | | * disclaimer in the documentation and/or other materials provided |
14 | | * with the distribution. |
15 | | * |
16 | | * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
17 | | * Institute nor the names of its contributors may be used to endorse |
18 | | * or promote products derived from this software without specific |
19 | | * prior written permission. |
20 | | * |
21 | | * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS |
22 | | * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
23 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
24 | | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH |
25 | | * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
26 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
27 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
28 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
29 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
31 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | #include "config.h" |
35 | | |
36 | | #include <stdint.h> |
37 | | #include <string.h> |
38 | | #include <stdlib.h> |
39 | | #include <stdio.h> |
40 | | |
41 | | #include "pack.h" |
42 | | |
43 | | //----------------------------------------------------------------------------- |
44 | | |
45 | | /* |
46 | | * Packs multiple symbols into a single byte if the total alphabet of symbols |
47 | | * used is <= 16. Each new symbol takes up 1, 2, 4 or 8 bits, or 0 if the |
48 | | * alphabet used is 1 (constant). |
49 | | * |
50 | | * If successful, out_meta/out_meta_len are set to hold the mapping table |
51 | | * to be used during decompression. |
52 | | * |
53 | | * Returns the packed buffer on success with new length in out_len, |
54 | | * NULL of failure |
55 | | */ |
56 | | uint8_t *hts_pack(uint8_t *data, int64_t len, |
57 | 0 | uint8_t *out_meta, int *out_meta_len, uint64_t *out_len) { |
58 | 0 | int p[256] = {0}, n; |
59 | 0 | uint64_t i, j; |
60 | 0 | uint8_t *out = malloc(len+1); |
61 | 0 | if (!out) |
62 | 0 | return NULL; |
63 | | |
64 | | // count syms |
65 | 0 | for (i = 0; i < len; i++) |
66 | 0 | p[data[i]]=1; |
67 | | |
68 | 0 | for (i = n = 0; i < 256; i++) { |
69 | 0 | if (p[i]) { |
70 | 0 | p[i] = n++; // p[i] is now the code number |
71 | 0 | out_meta[n] = i; |
72 | 0 | } |
73 | 0 | } |
74 | 0 | out_meta[0] = n; // 256 wraps to 0 |
75 | 0 | j = n+1; |
76 | | |
77 | | // 1 value per byte |
78 | 0 | if (n > 16) { |
79 | 0 | *out_meta_len = 1; |
80 | | // FIXME shortcut this by returning data and checking later. |
81 | 0 | memcpy(out, data, len); |
82 | 0 | *out_len = len; |
83 | 0 | return out; |
84 | 0 | } |
85 | | |
86 | | // Work out how many values per byte to encode. |
87 | 0 | int val_per_byte; |
88 | 0 | if (n > 4) |
89 | 0 | val_per_byte = 2; |
90 | 0 | else if (n > 2) |
91 | 0 | val_per_byte = 4; |
92 | 0 | else if (n > 1) |
93 | 0 | val_per_byte = 8; |
94 | 0 | else |
95 | 0 | val_per_byte = 0; // infinite |
96 | |
|
97 | 0 | *out_meta_len = j; |
98 | 0 | j = 0; |
99 | |
|
100 | 0 | switch (val_per_byte) { |
101 | 0 | case 2: |
102 | 0 | for (i = 0; i < (len & ~1); i+=2) |
103 | 0 | out[j++] = (p[data[i]]<<0) | (p[data[i+1]]<<4); |
104 | 0 | switch (len-i) { |
105 | 0 | case 1: out[j++] = p[data[i]]; |
106 | 0 | } |
107 | 0 | *out_len = j; |
108 | 0 | return out; |
109 | | |
110 | 0 | case 4: { |
111 | 0 | for (i = 0; i < (len & ~3); i+=4) |
112 | 0 | out[j++] = (p[data[i]]<<0) | (p[data[i+1]]<<2) | (p[data[i+2]]<<4) | (p[data[i+3]]<<6); |
113 | 0 | out[j] = 0; |
114 | 0 | int s = len-i, x = 0; |
115 | 0 | switch (s) { |
116 | 0 | case 3: out[j] |= p[data[i++]] << x; x+=2; |
117 | 0 | case 2: out[j] |= p[data[i++]] << x; x+=2; |
118 | 0 | case 1: out[j] |= p[data[i++]] << x; x+=2; |
119 | 0 | j++; |
120 | 0 | } |
121 | 0 | *out_len = j; |
122 | 0 | return out; |
123 | 0 | } |
124 | | |
125 | 0 | case 8: { |
126 | 0 | for (i = 0; i < (len & ~7); i+=8) |
127 | 0 | out[j++] = (p[data[i+0]]<<0) | (p[data[i+1]]<<1) | (p[data[i+2]]<<2) | (p[data[i+3]]<<3) |
128 | 0 | | (p[data[i+4]]<<4) | (p[data[i+5]]<<5) | (p[data[i+6]]<<6) | (p[data[i+7]]<<7); |
129 | 0 | out[j] = 0; |
130 | 0 | int s = len-i, x = 0; |
131 | 0 | switch (s) { |
132 | 0 | case 7: out[j] |= p[data[i++]] << x++; |
133 | 0 | case 6: out[j] |= p[data[i++]] << x++; |
134 | 0 | case 5: out[j] |= p[data[i++]] << x++; |
135 | 0 | case 4: out[j] |= p[data[i++]] << x++; |
136 | 0 | case 3: out[j] |= p[data[i++]] << x++; |
137 | 0 | case 2: out[j] |= p[data[i++]] << x++; |
138 | 0 | case 1: out[j] |= p[data[i++]] << x++; |
139 | 0 | j++; |
140 | 0 | } |
141 | 0 | *out_len = j; |
142 | 0 | return out; |
143 | 0 | } |
144 | | |
145 | 0 | case 0: |
146 | 0 | *out_len = j; |
147 | 0 | return out; |
148 | 0 | } |
149 | | |
150 | 0 | return NULL; |
151 | 0 | } |
152 | | |
153 | | |
154 | | /* |
155 | | * Unpacks the meta-data portions of the hts_pack algorithm. |
156 | | * This consists of the count of symbols and their values. |
157 | | * |
158 | | * The "map" array is filled out with the used symbols. |
159 | | * "nsym" is set to contain the number of symbols per byte; |
160 | | * 0, 1, 2, 4 or 8. |
161 | | * |
162 | | * Returns number of bytes of data[] consumed on success, |
163 | | * zero on failure. |
164 | | */ |
165 | | uint8_t hts_unpack_meta(uint8_t *data, uint32_t data_len, |
166 | 806 | uint64_t udata_len, uint8_t *map, int *nsym) { |
167 | 806 | if (data_len == 0) |
168 | 2 | return 0; |
169 | | |
170 | | // Number of symbols used |
171 | 804 | unsigned int n = data[0]; |
172 | 804 | if (n == 0) |
173 | 96 | n = 256; |
174 | | |
175 | | // Symbols per byte |
176 | 804 | if (n <= 1) |
177 | 86 | *nsym = 0; |
178 | 718 | else if (n <= 2) |
179 | 66 | *nsym = 8; |
180 | 652 | else if (n <= 4) |
181 | 26 | *nsym = 4; |
182 | 626 | else if (n <= 16) |
183 | 76 | *nsym = 2; |
184 | 550 | else { |
185 | 550 | *nsym = 1; // no packing |
186 | 550 | return 1; |
187 | 550 | } |
188 | | |
189 | 254 | if (data_len <= 1) |
190 | 0 | return 0; |
191 | | |
192 | 254 | int j = 1, c = 0; |
193 | 1.46k | do { |
194 | 1.46k | map[c++] = data[j++]; |
195 | 1.46k | } while (c < n && j < data_len); |
196 | | |
197 | 254 | return c < n ? 0 : j; |
198 | 254 | } |
199 | | |
200 | | /* |
201 | | * Unpacks a packed data steam (given the unpacked meta-data). |
202 | | * |
203 | | * "map" is the pack map, mapping 0->n to the expanded symbols. |
204 | | * The "out" buffer must be preallocated by the caller to be the correct |
205 | | * size. For error checking purposes, out_len is set to the size of |
206 | | * this buffer. |
207 | | * |
208 | | * Returns uncompressed data (out) on success, |
209 | | * NULL on failure. |
210 | | */ |
211 | 789 | uint8_t *hts_unpack(uint8_t *data, int64_t len, uint8_t *out, uint64_t out_len, int nsym, uint8_t *p) { |
212 | | //uint8_t *out; |
213 | 789 | uint8_t c = 0; |
214 | 789 | int64_t i, j = 0, olen; |
215 | | |
216 | 789 | if (nsym == 1) { |
217 | | // raw data; FIXME: shortcut the need for malloc & memcpy here |
218 | 540 | memcpy(out, data, len); |
219 | 540 | return out; |
220 | 540 | } |
221 | | |
222 | 249 | switch(nsym) { |
223 | 66 | case 8: { |
224 | 66 | union { |
225 | 66 | uint64_t w; |
226 | 66 | uint8_t c[8]; |
227 | 66 | } map[256]; |
228 | 66 | int x; |
229 | 16.9k | for (x = 0; x < 256; x++) { |
230 | 16.8k | map[x].c[0] = p[x>>0&1]; |
231 | 16.8k | map[x].c[1] = p[x>>1&1]; |
232 | 16.8k | map[x].c[2] = p[x>>2&1]; |
233 | 16.8k | map[x].c[3] = p[x>>3&1]; |
234 | 16.8k | map[x].c[4] = p[x>>4&1]; |
235 | 16.8k | map[x].c[5] = p[x>>5&1]; |
236 | 16.8k | map[x].c[6] = p[x>>6&1]; |
237 | 16.8k | map[x].c[7] = p[x>>7&1]; |
238 | 16.8k | } |
239 | 66 | if ((out_len+7)/8 > len) |
240 | 1 | return NULL; |
241 | 65 | olen = out_len & ~7; |
242 | | |
243 | 65 | for (i = 0; i < olen; i+=8) |
244 | 0 | memcpy(&out[i], &map[data[j++]].w, 8); |
245 | | |
246 | 65 | if (out_len != olen) { |
247 | 0 | c = data[j++]; |
248 | 0 | while (i < out_len) { |
249 | 0 | out[i++] = p[c & 1]; |
250 | 0 | c >>= 1; |
251 | 0 | } |
252 | 0 | } |
253 | 65 | break; |
254 | 66 | } |
255 | | |
256 | 25 | case 4: { |
257 | 25 | union { |
258 | 25 | uint32_t w; |
259 | 25 | uint8_t c[4]; |
260 | 25 | } map[256]; |
261 | | |
262 | 25 | int x, y, z, _, P=0; |
263 | 125 | for (x = 0; x < 4; x++) |
264 | 500 | for (y = 0; y < 4; y++) |
265 | 2.00k | for (z = 0; z < 4; z++) |
266 | 8.00k | for (_ = 0; _ < 4; _++, P++) { |
267 | 6.40k | map[P].c[0] = p[_]; |
268 | 6.40k | map[P].c[1] = p[z]; |
269 | 6.40k | map[P].c[2] = p[y]; |
270 | 6.40k | map[P].c[3] = p[x]; |
271 | 6.40k | } |
272 | | |
273 | 25 | if ((out_len+3)/4 > len) |
274 | 0 | return NULL; |
275 | 25 | olen = out_len & ~3; |
276 | | |
277 | 25 | for (i = 0; i < olen-12; i+=16) { |
278 | 0 | uint32_t w[] = { |
279 | 0 | map[data[j+0]].w, |
280 | 0 | map[data[j+1]].w, |
281 | 0 | map[data[j+2]].w, |
282 | 0 | map[data[j+3]].w |
283 | 0 | }; |
284 | 0 | j += 4; |
285 | 0 | memcpy(&out[i], &w, 16); |
286 | 0 | } |
287 | | |
288 | 29 | for (; i < olen; i+=4) |
289 | 4 | memcpy(&out[i], &map[data[j++]].w, 4); |
290 | | |
291 | 25 | if (out_len != olen) { |
292 | 1 | c = data[j++]; |
293 | 4 | while (i < out_len) { |
294 | 3 | out[i++] = p[c & 3]; |
295 | 3 | c >>= 2; |
296 | 3 | } |
297 | 1 | } |
298 | 25 | break; |
299 | 25 | } |
300 | | |
301 | 74 | case 2: { |
302 | 74 | union { |
303 | 74 | uint16_t w; |
304 | 74 | uint8_t c[2]; |
305 | 74 | } map[256]; |
306 | | |
307 | 74 | int x, y; |
308 | 1.25k | for (x = 0; x < 16; x++) { |
309 | 20.1k | for (y = 0; y < 16; y++) { |
310 | 18.9k | map[x*16+y].c[0] = p[y]; |
311 | 18.9k | map[x*16+y].c[1] = p[x]; |
312 | 18.9k | } |
313 | 1.18k | } |
314 | | |
315 | 74 | if ((out_len+1)/2 > len) |
316 | 0 | return NULL; |
317 | 74 | olen = out_len & ~1; |
318 | | |
319 | 77 | for (i = j = 0; i+2 < olen; i+=4) { |
320 | 3 | uint16_t w[] = { |
321 | 3 | map[data[j+0]].w, |
322 | 3 | map[data[j+1]].w |
323 | 3 | }; |
324 | 3 | memcpy(&out[i], &w, 4); |
325 | | |
326 | 3 | j += 2; |
327 | 3 | } |
328 | | |
329 | 78 | for (; i < olen; i+=2) |
330 | 4 | memcpy(&out[i], &map[data[j++]].w, 2); |
331 | | |
332 | 74 | if (out_len != olen) { |
333 | 1 | c = data[j++]; |
334 | 1 | out[i+0] = p[c&15]; |
335 | 1 | } |
336 | 74 | break; |
337 | 74 | } |
338 | | |
339 | 84 | case 0: |
340 | 84 | memset(out, p[0], out_len); |
341 | 84 | break; |
342 | | |
343 | 0 | default: |
344 | 0 | return NULL; |
345 | 249 | } |
346 | | |
347 | 248 | return out; |
348 | 249 | } |
349 | | |
350 | | |
351 | 0 | uint8_t *hts_unpack_(uint8_t *data, int64_t len, uint8_t *out, uint64_t out_len, int nsym, uint8_t *p) { |
352 | | //uint8_t *out; |
353 | 0 | uint8_t c = 0; |
354 | 0 | int64_t i, j = 0, olen; |
355 | |
|
356 | 0 | if (nsym == 1) { |
357 | | // raw data; FIXME: shortcut the need for malloc & memcpy here |
358 | 0 | memcpy(out, data, len); |
359 | 0 | return out; |
360 | 0 | } |
361 | | |
362 | 0 | switch(nsym) { |
363 | 0 | case 2: { |
364 | 0 | uint16_t map[256], x, y; |
365 | 0 | for (x = 0; x < 16; x++) |
366 | 0 | for (y = 0; y < 16; y++) |
367 | 0 | map[x*16+y] = p[x]*256+p[y]; |
368 | |
|
369 | 0 | if ((out_len+1)/2 > len) |
370 | 0 | return NULL; |
371 | 0 | olen = out_len & ~1; |
372 | |
|
373 | 0 | uint16_t *o16 = (uint16_t *)out; |
374 | 0 | for (i = 0; i+4 < olen/2; i+=4) { |
375 | 0 | int k; |
376 | 0 | for (k = 0; k < 4; k++) |
377 | 0 | o16[i+k] = map[data[i+k]]; |
378 | 0 | } |
379 | 0 | j = i; i *= 2; |
380 | |
|
381 | 0 | for (; i < olen; i+=2) { |
382 | 0 | uint16_t w1 = map[data[j++]]; |
383 | 0 | *(uint16_t *)&out[i] = w1; |
384 | 0 | } |
385 | |
|
386 | 0 | if (out_len != olen) { |
387 | 0 | c = data[j++]; |
388 | 0 | out[i+0] = p[c&15]; |
389 | 0 | } |
390 | 0 | break; |
391 | 0 | } |
392 | | |
393 | 0 | default: |
394 | 0 | return NULL; |
395 | 0 | } |
396 | | |
397 | 0 | return out; |
398 | 0 | } |