Coverage Report

Created: 2023-01-17 06:24

/src/htslib/cram/cram_codecs.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2021 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * FIXME: add checking of cram_external_type to return NULL on unsupported
33
 * {codec,type} tuples.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdlib.h>
40
#include <string.h>
41
#include <assert.h>
42
#include <limits.h>
43
#include <stdint.h>
44
#include <errno.h>
45
#include <stddef.h>
46
47
#include "../htslib/hts_endian.h"
48
49
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
50
#include <htscodecs/varint.h>
51
#include <htscodecs/pack.h>
52
#include <htscodecs/rle.h>
53
#else
54
#include "../htscodecs/htscodecs/varint.h"
55
#include "../htscodecs/htscodecs/pack.h"
56
#include "../htscodecs/htscodecs/rle.h"
57
#endif
58
59
#include "cram.h"
60
61
/*
62
 * ---------------------------------------------------------------------------
63
 * Block bit-level I/O functions.
64
 * All defined static here to promote easy inlining by the compiler.
65
 */
66
67
#if 0
68
/* Get a single bit, MSB first */
69
static signed int get_bit_MSB(cram_block *block) {
70
    unsigned int val;
71
72
    if (block->byte > block->alloc)
73
        return -1;
74
75
    val = block->data[block->byte] >> block->bit;
76
    if (--block->bit == -1) {
77
        block->bit = 7;
78
        block->byte++;
79
        //printf("(%02X)", block->data[block->byte]);
80
    }
81
82
    //printf("-B%d-", val&1);
83
84
    return val & 1;
85
}
86
#endif
87
88
/*
89
 * Count number of successive 0 and 1 bits
90
 */
91
0
static int get_one_bits_MSB(cram_block *block) {
92
0
    int n = 0, b;
93
0
    if (block->byte >= block->uncomp_size)
94
0
        return -1;
95
0
    do {
96
0
        b = block->data[block->byte] >> block->bit;
97
0
        if (--block->bit == -1) {
98
0
            block->bit = 7;
99
0
            block->byte++;
100
0
            if (block->byte == block->uncomp_size && (b&1))
101
0
                return -1;
102
0
        }
103
0
        n++;
104
0
    } while (b&1);
105
106
0
    return n-1;
107
0
}
108
109
0
static int get_zero_bits_MSB(cram_block *block) {
110
0
    int n = 0, b;
111
0
    if (block->byte >= block->uncomp_size)
112
0
        return -1;
113
0
    do {
114
0
        b = block->data[block->byte] >> block->bit;
115
0
        if (--block->bit == -1) {
116
0
            block->bit = 7;
117
0
            block->byte++;
118
0
            if (block->byte == block->uncomp_size && !(b&1))
119
0
                return -1;
120
0
        }
121
0
        n++;
122
0
    } while (!(b&1));
123
124
0
    return n-1;
125
0
}
126
127
#if 0
128
/* Stores a single bit */
129
static void store_bit_MSB(cram_block *block, unsigned int bit) {
130
    if (block->byte >= block->alloc) {
131
        block->alloc = block->alloc ? block->alloc*2 : 1024;
132
        block->data = realloc(block->data, block->alloc);
133
    }
134
135
    if (bit)
136
        block->data[block->byte] |= (1 << block->bit);
137
138
    if (--block->bit == -1) {
139
        block->bit = 7;
140
        block->byte++;
141
        block->data[block->byte] = 0;
142
    }
143
}
144
#endif
145
146
#if 0
147
/* Rounds to the next whole byte boundary first */
148
static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
149
    if (block->bit != 7) {
150
        block->bit = 7;
151
        block->byte++;
152
    }
153
154
    while (block->byte + len >= block->alloc) {
155
        block->alloc = block->alloc ? block->alloc*2 : 1024;
156
        block->data = realloc(block->data, block->alloc);
157
    }
158
159
    memcpy(&block->data[block->byte], bytes, len);
160
    block->byte += len;
161
}
162
#endif
163
164
/* Local optimised copy for inlining */
165
0
static inline int64_t get_bits_MSB(cram_block *block, int nbits) {
166
0
    uint64_t val = 0;
167
0
    int i;
168
169
#if 0
170
    // Fits within the current byte */
171
    if (nbits <= block->bit+1) {
172
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
173
        if ((block->bit -= nbits) == -1) {
174
            block->bit = 7;
175
            block->byte++;
176
        }
177
        return val;
178
    }
179
180
    // partial first byte
181
    val = block->data[block->byte] & ((1<<(block->bit+1))-1);
182
    nbits -= block->bit+1;
183
    block->bit = 7;
184
    block->byte++;
185
186
    // whole middle bytes
187
    while (nbits >= 8) {
188
        val = (val << 8) | block->data[block->byte++];
189
        nbits -= 8;
190
    }
191
192
    val <<= nbits;
193
    val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
194
    block->bit -= nbits;
195
    return val;
196
#endif
197
198
#if 0
199
    /* Inefficient implementation! */
200
    //printf("{");
201
    for (i = 0; i < nbits; i++)
202
        //val = (val << 1) | get_bit_MSB(block);
203
        GET_BIT_MSB(block, val);
204
#endif
205
206
0
#if 1
207
    /* Combination of 1st two methods */
208
0
    if (nbits <= block->bit+1) {
209
0
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
210
0
        if ((block->bit -= nbits) == -1) {
211
0
            block->bit = 7;
212
0
            block->byte++;
213
0
        }
214
0
        return val;
215
0
    }
216
217
0
    switch(nbits) {
218
//  case 15: GET_BIT_MSB(block, val); // fall through
219
//  case 14: GET_BIT_MSB(block, val); // fall through
220
//  case 13: GET_BIT_MSB(block, val); // fall through
221
//  case 12: GET_BIT_MSB(block, val); // fall through
222
//  case 11: GET_BIT_MSB(block, val); // fall through
223
//  case 10: GET_BIT_MSB(block, val); // fall through
224
//  case  9: GET_BIT_MSB(block, val); // fall through
225
0
    case  8: GET_BIT_MSB(block, val); // fall through
226
0
    case  7: GET_BIT_MSB(block, val); // fall through
227
0
    case  6: GET_BIT_MSB(block, val); // fall through
228
0
    case  5: GET_BIT_MSB(block, val); // fall through
229
0
    case  4: GET_BIT_MSB(block, val); // fall through
230
0
    case  3: GET_BIT_MSB(block, val); // fall through
231
0
    case  2: GET_BIT_MSB(block, val); // fall through
232
0
    case  1: GET_BIT_MSB(block, val);
233
0
        break;
234
235
0
    default:
236
0
        for (i = 0; i < nbits; i++)
237
            //val = (val << 1) | get_bit_MSB(block);
238
0
            GET_BIT_MSB(block, val);
239
0
    }
240
0
#endif
241
242
    //printf("=0x%x}", val);
243
244
0
    return val;
245
0
}
246
247
/*
248
 * Can store up to 24-bits worth of data encoded in an integer value
249
 * Possibly we'd want to have a less optimal store_bits function when dealing
250
 * with nbits > 24, but for now we assume the codes generated are never
251
 * that big. (Given this is only possible with 121392 or more
252
 * characters with exactly the correct frequency distribution we check
253
 * for it elsewhere.)
254
 */
255
0
static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) {
256
    //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
257
258
    /*
259
     * Use slow mode until we tweak the huffman generator to never generate
260
     * codes longer than 24-bits.
261
     */
262
0
    unsigned int mask;
263
264
0
    if (block->byte+8 >= block->alloc) {
265
0
        if (block->byte) {
266
0
            block->alloc *= 2;
267
0
            block->data = realloc(block->data, block->alloc + 8);
268
0
            if (!block->data)
269
0
                return -1;
270
0
        } else {
271
0
            block->alloc = 1024;
272
0
            block->data = realloc(block->data, block->alloc + 8);
273
0
            if (!block->data)
274
0
                return -1;
275
0
            block->data[0] = 0; // initialise first byte of buffer
276
0
        }
277
0
    }
278
279
    /* fits in current bit-field */
280
0
    if (nbits <= block->bit+1) {
281
0
        block->data[block->byte] |= (val << (block->bit+1-nbits));
282
0
        if ((block->bit-=nbits) == -1) {
283
0
            block->bit = 7;
284
0
            block->byte++;
285
0
            block->data[block->byte] = 0;
286
0
        }
287
0
        return 0;
288
0
    }
289
290
0
    block->data[block->byte] |= (val >> (nbits -= block->bit+1));
291
0
    block->bit = 7;
292
0
    block->byte++;
293
0
    block->data[block->byte] = 0;
294
295
0
    mask = 1<<(nbits-1);
296
0
    do {
297
0
        if (val & mask)
298
0
            block->data[block->byte] |= (1 << block->bit);
299
0
        if (--block->bit == -1) {
300
0
            block->bit = 7;
301
0
            block->byte++;
302
0
            block->data[block->byte] = 0;
303
0
        }
304
0
        mask >>= 1;
305
0
    } while(--nbits);
306
307
0
    return 0;
308
0
}
309
310
/*
311
 * Returns the next 'size' bytes from a block, or NULL if insufficient
312
 * data left.This is just a pointer into the block data and not an
313
 * allocated object, so do not free the result.
314
 */
315
0
static char *cram_extract_block(cram_block *b, int size) {
316
0
    char *cp = (char *)b->data + b->idx;
317
0
    b->idx += size;
318
0
    if (b->idx > b->uncomp_size)
319
0
        return NULL;
320
321
0
    return cp;
322
0
}
323
324
/*
325
 * ---------------------------------------------------------------------------
326
 * EXTERNAL
327
 *
328
 * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the
329
 * size of the object being returned.  This type is hard coded in the
330
 * spec document (changing from uint32 to uint64 requires a spec change)
331
 * and there is no data format introspection so implementations have
332
 * to determine which size to use based on version numbers.   It also
333
 * doesn't support signed data.
334
 *
335
 * With CRAM 4.0 onwards the size and sign of the data is no longer stated
336
 * explicitly in the specification.  Instead EXTERNAL is replaced by three
337
 * new encodings, for bytes and signed / unsigned integers which used a
338
 * variable sized encoding.
339
 *
340
 * For simplicity we use the same encode and decode functions for
341
 * bytes (CRAM4) and external (CRAM3). Given we already had code to
342
 * replace codec + type into a function pointer it makes little
343
 * difference how we ended up at that function.  However we disallow
344
 * this codec to operate on integer data for CRAM4 onwards.
345
 */
346
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
347
0
                             cram_block *in, char *out, int *out_size) {
348
0
    char *cp;
349
0
    cram_block *b;
350
351
    /* Find the external block */
352
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
353
0
    if (!b)
354
0
        return *out_size?-1:0;
355
356
0
    cp = (char *)b->data + b->idx;
357
    // E_INT and E_LONG are guaranteed single item queries
358
0
    int err = 0;
359
0
    *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err);
360
0
    b->idx = cp - (char *)b->data;
361
0
    *out_size = 1;
362
363
0
    return err ? -1 : 0;
364
0
}
365
366
int cram_external_decode_long(cram_slice *slice, cram_codec *c,
367
0
                              cram_block *in, char *out, int *out_size) {
368
0
    char *cp;
369
0
    cram_block *b;
370
371
    /* Find the external block */
372
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
373
0
    if (!b)
374
0
        return *out_size?-1:0;
375
376
0
    cp = (char *)b->data + b->idx;
377
    // E_INT and E_LONG are guaranteed single item queries
378
0
    int err = 0;
379
0
    *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err);
380
0
    b->idx = cp - (char *)b->data;
381
0
    *out_size = 1;
382
383
0
    return err ? -1 : 0;
384
0
}
385
386
int cram_external_decode_char(cram_slice *slice, cram_codec *c,
387
                              cram_block *in, char *out,
388
0
                              int *out_size) {
389
0
    char *cp;
390
0
    cram_block *b;
391
392
    /* Find the external block */
393
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
394
0
    if (!b)
395
0
        return *out_size?-1:0;
396
397
0
    cp = cram_extract_block(b, *out_size);
398
0
    if (!cp)
399
0
        return -1;
400
401
0
    if (out)
402
0
        memcpy(out, cp, *out_size);
403
0
    return 0;
404
0
}
405
406
static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
407
                                      cram_block *in, char *out_,
408
0
                                      int *out_size) {
409
0
    char *cp;
410
0
    cram_block *out = (cram_block *)out_;
411
0
    cram_block *b = NULL;
412
413
    /* Find the external block */
414
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
415
0
    if (!b)
416
0
        return *out_size?-1:0;
417
418
0
    cp = cram_extract_block(b, *out_size);
419
0
    if (!cp)
420
0
        return -1;
421
422
0
    BLOCK_APPEND(out, cp, *out_size);
423
0
    return 0;
424
425
0
 block_err:
426
0
    return -1;
427
0
}
428
429
242
void cram_external_decode_free(cram_codec *c) {
430
242
    if (c)
431
242
        free(c);
432
242
}
433
434
435
0
int cram_external_decode_size(cram_slice *slice, cram_codec *c) {
436
0
    cram_block *b;
437
438
    /* Find the external block */
439
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
440
0
    if (!b)
441
0
        return -1;
442
443
0
    return b->uncomp_size;
444
0
}
445
446
0
cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) {
447
0
    return cram_get_block_by_id(slice, c->u.external.content_id);
448
0
}
449
450
cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
451
                                      char *data, int size,
452
                                      enum cram_encoding codec,
453
                                      enum cram_external_type option,
454
243
                                      int version, varint_vec *vv) {
455
243
    cram_codec *c = NULL;
456
243
    char *cp = data;
457
458
243
    if (size < 1)
459
0
        goto malformed;
460
461
243
    if (!(c = malloc(sizeof(*c))))
462
0
        return NULL;
463
464
243
    c->codec  = E_EXTERNAL;
465
243
    if (CRAM_MAJOR_VERS(version) >= 4) {
466
        // Version 4 does not permit integer data to be encoded as a
467
        // series of bytes.  This is used purely for bytes, either
468
        // singular or declared as arrays
469
0
        switch (codec) {
470
0
        case E_EXTERNAL:
471
0
            if (option == E_BYTE_ARRAY_BLOCK)
472
0
                c->decode = cram_external_decode_block;
473
0
            else if (option == E_BYTE || option == E_BYTE_ARRAY)
474
0
                c->decode = cram_external_decode_char;
475
0
            else
476
0
                return NULL;
477
0
            break;
478
0
        default:
479
0
            return NULL;
480
0
        }
481
243
    } else {
482
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
483
        // use the option field to indicate the input data format so
484
        // we know which serialisation format to use.
485
243
        if (option == E_INT)
486
152
            c->decode = cram_external_decode_int;
487
91
        else if (option == E_LONG)
488
0
            c->decode = cram_external_decode_long;
489
91
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
490
39
            c->decode = cram_external_decode_char;
491
52
        else
492
52
            c->decode = cram_external_decode_block;
493
243
    }
494
243
    c->free   = cram_external_decode_free;
495
243
    c->size   = cram_external_decode_size;
496
243
    c->get_block = cram_external_get_block;
497
498
243
    c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL);
499
500
243
    if (cp - data != size)
501
1
        goto malformed;
502
503
242
    c->u.external.type = option;
504
505
242
    return c;
506
507
1
 malformed:
508
1
    hts_log_error("Malformed external header stream");
509
1
    free(c);
510
1
    return NULL;
511
243
}
512
513
int cram_external_encode_int(cram_slice *slice, cram_codec *c,
514
0
                             char *in, int in_size) {
515
0
    uint32_t *i32 = (uint32_t *)in;
516
0
    return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1;
517
0
}
518
519
int cram_external_encode_sint(cram_slice *slice, cram_codec *c,
520
0
                             char *in, int in_size) {
521
0
    int32_t *i32 = (int32_t *)in;
522
0
    return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1;
523
0
}
524
525
int cram_external_encode_long(cram_slice *slice, cram_codec *c,
526
0
                             char *in, int in_size) {
527
0
    uint64_t *i64 = (uint64_t *)in;
528
0
    return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1;
529
0
}
530
531
int cram_external_encode_slong(cram_slice *slice, cram_codec *c,
532
0
                               char *in, int in_size) {
533
0
    int64_t *i64 = (int64_t *)in;
534
0
    return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1;
535
0
}
536
537
int cram_external_encode_char(cram_slice *slice, cram_codec *c,
538
0
                              char *in, int in_size) {
539
0
    BLOCK_APPEND(c->out, in, in_size);
540
0
    return 0;
541
542
0
 block_err:
543
0
    return -1;
544
0
}
545
546
0
void cram_external_encode_free(cram_codec *c) {
547
0
    if (!c)
548
0
        return;
549
0
    free(c);
550
0
}
551
552
int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
553
0
                               int version) {
554
0
    char tmp[99], *tp = tmp, *tpend = tmp+99;
555
0
    int len = 0, r = 0, n;
556
557
0
    if (prefix) {
558
0
        size_t l = strlen(prefix);
559
0
        BLOCK_APPEND(b, prefix, l);
560
0
        len += l;
561
0
    }
562
563
0
    tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id);
564
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
565
0
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
566
0
    BLOCK_APPEND(b, tmp, tp-tmp);
567
0
    len += tp-tmp;
568
569
0
    if (r > 0)
570
0
        return len;
571
572
0
 block_err:
573
0
    return -1;
574
0
}
575
576
cram_codec *cram_external_encode_init(cram_stats *st,
577
                                      enum cram_encoding codec,
578
                                      enum cram_external_type option,
579
                                      void *dat,
580
0
                                      int version, varint_vec *vv) {
581
0
    cram_codec *c;
582
583
0
    c = malloc(sizeof(*c));
584
0
    if (!c)
585
0
        return NULL;
586
0
    c->codec = E_EXTERNAL;
587
0
    c->free = cram_external_encode_free;
588
0
    if (CRAM_MAJOR_VERS(version) >= 4) {
589
        // Version 4 does not permit integer data to be encoded as a
590
        // series of bytes.  This is used purely for bytes, either
591
        // singular or declared as arrays
592
0
        switch (codec) {
593
0
        case E_EXTERNAL:
594
0
            if (option != E_BYTE && option != E_BYTE_ARRAY)
595
0
                return NULL;
596
0
            c->encode = cram_external_encode_char;
597
0
            break;
598
0
        default:
599
0
            return NULL;
600
0
        }
601
0
    } else {
602
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
603
        // use the option field to indicate the input data format so
604
        // we know which serialisation format to use.
605
0
        if (option == E_INT)
606
0
            c->encode = cram_external_encode_int;
607
0
        else if (option == E_LONG)
608
0
            c->encode = cram_external_encode_long;
609
0
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
610
0
            c->encode = cram_external_encode_char;
611
0
        else
612
0
            abort();
613
0
    }
614
0
    c->store = cram_external_encode_store;
615
0
    c->flush = NULL;
616
617
0
    c->u.e_external.content_id = (size_t)dat;
618
619
0
    return c;
620
0
}
621
622
/*
623
 * ---------------------------------------------------------------------------
624
 * VARINT
625
 *
626
 * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8
627
 * format as well as bytes.  In CRAM 4 EXTERNAL is only for bytes and
628
 * byte arrays, with two dedicated encodings for integers:
629
 * VARINT_SIGNED and VARINT_UNSIGNED.  These also differ a little to
630
 * EXTERNAL with the addition of an offset field, meaning we can store
631
 * values in, say, the range -2 to 1 million without needing to use
632
 * a signed zig-zag transformation.
633
 */
634
int cram_varint_decode_int(cram_slice *slice, cram_codec *c,
635
0
                           cram_block *in, char *out, int *out_size) {
636
0
    char *cp;
637
0
    cram_block *b;
638
639
    /* Find the data block */
640
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
641
0
    if (!b)
642
0
        return *out_size?-1:0;
643
644
0
    cp = (char *)b->data + b->idx;
645
    // E_INT and E_LONG are guaranteed single item queries
646
0
    int err = 0;
647
0
    *(int32_t *)out = c->vv->varint_get32(&cp,
648
0
                                          (char *)b->data + b->uncomp_size,
649
0
                                          &err) + c->u.varint.offset;
650
0
    b->idx = cp - (char *)b->data;
651
0
    *out_size = 1;
652
653
0
    return err ? -1 : 0;
654
0
}
655
656
int cram_varint_decode_sint(cram_slice *slice, cram_codec *c,
657
0
                            cram_block *in, char *out, int *out_size) {
658
0
    char *cp;
659
0
    cram_block *b;
660
661
    /* Find the data block */
662
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
663
0
    if (!b)
664
0
        return *out_size?-1:0;
665
666
0
    cp = (char *)b->data + b->idx;
667
    // E_INT and E_LONG are guaranteed single item queries
668
0
    int err = 0;
669
0
    *(int32_t *)out = c->vv->varint_get32s(&cp,
670
0
                                           (char *)b->data + b->uncomp_size,
671
0
                                           &err) + c->u.varint.offset;
672
0
    b->idx = cp - (char *)b->data;
673
0
    *out_size = 1;
674
675
0
    return err ? -1 : 0;
676
0
}
677
678
int cram_varint_decode_long(cram_slice *slice, cram_codec *c,
679
0
                            cram_block *in, char *out, int *out_size) {
680
0
    char *cp;
681
0
    cram_block *b;
682
683
    /* Find the data block */
684
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
685
0
    if (!b)
686
0
        return *out_size?-1:0;
687
688
0
    cp = (char *)b->data + b->idx;
689
    // E_INT and E_LONG are guaranteed single item queries
690
0
    int err = 0;
691
0
    *(int64_t *)out = c->vv->varint_get64(&cp,
692
0
                                          (char *)b->data + b->uncomp_size,
693
0
                                          &err) + c->u.varint.offset;
694
0
    b->idx = cp - (char *)b->data;
695
0
    *out_size = 1;
696
697
0
    return err ? -1 : 0;
698
0
}
699
700
int cram_varint_decode_slong(cram_slice *slice, cram_codec *c,
701
0
                             cram_block *in, char *out, int *out_size) {
702
0
    char *cp;
703
0
    cram_block *b;
704
705
    /* Find the data block */
706
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
707
0
    if (!b)
708
0
        return *out_size?-1:0;
709
710
0
    cp = (char *)b->data + b->idx;
711
    // E_INT and E_LONG are guaranteed single item queries
712
0
    int err = 0;
713
0
    *(int64_t *)out = c->vv->varint_get64s(&cp,
714
0
                                           (char *)b->data + b->uncomp_size,
715
0
                                           &err) + c->u.varint.offset;
716
0
    b->idx = cp - (char *)b->data;
717
0
    *out_size = 1;
718
719
0
    return err ? -1 : 0;
720
0
}
721
722
199
void cram_varint_decode_free(cram_codec *c) {
723
199
    if (c)
724
199
        free(c);
725
199
}
726
727
0
int cram_varint_decode_size(cram_slice *slice, cram_codec *c) {
728
0
    cram_block *b;
729
730
    /* Find the data block */
731
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
732
0
    if (!b)
733
0
        return -1;
734
735
0
    return b->uncomp_size;
736
0
}
737
738
0
cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) {
739
0
    return cram_get_block_by_id(slice, c->u.varint.content_id);
740
0
}
741
742
cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
743
                                    char *data, int size,
744
                                    enum cram_encoding codec,
745
                                    enum cram_external_type option,
746
199
                                    int version, varint_vec *vv) {
747
199
    cram_codec *c;
748
199
    char *cp = data, *cp_end = data+size;
749
750
199
    if (!(c = malloc(sizeof(*c))))
751
0
        return NULL;
752
753
199
    c->codec  = codec;
754
755
    // Function pointer choice is theoretically by codec type.
756
    // Given we have some vars as int32 and some as int64 we
757
    // use option too for sizing, although on disk format
758
    // does not change.
759
199
    switch(codec) {
760
121
    case E_VARINT_UNSIGNED:
761
121
        c->decode = (option == E_INT)
762
121
            ? cram_varint_decode_int
763
121
            : cram_varint_decode_long;
764
121
        break;
765
78
    case E_VARINT_SIGNED:
766
78
        c->decode = (option == E_INT)
767
78
            ? cram_varint_decode_sint
768
78
            : cram_varint_decode_slong;
769
78
        break;
770
0
    default:
771
0
        return NULL;
772
199
    }
773
774
199
    c->free   = cram_varint_decode_free;
775
199
    c->size   = cram_varint_decode_size;
776
199
    c->get_block = cram_varint_get_block;
777
778
199
    c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL);
779
199
    c->u.varint.offset     = vv->varint_get64s(&cp, cp_end, NULL);
780
781
199
    if (cp - data != size) {
782
0
        fprintf(stderr, "Malformed varint header stream\n");
783
0
        free(c);
784
0
        return NULL;
785
0
    }
786
787
199
    c->u.varint.type = option;
788
789
199
    return c;
790
199
}
791
792
int cram_varint_encode_int(cram_slice *slice, cram_codec *c,
793
0
                           char *in, int in_size) {
794
0
    uint32_t *i32 = (uint32_t *)in;
795
0
    return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0
796
0
        ? 0 : -1;
797
0
}
798
799
int cram_varint_encode_sint(cram_slice *slice, cram_codec *c,
800
0
                            char *in, int in_size) {
801
0
    int32_t *i32 = (int32_t *)in;
802
0
    return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0
803
0
        ? 0 : -1;
804
0
}
805
806
int cram_varint_encode_long(cram_slice *slice, cram_codec *c,
807
0
                            char *in, int in_size) {
808
0
    uint64_t *i64 = (uint64_t *)in;
809
0
    return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0
810
0
        ? 0 : -1;
811
0
}
812
813
int cram_varint_encode_slong(cram_slice *slice, cram_codec *c,
814
0
                             char *in, int in_size) {
815
0
    int64_t *i64 = (int64_t *)in;
816
0
    return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0
817
0
        ? 0 : -1;
818
0
}
819
820
0
void cram_varint_encode_free(cram_codec *c) {
821
0
    if (!c)
822
0
        return;
823
0
    free(c);
824
0
}
825
826
int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix,
827
0
                             int version) {
828
0
    char tmp[99], *tp = tmp;
829
0
    int len = 0;
830
831
0
    if (prefix) {
832
0
        size_t l = strlen(prefix);
833
0
        BLOCK_APPEND(b, prefix, l);
834
0
        len += l;
835
0
    }
836
837
0
    tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id);
838
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset);
839
0
    len += c->vv->varint_put32_blk(b, c->codec);
840
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
841
0
    BLOCK_APPEND(b, tmp, tp-tmp);
842
0
    len += tp-tmp;
843
844
0
    return len;
845
846
0
 block_err:
847
0
    return -1;
848
0
}
849
850
cram_codec *cram_varint_encode_init(cram_stats *st,
851
                                    enum cram_encoding codec,
852
                                    enum cram_external_type option,
853
                                    void *dat,
854
0
                                    int version, varint_vec *vv) {
855
0
    cram_codec *c;
856
857
0
    if (!(c = malloc(sizeof(*c))))
858
0
        return NULL;
859
860
0
    c->u.e_varint.offset = 0;
861
0
    if (st) {
862
        // Marginal difference so far! Not worth the hassle?
863
0
        if (st->min_val < 0 && st->min_val >= -127
864
0
            && st->max_val / -st->min_val > 100) {
865
0
            c->u.e_varint.offset = -st->min_val;
866
0
            codec = E_VARINT_UNSIGNED;
867
0
        } else if (st->min_val > 0) {
868
0
            c->u.e_varint.offset = -st->min_val;
869
0
        }
870
0
    }
871
872
0
    c->codec = codec;
873
0
    c->free = cram_varint_encode_free;
874
875
    // Function pointer choice is theoretically by codec type.
876
    // Given we have some vars as int32 and some as int64 we
877
    // use option too for sizing, although on disk format
878
    // does not change.
879
0
    switch (codec) {
880
0
    case E_VARINT_UNSIGNED:
881
0
        c->encode = (option == E_INT)
882
0
            ? cram_varint_encode_int
883
0
            : cram_varint_encode_long;
884
0
        break;
885
0
    case E_VARINT_SIGNED:
886
0
        c->encode = (option == E_INT)
887
0
            ? cram_varint_encode_sint
888
0
            : cram_varint_encode_slong;
889
0
        break;
890
0
    default:
891
0
        return NULL;
892
0
    }
893
0
    c->store = cram_varint_encode_store;
894
0
    c->flush = NULL;
895
896
0
    c->u.e_varint.content_id = (size_t)dat;
897
898
0
    return c;
899
0
}
900
/*
901
 * ---------------------------------------------------------------------------
902
 * CONST_BYTE and CONST_INT
903
 */
904
int cram_const_decode_byte(cram_slice *slice, cram_codec *c,
905
0
                           cram_block *in, char *out, int *out_size) {
906
0
    int i, n;
907
908
0
    for (i = 0, n = *out_size; i < n; i++)
909
0
        out[i] = c->u.xconst.val;
910
911
0
    return 0;
912
0
}
913
914
int cram_const_decode_int(cram_slice *slice, cram_codec *c,
915
0
                          cram_block *in, char *out, int *out_size) {
916
0
    int32_t *out_i = (int32_t *)out;
917
0
    int i, n;
918
919
0
    for (i = 0, n = *out_size; i < n; i++)
920
0
        out_i[i] = c->u.xconst.val;
921
922
0
    return 0;
923
0
}
924
925
int cram_const_decode_long(cram_slice *slice, cram_codec *c,
926
0
                           cram_block *in, char *out, int *out_size) {
927
0
    int64_t *out_i = (int64_t *)out;
928
0
    int i, n;
929
930
0
    for (i = 0, n = *out_size; i < n; i++)
931
0
        out_i[i] = c->u.xconst.val;
932
933
0
    return 0;
934
0
}
935
936
210
void cram_const_decode_free(cram_codec *c) {
937
210
    if (c)
938
210
        free(c);
939
210
}
940
941
0
int cram_const_decode_size(cram_slice *slice, cram_codec *c) {
942
0
    return 0;
943
0
}
944
945
cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
946
                                   char *data, int size,
947
                                   enum cram_encoding codec,
948
                                   enum cram_external_type option,
949
210
                                   int version, varint_vec *vv) {
950
210
    cram_codec *c;
951
210
    char *cp = data;
952
953
210
    if (!(c = malloc(sizeof(*c))))
954
0
        return NULL;
955
956
210
    c->codec  = codec;
957
210
    if (codec == E_CONST_BYTE)
958
0
        c->decode = cram_const_decode_byte;
959
210
    else if (option == E_INT)
960
79
        c->decode = cram_const_decode_int;
961
131
    else
962
131
        c->decode = cram_const_decode_long;
963
210
    c->free   = cram_const_decode_free;
964
210
    c->size   = cram_const_decode_size;
965
210
    c->get_block = NULL;
966
967
210
    c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL);
968
969
210
    if (cp - data != size) {
970
0
        fprintf(stderr, "Malformed const header stream\n");
971
0
        free(c);
972
0
        return NULL;
973
0
    }
974
975
210
    return c;
976
210
}
977
978
int cram_const_encode(cram_slice *slice, cram_codec *c,
979
0
                      char *in, int in_size) {
980
0
    return 0;
981
0
}
982
983
int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix,
984
0
                            int version) {
985
0
    char tmp[99], *tp = tmp;
986
0
    int len = 0;
987
988
0
    if (prefix) {
989
0
        size_t l = strlen(prefix);
990
0
        BLOCK_APPEND(b, prefix, l);
991
0
        len += l;
992
0
    }
993
994
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val);
995
0
    len += c->vv->varint_put32_blk(b, c->codec);
996
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
997
0
    BLOCK_APPEND(b, tmp, tp-tmp);
998
0
    len += tp-tmp;
999
1000
0
    return len;
1001
1002
0
 block_err:
1003
0
    return -1;
1004
0
}
1005
1006
cram_codec *cram_const_encode_init(cram_stats *st,
1007
                                   enum cram_encoding codec,
1008
                                   enum cram_external_type option,
1009
                                   void *dat,
1010
0
                                   int version, varint_vec *vv) {
1011
0
    cram_codec *c;
1012
1013
0
    if (!(c = malloc(sizeof(*c))))
1014
0
        return NULL;
1015
1016
0
    c->codec = codec;
1017
0
    c->free = cram_const_decode_free; // as as decode
1018
0
    c->encode = cram_const_encode; // a nop
1019
0
    c->store = cram_const_encode_store;
1020
0
    c->flush = NULL;
1021
0
    c->u.e_xconst.val = st->min_val;
1022
1023
0
    return c;
1024
0
}
1025
1026
/*
1027
 * ---------------------------------------------------------------------------
1028
 * BETA
1029
 */
1030
0
int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1031
0
    int64_t *out_i = (int64_t *)out;
1032
0
    int i, n = *out_size;
1033
1034
0
    if (c->u.beta.nbits) {
1035
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1036
0
            return -1;
1037
1038
0
        for (i = 0; i < n; i++)
1039
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1040
0
    } else {
1041
0
        for (i = 0; i < n; i++)
1042
0
            out_i[i] = -c->u.beta.offset;
1043
0
    }
1044
1045
0
    return 0;
1046
0
}
1047
1048
0
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1049
0
    int32_t *out_i = (int32_t *)out;
1050
0
    int i, n = *out_size;
1051
1052
0
    if (c->u.beta.nbits) {
1053
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1054
0
            return -1;
1055
1056
0
        for (i = 0; i < n; i++)
1057
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1058
0
    } else {
1059
0
        for (i = 0; i < n; i++)
1060
0
            out_i[i] = -c->u.beta.offset;
1061
0
    }
1062
1063
0
    return 0;
1064
0
}
1065
1066
0
int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1067
0
    int i, n = *out_size;
1068
1069
1070
0
    if (c->u.beta.nbits) {
1071
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1072
0
            return -1;
1073
1074
0
        if (out)
1075
0
            for (i = 0; i < n; i++)
1076
0
                out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1077
0
        else
1078
0
            for (i = 0; i < n; i++)
1079
0
                get_bits_MSB(in, c->u.beta.nbits);
1080
0
    } else {
1081
0
        if (out)
1082
0
            for (i = 0; i < n; i++)
1083
0
                out[i] = -c->u.beta.offset;
1084
0
    }
1085
1086
0
    return 0;
1087
0
}
1088
1089
112
void cram_beta_decode_free(cram_codec *c) {
1090
112
    if (c)
1091
112
        free(c);
1092
112
}
1093
1094
cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
1095
                                  char *data, int size,
1096
                                  enum cram_encoding codec,
1097
                                  enum cram_external_type option,
1098
112
                                  int version, varint_vec *vv) {
1099
112
    cram_codec *c;
1100
112
    char *cp = data;
1101
1102
112
    if (!(c = malloc(sizeof(*c))))
1103
0
        return NULL;
1104
1105
112
    c->codec  = E_BETA;
1106
112
    if (option == E_INT || option == E_SINT)
1107
42
        c->decode = cram_beta_decode_int;
1108
70
    else if (option == E_LONG || option == E_SLONG)
1109
0
        c->decode = cram_beta_decode_long;
1110
70
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1111
70
        c->decode = cram_beta_decode_char;
1112
0
    else {
1113
0
        hts_log_error("BYTE_ARRAYs not supported by this codec");
1114
0
        free(c);
1115
0
        return NULL;
1116
0
    }
1117
112
    c->free   = cram_beta_decode_free;
1118
1119
112
    c->u.beta.nbits = -1;
1120
112
    c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL);
1121
112
    if (cp < data + size) // Ensure test below works
1122
112
        c->u.beta.nbits  = vv->varint_get32(&cp, data + size, NULL);
1123
1124
112
    if (cp - data != size
1125
112
        || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) {
1126
0
        hts_log_error("Malformed beta header stream");
1127
0
        free(c);
1128
0
        return NULL;
1129
0
    }
1130
1131
112
    return c;
1132
112
}
1133
1134
int cram_beta_encode_store(cram_codec *c, cram_block *b,
1135
0
                           char *prefix, int version) {
1136
0
    int len = 0, r = 0, n;
1137
1138
0
    if (prefix) {
1139
0
        size_t l = strlen(prefix);
1140
0
        BLOCK_APPEND(b, prefix, l);
1141
0
        len += l;
1142
0
    }
1143
1144
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1145
    // codec length
1146
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset)
1147
0
                                         + c->vv->varint_size(c->u.e_beta.nbits)));
1148
0
    r |= n;
1149
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n;
1150
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits));  r |= n;
1151
1152
0
    if (r > 0) return len;
1153
1154
0
 block_err:
1155
0
    return -1;
1156
0
}
1157
1158
int cram_beta_encode_long(cram_slice *slice, cram_codec *c,
1159
0
                          char *in, int in_size) {
1160
0
    int64_t *syms = (int64_t *)in;
1161
0
    int i, r = 0;
1162
1163
0
    for (i = 0; i < in_size; i++)
1164
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1165
0
                            c->u.e_beta.nbits);
1166
1167
0
    return r;
1168
0
}
1169
1170
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
1171
0
                         char *in, int in_size) {
1172
0
    int *syms = (int *)in;
1173
0
    int i, r = 0;
1174
1175
0
    for (i = 0; i < in_size; i++)
1176
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1177
0
                            c->u.e_beta.nbits);
1178
1179
0
    return r;
1180
0
}
1181
1182
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
1183
0
                          char *in, int in_size) {
1184
0
    unsigned char *syms = (unsigned char *)in;
1185
0
    int i, r = 0;
1186
1187
0
    for (i = 0; i < in_size; i++)
1188
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1189
0
                            c->u.e_beta.nbits);
1190
1191
0
    return r;
1192
0
}
1193
1194
0
void cram_beta_encode_free(cram_codec *c) {
1195
0
    if (c) free(c);
1196
0
}
1197
1198
cram_codec *cram_beta_encode_init(cram_stats *st,
1199
                                  enum cram_encoding codec,
1200
                                  enum cram_external_type option,
1201
                                  void *dat,
1202
0
                                  int version, varint_vec *vv) {
1203
0
    cram_codec *c;
1204
0
    int min_val, max_val, len = 0;
1205
0
    int64_t range;
1206
1207
0
    c = malloc(sizeof(*c));
1208
0
    if (!c)
1209
0
        return NULL;
1210
0
    c->codec  = E_BETA;
1211
0
    c->free   = cram_beta_encode_free;
1212
0
    if (option == E_INT || option == E_SINT)
1213
0
        c->encode = cram_beta_encode_int;
1214
0
    else if (option == E_LONG || option == E_SLONG)
1215
0
        c->encode = cram_beta_encode_long;
1216
0
    else
1217
0
        c->encode = cram_beta_encode_char;
1218
0
    c->store  = cram_beta_encode_store;
1219
0
    c->flush = NULL;
1220
1221
0
    if (dat) {
1222
0
        min_val = ((int *)dat)[0];
1223
0
        max_val = ((int *)dat)[1];
1224
0
    } else {
1225
0
        min_val = INT_MAX;
1226
0
        max_val = INT_MIN;
1227
0
        int i;
1228
0
        for (i = 0; i < MAX_STAT_VAL; i++) {
1229
0
            if (!st->freqs[i])
1230
0
                continue;
1231
0
            if (min_val > i)
1232
0
                min_val = i;
1233
0
            max_val = i;
1234
0
        }
1235
0
        if (st->h) {
1236
0
            khint_t k;
1237
1238
0
            for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1239
0
                if (!kh_exist(st->h, k))
1240
0
                    continue;
1241
1242
0
                i = kh_key(st->h, k);
1243
0
                if (min_val > i)
1244
0
                    min_val = i;
1245
0
                if (max_val < i)
1246
0
                    max_val = i;
1247
0
            }
1248
0
        }
1249
0
    }
1250
1251
0
    assert(max_val >= min_val);
1252
0
    c->u.e_beta.offset = -min_val;
1253
0
    range = (int64_t) max_val - min_val;
1254
0
    while (range) {
1255
0
        len++;
1256
0
        range >>= 1;
1257
0
    }
1258
0
    c->u.e_beta.nbits = len;
1259
1260
0
    return c;
1261
0
}
1262
1263
/*
1264
 * ---------------------------------------------------------------------------
1265
 * XPACK: Packing multiple values into a single byte.  A fast transform that
1266
 * reduces time taken by entropy encoder and may also improve compression.
1267
 *
1268
 * This also has the additional requirement that the data series is not
1269
 * interleaved with another, permitting efficient encoding and decoding
1270
 * of all elements enmasse instead of needing to only extract the bits
1271
 * necessary per item.
1272
 */
1273
0
int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1274
0
    int64_t *out_i = (int64_t *)out;
1275
0
    int i, n = *out_size;
1276
1277
0
    if (c->u.xpack.nbits) {
1278
0
        for (i = 0; i < n; i++)
1279
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1280
0
    } else {
1281
0
        for (i = 0; i < n; i++)
1282
0
            out_i[i] = c->u.xpack.rmap[0];
1283
0
    }
1284
1285
0
    return 0;
1286
0
}
1287
1288
0
int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1289
0
    int32_t *out_i = (int32_t *)out;
1290
0
    int i, n = *out_size;
1291
1292
0
    if (c->u.xpack.nbits) {
1293
0
        if (cram_not_enough_bits(in, c->u.xpack.nbits * n))
1294
0
            return -1;
1295
1296
0
        for (i = 0; i < n; i++)
1297
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1298
0
    } else {
1299
0
        for (i = 0; i < n; i++)
1300
0
            out_i[i] = c->u.xpack.rmap[0];
1301
0
    }
1302
1303
0
    return 0;
1304
0
}
1305
1306
0
static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) {
1307
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
1308
0
    if (b)
1309
0
        return 0;
1310
1311
    // get sub-codec data.
1312
0
    cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec);
1313
0
    if (!sub_b)
1314
0
        return -1;
1315
1316
    // Allocate local block to expand into
1317
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
1318
0
    if (!b)
1319
0
        return -1;
1320
0
    int n = sub_b->uncomp_size * 8/c->u.xpack.nbits;
1321
0
    BLOCK_GROW(b, n);
1322
0
    b->uncomp_size = n;
1323
1324
0
    uint8_t p[256];
1325
0
    int z;
1326
0
    for (z = 0; z < 256; z++)
1327
0
        p[z] = c->u.xpack.rmap[z];
1328
0
    hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size,
1329
0
               8 / c->u.xpack.nbits, p);
1330
1331
0
    return 0;
1332
1333
0
 block_err:
1334
0
    return -1;
1335
0
}
1336
1337
0
int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1338
    // FIXME: we need to ban data-series interleaving in the spec for this to work.
1339
1340
    // Remember this may be called when threaded and multi-slice per container.
1341
    // Hence one cram_codec instance, multiple slices, multiple blocks.
1342
    // We therefore have to cache appropriate block info in slice and not codec.
1343
    //    b = cram_get_block_by_id(slice, c->external.content_id);
1344
0
    if (c->u.xpack.nval > 1) {
1345
0
        cram_xpack_decode_expand_char(slice, c);
1346
0
        cram_block *b = slice->block_by_id[512 + c->codec_id];
1347
0
        if (!b)
1348
0
            return -1;
1349
1350
0
        if (out)
1351
0
            memcpy(out, b->data + b->byte, *out_size);
1352
0
        b->byte += *out_size;
1353
0
    } else {
1354
0
        memset(out, c->u.xpack.rmap[0], *out_size);
1355
0
    }
1356
1357
0
    return 0;
1358
0
}
1359
1360
256
void cram_xpack_decode_free(cram_codec *c) {
1361
256
    if (!c) return;
1362
1363
256
    if (c->u.xpack.sub_codec)
1364
254
        c->u.xpack.sub_codec->free(c->u.xpack.sub_codec);
1365
1366
    //free(slice->block_by_id[512 + c->codec_id]);
1367
    //slice->block_by_id[512 + c->codec_id] = 0;
1368
1369
256
    free(c);
1370
256
}
1371
1372
0
int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) {
1373
0
    cram_xpack_decode_expand_char(slice, c);
1374
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1375
0
}
1376
1377
0
cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) {
1378
0
    cram_xpack_decode_expand_char(slice, c);
1379
0
    return slice->block_by_id[512 + c->codec_id];
1380
0
}
1381
1382
cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr,
1383
                                   char *data, int size,
1384
                                   enum cram_encoding codec,
1385
                                   enum cram_external_type option,
1386
256
                                   int version, varint_vec *vv) {
1387
256
    cram_codec *c;
1388
256
    char *cp = data;
1389
256
    char *endp = data+size;
1390
1391
256
    if (!(c = calloc(1, sizeof(*c))))
1392
0
        return NULL;
1393
1394
256
    c->codec  = E_XPACK;
1395
256
    if (option == E_LONG)
1396
0
        c->decode = cram_xpack_decode_long;
1397
256
    else if (option == E_INT)
1398
55
        c->decode = cram_xpack_decode_int;
1399
201
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1400
201
        c->decode = cram_xpack_decode_char;
1401
0
    else {
1402
0
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
1403
0
        goto malformed;
1404
0
    }
1405
256
    c->free = cram_xpack_decode_free;
1406
256
    c->size = cram_xpack_decode_size;
1407
256
    c->get_block = cram_xpack_get_block;
1408
1409
256
    c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL);
1410
256
    c->u.xpack.nval  = vv->varint_get32(&cp, endp, NULL);
1411
256
    if (c->u.xpack.nbits >= 8  || c->u.xpack.nbits < 0 ||
1412
256
        c->u.xpack.nval  > 256 || c->u.xpack.nval < 0)
1413
0
        goto malformed;
1414
256
    int i;
1415
1.45k
    for (i = 0; i < c->u.xpack.nval; i++) {
1416
1.19k
        uint32_t v = vv->varint_get32(&cp, endp, NULL);
1417
1.19k
        if (v >= 256)
1418
1
            goto malformed;
1419
1.19k
        c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K
1420
1.19k
    }
1421
1422
255
    int encoding = vv->varint_get32(&cp, endp, NULL);
1423
255
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1424
255
    if (sub_size < 0 || endp - cp < sub_size)
1425
0
        goto malformed;
1426
255
    c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1427
255
                                             option, version, vv);
1428
255
    if (c->u.xpack.sub_codec == NULL)
1429
1
        goto malformed;
1430
254
    cp += sub_size;
1431
1432
254
    if (cp - data != size
1433
254
        || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) {
1434
4
    malformed:
1435
4
        fprintf(stderr, "Malformed xpack header stream\n");
1436
4
        cram_xpack_decode_free(c);
1437
4
        return NULL;
1438
2
    }
1439
1440
252
    return c;
1441
254
}
1442
1443
0
int cram_xpack_encode_flush(cram_codec *c) {
1444
    // Pack the buffered up data
1445
0
    int meta_len;
1446
0
    uint64_t out_len;
1447
0
    uint8_t out_meta[1024];
1448
0
    uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out),
1449
0
                            out_meta, &meta_len, &out_len);
1450
1451
    // We now need to pass this through the next layer of transform
1452
0
    if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming
1453
0
                                     c->u.e_xpack.sub_codec,
1454
0
                                     (char *)out, out_len))
1455
0
        return -1;
1456
1457
0
    int r = 0;
1458
0
    if (c->u.e_xpack.sub_codec->flush)
1459
0
        r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec);
1460
1461
0
    free(out);
1462
0
    return r;
1463
0
}
1464
1465
int cram_xpack_encode_store(cram_codec *c, cram_block *b,
1466
0
                            char *prefix, int version) {
1467
0
    int len = 0, r = 0, n;
1468
1469
0
    if (prefix) {
1470
0
        size_t l = strlen(prefix);
1471
0
        BLOCK_APPEND(b, prefix, l);
1472
0
        len += l;
1473
0
    }
1474
1475
    // Store sub-codec
1476
0
    cram_codec *tc = c->u.e_xpack.sub_codec;
1477
0
    cram_block *tb = cram_new_block(0, 0);
1478
0
    if (!tb)
1479
0
        return -1;
1480
0
    int len2 = tc->store(tc, tb, NULL, version);
1481
1482
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1483
1484
    // codec length
1485
0
    int len1 = 0, i;
1486
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1487
0
        len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n;
1488
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits)
1489
0
                                        +  c->vv->varint_size(c->u.e_xpack.nval)
1490
0
                                        + len1 + len2)); r |= n;
1491
1492
    // The map and sub-codec
1493
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n;
1494
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval));  r |= n;
1495
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1496
0
        len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n;
1497
1498
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1499
1500
0
    cram_free_block(tb);
1501
1502
0
    return r > 0 ? len + len2 : -1;
1503
1504
0
 block_err:
1505
0
    return -1;
1506
0
}
1507
1508
// Same as cram_beta_encode_long
1509
int cram_xpack_encode_long(cram_slice *slice, cram_codec *c,
1510
0
                           char *in, int in_size) {
1511
0
    int64_t *syms = (int64_t *)in;
1512
0
    int i, r = 0;
1513
1514
0
    for (i = 0; i < in_size; i++)
1515
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1516
1517
0
    return r;
1518
0
}
1519
1520
int cram_xpack_encode_int(cram_slice *slice, cram_codec *c,
1521
0
                          char *in, int in_size) {
1522
0
    int *syms = (int *)in;
1523
0
    int i, r = 0;
1524
1525
0
    for (i = 0; i < in_size; i++)
1526
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1527
1528
0
    return r;
1529
0
}
1530
1531
int cram_xpack_encode_char(cram_slice *slice, cram_codec *c,
1532
0
                           char *in, int in_size) {
1533
0
    BLOCK_APPEND(c->out, in, in_size);
1534
0
    return 0;
1535
1536
0
 block_err:
1537
0
    return -1;
1538
0
}
1539
1540
0
void cram_xpack_encode_free(cram_codec *c) {
1541
0
    if (!c) return;
1542
1543
0
    if (c->u.e_xpack.sub_codec)
1544
0
        c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec);
1545
1546
0
    cram_free_block(c->out);
1547
1548
0
    free(c);
1549
0
}
1550
1551
cram_codec *cram_xpack_encode_init(cram_stats *st,
1552
                                   enum cram_encoding codec,
1553
                                   enum cram_external_type option,
1554
                                   void *dat,
1555
0
                                   int version, varint_vec *vv) {
1556
0
    cram_codec *c;
1557
1558
0
    if (!(c = malloc(sizeof(*c))))
1559
0
        return NULL;
1560
1561
0
    c->codec  = E_XPACK;
1562
0
    c->free   = cram_xpack_encode_free;
1563
0
    if (option == E_LONG)
1564
0
        c->encode = cram_xpack_encode_long;
1565
0
    else if (option == E_INT)
1566
0
        c->encode = cram_xpack_encode_int;
1567
0
    else
1568
0
        c->encode = cram_xpack_encode_char;
1569
0
    c->store  = cram_xpack_encode_store;
1570
0
    c->flush  = cram_xpack_encode_flush;
1571
1572
0
    cram_xpack_encoder *e = (cram_xpack_encoder *)dat;
1573
0
    c->u.e_xpack.nbits = e->nbits;
1574
0
    c->u.e_xpack.nval = e->nval;
1575
0
    c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
1576
0
                                               E_BYTE_ARRAY, e->sub_codec_dat,
1577
0
                                               version, vv);
1578
1579
    // Initialise fwd and rev maps
1580
0
    memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3
1581
0
    int i, n;
1582
0
    for (i = n = 0; i < 256; i++)
1583
0
        if (e->map[i] != -1)
1584
0
            c->u.e_xpack.rmap[n++] = i;               // 0,1,2,3 to P,A,C,K
1585
0
    if (n != e->nval) {
1586
0
        fprintf(stderr, "Incorrectly specified number of map items in PACK\n");
1587
0
        return NULL;
1588
0
    }
1589
1590
0
    return c;
1591
0
}
1592
1593
/*
1594
 * ---------------------------------------------------------------------------
1595
 * XDELTA: subtract successive values, zig-zag to turn +/- to + only,
1596
 * and then var-int encode the result.
1597
 *
1598
 * This also has the additional requirement that the data series is not
1599
 * interleaved with another, permitting efficient encoding and decoding
1600
 * of all elements enmasse instead of needing to only extract the bits
1601
 * necessary per item.
1602
 */
1603
1604
0
static uint8_t  zigzag8 (int8_t  x) { return (x << 1) ^ (x >>  7); }
1605
0
static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); }
1606
0
static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); }
1607
1608
//static int8_t  unzigzag8 (uint8_t  x) { return (x >> 1) ^ -(x & 1); }
1609
0
static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); }
1610
0
static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); }
1611
1612
0
int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1613
0
    return -1;
1614
0
}
1615
1616
0
int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1617
    // Slow value-by-value method for now
1618
0
    uint32_t *out32 = (uint32_t *)out;
1619
0
    int i;
1620
0
    for (i = 0; i < *out_size; i++) {
1621
0
        uint32_t v;
1622
0
        int one = 1;
1623
0
        if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in,
1624
0
                                          (char *)&v, &one) < 0)
1625
0
            return -1;
1626
0
        uint32_t d = unzigzag32(v);
1627
0
        c->u.xdelta.last = out32[i] = d + c->u.xdelta.last;
1628
0
    }
1629
1630
0
    return 0;
1631
0
}
1632
1633
0
static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) {
1634
0
    return -1;
1635
0
}
1636
1637
0
int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1638
0
    return -1;
1639
0
}
1640
1641
0
static inline int16_t le_int2(int16_t i) {
1642
0
    int16_t s;
1643
0
    i16_to_le(i, (uint8_t *)&s);
1644
0
    return s;
1645
0
}
1646
1647
int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in,
1648
0
                             char *out_, int *out_size) {
1649
0
    cram_block *out = (cram_block *)out_;
1650
0
    cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec);
1651
0
    int i = 0;
1652
1653
0
    const int w = c->u.xdelta.word_size;
1654
0
    uint32_t npad = (w - *out_size%w)%w;
1655
0
    uint32_t out_sz = *out_size + npad;
1656
0
    c->u.xdelta.last = 0;  // reset for each new array
1657
1658
0
    for (i = 0; i < out_sz; i += w) {
1659
0
        uint16_t v;
1660
        // Need better interface
1661
0
        char *cp = (char *)b->data + b->byte;
1662
0
        char *cp_end = (char *)b->data + b->uncomp_size;
1663
0
        int err = 0;
1664
0
        v = c->vv->varint_get32(&cp, cp_end, &err);
1665
0
        if (err)
1666
0
            return -1;
1667
0
        b->byte = cp - (char *)b->data;
1668
1669
0
        switch(w) {
1670
0
        case 2: {
1671
0
            int16_t d = unzigzag16(v), z;
1672
0
            c->u.xdelta.last = d + c->u.xdelta.last;
1673
0
            z = le_int2(c->u.xdelta.last);
1674
0
            BLOCK_APPEND(out, &z, 2-npad);
1675
0
            npad = 0;
1676
0
            break;
1677
0
        }
1678
0
        default:
1679
0
            fprintf(stderr, "Unsupported word size by XDELTA\n");
1680
0
            return -1;
1681
0
        }
1682
0
    }
1683
1684
0
    return 0;
1685
1686
0
 block_err:
1687
0
    return -1;
1688
0
}
1689
1690
65
void cram_xdelta_decode_free(cram_codec *c) {
1691
65
    if (!c) return;
1692
1693
65
    if (c->u.xdelta.sub_codec)
1694
63
        c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec);
1695
1696
65
    free(c);
1697
65
}
1698
1699
0
int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) {
1700
0
    cram_xdelta_decode_expand_char(slice, c);
1701
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1702
0
}
1703
1704
0
cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) {
1705
0
    cram_xdelta_decode_expand_char(slice, c);
1706
0
    return slice->block_by_id[512 + c->codec_id];
1707
0
}
1708
1709
cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr,
1710
                                    char *data, int size,
1711
                                    enum cram_encoding codec,
1712
                                    enum cram_external_type option,
1713
65
                                    int version, varint_vec *vv) {
1714
65
    cram_codec *c;
1715
65
    char *cp = data;
1716
65
    char *endp = data+size;
1717
1718
65
    if (!(c = calloc(1, sizeof(*c))))
1719
0
        return NULL;
1720
1721
65
    c->codec  = E_XDELTA;
1722
65
    if (option == E_LONG)
1723
0
        c->decode = cram_xdelta_decode_long;
1724
65
    else if (option == E_INT)
1725
36
        c->decode = cram_xdelta_decode_int;
1726
29
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1727
27
        c->decode = cram_xdelta_decode_char;
1728
2
    else if (option == E_BYTE_ARRAY_BLOCK) {
1729
2
        option = E_BYTE_ARRAY;
1730
2
        c->decode = cram_xdelta_decode_block;
1731
2
    } else {
1732
0
        free(c);
1733
0
        return NULL;
1734
0
    }
1735
65
    c->free = cram_xdelta_decode_free;
1736
65
    c->size = cram_xdelta_decode_size;
1737
65
    c->get_block = cram_xdelta_get_block;
1738
1739
65
    c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL);
1740
65
    c->u.xdelta.last = 0;
1741
1742
65
    int encoding = vv->varint_get32(&cp, endp, NULL);
1743
65
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1744
65
    if (sub_size < 0 || endp - cp < sub_size)
1745
0
        goto malformed;
1746
65
    c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1747
65
                                              option, version, vv);
1748
65
    if (c->u.xdelta.sub_codec == NULL)
1749
2
        goto malformed;
1750
63
    cp += sub_size;
1751
1752
63
    if (cp - data != size) {
1753
4
    malformed:
1754
4
        fprintf(stderr, "Malformed xdelta header stream\n");
1755
4
        cram_xdelta_decode_free(c);
1756
4
        return NULL;
1757
2
    }
1758
1759
61
    return c;
1760
63
}
1761
1762
0
int cram_xdelta_encode_flush(cram_codec *c) {
1763
0
    int r = -1;
1764
0
    cram_block *b = cram_new_block(0, 0);
1765
0
    if (!b)
1766
0
        return -1;
1767
1768
0
    switch (c->u.e_xdelta.word_size) {
1769
0
    case 2: {
1770
        // Delta + zigzag transform.
1771
        // Subtracting two 8-bit values has a 9-bit result (-255 to 255).
1772
        // However think of it as turning a wheel clockwise or anti-clockwise.
1773
        // If it has 256 gradations then a -ve rotation followed by a +ve
1774
        // rotation of the same amount reverses it regardless.
1775
        //
1776
        // Similarly the zig-zag transformation doesn't invent any extra bits,
1777
        // so the entire thing can be done in-situ.  This may permit faster
1778
        // SIMD loops if we break apart the steps.
1779
1780
        // uint16_t last = 0, d;
1781
        // for (i = 0; i < n; i++) {
1782
        //     d = io[i] - last;
1783
        //     last = io[i];
1784
        //     io[i] = zigzag16(vd);
1785
        // }
1786
1787
        // --- vs ---
1788
1789
        // for (i = n-1; i >= 1; i--)
1790
        //     io[i] -= io[i-1];
1791
        // for (i = 0; i < n; i++)
1792
        //     io[i] = zigzag16(io[i]);
1793
1794
        // varint: need array variant for speed here.
1795
        // With zig-zag
1796
0
        int i, n = BLOCK_SIZE(c->out)/2;;
1797
0
        uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0;
1798
1799
0
        if (n*2 < BLOCK_SIZE(c->out)) {
1800
            // half word
1801
0
            last = *(uint8_t *)dat;
1802
0
            c->vv->varint_put32_blk(b, zigzag16(last));
1803
0
            dat = (uint16_t *)(((uint8_t *)dat)+1);
1804
0
        }
1805
1806
0
        for (i = 0; i < n; i++) {
1807
0
            uint16_t d = dat[i] - last; // possibly unaligned
1808
0
            last = dat[i];
1809
0
            c->vv->varint_put32_blk(b, zigzag16(d));
1810
0
        }
1811
1812
0
        break;
1813
0
    }
1814
1815
0
    case 4: {
1816
0
        int i, n = BLOCK_SIZE(c->out)/4;;
1817
0
        uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0;
1818
1819
0
        for (i = 0; i < n; i++) {
1820
0
            uint32_t d = dat[i] - last;
1821
0
            last = dat[i];
1822
0
            c->vv->varint_put32_blk(b, zigzag32(d));
1823
0
        }
1824
1825
0
        break;
1826
0
    }
1827
1828
0
    case 1: {
1829
0
        int i, n = BLOCK_SIZE(c->out);;
1830
0
        uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0;
1831
1832
0
        for (i = 0; i < n; i++) {
1833
0
            uint32_t d = dat[i] - last;
1834
0
            last = dat[i];
1835
0
            c->vv->varint_put32_blk(b, zigzag8(d));
1836
0
        }
1837
1838
0
        break;
1839
0
    }
1840
1841
0
    default:
1842
0
        goto err;
1843
0
    }
1844
1845
0
    if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec,
1846
0
                                      (char *)b->data, b->byte))
1847
0
        goto err;
1848
1849
0
    r = 0;
1850
1851
0
 err:
1852
0
    cram_free_block(b);
1853
0
    return r;
1854
1855
0
}
1856
1857
int cram_xdelta_encode_store(cram_codec *c, cram_block *b,
1858
0
                            char *prefix, int version) {
1859
0
    int len = 0, r = 0, n;
1860
1861
0
    if (prefix) {
1862
0
        size_t l = strlen(prefix);
1863
0
        BLOCK_APPEND(b, prefix, l);
1864
0
        len += l;
1865
0
    }
1866
1867
    // Store sub-codec
1868
0
    cram_codec *tc = c->u.e_xdelta.sub_codec;
1869
0
    cram_block *tb = cram_new_block(0, 0);
1870
0
    if (!tb)
1871
0
        return -1;
1872
0
    int len2 = tc->store(tc, tb, NULL, version);
1873
1874
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1875
1876
    // codec length
1877
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size)
1878
0
                                        + len2)); r |= n;
1879
1880
    // This and sub-codec
1881
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n;
1882
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1883
1884
0
    cram_free_block(tb);
1885
1886
0
    return r > 0 ? len + len2 : -1;
1887
1888
0
 block_err:
1889
0
    return -1;
1890
0
}
1891
1892
// Same as cram_beta_encode_long
1893
int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c,
1894
0
                           char *in, int in_size) {
1895
0
    return -1;
1896
0
}
1897
1898
int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c,
1899
0
                          char *in, int in_size) {
1900
0
    return -1;
1901
0
}
1902
1903
int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c,
1904
0
                            char *in, int in_size) {
1905
0
    char *dat = malloc(in_size*5);
1906
0
    if (!dat)
1907
0
        return -1;
1908
0
    char *cp = dat, *cp_end = dat + in_size*5;
1909
1910
0
    c->u.e_xdelta.last = 0; // reset for each new array
1911
0
    if (c->u.e_xdelta.word_size == 2) {
1912
0
        int i, part;
1913
1914
0
        part = in_size%2;
1915
0
        if (part) {
1916
0
            uint16_t z = in[0];
1917
0
            c->u.e_xdelta.last = le_int2(z);
1918
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last));
1919
0
        }
1920
1921
0
        uint16_t *in16 = (uint16_t *)(in+part);
1922
0
        for (i = 0; i < in_size/2; i++) {
1923
0
            uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last;
1924
0
            c->u.e_xdelta.last = le_int2(in16[i]);
1925
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(d));
1926
0
        }
1927
0
    }
1928
0
    if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec,
1929
0
                                      (char *)dat, cp-dat)) {
1930
0
        free(dat);
1931
0
        return -1;
1932
0
    }
1933
1934
0
    free(dat);
1935
0
    return 0;
1936
0
}
1937
1938
0
void cram_xdelta_encode_free(cram_codec *c) {
1939
0
    if (!c) return;
1940
1941
0
    if (c->u.e_xdelta.sub_codec)
1942
0
        c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec);
1943
1944
0
    cram_free_block(c->out);
1945
1946
0
    free(c);
1947
0
}
1948
1949
cram_codec *cram_xdelta_encode_init(cram_stats *st,
1950
                                    enum cram_encoding codec,
1951
                                    enum cram_external_type option,
1952
                                    void *dat,
1953
0
                                    int version, varint_vec *vv) {
1954
0
    cram_codec *c;
1955
1956
0
    if (!(c = malloc(sizeof(*c))))
1957
0
        return NULL;
1958
1959
0
    c->codec  = E_XDELTA;
1960
0
    c->free   = cram_xdelta_encode_free;
1961
0
    if (option == E_LONG)
1962
0
        c->encode = cram_xdelta_encode_long;
1963
0
    else if (option == E_INT)
1964
0
        c->encode = cram_xdelta_encode_int;
1965
0
    else
1966
0
        c->encode = cram_xdelta_encode_char;
1967
0
    c->store  = cram_xdelta_encode_store;
1968
0
    c->flush  = cram_xdelta_encode_flush;
1969
1970
0
    cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat;
1971
0
    c->u.e_xdelta.word_size = e->word_size;
1972
0
    c->u.e_xdelta.last = 0;
1973
0
    c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
1974
0
                                                E_BYTE_ARRAY,
1975
0
                                                e->sub_codec_dat,
1976
0
                                                version, vv);
1977
1978
0
    return c;
1979
0
}
1980
1981
/*
1982
 * ---------------------------------------------------------------------------
1983
 * XRLE
1984
 *
1985
 * This also has the additional requirement that the data series is not
1986
 * interleaved with another, permitting efficient encoding and decoding
1987
 * of all elements enmasse instead of needing to only extract the bits
1988
 * necessary per item.
1989
 */
1990
0
int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1991
    // TODO if and when needed
1992
0
    return -1;
1993
0
}
1994
1995
0
int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1996
    // TODO if and when needed
1997
0
    return -1;
1998
0
}
1999
2000
// Expands an XRLE transform and caches result in slice->block_by_id[]
2001
0
static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) {
2002
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2003
0
    if (b)
2004
0
        return 0;
2005
2006
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
2007
0
    if (!b)
2008
0
        return -1;
2009
0
    cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec);
2010
0
    if (!lit_b)
2011
0
        return -1;
2012
0
    unsigned char *lit_dat = lit_b->data;
2013
0
    unsigned int lit_sz = lit_b->uncomp_size;
2014
0
    unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec);
2015
2016
0
    cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec);
2017
0
    if (!len_b)
2018
0
        return -1;
2019
0
    unsigned char *len_dat = len_b->data;
2020
2021
0
    uint8_t rle_syms[256];
2022
0
    int rle_nsyms = 0;
2023
0
    int i;
2024
0
    for (i = 0; i < 256; i++) {
2025
0
        if (c->u.xrle.rep_score[i] > 0)
2026
0
            rle_syms[rle_nsyms++] = i;
2027
0
    }
2028
2029
0
    uint64_t out_sz;
2030
0
    int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz);
2031
0
    if (!(b->data = malloc(out_sz)))
2032
0
        return -1;
2033
0
    hts_rle_decode(lit_dat, lit_sz,
2034
0
                   len_dat+nb, len_sz-nb,
2035
0
                   rle_syms, rle_nsyms,
2036
0
                   b->data, &out_sz);
2037
0
    b->uncomp_size = out_sz;
2038
2039
0
    return 0;
2040
0
}
2041
2042
0
int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) {
2043
0
    cram_xrle_decode_expand_char(slice, c);
2044
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
2045
0
}
2046
2047
0
cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) {
2048
0
    cram_xrle_decode_expand_char(slice, c);
2049
0
    return slice->block_by_id[512 + c->codec_id];
2050
0
}
2051
2052
0
int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2053
0
    int n = *out_size;
2054
2055
0
    cram_xrle_decode_expand_char(slice, c);
2056
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2057
2058
0
    memcpy(out, b->data + b->idx, n);
2059
0
    b->idx += n;
2060
0
    return 0;
2061
2062
    // Old code when not cached
2063
0
    while (n > 0) {
2064
0
        if (c->u.xrle.cur_len == 0) {
2065
0
            unsigned char lit;
2066
0
            int one = 1;
2067
0
            if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in,
2068
0
                                          (char *)&lit, &one) < 0)
2069
0
                return -1;
2070
0
            c->u.xrle.cur_lit = lit;
2071
2072
0
            if (c->u.xrle.rep_score[lit] > 0) {
2073
0
                if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in,
2074
0
                                              (char *)&c->u.xrle.cur_len, &one) < 0)
2075
0
                    return -1;
2076
0
            } // else cur_len still zero
2077
            //else fprintf(stderr, "%d\n", lit);
2078
2079
0
            c->u.xrle.cur_len++;
2080
0
        }
2081
2082
0
        if (n >= c->u.xrle.cur_len) {
2083
0
            memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len);
2084
0
            out += c->u.xrle.cur_len;
2085
0
            n -= c->u.xrle.cur_len;
2086
0
            c->u.xrle.cur_len = 0;
2087
0
        } else {
2088
0
            memset(out, c->u.xrle.cur_lit, n);
2089
0
            out += n;
2090
0
            c->u.xrle.cur_len -= n;
2091
0
            n = 0;
2092
0
        }
2093
0
    }
2094
2095
0
    return 0;
2096
0
}
2097
2098
29
void cram_xrle_decode_free(cram_codec *c) {
2099
29
    if (!c) return;
2100
2101
29
    if (c->u.xrle.len_codec)
2102
26
        c->u.xrle.len_codec->free(c->u.xrle.len_codec);
2103
2104
29
    if (c->u.xrle.lit_codec)
2105
26
        c->u.xrle.lit_codec->free(c->u.xrle.lit_codec);
2106
2107
29
    free(c);
2108
29
}
2109
2110
cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr,
2111
                                  char *data, int size,
2112
                                  enum cram_encoding codec,
2113
                                  enum cram_external_type option,
2114
31
                                  int version, varint_vec *vv) {
2115
31
    cram_codec *c;
2116
31
    char *cp = data;
2117
31
    char *endp = data+size;
2118
31
    int err = 0;
2119
2120
31
    if (!(c = calloc(1, sizeof(*c))))
2121
0
        return NULL;
2122
2123
31
    c->codec  = E_XRLE;
2124
31
    if (option == E_LONG)
2125
0
        c->decode = cram_xrle_decode_long;
2126
31
    else if (option == E_INT)
2127
11
        c->decode = cram_xrle_decode_int;
2128
20
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
2129
18
        c->decode = cram_xrle_decode_char;
2130
2
    else {
2131
2
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
2132
2
        free(c);
2133
2
        return NULL;
2134
2
    }
2135
29
    c->free   = cram_xrle_decode_free;
2136
29
    c->size   = cram_xrle_decode_size;
2137
29
    c->get_block = cram_xrle_get_block;
2138
29
    c->u.xrle.cur_len = 0;
2139
29
    c->u.xrle.cur_lit = -1;
2140
2141
    // RLE map
2142
29
    int i, j, nrle = vv->varint_get32(&cp, endp, &err);
2143
29
    memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score));
2144
206
    for (i = 0; i < nrle && i < 256; i++) {
2145
177
        j = vv->varint_get32(&cp, endp, &err);
2146
177
        if (j >= 0 && j < 256)
2147
110
            c->u.xrle.rep_score[j] = 1;
2148
177
    }
2149
2150
    // Length and literal sub encodings
2151
29
    c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err);
2152
29
    int sub_size = vv->varint_get32(&cp, endp, &err);
2153
29
    if (sub_size < 0 || endp - cp < sub_size)
2154
2
        goto malformed;
2155
27
    c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding,
2156
27
                                            cp, sub_size, E_INT, version, vv);
2157
27
    if (c->u.xrle.len_codec == NULL)
2158
1
        goto malformed;
2159
26
    cp += sub_size;
2160
2161
26
    c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err);
2162
26
    sub_size = vv->varint_get32(&cp, endp, &err);
2163
26
    if (sub_size < 0 || endp - cp < sub_size)
2164
0
        goto malformed;
2165
26
    c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding,
2166
26
                                            cp, sub_size, option, version, vv);
2167
26
    if (c->u.xrle.lit_codec == NULL)
2168
0
        goto malformed;
2169
26
    cp += sub_size;
2170
2171
26
    if (err)
2172
0
        goto malformed;
2173
2174
26
    return c;
2175
2176
3
 malformed:
2177
3
    fprintf(stderr, "Malformed xrle header stream\n");
2178
3
    cram_xrle_decode_free(c);
2179
3
    return NULL;
2180
26
}
2181
2182
0
int cram_xrle_encode_flush(cram_codec *c) {
2183
0
    uint8_t *out_lit, *out_len;
2184
0
    uint64_t out_lit_size, out_len_size;
2185
0
    uint8_t rle_syms[256];
2186
0
    int rle_nsyms = 0, i;
2187
2188
0
    for (i = 0; i < 256; i++)
2189
0
        if (c->u.e_xrle.rep_score[i] > 0)
2190
0
            rle_syms[rle_nsyms++] = i;
2191
2192
0
    if (!c->u.e_xrle.to_flush) {
2193
0
        c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out);
2194
0
        c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out);
2195
0
    }
2196
2197
0
    out_len = malloc(c->u.e_xrle.to_flush_size+8);
2198
0
    if (!out_len)
2199
0
        return -1;
2200
2201
0
    int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size);
2202
2203
0
    out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
2204
0
                             out_len+nb, &out_len_size,
2205
0
                             rle_syms, &rle_nsyms,
2206
0
                             NULL, &out_lit_size);
2207
0
    out_len_size += nb;
2208
2209
2210
    // TODO: can maybe "gift" the sub codec the data block, to remove
2211
    // one level of memcpy.
2212
0
    if (c->u.e_xrle.len_codec->encode(NULL,
2213
0
                                      c->u.e_xrle.len_codec,
2214
0
                                      (char *)out_len, out_len_size))
2215
0
        return -1;
2216
2217
0
    if (c->u.e_xrle.lit_codec->encode(NULL,
2218
0
                                      c->u.e_xrle.lit_codec,
2219
0
                                      (char *)out_lit, out_lit_size))
2220
0
        return -1;
2221
2222
0
    free(out_len);
2223
0
    free(out_lit);
2224
2225
0
    return 0;
2226
0
}
2227
2228
int cram_xrle_encode_store(cram_codec *c, cram_block *b,
2229
0
                            char *prefix, int version) {
2230
0
    int len = 0, r = 0, n;
2231
0
    cram_codec *tc;
2232
0
    cram_block *b_rle, *b_len, *b_lit;
2233
2234
0
    if (prefix) {
2235
0
        size_t l = strlen(prefix);
2236
0
        BLOCK_APPEND(b, prefix, l);
2237
0
        len += l;
2238
0
    }
2239
2240
    // List of symbols to RLE
2241
0
    b_rle = cram_new_block(0, 0);
2242
0
    if (!b_rle)
2243
0
        return -1;
2244
0
    int i, nrle = 0, len1 = 0;
2245
0
    for (i = 0; i < 256; i++) {
2246
0
        if (c->u.e_xrle.rep_score[i] > 0) {
2247
0
            nrle++;
2248
0
            len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n;
2249
0
        }
2250
0
    }
2251
2252
    // Store length and literal sub-codecs to get encoded length
2253
0
    tc = c->u.e_xrle.len_codec;
2254
0
    b_len = cram_new_block(0, 0);
2255
0
    if (!b_len)
2256
0
        return -1;
2257
0
    int len2 = tc->store(tc, b_len, NULL, version);
2258
2259
0
    tc = c->u.e_xrle.lit_codec;
2260
0
    b_lit = cram_new_block(0, 0);
2261
0
    if (!b_lit)
2262
0
        return -1;
2263
0
    int len3 = tc->store(tc, b_lit, NULL, version);
2264
2265
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
2266
0
    len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3
2267
0
                                        + c->vv->varint_size(nrle))); r |= n;
2268
0
    len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n;
2269
0
    BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle));
2270
0
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
2271
0
    BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit));
2272
2273
0
    cram_free_block(b_rle);
2274
0
    cram_free_block(b_len);
2275
0
    cram_free_block(b_lit);
2276
2277
0
    if (r > 0)
2278
0
        return len + len1 + len2 + len3;
2279
2280
0
 block_err:
2281
0
    return -1;
2282
0
}
2283
2284
int cram_xrle_encode_long(cram_slice *slice, cram_codec *c,
2285
0
                           char *in, int in_size) {
2286
    // TODO if and when needed
2287
0
    return -1;
2288
0
}
2289
2290
int cram_xrle_encode_int(cram_slice *slice, cram_codec *c,
2291
0
                          char *in, int in_size) {
2292
    // TODO if and when needed
2293
0
    return -1;
2294
0
}
2295
2296
int cram_xrle_encode_char(cram_slice *slice, cram_codec *c,
2297
0
                          char *in, int in_size) {
2298
0
    if (c->u.e_xrle.to_flush) {
2299
0
        if (!c->out && !(c->out = cram_new_block(0, 0)))
2300
0
            return -1;
2301
0
        BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size);
2302
0
        c->u.e_xrle.to_flush = NULL;
2303
0
        c->u.e_xrle.to_flush_size = 0;
2304
0
    }
2305
2306
0
    if (c->out && BLOCK_SIZE(c->out) > 0) {
2307
        // Gathering data
2308
0
        BLOCK_APPEND(c->out, in, in_size);
2309
0
        return 0;
2310
0
    }
2311
2312
    // else cache copy of the data we're about to send to flush instead.
2313
0
    c->u.e_xrle.to_flush = in;
2314
0
    c->u.e_xrle.to_flush_size = in_size;
2315
0
    return 0;
2316
2317
0
 block_err:
2318
0
    return -1;
2319
0
}
2320
2321
0
void cram_xrle_encode_free(cram_codec *c) {
2322
0
    if (!c) return;
2323
2324
0
    if (c->u.e_xrle.len_codec)
2325
0
        c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec);
2326
0
    if (c->u.e_xrle.lit_codec)
2327
0
        c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec);
2328
2329
0
    cram_free_block(c->out);
2330
2331
0
    free(c);
2332
0
}
2333
2334
cram_codec *cram_xrle_encode_init(cram_stats *st,
2335
                                  enum cram_encoding codec,
2336
                                  enum cram_external_type option,
2337
                                  void *dat,
2338
0
                                  int version, varint_vec *vv) {
2339
0
    cram_codec *c;
2340
2341
0
    if (!(c = malloc(sizeof(*c))))
2342
0
        return NULL;
2343
2344
0
    c->codec  = E_XRLE;
2345
0
    c->free   = cram_xrle_encode_free;
2346
0
    if (option == E_LONG)
2347
0
        c->encode = cram_xrle_encode_long;
2348
0
    else if (option == E_INT)
2349
0
        c->encode = cram_xrle_encode_int;
2350
0
    else
2351
0
        c->encode = cram_xrle_encode_char;
2352
0
    c->store  = cram_xrle_encode_store;
2353
0
    c->flush  = cram_xrle_encode_flush;
2354
2355
0
    cram_xrle_encoder *e = (cram_xrle_encoder *)dat;
2356
2357
0
    c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL,
2358
0
                                              E_BYTE, e->len_dat,
2359
0
                                              version, vv);
2360
0
    c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL,
2361
0
                                              E_BYTE, e->lit_dat,
2362
0
                                              version, vv);
2363
0
    c->u.e_xrle.cur_lit = -1;
2364
0
    c->u.e_xrle.cur_len = -1;
2365
0
    c->u.e_xrle.to_flush = NULL;
2366
0
    c->u.e_xrle.to_flush_size = 0;
2367
2368
0
    memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score));
2369
2370
0
    return c;
2371
0
}
2372
2373
/*
2374
 * ---------------------------------------------------------------------------
2375
 * SUBEXP
2376
 */
2377
0
int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2378
0
    int32_t *out_i = (int32_t *)out;
2379
0
    int n, count;
2380
0
    int k = c->u.subexp.k;
2381
2382
0
    for (count = 0, n = *out_size; count < n; count++) {
2383
0
        int i = 0, tail;
2384
0
        int val;
2385
2386
        /* Get number of 1s */
2387
        //while (get_bit_MSB(in) == 1) i++;
2388
0
        i = get_one_bits_MSB(in);
2389
0
        if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
2390
0
            return -1;
2391
        /*
2392
         * Val is
2393
         * i > 0:  2^(k+i-1) + k+i-1 bits
2394
         * i = 0:  k bits
2395
         */
2396
0
        if (i) {
2397
0
            tail = i + k-1;
2398
0
            val = 0;
2399
0
            while (tail) {
2400
                //val = val<<1; val |= get_bit_MSB(in);
2401
0
                GET_BIT_MSB(in, val);
2402
0
                tail--;
2403
0
            }
2404
0
            val += 1 << (i + k-1);
2405
0
        } else {
2406
0
            tail = k;
2407
0
            val = 0;
2408
0
            while (tail) {
2409
                //val = val<<1; val |= get_bit_MSB(in);
2410
0
                GET_BIT_MSB(in, val);
2411
0
                tail--;
2412
0
            }
2413
0
        }
2414
2415
0
        out_i[count] = val - c->u.subexp.offset;
2416
0
    }
2417
2418
0
    return 0;
2419
0
}
2420
2421
71
void cram_subexp_decode_free(cram_codec *c) {
2422
71
    if (c)
2423
71
        free(c);
2424
71
}
2425
2426
cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
2427
                                    char *data, int size,
2428
                                    enum cram_encoding codec,
2429
                                    enum cram_external_type option,
2430
72
                                    int version, varint_vec *vv) {
2431
72
    cram_codec *c;
2432
72
    char *cp = data;
2433
2434
72
    if (option != E_INT) {
2435
0
        hts_log_error("This codec only supports INT encodings");
2436
0
        return NULL;
2437
0
    }
2438
2439
72
    if (!(c = malloc(sizeof(*c))))
2440
0
        return NULL;
2441
2442
72
    c->codec  = E_SUBEXP;
2443
72
    c->decode = cram_subexp_decode;
2444
72
    c->free   = cram_subexp_decode_free;
2445
72
    c->u.subexp.k = -1;
2446
2447
72
    c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL);
2448
72
    c->u.subexp.k      = vv->varint_get32(&cp, data + size, NULL);
2449
2450
72
    if (cp - data != size || c->u.subexp.k < 0) {
2451
1
        hts_log_error("Malformed subexp header stream");
2452
1
        free(c);
2453
1
        return NULL;
2454
1
    }
2455
2456
71
    return c;
2457
72
}
2458
2459
/*
2460
 * ---------------------------------------------------------------------------
2461
 * GAMMA
2462
 */
2463
0
int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2464
0
    int32_t *out_i = (int32_t *)out;
2465
0
    int i, n;
2466
2467
0
    for (i = 0, n = *out_size; i < n; i++) {
2468
0
        int nz = 0;
2469
0
        int val;
2470
        //while (get_bit_MSB(in) == 0) nz++;
2471
0
        nz = get_zero_bits_MSB(in);
2472
0
        if (cram_not_enough_bits(in, nz))
2473
0
            return -1;
2474
0
        val = 1;
2475
0
        while (nz > 0) {
2476
            //val <<= 1; val |= get_bit_MSB(in);
2477
0
            GET_BIT_MSB(in, val);
2478
0
            nz--;
2479
0
        }
2480
2481
0
        out_i[i] = val - c->u.gamma.offset;
2482
0
    }
2483
2484
0
    return 0;
2485
0
}
2486
2487
72
void cram_gamma_decode_free(cram_codec *c) {
2488
72
    if (c)
2489
72
        free(c);
2490
72
}
2491
2492
cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
2493
                                   char *data, int size,
2494
                                   enum cram_encoding codec,
2495
                                   enum cram_external_type option,
2496
72
                                   int version, varint_vec *vv) {
2497
72
    cram_codec *c = NULL;
2498
72
    char *cp = data;
2499
2500
72
    if (option != E_INT) {
2501
0
        hts_log_error("This codec only supports INT encodings");
2502
0
        return NULL;
2503
0
    }
2504
2505
72
    if (size < 1)
2506
0
        goto malformed;
2507
2508
72
    if (!(c = malloc(sizeof(*c))))
2509
0
        return NULL;
2510
2511
72
    c->codec  = E_GAMMA;
2512
72
    c->decode = cram_gamma_decode;
2513
72
    c->free   = cram_gamma_decode_free;
2514
2515
72
    c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL);
2516
2517
72
    if (cp - data != size)
2518
0
        goto malformed;
2519
2520
72
    return c;
2521
2522
0
 malformed:
2523
0
    hts_log_error("Malformed gamma header stream");
2524
0
    free(c);
2525
0
    return NULL;
2526
72
}
2527
2528
/*
2529
 * ---------------------------------------------------------------------------
2530
 * HUFFMAN
2531
 */
2532
2533
28
static int code_sort(const void *vp1, const void *vp2) {
2534
28
    const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
2535
28
    const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
2536
2537
28
    if (c1->len != c2->len)
2538
1
        return c1->len - c2->len;
2539
27
    else
2540
27
        return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
2541
28
}
2542
2543
59
void cram_huffman_decode_free(cram_codec *c) {
2544
59
    if (!c)
2545
0
        return;
2546
2547
59
    if (c->u.huffman.codes)
2548
44
        free(c->u.huffman.codes);
2549
59
    free(c);
2550
59
}
2551
2552
int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
2553
0
                             cram_block *in, char *out, int *out_size) {
2554
0
    return -1;
2555
0
}
2556
2557
int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
2558
0
                              cram_block *in, char *out, int *out_size) {
2559
0
    int i, n;
2560
2561
0
    if (!out)
2562
0
        return 0;
2563
2564
    /* Special case of 0 length codes */
2565
0
    for (i = 0, n = *out_size; i < n; i++) {
2566
0
        out[i] = c->u.huffman.codes[0].symbol;
2567
0
    }
2568
0
    return 0;
2569
0
}
2570
2571
int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
2572
0
                             cram_block *in, char *out, int *out_size) {
2573
0
    int i, n, ncodes = c->u.huffman.ncodes;
2574
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2575
2576
0
    for (i = 0, n = *out_size; i < n; i++) {
2577
0
        int idx = 0;
2578
0
        int val = 0, len = 0, last_len = 0;
2579
2580
0
        for (;;) {
2581
0
            int dlen = codes[idx].len - last_len;
2582
0
            if (cram_not_enough_bits(in, dlen))
2583
0
                return -1;
2584
2585
            //val <<= dlen;
2586
            //val  |= get_bits_MSB(in, dlen);
2587
            //last_len = (len += dlen);
2588
2589
0
            last_len = (len += dlen);
2590
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2591
2592
0
            idx = val - codes[idx].p;
2593
0
            if (idx >= ncodes || idx < 0)
2594
0
                return -1;
2595
2596
0
            if (codes[idx].code == val && codes[idx].len == len) {
2597
0
                if (out) out[i] = codes[idx].symbol;
2598
0
                break;
2599
0
            }
2600
0
        }
2601
0
    }
2602
2603
0
    return 0;
2604
0
}
2605
2606
int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
2607
0
                             cram_block *in, char *out, int *out_size) {
2608
0
    int32_t *out_i = (int32_t *)out;
2609
0
    int i, n;
2610
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2611
2612
    /* Special case of 0 length codes */
2613
0
    for (i = 0, n = *out_size; i < n; i++) {
2614
0
        out_i[i] = codes[0].symbol;
2615
0
    }
2616
0
    return 0;
2617
0
}
2618
2619
int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
2620
0
                            cram_block *in, char *out, int *out_size) {
2621
0
    int32_t *out_i = (int32_t *)out;
2622
0
    int i, n, ncodes = c->u.huffman.ncodes;
2623
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2624
2625
0
    for (i = 0, n = *out_size; i < n; i++) {
2626
0
        int idx = 0;
2627
0
        int val = 0, len = 0, last_len = 0;
2628
2629
        // Now one bit at a time for remaining checks
2630
0
        for (;;) {
2631
0
            int dlen = codes[idx].len - last_len;
2632
0
            if (cram_not_enough_bits(in, dlen))
2633
0
                return -1;
2634
2635
            //val <<= dlen;
2636
            //val  |= get_bits_MSB(in, dlen);
2637
            //last_len = (len += dlen);
2638
2639
0
            last_len = (len += dlen);
2640
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2641
2642
0
            idx = val - codes[idx].p;
2643
0
            if (idx >= ncodes || idx < 0)
2644
0
                return -1;
2645
2646
0
            if (codes[idx].code == val && codes[idx].len == len) {
2647
0
                out_i[i] = codes[idx].symbol;
2648
0
                break;
2649
0
            }
2650
0
        }
2651
0
    }
2652
2653
0
    return 0;
2654
0
}
2655
2656
int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c,
2657
0
                              cram_block *in, char *out, int *out_size) {
2658
0
    int64_t *out_i = (int64_t *)out;
2659
0
    int i, n;
2660
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2661
2662
    /* Special case of 0 length codes */
2663
0
    for (i = 0, n = *out_size; i < n; i++) {
2664
0
        out_i[i] = codes[0].symbol;
2665
0
    }
2666
0
    return 0;
2667
0
}
2668
2669
int cram_huffman_decode_long(cram_slice *slice, cram_codec *c,
2670
0
                             cram_block *in, char *out, int *out_size) {
2671
0
    int64_t *out_i = (int64_t *)out;
2672
0
    int i, n, ncodes = c->u.huffman.ncodes;
2673
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2674
2675
0
    for (i = 0, n = *out_size; i < n; i++) {
2676
0
        int idx = 0;
2677
0
        int val = 0, len = 0, last_len = 0;
2678
2679
        // Now one bit at a time for remaining checks
2680
0
        for (;;) {
2681
0
            int dlen = codes[idx].len - last_len;
2682
0
            if (cram_not_enough_bits(in, dlen))
2683
0
                return -1;
2684
2685
            //val <<= dlen;
2686
            //val  |= get_bits_MSB(in, dlen);
2687
            //last_len = (len += dlen);
2688
2689
0
            last_len = (len += dlen);
2690
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2691
2692
0
            idx = val - codes[idx].p;
2693
0
            if (idx >= ncodes || idx < 0)
2694
0
                return -1;
2695
2696
0
            if (codes[idx].code == val && codes[idx].len == len) {
2697
0
                out_i[i] = codes[idx].symbol;
2698
0
                break;
2699
0
            }
2700
0
        }
2701
0
    }
2702
2703
0
    return 0;
2704
0
}
2705
2706
/*
2707
 * Initialises a huffman decoder from an encoding data stream.
2708
 */
2709
cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
2710
                                     char *data, int size,
2711
                                     enum cram_encoding codec,
2712
                                     enum cram_external_type option,
2713
60
                                     int version, varint_vec *vv) {
2714
60
    int32_t ncodes = 0, i, j;
2715
60
    char *cp = data, *data_end = &data[size];
2716
60
    cram_codec *h;
2717
60
    cram_huffman_code *codes = NULL;
2718
60
    int32_t val, last_len, max_len = 0;
2719
60
    uint32_t max_val; // needs one more bit than val
2720
60
    const int max_code_bits = sizeof(val) * 8 - 1;
2721
60
    int err = 0;
2722
2723
60
    if (option == E_BYTE_ARRAY_BLOCK) {
2724
0
        hts_log_error("BYTE_ARRAYs not supported by this codec");
2725
0
        return NULL;
2726
0
    }
2727
2728
60
    ncodes = vv->varint_get32(&cp, data_end, &err);
2729
60
    if (ncodes < 0) {
2730
0
        hts_log_error("Invalid number of symbols in huffman stream");
2731
0
        return NULL;
2732
0
    }
2733
60
    if (ncodes >= SIZE_MAX / sizeof(*codes)) {
2734
0
        errno = ENOMEM;
2735
0
        return NULL;
2736
0
    }
2737
2738
60
    h = calloc(1, sizeof(*h));
2739
60
    if (!h)
2740
0
        return NULL;
2741
2742
60
    h->codec  = E_HUFFMAN;
2743
60
    h->free   = cram_huffman_decode_free;
2744
2745
60
    h->u.huffman.ncodes = ncodes;
2746
60
    h->u.huffman.option = option;
2747
60
    if (ncodes) {
2748
45
        codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes));
2749
45
        if (!codes) {
2750
0
            free(h);
2751
0
            return NULL;
2752
0
        }
2753
45
    } else {
2754
15
        codes = h->u.huffman.codes = NULL;
2755
15
    }
2756
2757
    /* Read symbols and bit-lengths */
2758
60
    if (option == E_LONG) {
2759
0
        for (i = 0; i < ncodes; i++)
2760
0
            codes[i].symbol = vv->varint_get64(&cp, data_end, &err);
2761
60
    } else if (option == E_INT || option == E_BYTE) {
2762
133
        for (i = 0; i < ncodes; i++)
2763
73
            codes[i].symbol = vv->varint_get32(&cp, data_end, &err);
2764
60
    } else {
2765
0
        goto malformed;
2766
0
    }
2767
2768
60
    if (err)
2769
0
        goto malformed;
2770
2771
60
    i = vv->varint_get32(&cp, data_end, &err);
2772
60
    if (i != ncodes)
2773
0
        goto malformed;
2774
2775
60
    if (ncodes == 0) {
2776
        /* NULL huffman stream.  Ensure it returns an error if
2777
           anything tries to use it. */
2778
15
        h->decode = cram_huffman_decode_null;
2779
15
        return h;
2780
15
    }
2781
2782
118
    for (i = 0; i < ncodes; i++) {
2783
73
        codes[i].len = vv->varint_get32(&cp, data_end, &err);
2784
73
        if (err)
2785
0
            break;
2786
73
        if (codes[i].len < 0) {
2787
0
            hts_log_error("Huffman code length (%d) is negative", codes[i].len);
2788
0
            goto malformed;
2789
0
        }
2790
73
        if (max_len < codes[i].len)
2791
28
            max_len = codes[i].len;
2792
73
    }
2793
45
    if (err || cp - data != size || max_len >= ncodes)
2794
0
        goto malformed;
2795
2796
    /* 31 is max. bits available in val */
2797
45
    if (max_len > max_code_bits) {
2798
0
        hts_log_error("Huffman code length (%d) is greater "
2799
0
                      "than maximum supported (%d)", max_len, max_code_bits);
2800
0
        goto malformed;
2801
0
    }
2802
2803
    /* Sort by bit length and then by symbol value */
2804
45
    qsort(codes, ncodes, sizeof(*codes), code_sort);
2805
2806
    /* Assign canonical codes */
2807
45
    val = -1, last_len = 0, max_val = 0;
2808
117
    for (i = 0; i < ncodes; i++) {
2809
73
        val++;
2810
73
        if (val > max_val)
2811
1
            goto malformed;
2812
2813
72
        if (codes[i].len > last_len) {
2814
27
            val <<= (codes[i].len - last_len);
2815
27
            last_len = codes[i].len;
2816
27
            max_val = (1U << codes[i].len) - 1;
2817
27
        }
2818
72
        codes[i].code = val;
2819
72
    }
2820
2821
    /*
2822
     * Compute the next starting point, offset by the i'th value.
2823
     * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
2824
     * codes[10..13].p = 30 - 10.
2825
     */
2826
44
    last_len = 0;
2827
115
    for (i = j = 0; i < ncodes; i++) {
2828
71
        if (codes[i].len > last_len) {
2829
27
            j = codes[i].code - i;
2830
27
            last_len = codes[i].len;
2831
27
        }
2832
71
        codes[i].p = j;
2833
71
    }
2834
2835
    // puts("==HUFF LEN==");
2836
    // for (i = 0; i <= last_len+1; i++) {
2837
    //     printf("len %d=%d prefix %d\n", i, h->u.huffman.lengths[i], h->u.huffman.prefix[i]);
2838
    // }
2839
    // puts("===HUFFMAN CODES===");
2840
    // for (i = 0; i < ncodes; i++) {
2841
    //     int j;
2842
    //     printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
2843
    //     j = codes[i].len;
2844
    //     while (j) {
2845
    //         putchar(codes[i].code & (1 << --j) ? '1' : '0');
2846
    //     }
2847
    //     printf(" %d\n", codes[i].code);
2848
    // }
2849
2850
44
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
2851
39
        if (h->u.huffman.codes[0].len == 0)
2852
17
            h->decode = cram_huffman_decode_char0;
2853
22
        else
2854
22
            h->decode = cram_huffman_decode_char;
2855
39
    } else if (option == E_LONG || option == E_SLONG) {
2856
0
        if (h->u.huffman.codes[0].len == 0)
2857
0
            h->decode = cram_huffman_decode_long0;
2858
0
        else
2859
0
            h->decode = cram_huffman_decode_long;
2860
5
    } else if (option == E_INT || option == E_SINT || option == E_BYTE) {
2861
5
        if (h->u.huffman.codes[0].len == 0)
2862
0
            h->decode = cram_huffman_decode_int0;
2863
5
        else
2864
5
            h->decode = cram_huffman_decode_int;
2865
5
    } else {
2866
0
        return NULL;
2867
0
    }
2868
2869
44
    return (cram_codec *)h;
2870
2871
1
 malformed:
2872
1
    hts_log_error("Malformed huffman header stream");
2873
1
    free(codes);
2874
1
    free(h);
2875
1
    return NULL;
2876
44
}
2877
2878
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
2879
0
                              char *in, int in_size) {
2880
0
    return 0;
2881
0
}
2882
2883
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
2884
0
                             char *in, int in_size) {
2885
0
    int i, code, len, r = 0;
2886
0
    unsigned char *syms = (unsigned char *)in;
2887
2888
0
    while (in_size--) {
2889
0
        int sym = *syms++;
2890
0
        if (sym >= -1 && sym < MAX_HUFF) {
2891
0
            i = c->u.e_huffman.val2code[sym+1];
2892
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
2893
0
            code = c->u.e_huffman.codes[i].code;
2894
0
            len  = c->u.e_huffman.codes[i].len;
2895
0
        } else {
2896
            /* Slow - use a lookup table for when sym < MAX_HUFF? */
2897
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
2898
0
                if (c->u.e_huffman.codes[i].symbol == sym)
2899
0
                    break;
2900
0
            }
2901
0
            if (i == c->u.e_huffman.nvals)
2902
0
                return -1;
2903
2904
0
            code = c->u.e_huffman.codes[i].code;
2905
0
            len  = c->u.e_huffman.codes[i].len;
2906
0
        }
2907
2908
0
        r |= store_bits_MSB(c->out, code, len);
2909
0
    }
2910
2911
0
    return r;
2912
0
}
2913
2914
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
2915
0
                             char *in, int in_size) {
2916
0
    return 0;
2917
0
}
2918
2919
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
2920
0
                            char *in, int in_size) {
2921
0
    int i, code, len, r = 0;
2922
0
    int *syms = (int *)in;
2923
2924
0
    while (in_size--) {
2925
0
        int sym = *syms++;
2926
2927
0
        if (sym >= -1 && sym < MAX_HUFF) {
2928
0
            i = c->u.e_huffman.val2code[sym+1];
2929
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
2930
0
            code = c->u.e_huffman.codes[i].code;
2931
0
            len  = c->u.e_huffman.codes[i].len;
2932
0
        } else {
2933
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
2934
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
2935
0
                if (c->u.e_huffman.codes[i].symbol == sym)
2936
0
                    break;
2937
0
            }
2938
0
            if (i == c->u.e_huffman.nvals)
2939
0
                return -1;
2940
2941
0
            code = c->u.e_huffman.codes[i].code;
2942
0
            len  = c->u.e_huffman.codes[i].len;
2943
0
        }
2944
2945
0
        r |= store_bits_MSB(c->out, code, len);
2946
0
    }
2947
2948
0
    return r;
2949
0
}
2950
2951
int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c,
2952
0
                              char *in, int in_size) {
2953
0
    return 0;
2954
0
}
2955
2956
int cram_huffman_encode_long(cram_slice *slice, cram_codec *c,
2957
0
                             char *in, int in_size) {
2958
0
    int i, code, len, r = 0;
2959
0
    int64_t *syms = (int64_t *)in;
2960
2961
0
    while (in_size--) {
2962
0
        int sym = *syms++;
2963
2964
0
        if (sym >= -1 && sym < MAX_HUFF) {
2965
0
            i = c->u.e_huffman.val2code[sym+1];
2966
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
2967
0
            code = c->u.e_huffman.codes[i].code;
2968
0
            len  = c->u.e_huffman.codes[i].len;
2969
0
        } else {
2970
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
2971
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
2972
0
                if (c->u.e_huffman.codes[i].symbol == sym)
2973
0
                    break;
2974
0
            }
2975
0
            if (i == c->u.e_huffman.nvals)
2976
0
                return -1;
2977
2978
0
            code = c->u.e_huffman.codes[i].code;
2979
0
            len  = c->u.e_huffman.codes[i].len;
2980
0
        }
2981
2982
0
        r |= store_bits_MSB(c->out, code, len);
2983
0
    }
2984
2985
0
    return r;
2986
0
}
2987
2988
0
void cram_huffman_encode_free(cram_codec *c) {
2989
0
    if (!c)
2990
0
        return;
2991
2992
0
    if (c->u.e_huffman.codes)
2993
0
        free(c->u.e_huffman.codes);
2994
0
    free(c);
2995
0
}
2996
2997
/*
2998
 * Encodes a huffman tree.
2999
 * Returns number of bytes written.
3000
 */
3001
int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
3002
0
                              int version) {
3003
0
    int i, len = 0, r = 0, n;
3004
0
    cram_huffman_code *codes = c->u.e_huffman.codes;
3005
    /*
3006
     * Up to code length 127 means 2.5e+26 bytes of data required (worst
3007
     * case huffman tree needs symbols with freqs matching the Fibonacci
3008
     * series). So guaranteed 1 byte per code.
3009
     *
3010
     * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
3011
     *
3012
     * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
3013
     */
3014
0
    char *tmp = malloc(6*c->u.e_huffman.nvals+16);
3015
0
    char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16;
3016
3017
0
    if (!tmp)
3018
0
        return -1;
3019
3020
0
    if (prefix) {
3021
0
        size_t l = strlen(prefix);
3022
0
        BLOCK_APPEND(b, prefix, l);
3023
0
        len += l;
3024
0
    }
3025
3026
0
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3027
0
    if (c->u.e_huffman.option == E_LONG) {
3028
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3029
0
            tp += c->vv->varint_put64(tp, tpend, codes[i].symbol);
3030
0
        }
3031
0
    } else if (c->u.e_huffman.option == E_SLONG) {
3032
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3033
0
            tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol);
3034
0
        }
3035
0
    } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) {
3036
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3037
0
            tp += c->vv->varint_put32(tp, tpend, codes[i].symbol);
3038
0
        }
3039
0
    } else if (c->u.e_huffman.option == E_SINT) {
3040
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3041
0
            tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol);
3042
0
        }
3043
0
    } else {
3044
0
        return -1;
3045
0
    }
3046
3047
0
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3048
0
    for (i = 0; i < c->u.e_huffman.nvals; i++)
3049
0
        tp += c->vv->varint_put32(tp, tpend, codes[i].len);
3050
3051
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
3052
0
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
3053
0
    BLOCK_APPEND(b, tmp, tp-tmp);
3054
0
    len += tp-tmp;
3055
3056
0
    free(tmp);
3057
3058
0
    if (r > 0)
3059
0
        return len;
3060
3061
0
 block_err:
3062
0
    return -1;
3063
0
}
3064
3065
cram_codec *cram_huffman_encode_init(cram_stats *st,
3066
                                     enum cram_encoding codec,
3067
                                     enum cram_external_type option,
3068
                                     void *dat,
3069
0
                                     int version, varint_vec *vv) {
3070
0
    int *vals = NULL, *freqs = NULL, *lens = NULL, code, len;
3071
0
    int *new_vals, *new_freqs;
3072
0
    int i, max_val = 0, min_val = INT_MAX, k;
3073
0
    size_t nvals, vals_alloc = 0;
3074
0
    cram_codec *c;
3075
0
    cram_huffman_code *codes;
3076
3077
0
    c = malloc(sizeof(*c));
3078
0
    if (!c)
3079
0
        return NULL;
3080
0
    c->codec = E_HUFFMAN;
3081
3082
    /* Count number of unique symbols */
3083
0
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
3084
0
        if (!st->freqs[i])
3085
0
            continue;
3086
0
        if (nvals >= vals_alloc) {
3087
0
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3088
0
            new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3089
0
            if (!new_vals) goto nomem;
3090
0
            vals = new_vals;
3091
0
            new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3092
0
            if (!new_freqs) goto nomem;
3093
0
            freqs = new_freqs;
3094
0
        }
3095
0
        vals[nvals] = i;
3096
0
        freqs[nvals] = st->freqs[i];
3097
0
        assert(st->freqs[i] > 0);
3098
0
        if (max_val < i) max_val = i;
3099
0
        if (min_val > i) min_val = i;
3100
0
        nvals++;
3101
0
    }
3102
0
    if (st->h) {
3103
0
        khint_t k;
3104
3105
0
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
3106
0
            if (!kh_exist(st->h, k))
3107
0
                continue;
3108
0
            if (nvals >= vals_alloc) {
3109
0
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3110
0
                new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3111
0
                if (!new_vals) goto nomem;
3112
0
                vals = new_vals;
3113
0
                new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3114
0
                if (!new_freqs) goto nomem;
3115
0
                freqs = new_freqs;
3116
0
            }
3117
0
            vals[nvals]= kh_key(st->h, k);
3118
0
            freqs[nvals] = kh_val(st->h, k);
3119
0
            assert(freqs[nvals] > 0);
3120
0
            if (max_val < i) max_val = i;
3121
0
            if (min_val > i) min_val = i;
3122
0
            nvals++;
3123
0
        }
3124
0
    }
3125
3126
0
    assert(nvals > 0);
3127
3128
0
    new_freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
3129
0
    if (!new_freqs) goto nomem;
3130
0
    freqs = new_freqs;
3131
0
    lens = calloc(2*nvals, sizeof(*lens));
3132
0
    if (!lens) goto nomem;
3133
3134
    /* Inefficient, use pointers to form chain so we can insert and maintain
3135
     * a sorted list? This is currently O(nvals^2) complexity.
3136
     */
3137
0
    for (;;) {
3138
0
        int low1 = INT_MAX, low2 = INT_MAX;
3139
0
        int ind1 = 0, ind2 = 0;
3140
0
        for (i = 0; i < nvals; i++) {
3141
0
            if (freqs[i] < 0)
3142
0
                continue;
3143
0
            if (low1 > freqs[i])
3144
0
                low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
3145
0
            else if (low2 > freqs[i])
3146
0
                low2 = freqs[i], ind2 = i;
3147
0
        }
3148
0
        if (low2 == INT_MAX)
3149
0
            break;
3150
3151
0
        freqs[nvals] = low1 + low2;
3152
0
        lens[ind1] = nvals;
3153
0
        lens[ind2] = nvals;
3154
0
        freqs[ind1] *= -1;
3155
0
        freqs[ind2] *= -1;
3156
0
        nvals++;
3157
0
    }
3158
0
    nvals = nvals/2+1;
3159
3160
    /* Assign lengths */
3161
0
    for (i = 0; i < nvals; i++) {
3162
0
        int code_len = 0;
3163
0
        for (k = lens[i]; k; k = lens[k])
3164
0
            code_len++;
3165
0
        lens[i] = code_len;
3166
0
        freqs[i] *= -1;
3167
        //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
3168
0
    }
3169
3170
3171
    /* Sort, need in a struct */
3172
0
    if (!(codes = malloc(nvals * sizeof(*codes))))
3173
0
        goto nomem;
3174
0
    for (i = 0; i < nvals; i++) {
3175
0
        codes[i].symbol = vals[i];
3176
0
        codes[i].len = lens[i];
3177
0
    }
3178
0
    qsort(codes, nvals, sizeof(*codes), code_sort);
3179
3180
    /*
3181
     * Generate canonical codes from lengths.
3182
     * Sort by length.
3183
     * Start with 0.
3184
     * Every new code of same length is +1.
3185
     * Every new code of new length is +1 then <<1 per extra length.
3186
     *
3187
     * /\
3188
     * a/\
3189
     * /\/\
3190
     * bcd/\
3191
     *    ef
3192
     *
3193
     * a 1  0
3194
     * b 3  4 (0+1)<<2
3195
     * c 3  5
3196
     * d 3  6
3197
     * e 4  14  (6+1)<<1
3198
     * f 5  15
3199
     */
3200
0
    code = 0; len = codes[0].len;
3201
0
    for (i = 0; i < nvals; i++) {
3202
0
        while (len != codes[i].len) {
3203
0
            code<<=1;
3204
0
            len++;
3205
0
        }
3206
0
        codes[i].code = code++;
3207
3208
0
        if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
3209
0
            c->u.e_huffman.val2code[codes[i].symbol+1] = i;
3210
3211
        //fprintf(stderr, "sym %d, code %d, len %d\n",
3212
        //      codes[i].symbol, codes[i].code, codes[i].len);
3213
0
    }
3214
3215
0
    free(lens);
3216
0
    free(vals);
3217
0
    free(freqs);
3218
3219
0
    c->u.e_huffman.codes = codes;
3220
0
    c->u.e_huffman.nvals = nvals;
3221
0
    c->u.e_huffman.option = option;
3222
3223
0
    c->free = cram_huffman_encode_free;
3224
0
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
3225
0
        if (c->u.e_huffman.codes[0].len == 0)
3226
0
            c->encode = cram_huffman_encode_char0;
3227
0
        else
3228
0
            c->encode = cram_huffman_encode_char;
3229
0
    } else if (option == E_INT || option == E_SINT) {
3230
0
        if (c->u.e_huffman.codes[0].len == 0)
3231
0
            c->encode = cram_huffman_encode_int0;
3232
0
        else
3233
0
            c->encode = cram_huffman_encode_int;
3234
0
    } else if (option == E_LONG || option == E_SLONG) {
3235
0
        if (c->u.e_huffman.codes[0].len == 0)
3236
0
            c->encode = cram_huffman_encode_long0;
3237
0
        else
3238
0
            c->encode = cram_huffman_encode_long;
3239
0
    } else {
3240
0
        return NULL;
3241
0
    }
3242
0
    c->store = cram_huffman_encode_store;
3243
0
    c->flush = NULL;
3244
3245
0
    return c;
3246
3247
0
 nomem:
3248
0
    hts_log_error("Out of memory");
3249
0
    free(vals);
3250
0
    free(freqs);
3251
0
    free(lens);
3252
0
    free(c);
3253
0
    return NULL;
3254
0
}
3255
3256
/*
3257
 * ---------------------------------------------------------------------------
3258
 * BYTE_ARRAY_LEN
3259
 */
3260
int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
3261
                               cram_block *in, char *out,
3262
0
                               int *out_size) {
3263
    /* Fetch length */
3264
0
    int32_t len = 0, one = 1;
3265
0
    int r;
3266
3267
0
    r = c->u.byte_array_len.len_codec->decode(slice, c->u.byte_array_len.len_codec,
3268
0
                                              in, (char *)&len, &one);
3269
    //printf("ByteArray Len=%d\n", len);
3270
3271
0
    if (!r && c->u.byte_array_len.val_codec && len >= 0) {
3272
0
        r = c->u.byte_array_len.val_codec->decode(slice,
3273
0
                                                  c->u.byte_array_len.val_codec,
3274
0
                                                  in, out, &len);
3275
0
    } else {
3276
0
        return -1;
3277
0
    }
3278
3279
0
    *out_size = len;
3280
3281
0
    return r;
3282
0
}
3283
3284
41
void cram_byte_array_len_decode_free(cram_codec *c) {
3285
41
    if (!c) return;
3286
3287
41
    if (c->u.byte_array_len.len_codec)
3288
36
        c->u.byte_array_len.len_codec->free(c->u.byte_array_len.len_codec);
3289
3290
41
    if (c->u.byte_array_len.val_codec)
3291
36
        c->u.byte_array_len.val_codec->free(c->u.byte_array_len.val_codec);
3292
3293
41
    free(c);
3294
41
}
3295
3296
cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
3297
                                            char *data, int size,
3298
                                            enum cram_encoding codec,
3299
                                            enum cram_external_type option,
3300
41
                                            int version, varint_vec *vv) {
3301
41
    cram_codec *c;
3302
41
    char *cp   = data;
3303
41
    char *endp = data + size;
3304
3305
41
    if (!(c = malloc(sizeof(*c))))
3306
0
        return NULL;
3307
3308
41
    c->codec  = E_BYTE_ARRAY_LEN;
3309
41
    c->decode = cram_byte_array_len_decode;
3310
41
    c->free   = cram_byte_array_len_decode_free;
3311
41
    c->u.byte_array_len.len_codec = NULL;
3312
41
    c->u.byte_array_len.val_codec = NULL;
3313
3314
41
    int encoding = vv->varint_get32(&cp, endp, NULL);
3315
41
    int sub_size = vv->varint_get32(&cp, endp, NULL);
3316
41
    if (sub_size < 0 || endp - cp < sub_size)
3317
2
        goto malformed;
3318
39
    c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3319
39
                                                      E_INT, version, vv);
3320
39
    if (c->u.byte_array_len.len_codec == NULL)
3321
3
        goto no_codec;
3322
36
    cp += sub_size;
3323
3324
36
    encoding = vv->varint_get32(&cp, endp, NULL);
3325
36
    sub_size = vv->varint_get32(&cp, endp, NULL);
3326
36
    if (sub_size < 0 || endp - cp < sub_size)
3327
0
        goto malformed;
3328
36
    c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3329
36
                                                      option, version, vv);
3330
36
    if (c->u.byte_array_len.val_codec == NULL)
3331
0
        goto no_codec;
3332
36
    cp += sub_size;
3333
3334
36
    if (cp - data != size)
3335
0
        goto malformed;
3336
3337
36
    return c;
3338
3339
2
 malformed:
3340
2
    hts_log_error("Malformed byte_array_len header stream");
3341
5
 no_codec:
3342
5
    cram_byte_array_len_decode_free(c);
3343
5
    return NULL;
3344
2
}
3345
3346
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
3347
0
                               char *in, int in_size) {
3348
0
    int32_t i32 = in_size;
3349
0
    int r = 0;
3350
3351
0
    r |= c->u.e_byte_array_len.len_codec->encode(slice,
3352
0
                                                 c->u.e_byte_array_len.len_codec,
3353
0
                                                 (char *)&i32, 1);
3354
0
    r |= c->u.e_byte_array_len.val_codec->encode(slice,
3355
0
                                                 c->u.e_byte_array_len.val_codec,
3356
0
                                                 in, in_size);
3357
0
    return r;
3358
0
}
3359
3360
0
void cram_byte_array_len_encode_free(cram_codec *c) {
3361
0
    if (!c)
3362
0
        return;
3363
3364
0
    if (c->u.e_byte_array_len.len_codec)
3365
0
        c->u.e_byte_array_len.len_codec->free(c->u.e_byte_array_len.len_codec);
3366
3367
0
    if (c->u.e_byte_array_len.val_codec)
3368
0
        c->u.e_byte_array_len.val_codec->free(c->u.e_byte_array_len.val_codec);
3369
3370
0
    free(c);
3371
0
}
3372
3373
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
3374
0
                                     char *prefix, int version) {
3375
0
    int len = 0, len2, len3, r = 0, n;
3376
0
    cram_codec *tc;
3377
0
    cram_block *b_len = NULL, *b_val = NULL;
3378
3379
0
    if (prefix) {
3380
0
        size_t l = strlen(prefix);
3381
0
        BLOCK_APPEND(b, prefix, l);
3382
0
        len += l;
3383
0
    }
3384
3385
0
    tc = c->u.e_byte_array_len.len_codec;
3386
0
    b_len = cram_new_block(0, 0);
3387
0
    if (!b_len) goto block_err;
3388
0
    len2 = tc->store(tc, b_len, NULL, version);
3389
0
    if (len2 < 0) goto block_err;
3390
3391
0
    tc = c->u.e_byte_array_len.val_codec;
3392
0
    b_val = cram_new_block(0, 0);
3393
0
    if (!b_val) goto block_err;
3394
0
    len3 = tc->store(tc, b_val, NULL, version);
3395
0
    if (len3 < 0) goto block_err;
3396
3397
0
    len += (n = c->vv->varint_put32_blk(b, c->codec));  r |= n;
3398
0
    len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n;
3399
0
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
3400
0
    BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
3401
3402
0
    cram_free_block(b_len);
3403
0
    cram_free_block(b_val);
3404
3405
0
    if (r > 0)
3406
0
        return len + len2 + len3;
3407
3408
0
 block_err:
3409
0
    if (b_len) cram_free_block(b_len);
3410
0
    if (b_val) cram_free_block(b_val);
3411
0
    return -1;
3412
0
}
3413
3414
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
3415
                                            enum cram_encoding codec,
3416
                                            enum cram_external_type option,
3417
                                            void *dat,
3418
0
                                            int version, varint_vec *vv) {
3419
0
    cram_codec *c;
3420
0
    cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
3421
3422
0
    c = malloc(sizeof(*c));
3423
0
    if (!c)
3424
0
        return NULL;
3425
0
    c->codec = E_BYTE_ARRAY_LEN;
3426
0
    c->free = cram_byte_array_len_encode_free;
3427
0
    c->encode = cram_byte_array_len_encode;
3428
0
    c->store = cram_byte_array_len_encode_store;
3429
0
    c->flush = NULL;
3430
3431
0
    c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
3432
0
                                                        st, E_INT,
3433
0
                                                        e->len_dat,
3434
0
                                                        version, vv);
3435
0
    c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
3436
0
                                                        NULL, E_BYTE_ARRAY,
3437
0
                                                        e->val_dat,
3438
0
                                                        version, vv);
3439
3440
0
    if (!c->u.e_byte_array_len.len_codec ||
3441
0
        !c->u.e_byte_array_len.val_codec) {
3442
0
        cram_byte_array_len_encode_free(c);
3443
0
        return NULL;
3444
0
    }
3445
3446
0
    return c;
3447
0
}
3448
3449
/*
3450
 * ---------------------------------------------------------------------------
3451
 * BYTE_ARRAY_STOP
3452
 */
3453
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
3454
                                            cram_block *in, char *out,
3455
0
                                            int *out_size) {
3456
0
    char *cp, ch;
3457
0
    cram_block *b = NULL;
3458
3459
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3460
0
    if (!b)
3461
0
        return *out_size?-1:0;
3462
3463
0
    if (b->idx >= b->uncomp_size)
3464
0
        return -1;
3465
3466
0
    cp = (char *)b->data + b->idx;
3467
0
    if (out) {
3468
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3469
0
            if (cp - (char *)b->data >= b->uncomp_size)
3470
0
                return -1;
3471
0
            *out++ = ch;
3472
0
            cp++;
3473
0
        }
3474
0
    } else {
3475
        // Consume input, but produce no output
3476
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3477
0
            if (cp - (char *)b->data >= b->uncomp_size)
3478
0
                return -1;
3479
0
            cp++;
3480
0
        }
3481
0
    }
3482
3483
0
    *out_size = cp - (char *)(b->data + b->idx);
3484
0
    b->idx = cp - (char *)b->data + 1;
3485
3486
0
    return 0;
3487
0
}
3488
3489
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
3490
                                      cram_block *in, char *out_,
3491
0
                                      int *out_size) {
3492
0
    cram_block *b;
3493
0
    cram_block *out = (cram_block *)out_;
3494
0
    unsigned char *cp, *cp_end;
3495
0
    unsigned char stop;
3496
3497
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3498
0
    if (!b)
3499
0
        return *out_size?-1:0;
3500
3501
0
    if (b->idx >= b->uncomp_size)
3502
0
        return -1;
3503
0
    cp = b->data + b->idx;
3504
0
    cp_end = b->data + b->uncomp_size;
3505
3506
0
    stop = c->u.byte_array_stop.stop;
3507
0
    if (cp_end - cp < out->alloc - out->byte) {
3508
0
        unsigned char *out_cp = BLOCK_END(out);
3509
0
        while (cp != cp_end && *cp != stop)
3510
0
            *out_cp++ = *cp++;
3511
0
        BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out);
3512
0
    } else {
3513
0
        unsigned char *cp_start;
3514
0
        for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
3515
0
            ;
3516
0
        BLOCK_APPEND(out, cp_start, cp - cp_start);
3517
0
        BLOCK_GROW(out, cp - cp_start);
3518
0
    }
3519
3520
0
    *out_size = cp - (b->data + b->idx);
3521
0
    b->idx = cp - b->data + 1;
3522
3523
0
    return 0;
3524
3525
0
 block_err:
3526
0
    return -1;
3527
0
}
3528
3529
167
void cram_byte_array_stop_decode_free(cram_codec *c) {
3530
167
    if (!c) return;
3531
3532
167
    free(c);
3533
167
}
3534
3535
cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
3536
                                             char *data, int size,
3537
                                             enum cram_encoding codec,
3538
                                             enum cram_external_type option,
3539
168
                                             int version, varint_vec *vv) {
3540
168
    cram_codec *c = NULL;
3541
168
    unsigned char *cp = (unsigned char *)data;
3542
168
    int err = 0;
3543
3544
168
    if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
3545
1
        goto malformed;
3546
3547
167
    if (!(c = malloc(sizeof(*c))))
3548
0
        return NULL;
3549
3550
167
    c->codec  = E_BYTE_ARRAY_STOP;
3551
167
    switch (option) {
3552
157
    case E_BYTE_ARRAY_BLOCK:
3553
157
        c->decode = cram_byte_array_stop_decode_block;
3554
157
        break;
3555
10
    case E_BYTE_ARRAY:
3556
10
        c->decode = cram_byte_array_stop_decode_char;
3557
10
        break;
3558
0
    default:
3559
0
        hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
3560
0
        free(c);
3561
0
        return NULL;
3562
167
    }
3563
167
    c->free   = cram_byte_array_stop_decode_free;
3564
3565
167
    c->u.byte_array_stop.stop = *cp++;
3566
167
    if (CRAM_MAJOR_VERS(version) == 1) {
3567
167
        c->u.byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
3568
167
            + ((unsigned int) cp[3]<<24);
3569
167
        cp += 4;
3570
167
    } else {
3571
0
        c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err);
3572
0
    }
3573
3574
167
    if ((char *)cp - data != size || err)
3575
0
        goto malformed;
3576
3577
167
    return c;
3578
3579
1
 malformed:
3580
1
    hts_log_error("Malformed byte_array_stop header stream");
3581
1
    free(c);
3582
1
    return NULL;
3583
167
}
3584
3585
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
3586
0
                                char *in, int in_size) {
3587
0
    BLOCK_APPEND(c->out, in, in_size);
3588
0
    BLOCK_APPEND_CHAR(c->out, c->u.e_byte_array_stop.stop);
3589
0
    return 0;
3590
3591
0
 block_err:
3592
0
    return -1;
3593
0
}
3594
3595
0
void cram_byte_array_stop_encode_free(cram_codec *c) {
3596
0
    if (!c)
3597
0
        return;
3598
0
    free(c);
3599
0
}
3600
3601
int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
3602
0
                                      char *prefix, int version) {
3603
0
    int len = 0;
3604
0
    char buf[20], *cp = buf;
3605
3606
0
    if (prefix) {
3607
0
        size_t l = strlen(prefix);
3608
0
        BLOCK_APPEND(b, prefix, l);
3609
0
        len += l;
3610
0
    }
3611
3612
0
    cp += c->vv->varint_put32(cp, buf+20, c->codec);
3613
3614
0
    if (CRAM_MAJOR_VERS(version) == 1) {
3615
0
        cp += c->vv->varint_put32(cp, buf+20, 5);
3616
0
        *cp++ = c->u.e_byte_array_stop.stop;
3617
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  0) & 0xff;
3618
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  8) & 0xff;
3619
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff;
3620
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff;
3621
0
    } else {
3622
0
        cp += c->vv->varint_put32(cp, buf+20, 1 +
3623
0
                                  c->vv->varint_size(c->u.e_byte_array_stop.content_id));
3624
0
        *cp++ = c->u.e_byte_array_stop.stop;
3625
0
        cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id);
3626
0
    }
3627
3628
0
    BLOCK_APPEND(b, buf, cp-buf);
3629
0
    len += cp-buf;
3630
3631
0
    return len;
3632
3633
0
 block_err:
3634
0
    return -1;
3635
0
}
3636
3637
cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
3638
                                             enum cram_encoding codec,
3639
                                             enum cram_external_type option,
3640
                                             void *dat,
3641
0
                                             int version, varint_vec *vv) {
3642
0
    cram_codec *c;
3643
3644
0
    c = malloc(sizeof(*c));
3645
0
    if (!c)
3646
0
        return NULL;
3647
0
    c->codec = E_BYTE_ARRAY_STOP;
3648
0
    c->free = cram_byte_array_stop_encode_free;
3649
0
    c->encode = cram_byte_array_stop_encode;
3650
0
    c->store = cram_byte_array_stop_encode_store;
3651
0
    c->flush = NULL;
3652
3653
0
    c->u.e_byte_array_stop.stop = ((int *)dat)[0];
3654
0
    c->u.e_byte_array_stop.content_id = ((int *)dat)[1];
3655
3656
0
    return c;
3657
0
}
3658
3659
/*
3660
 * ---------------------------------------------------------------------------
3661
 */
3662
3663
7
const char *cram_encoding2str(enum cram_encoding t) {
3664
7
    switch (t) {
3665
3
    case E_NULL:            return "NULL";
3666
0
    case E_EXTERNAL:        return "EXTERNAL";
3667
1
    case E_GOLOMB:          return "GOLOMB";
3668
0
    case E_HUFFMAN:         return "HUFFMAN";
3669
0
    case E_BYTE_ARRAY_LEN:  return "BYTE_ARRAY_LEN";
3670
0
    case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
3671
0
    case E_BETA:            return "BETA";
3672
0
    case E_SUBEXP:          return "SUBEXP";
3673
0
    case E_GOLOMB_RICE:     return "GOLOMB_RICE";
3674
0
    case E_GAMMA:           return "GAMMA";
3675
3676
0
    case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED";
3677
0
    case E_VARINT_SIGNED:   return "VARINT_SIGNED";
3678
0
    case E_CONST_BYTE:      return "CONST_BYTE";
3679
0
    case E_CONST_INT:       return "CONST_INT";
3680
3681
0
    case E_NUM_CODECS:
3682
3
    default:                return "?";
3683
7
    }
3684
7
}
3685
3686
static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr,
3687
                                    char *data,
3688
                                    int size,
3689
                                    enum cram_encoding codec,
3690
                                    enum cram_external_type option,
3691
                                    int version, varint_vec *vv) = {
3692
    // CRAM 3.0 valid codecs
3693
    NULL, // null codec
3694
    cram_external_decode_init,
3695
    NULL, // golomb
3696
    cram_huffman_decode_init,
3697
    cram_byte_array_len_decode_init,
3698
    cram_byte_array_stop_decode_init,
3699
    cram_beta_decode_init,
3700
    cram_subexp_decode_init,
3701
    NULL, // golomb rice
3702
    cram_gamma_decode_init,
3703
3704
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3705
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3706
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3707
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3708
3709
    NULL,                      // was xbyte
3710
    cram_varint_decode_init,   // varint unsigned
3711
    cram_varint_decode_init,   // varint signed
3712
    cram_const_decode_init,    // const byte
3713
    cram_const_decode_init,    // const int
3714
3715
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3716
    NULL, NULL, NULL, NULL, NULL,
3717
3718
    NULL, // xhuffman
3719
    cram_xpack_decode_init,
3720
    cram_xrle_decode_init,
3721
    cram_xdelta_decode_init,
3722
};
3723
3724
cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr,
3725
                              enum cram_encoding codec,
3726
                              char *data, int size,
3727
                              enum cram_external_type option,
3728
1.53k
                              int version, varint_vec *vv) {
3729
1.53k
    if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
3730
1.52k
        cram_codec *r = decode_init[codec](hdr, data, size, codec,
3731
1.52k
                                           option, version, vv);
3732
1.52k
        if (r) {
3733
1.50k
            r->vv = vv;
3734
1.50k
            r->codec_id = hdr->ncodecs++;
3735
1.50k
        }
3736
1.52k
        return r;
3737
1.52k
    } else {
3738
7
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3739
7
        return NULL;
3740
7
    }
3741
1.53k
}
3742
3743
static cram_codec *(*encode_init[])(cram_stats *stx,
3744
                                    enum cram_encoding codec,
3745
                                    enum cram_external_type option,
3746
                                    void *opt,
3747
                                    int version, varint_vec *vv) = {
3748
    // CRAM 3.0 valid codecs
3749
    NULL, // null codec
3750
    cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4
3751
    NULL, // golomb
3752
    cram_huffman_encode_init,
3753
    cram_byte_array_len_encode_init,
3754
    cram_byte_array_stop_encode_init,
3755
    cram_beta_encode_init,
3756
    NULL, // subexponential (we support decode only)
3757
    NULL, // golomb rice
3758
    NULL, // gamma (we support decode only)
3759
3760
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3761
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3762
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3763
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3764
3765
    NULL, // was xbyte
3766
    cram_varint_encode_init, // varint unsigned
3767
    cram_varint_encode_init, // varint signed
3768
    cram_const_encode_init,  // const byte
3769
    cram_const_encode_init,  // const int
3770
3771
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3772
    NULL, NULL, NULL, NULL, NULL,
3773
3774
    NULL, // xhuffman
3775
    cram_xpack_encode_init,
3776
    cram_xrle_encode_init,
3777
    cram_xdelta_encode_init,
3778
};
3779
3780
cram_codec *cram_encoder_init(enum cram_encoding codec,
3781
                              cram_stats *st,
3782
                              enum cram_external_type option,
3783
                              void *dat,
3784
0
                              int version, varint_vec *vv) {
3785
0
    if (st && !st->nvals)
3786
0
        return NULL;
3787
3788
    // cram_stats_encoding assumes integer data, but if option
3789
    // is E_BYTE then tweak the requested encoding.  This ought
3790
    // to be fixed in cram_stats_encoding instead.
3791
0
    if (option == E_BYTE || option == E_BYTE_ARRAY ||
3792
0
       option == E_BYTE_ARRAY_BLOCK) {
3793
0
       if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED)
3794
0
           codec = E_EXTERNAL;
3795
0
       else if (codec == E_CONST_INT)
3796
0
           codec = E_CONST_BYTE;
3797
0
    }
3798
3799
0
    if (encode_init[codec]) {
3800
0
        cram_codec *r;
3801
0
        if ((r = encode_init[codec](st, codec, option, dat, version, vv)))
3802
0
            r->out = NULL;
3803
0
        if (!r) {
3804
0
            hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec));
3805
0
            return NULL;
3806
0
        }
3807
0
        r->vv = vv;
3808
0
        return r;
3809
0
    } else {
3810
0
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3811
0
        abort();
3812
0
    }
3813
0
}
3814
3815
/*
3816
 * Returns the content_id used by this codec, also in id2 if byte_array_len.
3817
 * Returns -1 for the CORE block and -2 for unneeded.
3818
 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
3819
 */
3820
0
int cram_codec_to_id(cram_codec *c, int *id2) {
3821
0
    int bnum1, bnum2 = -2;
3822
3823
0
    switch (c->codec) {
3824
0
    case E_CONST_INT:
3825
0
    case E_CONST_BYTE:
3826
0
       bnum1 = -2; // no blocks used
3827
3828
0
    case E_HUFFMAN:
3829
0
        bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1;
3830
0
        break;
3831
3832
0
    case E_GOLOMB:
3833
0
    case E_BETA:
3834
0
    case E_SUBEXP:
3835
0
    case E_GOLOMB_RICE:
3836
0
    case E_GAMMA:
3837
        // CORE block
3838
0
        bnum1 = -1;
3839
0
        break;
3840
3841
0
    case E_EXTERNAL:
3842
0
    case E_VARINT_UNSIGNED:
3843
0
    case E_VARINT_SIGNED:
3844
0
        bnum1 = c->u.external.content_id;
3845
0
        break;
3846
3847
0
    case E_BYTE_ARRAY_LEN:
3848
0
        bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL);
3849
0
        bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL);
3850
0
        break;
3851
3852
0
    case E_BYTE_ARRAY_STOP:
3853
0
        bnum1 = c->u.byte_array_stop.content_id;
3854
0
        break;
3855
3856
0
    case E_NULL:
3857
0
        bnum1 = -2;
3858
0
        break;
3859
3860
0
    default:
3861
0
        hts_log_error("Unknown codec type %d", c->codec);
3862
0
        bnum1 = -1;
3863
0
    }
3864
3865
0
    if (id2)
3866
0
        *id2 = bnum2;
3867
0
    return bnum1;
3868
0
}
3869
3870
3871
/*
3872
 * cram_codec structures are specialised for decoding or encoding.
3873
 * Unfortunately this makes turning a decoder into an encoder (such as
3874
 * when transcoding files) problematic.
3875
 *
3876
 * This function converts a cram decoder codec into an encoder version
3877
 * in-place (ie it modifiers the codec itself).
3878
 *
3879
 * Returns 0 on success;
3880
 *        -1 on failure.
3881
 */
3882
0
int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
3883
0
    int j;
3884
3885
0
    switch (c->codec) {
3886
0
    case E_CONST_INT:
3887
0
    case E_CONST_BYTE:
3888
        // shares struct with decode
3889
0
        c->store = cram_const_encode_store;
3890
0
        break;
3891
3892
0
    case E_EXTERNAL:
3893
        // shares struct with decode
3894
0
        c->free = cram_external_encode_free;
3895
0
        c->store = cram_external_encode_store;
3896
0
        if (c->decode == cram_external_decode_int)
3897
0
            c->encode = cram_external_encode_int;
3898
0
        else if (c->decode == cram_external_decode_long)
3899
0
            c->encode = cram_external_encode_long;
3900
0
        else if (c->decode == cram_external_decode_char)
3901
0
            c->encode = cram_external_encode_char;
3902
0
        else if (c->decode == cram_external_decode_block)
3903
0
            c->encode = cram_external_encode_char;
3904
0
        else
3905
0
            return -1;
3906
0
        break;
3907
3908
0
    case E_VARINT_SIGNED:
3909
0
    case E_VARINT_UNSIGNED:
3910
        // shares struct with decode
3911
0
        c->free = cram_varint_encode_free;
3912
0
        c->store = cram_varint_encode_store;
3913
0
        if (c->decode == cram_varint_decode_int)
3914
0
            c->encode = cram_varint_encode_int;
3915
0
        else if (c->decode == cram_varint_decode_sint)
3916
0
            c->encode = cram_varint_encode_sint;
3917
0
        else if (c->decode == cram_varint_decode_long)
3918
0
            c->encode = cram_varint_encode_long;
3919
0
        else if (c->decode == cram_varint_decode_slong)
3920
0
            c->encode = cram_varint_encode_slong;
3921
0
        else
3922
0
            return -1;
3923
0
        break;
3924
3925
0
    case E_HUFFMAN: {
3926
        // New structure, so switch.
3927
        // FIXME: we huffman and e_huffman structs amended, we could
3928
        // unify this.
3929
0
        cram_codec *t = malloc(sizeof(*t));
3930
0
        if (!t) return -1;
3931
0
        t->vv     = c->vv;
3932
0
        t->codec = E_HUFFMAN;
3933
0
        t->free = cram_huffman_encode_free;
3934
0
        t->store = cram_huffman_encode_store;
3935
0
        t->u.e_huffman.codes = c->u.huffman.codes;
3936
0
        t->u.e_huffman.nvals = c->u.huffman.ncodes;
3937
0
        t->u.e_huffman.option = c->u.huffman.option;
3938
0
        for (j = 0; j < t->u.e_huffman.nvals; j++) {
3939
0
            int32_t sym = t->u.e_huffman.codes[j].symbol;
3940
0
            if (sym >= -1 && sym < MAX_HUFF)
3941
0
                t->u.e_huffman.val2code[sym+1] = j;
3942
0
        }
3943
3944
0
        if (c->decode == cram_huffman_decode_char0)
3945
0
            t->encode = cram_huffman_encode_char0;
3946
0
        else if (c->decode == cram_huffman_decode_char)
3947
0
            t->encode = cram_huffman_encode_char;
3948
0
        else if (c->decode == cram_huffman_decode_int0)
3949
0
            t->encode = cram_huffman_encode_int0;
3950
0
        else if (c->decode == cram_huffman_decode_int)
3951
0
            t->encode = cram_huffman_encode_int;
3952
0
        else if (c->decode == cram_huffman_decode_long0)
3953
0
            t->encode = cram_huffman_encode_long0;
3954
0
        else if (c->decode == cram_huffman_decode_long)
3955
0
            t->encode = cram_huffman_encode_long;
3956
0
        else {
3957
0
            free(t);
3958
0
            return -1;
3959
0
        }
3960
0
        *c = *t;
3961
0
        free(t);
3962
0
        break;
3963
0
    }
3964
3965
0
    case E_BETA:
3966
        // shares struct with decode
3967
0
        c->free = cram_beta_encode_free;
3968
0
        c->store = cram_beta_encode_store;
3969
0
        if (c->decode == cram_beta_decode_int)
3970
0
            c->encode = cram_beta_encode_int;
3971
0
        else if (c->decode == cram_beta_decode_long)
3972
0
            c->encode = cram_beta_encode_long;
3973
0
        else if (c->decode == cram_beta_decode_char)
3974
0
            c->encode = cram_beta_encode_char;
3975
0
        else
3976
0
            return -1;
3977
0
        break;
3978
3979
0
    case E_XPACK: {
3980
        // shares struct with decode
3981
0
        cram_codec t = *c;
3982
0
        t.free = cram_xpack_encode_free;
3983
0
        t.store = cram_xpack_encode_store;
3984
0
        if (t.decode == cram_xpack_decode_long)
3985
0
            t.encode = cram_xpack_encode_long;
3986
0
        else if (t.decode == cram_xpack_decode_int)
3987
0
            t.encode = cram_xpack_encode_int;
3988
0
        else if (t.decode == cram_xpack_decode_char)
3989
0
            t.encode = cram_xpack_encode_char;
3990
0
        else
3991
0
            return -1;
3992
0
        t.u.e_xpack.sub_codec = t.u.xpack.sub_codec;
3993
0
        if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1)
3994
0
            return -1;
3995
0
        *c = t;
3996
0
        break;
3997
0
    }
3998
3999
0
    case E_BYTE_ARRAY_LEN: {
4000
0
        cram_codec *t = malloc(sizeof(*t));
4001
0
        if (!t) return -1;
4002
0
        t->vv     = c->vv;
4003
0
        t->codec  = E_BYTE_ARRAY_LEN;
4004
0
        t->free   = cram_byte_array_len_encode_free;
4005
0
        t->store  = cram_byte_array_len_encode_store;
4006
0
        t->encode = cram_byte_array_len_encode;
4007
0
        t->u.e_byte_array_len.len_codec = c->u.byte_array_len.len_codec;
4008
0
        t->u.e_byte_array_len.val_codec = c->u.byte_array_len.val_codec;
4009
0
        if (cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.len_codec) == -1 ||
4010
0
            cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.val_codec) == -1) {
4011
0
            t->free(t);
4012
0
            return -1;
4013
0
        }
4014
4015
        // {len,val}_{encoding,dat} are undefined, but unused.
4016
        // Leaving them unset here means we can test that assertion.
4017
0
        *c = *t;
4018
0
        free(t);
4019
0
        break;
4020
0
    }
4021
4022
0
    case E_BYTE_ARRAY_STOP:
4023
        // shares struct with decode
4024
0
        c->free   = cram_byte_array_stop_encode_free;
4025
0
        c->store  = cram_byte_array_stop_encode_store;
4026
0
        c->encode = cram_byte_array_stop_encode;
4027
0
        break;
4028
4029
0
    default:
4030
0
        return -1;
4031
0
    }
4032
4033
0
    return 0;
4034
0
}