Coverage Report

Created: 2025-07-09 06:49

/src/htslib/cram/cram_codecs.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2021,2023 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * FIXME: add checking of cram_external_type to return NULL on unsupported
33
 * {codec,type} tuples.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdlib.h>
40
#include <string.h>
41
#include <assert.h>
42
#include <limits.h>
43
#include <stdint.h>
44
#include <errno.h>
45
#include <stddef.h>
46
47
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48
#include "../fuzz_settings.h"
49
#endif
50
51
#include "../htslib/hts_endian.h"
52
53
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
54
#include <htscodecs/varint.h>
55
#include <htscodecs/pack.h>
56
#include <htscodecs/rle.h>
57
#else
58
#include "../htscodecs/htscodecs/varint.h"
59
#include "../htscodecs/htscodecs/pack.h"
60
#include "../htscodecs/htscodecs/rle.h"
61
#endif
62
63
#include "cram.h"
64
65
/*
66
 * ---------------------------------------------------------------------------
67
 * Block bit-level I/O functions.
68
 * All defined static here to promote easy inlining by the compiler.
69
 */
70
71
#if 0
72
/* Get a single bit, MSB first */
73
static signed int get_bit_MSB(cram_block *block) {
74
    unsigned int val;
75
76
    if (block->byte > block->alloc)
77
        return -1;
78
79
    val = block->data[block->byte] >> block->bit;
80
    if (--block->bit == -1) {
81
        block->bit = 7;
82
        block->byte++;
83
        //printf("(%02X)", block->data[block->byte]);
84
    }
85
86
    //printf("-B%d-", val&1);
87
88
    return val & 1;
89
}
90
#endif
91
92
/*
93
 * Count number of successive 0 and 1 bits
94
 */
95
0
static int get_one_bits_MSB(cram_block *block) {
96
0
    int n = 0, b;
97
0
    if (block->byte >= block->uncomp_size)
98
0
        return -1;
99
0
    do {
100
0
        b = block->data[block->byte] >> block->bit;
101
0
        if (--block->bit == -1) {
102
0
            block->bit = 7;
103
0
            block->byte++;
104
0
            if (block->byte == block->uncomp_size && (b&1))
105
0
                return -1;
106
0
        }
107
0
        n++;
108
0
    } while (b&1);
109
110
0
    return n-1;
111
0
}
112
113
0
static int get_zero_bits_MSB(cram_block *block) {
114
0
    int n = 0, b;
115
0
    if (block->byte >= block->uncomp_size)
116
0
        return -1;
117
0
    do {
118
0
        b = block->data[block->byte] >> block->bit;
119
0
        if (--block->bit == -1) {
120
0
            block->bit = 7;
121
0
            block->byte++;
122
0
            if (block->byte == block->uncomp_size && !(b&1))
123
0
                return -1;
124
0
        }
125
0
        n++;
126
0
    } while (!(b&1));
127
128
0
    return n-1;
129
0
}
130
131
#if 0
132
/* Stores a single bit */
133
static void store_bit_MSB(cram_block *block, unsigned int bit) {
134
    if (block->byte >= block->alloc) {
135
        block->alloc = block->alloc ? block->alloc*2 : 1024;
136
        block->data = realloc(block->data, block->alloc);
137
    }
138
139
    if (bit)
140
        block->data[block->byte] |= (1 << block->bit);
141
142
    if (--block->bit == -1) {
143
        block->bit = 7;
144
        block->byte++;
145
        block->data[block->byte] = 0;
146
    }
147
}
148
#endif
149
150
#if 0
151
/* Rounds to the next whole byte boundary first */
152
static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
153
    if (block->bit != 7) {
154
        block->bit = 7;
155
        block->byte++;
156
    }
157
158
    while (block->byte + len >= block->alloc) {
159
        block->alloc = block->alloc ? block->alloc*2 : 1024;
160
        block->data = realloc(block->data, block->alloc);
161
    }
162
163
    memcpy(&block->data[block->byte], bytes, len);
164
    block->byte += len;
165
}
166
#endif
167
168
/* Local optimised copy for inlining */
169
0
static inline int64_t get_bits_MSB(cram_block *block, int nbits) {
170
0
    uint64_t val = 0;
171
0
    int i;
172
173
#if 0
174
    // Fits within the current byte */
175
    if (nbits <= block->bit+1) {
176
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
177
        if ((block->bit -= nbits) == -1) {
178
            block->bit = 7;
179
            block->byte++;
180
        }
181
        return val;
182
    }
183
184
    // partial first byte
185
    val = block->data[block->byte] & ((1<<(block->bit+1))-1);
186
    nbits -= block->bit+1;
187
    block->bit = 7;
188
    block->byte++;
189
190
    // whole middle bytes
191
    while (nbits >= 8) {
192
        val = (val << 8) | block->data[block->byte++];
193
        nbits -= 8;
194
    }
195
196
    val <<= nbits;
197
    val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
198
    block->bit -= nbits;
199
    return val;
200
#endif
201
202
#if 0
203
    /* Inefficient implementation! */
204
    //printf("{");
205
    for (i = 0; i < nbits; i++)
206
        //val = (val << 1) | get_bit_MSB(block);
207
        GET_BIT_MSB(block, val);
208
#endif
209
210
0
#if 1
211
    /* Combination of 1st two methods */
212
0
    if (nbits <= block->bit+1) {
213
0
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
214
0
        if ((block->bit -= nbits) == -1) {
215
0
            block->bit = 7;
216
0
            block->byte++;
217
0
        }
218
0
        return val;
219
0
    }
220
221
0
    switch(nbits) {
222
//  case 15: GET_BIT_MSB(block, val); // fall through
223
//  case 14: GET_BIT_MSB(block, val); // fall through
224
//  case 13: GET_BIT_MSB(block, val); // fall through
225
//  case 12: GET_BIT_MSB(block, val); // fall through
226
//  case 11: GET_BIT_MSB(block, val); // fall through
227
//  case 10: GET_BIT_MSB(block, val); // fall through
228
//  case  9: GET_BIT_MSB(block, val); // fall through
229
0
    case  8: GET_BIT_MSB(block, val); // fall through
230
0
    case  7: GET_BIT_MSB(block, val); // fall through
231
0
    case  6: GET_BIT_MSB(block, val); // fall through
232
0
    case  5: GET_BIT_MSB(block, val); // fall through
233
0
    case  4: GET_BIT_MSB(block, val); // fall through
234
0
    case  3: GET_BIT_MSB(block, val); // fall through
235
0
    case  2: GET_BIT_MSB(block, val); // fall through
236
0
    case  1: GET_BIT_MSB(block, val);
237
0
        break;
238
239
0
    default:
240
0
        for (i = 0; i < nbits; i++)
241
            //val = (val << 1) | get_bit_MSB(block);
242
0
            GET_BIT_MSB(block, val);
243
0
    }
244
0
#endif
245
246
    //printf("=0x%x}", val);
247
248
0
    return val;
249
0
}
250
251
/*
252
 * Can store up to 24-bits worth of data encoded in an integer value
253
 * Possibly we'd want to have a less optimal store_bits function when dealing
254
 * with nbits > 24, but for now we assume the codes generated are never
255
 * that big. (Given this is only possible with 121392 or more
256
 * characters with exactly the correct frequency distribution we check
257
 * for it elsewhere.)
258
 */
259
6.67k
static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) {
260
    //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
261
262
    /*
263
     * Use slow mode until we tweak the huffman generator to never generate
264
     * codes longer than 24-bits.
265
     */
266
6.67k
    unsigned int mask;
267
268
6.67k
    if (block->byte+8 >= block->alloc) {
269
645
        if (block->byte) {
270
3
            block->alloc *= 2;
271
3
            block->data = realloc(block->data, block->alloc + 8);
272
3
            if (!block->data)
273
0
                return -1;
274
642
        } else {
275
642
            block->alloc = 1024;
276
642
            block->data = realloc(block->data, block->alloc + 8);
277
642
            if (!block->data)
278
0
                return -1;
279
642
            block->data[0] = 0; // initialise first byte of buffer
280
642
        }
281
645
    }
282
283
    /* fits in current bit-field */
284
6.67k
    if (nbits <= block->bit+1) {
285
2.83k
        block->data[block->byte] |= (val << (block->bit+1-nbits));
286
2.83k
        if ((block->bit-=nbits) == -1) {
287
1.00k
            block->bit = 7;
288
1.00k
            block->byte++;
289
1.00k
            block->data[block->byte] = 0;
290
1.00k
        }
291
2.83k
        return 0;
292
2.83k
    }
293
294
3.84k
    block->data[block->byte] |= (val >> (nbits -= block->bit+1));
295
3.84k
    block->bit = 7;
296
3.84k
    block->byte++;
297
3.84k
    block->data[block->byte] = 0;
298
299
3.84k
    mask = 1<<(nbits-1);
300
20.2k
    do {
301
20.2k
        if (val & mask)
302
4.72k
            block->data[block->byte] |= (1 << block->bit);
303
20.2k
        if (--block->bit == -1) {
304
955
            block->bit = 7;
305
955
            block->byte++;
306
955
            block->data[block->byte] = 0;
307
955
        }
308
20.2k
        mask >>= 1;
309
20.2k
    } while(--nbits);
310
311
3.84k
    return 0;
312
6.67k
}
313
314
/*
315
 * Returns the next 'size' bytes from a block, or NULL if insufficient
316
 * data left.This is just a pointer into the block data and not an
317
 * allocated object, so do not free the result.
318
 */
319
0
static char *cram_extract_block(cram_block *b, int size) {
320
0
    char *cp = (char *)b->data + b->idx;
321
0
    b->idx += size;
322
0
    if (b->idx > b->uncomp_size)
323
0
        return NULL;
324
325
0
    return cp;
326
0
}
327
328
/*
329
 * ---------------------------------------------------------------------------
330
 * EXTERNAL
331
 *
332
 * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the
333
 * size of the object being returned.  This type is hard coded in the
334
 * spec document (changing from uint32 to uint64 requires a spec change)
335
 * and there is no data format introspection so implementations have
336
 * to determine which size to use based on version numbers.   It also
337
 * doesn't support signed data.
338
 *
339
 * With CRAM 4.0 onwards the size and sign of the data is no longer stated
340
 * explicitly in the specification.  Instead EXTERNAL is replaced by three
341
 * new encodings, for bytes and signed / unsigned integers which used a
342
 * variable sized encoding.
343
 *
344
 * For simplicity we use the same encode and decode functions for
345
 * bytes (CRAM4) and external (CRAM3). Given we already had code to
346
 * replace codec + type into a function pointer it makes little
347
 * difference how we ended up at that function.  However we disallow
348
 * this codec to operate on integer data for CRAM4 onwards.
349
 */
350
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
351
0
                             cram_block *in, char *out, int *out_size) {
352
0
    char *cp;
353
0
    cram_block *b;
354
355
    /* Find the external block */
356
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
357
0
    if (!b)
358
0
        return *out_size?-1:0;
359
360
0
    cp = (char *)b->data + b->idx;
361
    // E_INT and E_LONG are guaranteed single item queries
362
0
    int err = 0;
363
0
    *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err);
364
0
    b->idx = cp - (char *)b->data;
365
0
    *out_size = 1;
366
367
0
    return err ? -1 : 0;
368
0
}
369
370
int cram_external_decode_long(cram_slice *slice, cram_codec *c,
371
0
                              cram_block *in, char *out, int *out_size) {
372
0
    char *cp;
373
0
    cram_block *b;
374
375
    /* Find the external block */
376
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
377
0
    if (!b)
378
0
        return *out_size?-1:0;
379
380
0
    cp = (char *)b->data + b->idx;
381
    // E_INT and E_LONG are guaranteed single item queries
382
0
    int err = 0;
383
0
    *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err);
384
0
    b->idx = cp - (char *)b->data;
385
0
    *out_size = 1;
386
387
0
    return err ? -1 : 0;
388
0
}
389
390
int cram_external_decode_char(cram_slice *slice, cram_codec *c,
391
                              cram_block *in, char *out,
392
0
                              int *out_size) {
393
0
    char *cp;
394
0
    cram_block *b;
395
396
    /* Find the external block */
397
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
398
0
    if (!b)
399
0
        return *out_size?-1:0;
400
401
0
    cp = cram_extract_block(b, *out_size);
402
0
    if (!cp)
403
0
        return -1;
404
405
0
    if (out)
406
0
        memcpy(out, cp, *out_size);
407
0
    return 0;
408
0
}
409
410
static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
411
                                      cram_block *in, char *out_,
412
0
                                      int *out_size) {
413
0
    char *cp;
414
0
    cram_block *out = (cram_block *)out_;
415
0
    cram_block *b = NULL;
416
417
    /* Find the external block */
418
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
419
0
    if (!b)
420
0
        return *out_size?-1:0;
421
422
0
    cp = cram_extract_block(b, *out_size);
423
0
    if (!cp)
424
0
        return -1;
425
426
0
    BLOCK_APPEND(out, cp, *out_size);
427
0
    return 0;
428
429
0
 block_err:
430
0
    return -1;
431
0
}
432
433
1.48k
void cram_external_decode_free(cram_codec *c) {
434
1.48k
    if (c)
435
1.48k
        free(c);
436
1.48k
}
437
438
439
0
int cram_external_decode_size(cram_slice *slice, cram_codec *c) {
440
0
    cram_block *b;
441
442
    /* Find the external block */
443
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
444
0
    if (!b)
445
0
        return -1;
446
447
0
    return b->uncomp_size;
448
0
}
449
450
0
cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) {
451
0
    return cram_get_block_by_id(slice, c->u.external.content_id);
452
0
}
453
454
0
int cram_external_describe(cram_codec *c, kstring_t *ks) {
455
0
    return ksprintf(ks, "EXTERNAL(id=%d)",
456
0
                    c->u.external.content_id) < 0 ? -1 : 0;
457
0
}
458
459
cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
460
                                      char *data, int size,
461
                                      enum cram_encoding codec,
462
                                      enum cram_external_type option,
463
1.52k
                                      int version, varint_vec *vv) {
464
1.52k
    cram_codec *c = NULL;
465
1.52k
    char *cp = data;
466
467
1.52k
    if (size < 1)
468
6
        goto malformed;
469
470
1.51k
    if (!(c = malloc(sizeof(*c))))
471
0
        return NULL;
472
473
1.51k
    c->codec  = E_EXTERNAL;
474
1.51k
    if (CRAM_MAJOR_VERS(version) >= 4) {
475
        // Version 4 does not permit integer data to be encoded as a
476
        // series of bytes.  This is used purely for bytes, either
477
        // singular or declared as arrays
478
66
        switch (codec) {
479
66
        case E_EXTERNAL:
480
66
            if (option == E_BYTE_ARRAY_BLOCK)
481
48
                c->decode = cram_external_decode_block;
482
18
            else if (option == E_BYTE || option == E_BYTE_ARRAY)
483
6
                c->decode = cram_external_decode_char;
484
12
            else
485
12
                goto malformed;
486
54
            break;
487
54
        default:
488
0
            goto malformed;
489
66
        }
490
1.45k
    } else {
491
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
492
        // use the option field to indicate the input data format so
493
        // we know which serialisation format to use.
494
1.45k
        if (option == E_INT)
495
1.18k
            c->decode = cram_external_decode_int;
496
264
        else if (option == E_LONG)
497
0
            c->decode = cram_external_decode_long;
498
264
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
499
21
            c->decode = cram_external_decode_char;
500
243
        else
501
243
            c->decode = cram_external_decode_block;
502
1.45k
    }
503
1.50k
    c->free   = cram_external_decode_free;
504
1.50k
    c->size   = cram_external_decode_size;
505
1.50k
    c->get_block = cram_external_get_block;
506
1.50k
    c->describe = cram_external_describe;
507
508
1.50k
    c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL);
509
510
1.50k
    if (cp - data != size)
511
18
        goto malformed;
512
513
1.48k
    c->u.external.type = option;
514
515
1.48k
    return c;
516
517
36
 malformed:
518
36
    hts_log_error("Malformed external header stream");
519
36
    free(c);
520
36
    return NULL;
521
1.50k
}
522
523
int cram_external_encode_int(cram_slice *slice, cram_codec *c,
524
4.00M
                             char *in, int in_size) {
525
4.00M
    uint32_t *i32 = (uint32_t *)in;
526
4.00M
    return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1;
527
4.00M
}
528
529
int cram_external_encode_sint(cram_slice *slice, cram_codec *c,
530
0
                             char *in, int in_size) {
531
0
    int32_t *i32 = (int32_t *)in;
532
0
    return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1;
533
0
}
534
535
int cram_external_encode_long(cram_slice *slice, cram_codec *c,
536
0
                             char *in, int in_size) {
537
0
    uint64_t *i64 = (uint64_t *)in;
538
0
    return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1;
539
0
}
540
541
int cram_external_encode_slong(cram_slice *slice, cram_codec *c,
542
0
                               char *in, int in_size) {
543
0
    int64_t *i64 = (int64_t *)in;
544
0
    return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1;
545
0
}
546
547
int cram_external_encode_char(cram_slice *slice, cram_codec *c,
548
263k
                              char *in, int in_size) {
549
263k
    BLOCK_APPEND(c->out, in, in_size);
550
263k
    return 0;
551
552
0
 block_err:
553
0
    return -1;
554
263k
}
555
556
181k
void cram_external_encode_free(cram_codec *c) {
557
181k
    if (!c)
558
0
        return;
559
181k
    free(c);
560
181k
}
561
562
int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
563
180k
                               int version) {
564
180k
    char tmp[99], *tp = tmp, *tpend = tmp+99;
565
180k
    int len = 0, r = 0, n;
566
567
180k
    if (prefix) {
568
57.4k
        size_t l = strlen(prefix);
569
57.4k
        BLOCK_APPEND(b, prefix, l);
570
57.4k
        len += l;
571
57.4k
    }
572
573
180k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id);
574
180k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
575
180k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
576
180k
    BLOCK_APPEND(b, tmp, tp-tmp);
577
180k
    len += tp-tmp;
578
579
180k
    if (r > 0)
580
180k
        return len;
581
582
0
 block_err:
583
0
    return -1;
584
180k
}
585
586
cram_codec *cram_external_encode_init(cram_stats *st,
587
                                      enum cram_encoding codec,
588
                                      enum cram_external_type option,
589
                                      void *dat,
590
181k
                                      int version, varint_vec *vv) {
591
181k
    cram_codec *c;
592
593
181k
    c = malloc(sizeof(*c));
594
181k
    if (!c)
595
0
        return NULL;
596
181k
    c->codec = E_EXTERNAL;
597
181k
    c->free = cram_external_encode_free;
598
181k
    if (CRAM_MAJOR_VERS(version) >= 4) {
599
        // Version 4 does not permit integer data to be encoded as a
600
        // series of bytes.  This is used purely for bytes, either
601
        // singular or declared as arrays
602
0
        switch (codec) {
603
0
        case E_EXTERNAL:
604
0
            if (option != E_BYTE && option != E_BYTE_ARRAY)
605
0
                return NULL;
606
0
            c->encode = cram_external_encode_char;
607
0
            break;
608
0
        default:
609
0
            return NULL;
610
0
        }
611
181k
    } else {
612
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
613
        // use the option field to indicate the input data format so
614
        // we know which serialisation format to use.
615
181k
        if (option == E_INT)
616
55.3k
            c->encode = cram_external_encode_int;
617
126k
        else if (option == E_LONG)
618
0
            c->encode = cram_external_encode_long;
619
126k
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
620
126k
            c->encode = cram_external_encode_char;
621
0
        else
622
0
            abort();
623
181k
    }
624
181k
    c->store = cram_external_encode_store;
625
181k
    c->flush = NULL;
626
627
181k
    c->u.e_external.content_id = (size_t)dat;
628
629
181k
    return c;
630
181k
}
631
632
/*
633
 * ---------------------------------------------------------------------------
634
 * VARINT
635
 *
636
 * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8
637
 * format as well as bytes.  In CRAM 4 EXTERNAL is only for bytes and
638
 * byte arrays, with two dedicated encodings for integers:
639
 * VARINT_SIGNED and VARINT_UNSIGNED.  These also differ a little to
640
 * EXTERNAL with the addition of an offset field, meaning we can store
641
 * values in, say, the range -2 to 1 million without needing to use
642
 * a signed zig-zag transformation.
643
 */
644
int cram_varint_decode_int(cram_slice *slice, cram_codec *c,
645
0
                           cram_block *in, char *out, int *out_size) {
646
0
    char *cp;
647
0
    cram_block *b;
648
649
    /* Find the data block */
650
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
651
0
    if (!b)
652
0
        return *out_size?-1:0;
653
654
0
    cp = (char *)b->data + b->idx;
655
    // E_INT and E_LONG are guaranteed single item queries
656
0
    int err = 0;
657
0
    *(int32_t *)out = c->vv->varint_get32(&cp,
658
0
                                          (char *)b->data + b->uncomp_size,
659
0
                                          &err) + c->u.varint.offset;
660
0
    b->idx = cp - (char *)b->data;
661
0
    *out_size = 1;
662
663
0
    return err ? -1 : 0;
664
0
}
665
666
int cram_varint_decode_sint(cram_slice *slice, cram_codec *c,
667
0
                            cram_block *in, char *out, int *out_size) {
668
0
    char *cp;
669
0
    cram_block *b;
670
671
    /* Find the data block */
672
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
673
0
    if (!b)
674
0
        return *out_size?-1:0;
675
676
0
    cp = (char *)b->data + b->idx;
677
    // E_INT and E_LONG are guaranteed single item queries
678
0
    int err = 0;
679
0
    *(int32_t *)out = c->vv->varint_get32s(&cp,
680
0
                                           (char *)b->data + b->uncomp_size,
681
0
                                           &err) + c->u.varint.offset;
682
0
    b->idx = cp - (char *)b->data;
683
0
    *out_size = 1;
684
685
0
    return err ? -1 : 0;
686
0
}
687
688
int cram_varint_decode_long(cram_slice *slice, cram_codec *c,
689
0
                            cram_block *in, char *out, int *out_size) {
690
0
    char *cp;
691
0
    cram_block *b;
692
693
    /* Find the data block */
694
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
695
0
    if (!b)
696
0
        return *out_size?-1:0;
697
698
0
    cp = (char *)b->data + b->idx;
699
    // E_INT and E_LONG are guaranteed single item queries
700
0
    int err = 0;
701
0
    *(int64_t *)out = c->vv->varint_get64(&cp,
702
0
                                          (char *)b->data + b->uncomp_size,
703
0
                                          &err) + c->u.varint.offset;
704
0
    b->idx = cp - (char *)b->data;
705
0
    *out_size = 1;
706
707
0
    return err ? -1 : 0;
708
0
}
709
710
int cram_varint_decode_slong(cram_slice *slice, cram_codec *c,
711
0
                             cram_block *in, char *out, int *out_size) {
712
0
    char *cp;
713
0
    cram_block *b;
714
715
    /* Find the data block */
716
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
717
0
    if (!b)
718
0
        return *out_size?-1:0;
719
720
0
    cp = (char *)b->data + b->idx;
721
    // E_INT and E_LONG are guaranteed single item queries
722
0
    int err = 0;
723
0
    *(int64_t *)out = c->vv->varint_get64s(&cp,
724
0
                                           (char *)b->data + b->uncomp_size,
725
0
                                           &err) + c->u.varint.offset;
726
0
    b->idx = cp - (char *)b->data;
727
0
    *out_size = 1;
728
729
0
    return err ? -1 : 0;
730
0
}
731
732
438
void cram_varint_decode_free(cram_codec *c) {
733
438
    if (c)
734
438
        free(c);
735
438
}
736
737
0
int cram_varint_decode_size(cram_slice *slice, cram_codec *c) {
738
0
    cram_block *b;
739
740
    /* Find the data block */
741
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
742
0
    if (!b)
743
0
        return -1;
744
745
0
    return b->uncomp_size;
746
0
}
747
748
0
cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) {
749
0
    return cram_get_block_by_id(slice, c->u.varint.content_id);
750
0
}
751
752
0
int cram_varint_describe(cram_codec *c, kstring_t *ks) {
753
0
    return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)",
754
0
                    c->u.varint.content_id,
755
0
                    c->u.varint.offset,
756
0
                    c->u.varint.type)
757
0
        < 0 ? -1 : 0;
758
0
}
759
760
cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
761
                                    char *data, int size,
762
                                    enum cram_encoding codec,
763
                                    enum cram_external_type option,
764
462
                                    int version, varint_vec *vv) {
765
462
    cram_codec *c;
766
462
    char *cp = data, *cp_end = data+size;
767
768
462
    if (!(c = malloc(sizeof(*c))))
769
0
        return NULL;
770
771
462
    c->codec  = codec;
772
773
    // Function pointer choice is theoretically by codec type.
774
    // Given we have some vars as int32 and some as int64 we
775
    // use option too for sizing, although on disk format
776
    // does not change.
777
462
    switch(codec) {
778
345
    case E_VARINT_UNSIGNED:
779
345
        c->decode = (option == E_INT)
780
345
            ? cram_varint_decode_int
781
345
            : cram_varint_decode_long;
782
345
        break;
783
117
    case E_VARINT_SIGNED:
784
117
        c->decode = (option == E_INT)
785
117
            ? cram_varint_decode_sint
786
117
            : cram_varint_decode_slong;
787
117
        break;
788
0
    default:
789
0
        return NULL;
790
462
    }
791
792
462
    c->free   = cram_varint_decode_free;
793
462
    c->size   = cram_varint_decode_size;
794
462
    c->get_block = cram_varint_get_block;
795
462
    c->describe = cram_varint_describe;
796
797
462
    c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL);
798
462
    c->u.varint.offset     = vv->varint_get64s(&cp, cp_end, NULL);
799
800
462
    if (cp - data != size) {
801
24
        fprintf(stderr, "Malformed varint header stream\n");
802
24
        free(c);
803
24
        return NULL;
804
24
    }
805
806
438
    c->u.varint.type = option;
807
808
438
    return c;
809
462
}
810
811
int cram_varint_encode_int(cram_slice *slice, cram_codec *c,
812
0
                           char *in, int in_size) {
813
0
    uint32_t *i32 = (uint32_t *)in;
814
0
    return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0
815
0
        ? 0 : -1;
816
0
}
817
818
int cram_varint_encode_sint(cram_slice *slice, cram_codec *c,
819
0
                            char *in, int in_size) {
820
0
    int32_t *i32 = (int32_t *)in;
821
0
    return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0
822
0
        ? 0 : -1;
823
0
}
824
825
int cram_varint_encode_long(cram_slice *slice, cram_codec *c,
826
0
                            char *in, int in_size) {
827
0
    uint64_t *i64 = (uint64_t *)in;
828
0
    return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0
829
0
        ? 0 : -1;
830
0
}
831
832
int cram_varint_encode_slong(cram_slice *slice, cram_codec *c,
833
0
                             char *in, int in_size) {
834
0
    int64_t *i64 = (int64_t *)in;
835
0
    return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0
836
0
        ? 0 : -1;
837
0
}
838
839
0
void cram_varint_encode_free(cram_codec *c) {
840
0
    if (!c)
841
0
        return;
842
0
    free(c);
843
0
}
844
845
int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix,
846
0
                             int version) {
847
0
    char tmp[99], *tp = tmp;
848
0
    int len = 0;
849
850
0
    if (prefix) {
851
0
        size_t l = strlen(prefix);
852
0
        BLOCK_APPEND(b, prefix, l);
853
0
        len += l;
854
0
    }
855
856
0
    tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id);
857
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset);
858
0
    len += c->vv->varint_put32_blk(b, c->codec);
859
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
860
0
    BLOCK_APPEND(b, tmp, tp-tmp);
861
0
    len += tp-tmp;
862
863
0
    return len;
864
865
0
 block_err:
866
0
    return -1;
867
0
}
868
869
cram_codec *cram_varint_encode_init(cram_stats *st,
870
                                    enum cram_encoding codec,
871
                                    enum cram_external_type option,
872
                                    void *dat,
873
0
                                    int version, varint_vec *vv) {
874
0
    cram_codec *c;
875
876
0
    if (!(c = malloc(sizeof(*c))))
877
0
        return NULL;
878
879
0
    c->u.e_varint.offset = 0;
880
0
    if (st) {
881
        // Marginal difference so far! Not worth the hassle?
882
0
        if (st->min_val < 0 && st->min_val >= -127
883
0
            && st->max_val / -st->min_val > 100) {
884
0
            c->u.e_varint.offset = -st->min_val;
885
0
            codec = E_VARINT_UNSIGNED;
886
0
        } else if (st->min_val > 0) {
887
0
            c->u.e_varint.offset = -st->min_val;
888
0
        }
889
0
    }
890
891
0
    c->codec = codec;
892
0
    c->free = cram_varint_encode_free;
893
894
    // Function pointer choice is theoretically by codec type.
895
    // Given we have some vars as int32 and some as int64 we
896
    // use option too for sizing, although on disk format
897
    // does not change.
898
0
    switch (codec) {
899
0
    case E_VARINT_UNSIGNED:
900
0
        c->encode = (option == E_INT)
901
0
            ? cram_varint_encode_int
902
0
            : cram_varint_encode_long;
903
0
        break;
904
0
    case E_VARINT_SIGNED:
905
0
        c->encode = (option == E_INT)
906
0
            ? cram_varint_encode_sint
907
0
            : cram_varint_encode_slong;
908
0
        break;
909
0
    default:
910
0
        return NULL;
911
0
    }
912
0
    c->store = cram_varint_encode_store;
913
0
    c->flush = NULL;
914
915
0
    c->u.e_varint.content_id = (size_t)dat;
916
917
0
    return c;
918
0
}
919
/*
920
 * ---------------------------------------------------------------------------
921
 * CONST_BYTE and CONST_INT
922
 */
923
int cram_const_decode_byte(cram_slice *slice, cram_codec *c,
924
0
                           cram_block *in, char *out, int *out_size) {
925
0
    int i, n;
926
927
0
    for (i = 0, n = *out_size; i < n; i++)
928
0
        out[i] = c->u.xconst.val;
929
930
0
    return 0;
931
0
}
932
933
int cram_const_decode_int(cram_slice *slice, cram_codec *c,
934
0
                          cram_block *in, char *out, int *out_size) {
935
0
    int32_t *out_i = (int32_t *)out;
936
0
    int i, n;
937
938
0
    for (i = 0, n = *out_size; i < n; i++)
939
0
        out_i[i] = c->u.xconst.val;
940
941
0
    return 0;
942
0
}
943
944
int cram_const_decode_long(cram_slice *slice, cram_codec *c,
945
0
                           cram_block *in, char *out, int *out_size) {
946
0
    int64_t *out_i = (int64_t *)out;
947
0
    int i, n;
948
949
0
    for (i = 0, n = *out_size; i < n; i++)
950
0
        out_i[i] = c->u.xconst.val;
951
952
0
    return 0;
953
0
}
954
955
363
void cram_const_decode_free(cram_codec *c) {
956
363
    if (c)
957
363
        free(c);
958
363
}
959
960
0
int cram_const_decode_size(cram_slice *slice, cram_codec *c) {
961
0
    return 0;
962
0
}
963
964
0
int cram_const_describe(cram_codec *c, kstring_t *ks) {
965
0
    return ksprintf(ks, "CONST(val=%"PRId64")",
966
0
                    c->u.xconst.val) < 0 ? -1 : 0;
967
0
}
968
969
cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
970
                                   char *data, int size,
971
                                   enum cram_encoding codec,
972
                                   enum cram_external_type option,
973
372
                                   int version, varint_vec *vv) {
974
372
    cram_codec *c;
975
372
    char *cp = data;
976
977
372
    if (!(c = malloc(sizeof(*c))))
978
0
        return NULL;
979
980
372
    c->codec  = codec;
981
372
    if (codec == E_CONST_BYTE)
982
12
        c->decode = cram_const_decode_byte;
983
360
    else if (option == E_INT)
984
234
        c->decode = cram_const_decode_int;
985
126
    else
986
126
        c->decode = cram_const_decode_long;
987
372
    c->free   = cram_const_decode_free;
988
372
    c->size   = cram_const_decode_size;
989
372
    c->get_block = NULL;
990
372
    c->describe = cram_const_describe;
991
992
372
    c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL);
993
994
372
    if (cp - data != size) {
995
9
        fprintf(stderr, "Malformed const header stream\n");
996
9
        free(c);
997
9
        return NULL;
998
9
    }
999
1000
363
    return c;
1001
372
}
1002
1003
int cram_const_encode(cram_slice *slice, cram_codec *c,
1004
0
                      char *in, int in_size) {
1005
0
    return 0;
1006
0
}
1007
1008
int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix,
1009
0
                            int version) {
1010
0
    char tmp[99], *tp = tmp;
1011
0
    int len = 0;
1012
1013
0
    if (prefix) {
1014
0
        size_t l = strlen(prefix);
1015
0
        BLOCK_APPEND(b, prefix, l);
1016
0
        len += l;
1017
0
    }
1018
1019
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val);
1020
0
    len += c->vv->varint_put32_blk(b, c->codec);
1021
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
1022
0
    BLOCK_APPEND(b, tmp, tp-tmp);
1023
0
    len += tp-tmp;
1024
1025
0
    return len;
1026
1027
0
 block_err:
1028
0
    return -1;
1029
0
}
1030
1031
cram_codec *cram_const_encode_init(cram_stats *st,
1032
                                   enum cram_encoding codec,
1033
                                   enum cram_external_type option,
1034
                                   void *dat,
1035
0
                                   int version, varint_vec *vv) {
1036
0
    cram_codec *c;
1037
1038
0
    if (!(c = malloc(sizeof(*c))))
1039
0
        return NULL;
1040
1041
0
    c->codec = codec;
1042
0
    c->free = cram_const_decode_free; // as as decode
1043
0
    c->encode = cram_const_encode; // a nop
1044
0
    c->store = cram_const_encode_store;
1045
0
    c->flush = NULL;
1046
0
    c->u.e_xconst.val = st->min_val;
1047
1048
0
    return c;
1049
0
}
1050
1051
/*
1052
 * ---------------------------------------------------------------------------
1053
 * BETA
1054
 */
1055
0
int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1056
0
    int64_t *out_i = (int64_t *)out;
1057
0
    int i, n = *out_size;
1058
1059
0
    if (c->u.beta.nbits) {
1060
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1061
0
            return -1;
1062
1063
0
        for (i = 0; i < n; i++)
1064
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1065
0
    } else {
1066
0
        for (i = 0; i < n; i++)
1067
0
            out_i[i] = -c->u.beta.offset;
1068
0
    }
1069
1070
0
    return 0;
1071
0
}
1072
1073
0
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1074
0
    int32_t *out_i = (int32_t *)out;
1075
0
    int i, n = *out_size;
1076
1077
0
    if (c->u.beta.nbits) {
1078
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1079
0
            return -1;
1080
1081
0
        for (i = 0; i < n; i++)
1082
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1083
0
    } else {
1084
0
        for (i = 0; i < n; i++)
1085
0
            out_i[i] = -c->u.beta.offset;
1086
0
    }
1087
1088
0
    return 0;
1089
0
}
1090
1091
0
int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1092
0
    int i, n = *out_size;
1093
1094
1095
0
    if (c->u.beta.nbits) {
1096
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1097
0
            return -1;
1098
1099
0
        if (out)
1100
0
            for (i = 0; i < n; i++)
1101
0
                out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1102
0
        else
1103
0
            for (i = 0; i < n; i++)
1104
0
                get_bits_MSB(in, c->u.beta.nbits);
1105
0
    } else {
1106
0
        if (out)
1107
0
            for (i = 0; i < n; i++)
1108
0
                out[i] = -c->u.beta.offset;
1109
0
    }
1110
1111
0
    return 0;
1112
0
}
1113
1114
174
void cram_beta_decode_free(cram_codec *c) {
1115
174
    if (c)
1116
174
        free(c);
1117
174
}
1118
1119
0
int cram_beta_describe(cram_codec *c, kstring_t *ks) {
1120
0
    return ksprintf(ks, "BETA(offset=%d, nbits=%d)",
1121
0
                    c->u.beta.offset, c->u.beta.nbits)
1122
0
        < 0 ? -1 : 0;
1123
0
}
1124
1125
cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
1126
                                  char *data, int size,
1127
                                  enum cram_encoding codec,
1128
                                  enum cram_external_type option,
1129
189
                                  int version, varint_vec *vv) {
1130
189
    cram_codec *c;
1131
189
    char *cp = data;
1132
1133
189
    if (!(c = malloc(sizeof(*c))))
1134
0
        return NULL;
1135
1136
189
    c->codec  = E_BETA;
1137
189
    if (option == E_INT || option == E_SINT)
1138
42
        c->decode = cram_beta_decode_int;
1139
147
    else if (option == E_LONG || option == E_SLONG)
1140
0
        c->decode = cram_beta_decode_long;
1141
147
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1142
144
        c->decode = cram_beta_decode_char;
1143
3
    else {
1144
3
        hts_log_error("BYTE_ARRAYs not supported by this codec");
1145
3
        free(c);
1146
3
        return NULL;
1147
3
    }
1148
186
    c->free   = cram_beta_decode_free;
1149
186
    c->describe = cram_beta_describe;
1150
1151
186
    c->u.beta.nbits = -1;
1152
186
    c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL);
1153
186
    if (cp < data + size) // Ensure test below works
1154
183
        c->u.beta.nbits  = vv->varint_get32(&cp, data + size, NULL);
1155
1156
186
    if (cp - data != size
1157
186
        || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) {
1158
12
        hts_log_error("Malformed beta header stream");
1159
12
        free(c);
1160
12
        return NULL;
1161
12
    }
1162
1163
174
    return c;
1164
186
}
1165
1166
int cram_beta_encode_store(cram_codec *c, cram_block *b,
1167
643
                           char *prefix, int version) {
1168
643
    int len = 0, r = 0, n;
1169
1170
643
    if (prefix) {
1171
643
        size_t l = strlen(prefix);
1172
643
        BLOCK_APPEND(b, prefix, l);
1173
643
        len += l;
1174
643
    }
1175
1176
643
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1177
    // codec length
1178
643
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset)
1179
643
                                         + c->vv->varint_size(c->u.e_beta.nbits)));
1180
643
    r |= n;
1181
643
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n;
1182
643
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits));  r |= n;
1183
1184
643
    if (r > 0) return len;
1185
1186
0
 block_err:
1187
0
    return -1;
1188
643
}
1189
1190
int cram_beta_encode_long(cram_slice *slice, cram_codec *c,
1191
0
                          char *in, int in_size) {
1192
0
    int64_t *syms = (int64_t *)in;
1193
0
    int i, r = 0;
1194
1195
0
    for (i = 0; i < in_size; i++)
1196
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1197
0
                            c->u.e_beta.nbits);
1198
1199
0
    return r;
1200
0
}
1201
1202
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
1203
6.67k
                         char *in, int in_size) {
1204
6.67k
    int *syms = (int *)in;
1205
6.67k
    int i, r = 0;
1206
1207
13.3k
    for (i = 0; i < in_size; i++)
1208
6.67k
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1209
6.67k
                            c->u.e_beta.nbits);
1210
1211
6.67k
    return r;
1212
6.67k
}
1213
1214
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
1215
0
                          char *in, int in_size) {
1216
0
    unsigned char *syms = (unsigned char *)in;
1217
0
    int i, r = 0;
1218
1219
0
    for (i = 0; i < in_size; i++)
1220
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1221
0
                            c->u.e_beta.nbits);
1222
1223
0
    return r;
1224
0
}
1225
1226
643
void cram_beta_encode_free(cram_codec *c) {
1227
643
    if (c) free(c);
1228
643
}
1229
1230
cram_codec *cram_beta_encode_init(cram_stats *st,
1231
                                  enum cram_encoding codec,
1232
                                  enum cram_external_type option,
1233
                                  void *dat,
1234
682
                                  int version, varint_vec *vv) {
1235
682
    cram_codec *c;
1236
682
    hts_pos_t min_val, max_val;
1237
682
    int len = 0;
1238
682
    int64_t range;
1239
1240
682
    c = malloc(sizeof(*c));
1241
682
    if (!c)
1242
0
        return NULL;
1243
682
    c->codec  = E_BETA;
1244
682
    c->free   = cram_beta_encode_free;
1245
682
    if (option == E_INT || option == E_SINT)
1246
682
        c->encode = cram_beta_encode_int;
1247
0
    else if (option == E_LONG || option == E_SLONG)
1248
0
        c->encode = cram_beta_encode_long;
1249
0
    else
1250
0
        c->encode = cram_beta_encode_char;
1251
682
    c->store  = cram_beta_encode_store;
1252
682
    c->flush = NULL;
1253
1254
682
    if (dat) {
1255
682
        min_val = ((hts_pos_t *)dat)[0];
1256
682
        max_val = ((hts_pos_t *)dat)[1];
1257
682
    } else {
1258
0
        min_val = INT_MAX;
1259
0
        max_val = INT_MIN;
1260
0
        int i;
1261
0
        for (i = 0; i < MAX_STAT_VAL; i++) {
1262
0
            if (!st->freqs[i])
1263
0
                continue;
1264
0
            if (min_val > i)
1265
0
                min_val = i;
1266
0
            max_val = i;
1267
0
        }
1268
0
        if (st->h) {
1269
0
            khint_t k;
1270
1271
0
            for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1272
0
                if (!kh_exist(st->h, k))
1273
0
                    continue;
1274
1275
0
                i = kh_key(st->h, k);
1276
0
                if (min_val > i)
1277
0
                    min_val = i;
1278
0
                if (max_val < i)
1279
0
                    max_val = i;
1280
0
            }
1281
0
        }
1282
0
    }
1283
1284
682
    if (max_val < min_val)
1285
0
        goto err;
1286
1287
682
    range = (int64_t) max_val - min_val;
1288
682
    switch (option) {
1289
0
    case E_SINT:
1290
0
        if (min_val < INT_MIN || range > INT_MAX)
1291
0
            goto err;
1292
0
        break;
1293
1294
682
    case E_INT:
1295
682
        if (max_val > UINT_MAX || range > UINT_MAX)
1296
39
            goto err;
1297
643
        break;
1298
1299
643
    default:
1300
0
        break;
1301
682
    }
1302
1303
643
    c->u.e_beta.offset = -min_val;
1304
3.38k
    while (range) {
1305
2.74k
        len++;
1306
2.74k
        range >>= 1;
1307
2.74k
    }
1308
643
    c->u.e_beta.nbits = len;
1309
1310
643
    return c;
1311
1312
39
 err:
1313
39
    free(c);
1314
39
    return NULL;
1315
682
}
1316
1317
/*
1318
 * ---------------------------------------------------------------------------
1319
 * XPACK: Packing multiple values into a single byte.  A fast transform that
1320
 * reduces time taken by entropy encoder and may also improve compression.
1321
 *
1322
 * This also has the additional requirement that the data series is not
1323
 * interleaved with another, permitting efficient encoding and decoding
1324
 * of all elements enmasse instead of needing to only extract the bits
1325
 * necessary per item.
1326
 */
1327
0
int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1328
0
    int64_t *out_i = (int64_t *)out;
1329
0
    int i, n = *out_size;
1330
1331
0
    if (c->u.xpack.nbits) {
1332
0
        for (i = 0; i < n; i++)
1333
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1334
0
    } else {
1335
0
        for (i = 0; i < n; i++)
1336
0
            out_i[i] = c->u.xpack.rmap[0];
1337
0
    }
1338
1339
0
    return 0;
1340
0
}
1341
1342
0
int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1343
0
    int32_t *out_i = (int32_t *)out;
1344
0
    int i, n = *out_size;
1345
1346
0
    if (c->u.xpack.nbits) {
1347
0
        if (cram_not_enough_bits(in, c->u.xpack.nbits * n))
1348
0
            return -1;
1349
1350
0
        for (i = 0; i < n; i++)
1351
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1352
0
    } else {
1353
0
        for (i = 0; i < n; i++)
1354
0
            out_i[i] = c->u.xpack.rmap[0];
1355
0
    }
1356
1357
0
    return 0;
1358
0
}
1359
1360
0
static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) {
1361
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
1362
0
    if (b)
1363
0
        return 0;
1364
1365
    // get sub-codec data.
1366
0
    cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec);
1367
0
    if (!sub_b)
1368
0
        return -1;
1369
1370
    // Allocate local block to expand into
1371
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
1372
0
    if (!b)
1373
0
        return -1;
1374
0
    int n = sub_b->uncomp_size * 8/c->u.xpack.nbits;
1375
0
    BLOCK_GROW(b, n);
1376
0
    b->uncomp_size = n;
1377
1378
0
    uint8_t p[256];
1379
0
    int z;
1380
0
    for (z = 0; z < 256; z++)
1381
0
        p[z] = c->u.xpack.rmap[z];
1382
0
    hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size,
1383
0
               8 / c->u.xpack.nbits, p);
1384
1385
0
    return 0;
1386
1387
0
 block_err:
1388
0
    return -1;
1389
0
}
1390
1391
0
int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1392
    // FIXME: we need to ban data-series interleaving in the spec for this to work.
1393
1394
    // Remember this may be called when threaded and multi-slice per container.
1395
    // Hence one cram_codec instance, multiple slices, multiple blocks.
1396
    // We therefore have to cache appropriate block info in slice and not codec.
1397
    //    b = cram_get_block_by_id(slice, c->external.content_id);
1398
0
    if (c->u.xpack.nval > 1) {
1399
0
        cram_xpack_decode_expand_char(slice, c);
1400
0
        cram_block *b = slice->block_by_id[512 + c->codec_id];
1401
0
        if (!b)
1402
0
            return -1;
1403
1404
0
        if (out)
1405
0
            memcpy(out, b->data + b->byte, *out_size);
1406
0
        b->byte += *out_size;
1407
0
    } else {
1408
0
        memset(out, c->u.xpack.rmap[0], *out_size);
1409
0
    }
1410
1411
0
    return 0;
1412
0
}
1413
1414
618
void cram_xpack_decode_free(cram_codec *c) {
1415
618
    if (!c) return;
1416
1417
618
    if (c->u.xpack.sub_codec)
1418
543
        c->u.xpack.sub_codec->free(c->u.xpack.sub_codec);
1419
1420
    //free(slice->block_by_id[512 + c->codec_id]);
1421
    //slice->block_by_id[512 + c->codec_id] = 0;
1422
1423
618
    free(c);
1424
618
}
1425
1426
0
int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) {
1427
0
    cram_xpack_decode_expand_char(slice, c);
1428
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1429
0
}
1430
1431
0
cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) {
1432
0
    cram_xpack_decode_expand_char(slice, c);
1433
0
    return slice->block_by_id[512 + c->codec_id];
1434
0
}
1435
1436
cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr,
1437
                                   char *data, int size,
1438
                                   enum cram_encoding codec,
1439
                                   enum cram_external_type option,
1440
618
                                   int version, varint_vec *vv) {
1441
618
    cram_codec *c;
1442
618
    char *cp = data;
1443
618
    char *endp = data+size;
1444
1445
618
    if (!(c = calloc(1, sizeof(*c))))
1446
0
        return NULL;
1447
1448
618
    c->codec  = E_XPACK;
1449
618
    if (option == E_LONG)
1450
9
        c->decode = cram_xpack_decode_long;
1451
609
    else if (option == E_INT)
1452
336
        c->decode = cram_xpack_decode_int;
1453
273
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1454
270
        c->decode = cram_xpack_decode_char;
1455
3
    else {
1456
3
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
1457
3
        goto malformed;
1458
3
    }
1459
615
    c->free = cram_xpack_decode_free;
1460
615
    c->size = cram_xpack_decode_size;
1461
615
    c->get_block = cram_xpack_get_block;
1462
615
    c->describe = NULL;
1463
1464
615
    c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL);
1465
615
    c->u.xpack.nval  = vv->varint_get32(&cp, endp, NULL);
1466
615
    if (c->u.xpack.nbits >= 8  || c->u.xpack.nbits < 0 ||
1467
615
        c->u.xpack.nval  > 256 || c->u.xpack.nval < 0)
1468
24
        goto malformed;
1469
591
    int i;
1470
2.96k
    for (i = 0; i < c->u.xpack.nval; i++) {
1471
2.38k
        uint32_t v = vv->varint_get32(&cp, endp, NULL);
1472
2.38k
        if (v >= 256)
1473
9
            goto malformed;
1474
2.37k
        c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K
1475
2.37k
    }
1476
1477
582
    int encoding = vv->varint_get32(&cp, endp, NULL);
1478
582
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1479
582
    if (sub_size < 0 || endp - cp < sub_size)
1480
12
        goto malformed;
1481
570
    c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1482
570
                                             option, version, vv);
1483
570
    if (c->u.xpack.sub_codec == NULL)
1484
27
        goto malformed;
1485
543
    cp += sub_size;
1486
1487
543
    if (cp - data != size
1488
543
        || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) {
1489
93
    malformed:
1490
93
        fprintf(stderr, "Malformed xpack header stream\n");
1491
93
        cram_xpack_decode_free(c);
1492
93
        return NULL;
1493
18
    }
1494
1495
525
    return c;
1496
543
}
1497
1498
0
int cram_xpack_encode_flush(cram_codec *c) {
1499
    // Pack the buffered up data
1500
0
    int meta_len;
1501
0
    uint64_t out_len;
1502
0
    uint8_t out_meta[1024];
1503
0
    uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out),
1504
0
                            out_meta, &meta_len, &out_len);
1505
1506
    // We now need to pass this through the next layer of transform
1507
0
    if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming
1508
0
                                     c->u.e_xpack.sub_codec,
1509
0
                                     (char *)out, out_len))
1510
0
        return -1;
1511
1512
0
    int r = 0;
1513
0
    if (c->u.e_xpack.sub_codec->flush)
1514
0
        r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec);
1515
1516
0
    free(out);
1517
0
    return r;
1518
0
}
1519
1520
int cram_xpack_encode_store(cram_codec *c, cram_block *b,
1521
0
                            char *prefix, int version) {
1522
0
    int len = 0, r = 0, n;
1523
1524
0
    if (prefix) {
1525
0
        size_t l = strlen(prefix);
1526
0
        BLOCK_APPEND(b, prefix, l);
1527
0
        len += l;
1528
0
    }
1529
1530
    // Store sub-codec
1531
0
    cram_codec *tc = c->u.e_xpack.sub_codec;
1532
0
    cram_block *tb = cram_new_block(0, 0);
1533
0
    if (!tb)
1534
0
        return -1;
1535
0
    int len2 = tc->store(tc, tb, NULL, version);
1536
1537
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1538
1539
    // codec length
1540
0
    int len1 = 0, i;
1541
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1542
0
        len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n;
1543
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits)
1544
0
                                        +  c->vv->varint_size(c->u.e_xpack.nval)
1545
0
                                        + len1 + len2)); r |= n;
1546
1547
    // The map and sub-codec
1548
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n;
1549
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval));  r |= n;
1550
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1551
0
        len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n;
1552
1553
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1554
1555
0
    cram_free_block(tb);
1556
1557
0
    return r > 0 ? len + len2 : -1;
1558
1559
0
 block_err:
1560
0
    return -1;
1561
0
}
1562
1563
// Same as cram_beta_encode_long
1564
int cram_xpack_encode_long(cram_slice *slice, cram_codec *c,
1565
0
                           char *in, int in_size) {
1566
0
    int64_t *syms = (int64_t *)in;
1567
0
    int i, r = 0;
1568
1569
0
    for (i = 0; i < in_size; i++)
1570
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1571
1572
0
    return r;
1573
0
}
1574
1575
int cram_xpack_encode_int(cram_slice *slice, cram_codec *c,
1576
0
                          char *in, int in_size) {
1577
0
    int *syms = (int *)in;
1578
0
    int i, r = 0;
1579
1580
0
    for (i = 0; i < in_size; i++)
1581
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1582
1583
0
    return r;
1584
0
}
1585
1586
int cram_xpack_encode_char(cram_slice *slice, cram_codec *c,
1587
0
                           char *in, int in_size) {
1588
0
    BLOCK_APPEND(c->out, in, in_size);
1589
0
    return 0;
1590
1591
0
 block_err:
1592
0
    return -1;
1593
0
}
1594
1595
0
void cram_xpack_encode_free(cram_codec *c) {
1596
0
    if (!c) return;
1597
1598
0
    if (c->u.e_xpack.sub_codec)
1599
0
        c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec);
1600
1601
0
    cram_free_block(c->out);
1602
1603
0
    free(c);
1604
0
}
1605
1606
cram_codec *cram_xpack_encode_init(cram_stats *st,
1607
                                   enum cram_encoding codec,
1608
                                   enum cram_external_type option,
1609
                                   void *dat,
1610
0
                                   int version, varint_vec *vv) {
1611
0
    cram_codec *c;
1612
1613
0
    if (!(c = malloc(sizeof(*c))))
1614
0
        return NULL;
1615
1616
0
    c->codec  = E_XPACK;
1617
0
    c->free   = cram_xpack_encode_free;
1618
0
    if (option == E_LONG)
1619
0
        c->encode = cram_xpack_encode_long;
1620
0
    else if (option == E_INT)
1621
0
        c->encode = cram_xpack_encode_int;
1622
0
    else
1623
0
        c->encode = cram_xpack_encode_char;
1624
0
    c->store  = cram_xpack_encode_store;
1625
0
    c->flush  = cram_xpack_encode_flush;
1626
1627
0
    cram_xpack_encoder *e = (cram_xpack_encoder *)dat;
1628
0
    c->u.e_xpack.nbits = e->nbits;
1629
0
    c->u.e_xpack.nval = e->nval;
1630
0
    c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
1631
0
                                               E_BYTE_ARRAY, e->sub_codec_dat,
1632
0
                                               version, vv);
1633
1634
    // Initialise fwd and rev maps
1635
0
    memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3
1636
0
    int i, n;
1637
0
    for (i = n = 0; i < 256; i++)
1638
0
        if (e->map[i] != -1)
1639
0
            c->u.e_xpack.rmap[n++] = i;               // 0,1,2,3 to P,A,C,K
1640
0
    if (n != e->nval) {
1641
0
        fprintf(stderr, "Incorrectly specified number of map items in PACK\n");
1642
0
        return NULL;
1643
0
    }
1644
1645
0
    return c;
1646
0
}
1647
1648
/*
1649
 * ---------------------------------------------------------------------------
1650
 * XDELTA: subtract successive values, zig-zag to turn +/- to + only,
1651
 * and then var-int encode the result.
1652
 *
1653
 * This also has the additional requirement that the data series is not
1654
 * interleaved with another, permitting efficient encoding and decoding
1655
 * of all elements enmasse instead of needing to only extract the bits
1656
 * necessary per item.
1657
 */
1658
1659
0
static uint8_t  zigzag8 (int8_t  x) { return (x << 1) ^ (x >>  7); }
1660
0
static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); }
1661
0
static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); }
1662
1663
//static int8_t  unzigzag8 (uint8_t  x) { return (x >> 1) ^ -(x & 1); }
1664
0
static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); }
1665
0
static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); }
1666
1667
0
int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1668
0
    return -1;
1669
0
}
1670
1671
0
int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1672
    // Slow value-by-value method for now
1673
0
    uint32_t *out32 = (uint32_t *)out;
1674
0
    int i;
1675
0
    for (i = 0; i < *out_size; i++) {
1676
0
        uint32_t v;
1677
0
        int one = 1;
1678
0
        if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in,
1679
0
                                          (char *)&v, &one) < 0)
1680
0
            return -1;
1681
0
        uint32_t d = unzigzag32(v);
1682
0
        c->u.xdelta.last = out32[i] = d + c->u.xdelta.last;
1683
0
    }
1684
1685
0
    return 0;
1686
0
}
1687
1688
0
static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) {
1689
0
    return -1;
1690
0
}
1691
1692
0
int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1693
0
    return -1;
1694
0
}
1695
1696
0
static inline int16_t le_int2(int16_t i) {
1697
0
    int16_t s;
1698
0
    i16_to_le(i, (uint8_t *)&s);
1699
0
    return s;
1700
0
}
1701
1702
int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in,
1703
0
                             char *out_, int *out_size) {
1704
0
    cram_block *out = (cram_block *)out_;
1705
0
    cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec);
1706
0
    int i = 0;
1707
1708
0
    const int w = c->u.xdelta.word_size;
1709
0
    uint32_t npad = (w - *out_size%w)%w;
1710
0
    uint32_t out_sz = *out_size + npad;
1711
0
    c->u.xdelta.last = 0;  // reset for each new array
1712
1713
0
    for (i = 0; i < out_sz; i += w) {
1714
0
        uint16_t v;
1715
        // Need better interface
1716
0
        char *cp = (char *)b->data + b->byte;
1717
0
        char *cp_end = (char *)b->data + b->uncomp_size;
1718
0
        int err = 0;
1719
0
        v = c->vv->varint_get32(&cp, cp_end, &err);
1720
0
        if (err)
1721
0
            return -1;
1722
0
        b->byte = cp - (char *)b->data;
1723
1724
0
        switch(w) {
1725
0
        case 2: {
1726
0
            int16_t d = unzigzag16(v), z;
1727
0
            c->u.xdelta.last = d + c->u.xdelta.last;
1728
0
            z = le_int2(c->u.xdelta.last);
1729
0
            BLOCK_APPEND(out, &z, 2-npad);
1730
0
            npad = 0;
1731
0
            break;
1732
0
        }
1733
0
        default:
1734
0
            fprintf(stderr, "Unsupported word size by XDELTA\n");
1735
0
            return -1;
1736
0
        }
1737
0
    }
1738
1739
0
    return 0;
1740
1741
0
 block_err:
1742
0
    return -1;
1743
0
}
1744
1745
99
void cram_xdelta_decode_free(cram_codec *c) {
1746
99
    if (!c) return;
1747
1748
99
    if (c->u.xdelta.sub_codec)
1749
75
        c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec);
1750
1751
99
    free(c);
1752
99
}
1753
1754
0
int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) {
1755
0
    cram_xdelta_decode_expand_char(slice, c);
1756
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1757
0
}
1758
1759
0
cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) {
1760
0
    cram_xdelta_decode_expand_char(slice, c);
1761
0
    return slice->block_by_id[512 + c->codec_id];
1762
0
}
1763
1764
cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr,
1765
                                    char *data, int size,
1766
                                    enum cram_encoding codec,
1767
                                    enum cram_external_type option,
1768
99
                                    int version, varint_vec *vv) {
1769
99
    cram_codec *c;
1770
99
    char *cp = data;
1771
99
    char *endp = data+size;
1772
1773
99
    if (!(c = calloc(1, sizeof(*c))))
1774
0
        return NULL;
1775
1776
99
    c->codec  = E_XDELTA;
1777
99
    if (option == E_LONG)
1778
3
        c->decode = cram_xdelta_decode_long;
1779
96
    else if (option == E_INT)
1780
27
        c->decode = cram_xdelta_decode_int;
1781
69
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1782
51
        c->decode = cram_xdelta_decode_char;
1783
18
    else if (option == E_BYTE_ARRAY_BLOCK) {
1784
18
        option = E_BYTE_ARRAY;
1785
18
        c->decode = cram_xdelta_decode_block;
1786
18
    } else {
1787
0
        free(c);
1788
0
        return NULL;
1789
0
    }
1790
99
    c->free = cram_xdelta_decode_free;
1791
99
    c->size = cram_xdelta_decode_size;
1792
99
    c->get_block = cram_xdelta_get_block;
1793
99
    c->describe = NULL;
1794
1795
99
    c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL);
1796
99
    c->u.xdelta.last = 0;
1797
1798
99
    int encoding = vv->varint_get32(&cp, endp, NULL);
1799
99
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1800
99
    if (sub_size < 0 || endp - cp < sub_size)
1801
15
        goto malformed;
1802
84
    c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1803
84
                                              option, version, vv);
1804
84
    if (c->u.xdelta.sub_codec == NULL)
1805
9
        goto malformed;
1806
75
    cp += sub_size;
1807
1808
75
    if (cp - data != size) {
1809
36
    malformed:
1810
36
        fprintf(stderr, "Malformed xdelta header stream\n");
1811
36
        cram_xdelta_decode_free(c);
1812
36
        return NULL;
1813
12
    }
1814
1815
63
    return c;
1816
75
}
1817
1818
0
int cram_xdelta_encode_flush(cram_codec *c) {
1819
0
    int r = -1;
1820
0
    cram_block *b = cram_new_block(0, 0);
1821
0
    if (!b)
1822
0
        return -1;
1823
1824
0
    switch (c->u.e_xdelta.word_size) {
1825
0
    case 2: {
1826
        // Delta + zigzag transform.
1827
        // Subtracting two 8-bit values has a 9-bit result (-255 to 255).
1828
        // However think of it as turning a wheel clockwise or anti-clockwise.
1829
        // If it has 256 gradations then a -ve rotation followed by a +ve
1830
        // rotation of the same amount reverses it regardless.
1831
        //
1832
        // Similarly the zig-zag transformation doesn't invent any extra bits,
1833
        // so the entire thing can be done in-situ.  This may permit faster
1834
        // SIMD loops if we break apart the steps.
1835
1836
        // uint16_t last = 0, d;
1837
        // for (i = 0; i < n; i++) {
1838
        //     d = io[i] - last;
1839
        //     last = io[i];
1840
        //     io[i] = zigzag16(vd);
1841
        // }
1842
1843
        // --- vs ---
1844
1845
        // for (i = n-1; i >= 1; i--)
1846
        //     io[i] -= io[i-1];
1847
        // for (i = 0; i < n; i++)
1848
        //     io[i] = zigzag16(io[i]);
1849
1850
        // varint: need array variant for speed here.
1851
        // With zig-zag
1852
0
        int i, n = BLOCK_SIZE(c->out)/2;;
1853
0
        uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0;
1854
1855
0
        if (n*2 < BLOCK_SIZE(c->out)) {
1856
            // half word
1857
0
            last = *(uint8_t *)dat;
1858
0
            c->vv->varint_put32_blk(b, zigzag16(last));
1859
0
            dat = (uint16_t *)(((uint8_t *)dat)+1);
1860
0
        }
1861
1862
0
        for (i = 0; i < n; i++) {
1863
0
            uint16_t d = dat[i] - last; // possibly unaligned
1864
0
            last = dat[i];
1865
0
            c->vv->varint_put32_blk(b, zigzag16(d));
1866
0
        }
1867
1868
0
        break;
1869
0
    }
1870
1871
0
    case 4: {
1872
0
        int i, n = BLOCK_SIZE(c->out)/4;;
1873
0
        uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0;
1874
1875
0
        for (i = 0; i < n; i++) {
1876
0
            uint32_t d = dat[i] - last;
1877
0
            last = dat[i];
1878
0
            c->vv->varint_put32_blk(b, zigzag32(d));
1879
0
        }
1880
1881
0
        break;
1882
0
    }
1883
1884
0
    case 1: {
1885
0
        int i, n = BLOCK_SIZE(c->out);;
1886
0
        uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0;
1887
1888
0
        for (i = 0; i < n; i++) {
1889
0
            uint32_t d = dat[i] - last;
1890
0
            last = dat[i];
1891
0
            c->vv->varint_put32_blk(b, zigzag8(d));
1892
0
        }
1893
1894
0
        break;
1895
0
    }
1896
1897
0
    default:
1898
0
        goto err;
1899
0
    }
1900
1901
0
    if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec,
1902
0
                                      (char *)b->data, b->byte))
1903
0
        goto err;
1904
1905
0
    r = 0;
1906
1907
0
 err:
1908
0
    cram_free_block(b);
1909
0
    return r;
1910
1911
0
}
1912
1913
int cram_xdelta_encode_store(cram_codec *c, cram_block *b,
1914
0
                            char *prefix, int version) {
1915
0
    int len = 0, r = 0, n;
1916
1917
0
    if (prefix) {
1918
0
        size_t l = strlen(prefix);
1919
0
        BLOCK_APPEND(b, prefix, l);
1920
0
        len += l;
1921
0
    }
1922
1923
    // Store sub-codec
1924
0
    cram_codec *tc = c->u.e_xdelta.sub_codec;
1925
0
    cram_block *tb = cram_new_block(0, 0);
1926
0
    if (!tb)
1927
0
        return -1;
1928
0
    int len2 = tc->store(tc, tb, NULL, version);
1929
1930
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1931
1932
    // codec length
1933
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size)
1934
0
                                        + len2)); r |= n;
1935
1936
    // This and sub-codec
1937
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n;
1938
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1939
1940
0
    cram_free_block(tb);
1941
1942
0
    return r > 0 ? len + len2 : -1;
1943
1944
0
 block_err:
1945
0
    return -1;
1946
0
}
1947
1948
// Same as cram_beta_encode_long
1949
int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c,
1950
0
                           char *in, int in_size) {
1951
0
    return -1;
1952
0
}
1953
1954
int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c,
1955
0
                          char *in, int in_size) {
1956
0
    return -1;
1957
0
}
1958
1959
int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c,
1960
0
                            char *in, int in_size) {
1961
0
    char *dat = malloc(in_size*5);
1962
0
    if (!dat)
1963
0
        return -1;
1964
0
    char *cp = dat, *cp_end = dat + in_size*5;
1965
1966
0
    c->u.e_xdelta.last = 0; // reset for each new array
1967
0
    if (c->u.e_xdelta.word_size == 2) {
1968
0
        int i, part;
1969
1970
0
        part = in_size%2;
1971
0
        if (part) {
1972
0
            uint16_t z = in[0];
1973
0
            c->u.e_xdelta.last = le_int2(z);
1974
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last));
1975
0
        }
1976
1977
0
        uint16_t *in16 = (uint16_t *)(in+part);
1978
0
        for (i = 0; i < in_size/2; i++) {
1979
0
            uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last;
1980
0
            c->u.e_xdelta.last = le_int2(in16[i]);
1981
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(d));
1982
0
        }
1983
0
    }
1984
0
    if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec,
1985
0
                                      (char *)dat, cp-dat)) {
1986
0
        free(dat);
1987
0
        return -1;
1988
0
    }
1989
1990
0
    free(dat);
1991
0
    return 0;
1992
0
}
1993
1994
0
void cram_xdelta_encode_free(cram_codec *c) {
1995
0
    if (!c) return;
1996
1997
0
    if (c->u.e_xdelta.sub_codec)
1998
0
        c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec);
1999
2000
0
    cram_free_block(c->out);
2001
2002
0
    free(c);
2003
0
}
2004
2005
cram_codec *cram_xdelta_encode_init(cram_stats *st,
2006
                                    enum cram_encoding codec,
2007
                                    enum cram_external_type option,
2008
                                    void *dat,
2009
0
                                    int version, varint_vec *vv) {
2010
0
    cram_codec *c;
2011
2012
0
    if (!(c = malloc(sizeof(*c))))
2013
0
        return NULL;
2014
2015
0
    c->codec  = E_XDELTA;
2016
0
    c->free   = cram_xdelta_encode_free;
2017
0
    if (option == E_LONG)
2018
0
        c->encode = cram_xdelta_encode_long;
2019
0
    else if (option == E_INT)
2020
0
        c->encode = cram_xdelta_encode_int;
2021
0
    else
2022
0
        c->encode = cram_xdelta_encode_char;
2023
0
    c->store  = cram_xdelta_encode_store;
2024
0
    c->flush  = cram_xdelta_encode_flush;
2025
2026
0
    cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat;
2027
0
    c->u.e_xdelta.word_size = e->word_size;
2028
0
    c->u.e_xdelta.last = 0;
2029
0
    c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
2030
0
                                                E_BYTE_ARRAY,
2031
0
                                                e->sub_codec_dat,
2032
0
                                                version, vv);
2033
2034
0
    return c;
2035
0
}
2036
2037
/*
2038
 * ---------------------------------------------------------------------------
2039
 * XRLE
2040
 *
2041
 * This also has the additional requirement that the data series is not
2042
 * interleaved with another, permitting efficient encoding and decoding
2043
 * of all elements enmasse instead of needing to only extract the bits
2044
 * necessary per item.
2045
 */
2046
0
int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2047
    // TODO if and when needed
2048
0
    return -1;
2049
0
}
2050
2051
0
int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2052
    // TODO if and when needed
2053
0
    return -1;
2054
0
}
2055
2056
// Expands an XRLE transform and caches result in slice->block_by_id[]
2057
0
static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) {
2058
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2059
0
    if (b)
2060
0
        return 0;
2061
2062
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
2063
0
    if (!b)
2064
0
        return -1;
2065
0
    cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec);
2066
0
    if (!lit_b)
2067
0
        return -1;
2068
0
    unsigned char *lit_dat = lit_b->data;
2069
0
    unsigned int lit_sz = lit_b->uncomp_size;
2070
0
    unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec);
2071
2072
0
    cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec);
2073
0
    if (!len_b)
2074
0
        return -1;
2075
0
    unsigned char *len_dat = len_b->data;
2076
2077
0
    uint8_t rle_syms[256];
2078
0
    int rle_nsyms = 0;
2079
0
    int i;
2080
0
    for (i = 0; i < 256; i++) {
2081
0
        if (c->u.xrle.rep_score[i] > 0)
2082
0
            rle_syms[rle_nsyms++] = i;
2083
0
    }
2084
2085
0
    uint64_t out_sz;
2086
0
    int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz);
2087
0
    if (!(b->data = malloc(out_sz)))
2088
0
        return -1;
2089
0
    hts_rle_decode(lit_dat, lit_sz,
2090
0
                   len_dat+nb, len_sz-nb,
2091
0
                   rle_syms, rle_nsyms,
2092
0
                   b->data, &out_sz);
2093
0
    b->uncomp_size = out_sz;
2094
2095
0
    return 0;
2096
0
}
2097
2098
0
int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) {
2099
0
    cram_xrle_decode_expand_char(slice, c);
2100
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
2101
0
}
2102
2103
0
cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) {
2104
0
    cram_xrle_decode_expand_char(slice, c);
2105
0
    return slice->block_by_id[512 + c->codec_id];
2106
0
}
2107
2108
0
int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2109
0
    int n = *out_size;
2110
2111
0
    cram_xrle_decode_expand_char(slice, c);
2112
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2113
2114
0
    memcpy(out, b->data + b->idx, n);
2115
0
    b->idx += n;
2116
0
    return 0;
2117
2118
    // Old code when not cached
2119
0
    while (n > 0) {
2120
0
        if (c->u.xrle.cur_len == 0) {
2121
0
            unsigned char lit;
2122
0
            int one = 1;
2123
0
            if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in,
2124
0
                                          (char *)&lit, &one) < 0)
2125
0
                return -1;
2126
0
            c->u.xrle.cur_lit = lit;
2127
2128
0
            if (c->u.xrle.rep_score[lit] > 0) {
2129
0
                if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in,
2130
0
                                              (char *)&c->u.xrle.cur_len, &one) < 0)
2131
0
                    return -1;
2132
0
            } // else cur_len still zero
2133
            //else fprintf(stderr, "%d\n", lit);
2134
2135
0
            c->u.xrle.cur_len++;
2136
0
        }
2137
2138
0
        if (n >= c->u.xrle.cur_len) {
2139
0
            memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len);
2140
0
            out += c->u.xrle.cur_len;
2141
0
            n -= c->u.xrle.cur_len;
2142
0
            c->u.xrle.cur_len = 0;
2143
0
        } else {
2144
0
            memset(out, c->u.xrle.cur_lit, n);
2145
0
            out += n;
2146
0
            c->u.xrle.cur_len -= n;
2147
0
            n = 0;
2148
0
        }
2149
0
    }
2150
2151
0
    return 0;
2152
0
}
2153
2154
87
void cram_xrle_decode_free(cram_codec *c) {
2155
87
    if (!c) return;
2156
2157
87
    if (c->u.xrle.len_codec)
2158
30
        c->u.xrle.len_codec->free(c->u.xrle.len_codec);
2159
2160
87
    if (c->u.xrle.lit_codec)
2161
9
        c->u.xrle.lit_codec->free(c->u.xrle.lit_codec);
2162
2163
87
    free(c);
2164
87
}
2165
2166
cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr,
2167
                                  char *data, int size,
2168
                                  enum cram_encoding codec,
2169
                                  enum cram_external_type option,
2170
96
                                  int version, varint_vec *vv) {
2171
96
    cram_codec *c;
2172
96
    char *cp = data;
2173
96
    char *endp = data+size;
2174
96
    int err = 0;
2175
2176
96
    if (!(c = calloc(1, sizeof(*c))))
2177
0
        return NULL;
2178
2179
96
    c->codec  = E_XRLE;
2180
96
    if (option == E_LONG)
2181
3
        c->decode = cram_xrle_decode_long;
2182
93
    else if (option == E_INT)
2183
45
        c->decode = cram_xrle_decode_int;
2184
48
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
2185
39
        c->decode = cram_xrle_decode_char;
2186
9
    else {
2187
9
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
2188
9
        free(c);
2189
9
        return NULL;
2190
9
    }
2191
87
    c->free   = cram_xrle_decode_free;
2192
87
    c->size   = cram_xrle_decode_size;
2193
87
    c->get_block = cram_xrle_get_block;
2194
87
    c->describe = NULL;
2195
87
    c->u.xrle.cur_len = 0;
2196
87
    c->u.xrle.cur_lit = -1;
2197
2198
    // RLE map
2199
87
    int i, j, nrle = vv->varint_get32(&cp, endp, &err);
2200
87
    memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score));
2201
2.81k
    for (i = 0; i < nrle && i < 256; i++) {
2202
2.72k
        j = vv->varint_get32(&cp, endp, &err);
2203
2.72k
        if (j >= 0 && j < 256)
2204
2.56k
            c->u.xrle.rep_score[j] = 1;
2205
2.72k
    }
2206
2207
    // Length and literal sub encodings
2208
87
    c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err);
2209
87
    int sub_size = vv->varint_get32(&cp, endp, &err);
2210
87
    if (sub_size < 0 || endp - cp < sub_size)
2211
18
        goto malformed;
2212
69
    c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding,
2213
69
                                            cp, sub_size, E_INT, version, vv);
2214
69
    if (c->u.xrle.len_codec == NULL)
2215
39
        goto malformed;
2216
30
    cp += sub_size;
2217
2218
30
    c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err);
2219
30
    sub_size = vv->varint_get32(&cp, endp, &err);
2220
30
    if (sub_size < 0 || endp - cp < sub_size)
2221
9
        goto malformed;
2222
21
    c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding,
2223
21
                                            cp, sub_size, option, version, vv);
2224
21
    if (c->u.xrle.lit_codec == NULL)
2225
12
        goto malformed;
2226
9
    cp += sub_size;
2227
2228
9
    if (err)
2229
0
        goto malformed;
2230
2231
9
    return c;
2232
2233
78
 malformed:
2234
78
    fprintf(stderr, "Malformed xrle header stream\n");
2235
78
    cram_xrle_decode_free(c);
2236
78
    return NULL;
2237
9
}
2238
2239
0
int cram_xrle_encode_flush(cram_codec *c) {
2240
0
    uint8_t *out_lit, *out_len;
2241
0
    uint64_t out_lit_size, out_len_size;
2242
0
    uint8_t rle_syms[256];
2243
0
    int rle_nsyms = 0, i;
2244
2245
0
    for (i = 0; i < 256; i++)
2246
0
        if (c->u.e_xrle.rep_score[i] > 0)
2247
0
            rle_syms[rle_nsyms++] = i;
2248
2249
0
    if (!c->u.e_xrle.to_flush) {
2250
0
        c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out);
2251
0
        c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out);
2252
0
    }
2253
2254
0
    out_len = malloc(c->u.e_xrle.to_flush_size+8);
2255
0
    if (!out_len)
2256
0
        return -1;
2257
2258
0
    int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size);
2259
2260
0
    out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
2261
0
                             out_len+nb, &out_len_size,
2262
0
                             rle_syms, &rle_nsyms,
2263
0
                             NULL, &out_lit_size);
2264
0
    out_len_size += nb;
2265
2266
2267
    // TODO: can maybe "gift" the sub codec the data block, to remove
2268
    // one level of memcpy.
2269
0
    if (c->u.e_xrle.len_codec->encode(NULL,
2270
0
                                      c->u.e_xrle.len_codec,
2271
0
                                      (char *)out_len, out_len_size))
2272
0
        return -1;
2273
2274
0
    if (c->u.e_xrle.lit_codec->encode(NULL,
2275
0
                                      c->u.e_xrle.lit_codec,
2276
0
                                      (char *)out_lit, out_lit_size))
2277
0
        return -1;
2278
2279
0
    free(out_len);
2280
0
    free(out_lit);
2281
2282
0
    return 0;
2283
0
}
2284
2285
int cram_xrle_encode_store(cram_codec *c, cram_block *b,
2286
0
                            char *prefix, int version) {
2287
0
    int len = 0, r = 0, n;
2288
0
    cram_codec *tc;
2289
0
    cram_block *b_rle, *b_len, *b_lit;
2290
2291
0
    if (prefix) {
2292
0
        size_t l = strlen(prefix);
2293
0
        BLOCK_APPEND(b, prefix, l);
2294
0
        len += l;
2295
0
    }
2296
2297
    // List of symbols to RLE
2298
0
    b_rle = cram_new_block(0, 0);
2299
0
    if (!b_rle)
2300
0
        return -1;
2301
0
    int i, nrle = 0, len1 = 0;
2302
0
    for (i = 0; i < 256; i++) {
2303
0
        if (c->u.e_xrle.rep_score[i] > 0) {
2304
0
            nrle++;
2305
0
            len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n;
2306
0
        }
2307
0
    }
2308
2309
    // Store length and literal sub-codecs to get encoded length
2310
0
    tc = c->u.e_xrle.len_codec;
2311
0
    b_len = cram_new_block(0, 0);
2312
0
    if (!b_len)
2313
0
        return -1;
2314
0
    int len2 = tc->store(tc, b_len, NULL, version);
2315
2316
0
    tc = c->u.e_xrle.lit_codec;
2317
0
    b_lit = cram_new_block(0, 0);
2318
0
    if (!b_lit)
2319
0
        return -1;
2320
0
    int len3 = tc->store(tc, b_lit, NULL, version);
2321
2322
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
2323
0
    len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3
2324
0
                                        + c->vv->varint_size(nrle))); r |= n;
2325
0
    len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n;
2326
0
    BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle));
2327
0
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
2328
0
    BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit));
2329
2330
0
    cram_free_block(b_rle);
2331
0
    cram_free_block(b_len);
2332
0
    cram_free_block(b_lit);
2333
2334
0
    if (r > 0)
2335
0
        return len + len1 + len2 + len3;
2336
2337
0
 block_err:
2338
0
    return -1;
2339
0
}
2340
2341
int cram_xrle_encode_long(cram_slice *slice, cram_codec *c,
2342
0
                           char *in, int in_size) {
2343
    // TODO if and when needed
2344
0
    return -1;
2345
0
}
2346
2347
int cram_xrle_encode_int(cram_slice *slice, cram_codec *c,
2348
0
                          char *in, int in_size) {
2349
    // TODO if and when needed
2350
0
    return -1;
2351
0
}
2352
2353
int cram_xrle_encode_char(cram_slice *slice, cram_codec *c,
2354
0
                          char *in, int in_size) {
2355
0
    if (c->u.e_xrle.to_flush) {
2356
0
        if (!c->out && !(c->out = cram_new_block(0, 0)))
2357
0
            return -1;
2358
0
        BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size);
2359
0
        c->u.e_xrle.to_flush = NULL;
2360
0
        c->u.e_xrle.to_flush_size = 0;
2361
0
    }
2362
2363
0
    if (c->out && BLOCK_SIZE(c->out) > 0) {
2364
        // Gathering data
2365
0
        BLOCK_APPEND(c->out, in, in_size);
2366
0
        return 0;
2367
0
    }
2368
2369
    // else cache copy of the data we're about to send to flush instead.
2370
0
    c->u.e_xrle.to_flush = in;
2371
0
    c->u.e_xrle.to_flush_size = in_size;
2372
0
    return 0;
2373
2374
0
 block_err:
2375
0
    return -1;
2376
0
}
2377
2378
0
void cram_xrle_encode_free(cram_codec *c) {
2379
0
    if (!c) return;
2380
2381
0
    if (c->u.e_xrle.len_codec)
2382
0
        c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec);
2383
0
    if (c->u.e_xrle.lit_codec)
2384
0
        c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec);
2385
2386
0
    cram_free_block(c->out);
2387
2388
0
    free(c);
2389
0
}
2390
2391
cram_codec *cram_xrle_encode_init(cram_stats *st,
2392
                                  enum cram_encoding codec,
2393
                                  enum cram_external_type option,
2394
                                  void *dat,
2395
0
                                  int version, varint_vec *vv) {
2396
0
    cram_codec *c;
2397
2398
0
    if (!(c = malloc(sizeof(*c))))
2399
0
        return NULL;
2400
2401
0
    c->codec  = E_XRLE;
2402
0
    c->free   = cram_xrle_encode_free;
2403
0
    if (option == E_LONG)
2404
0
        c->encode = cram_xrle_encode_long;
2405
0
    else if (option == E_INT)
2406
0
        c->encode = cram_xrle_encode_int;
2407
0
    else
2408
0
        c->encode = cram_xrle_encode_char;
2409
0
    c->store  = cram_xrle_encode_store;
2410
0
    c->flush  = cram_xrle_encode_flush;
2411
2412
0
    cram_xrle_encoder *e = (cram_xrle_encoder *)dat;
2413
2414
0
    c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL,
2415
0
                                              E_BYTE, e->len_dat,
2416
0
                                              version, vv);
2417
0
    c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL,
2418
0
                                              E_BYTE, e->lit_dat,
2419
0
                                              version, vv);
2420
0
    c->u.e_xrle.cur_lit = -1;
2421
0
    c->u.e_xrle.cur_len = -1;
2422
0
    c->u.e_xrle.to_flush = NULL;
2423
0
    c->u.e_xrle.to_flush_size = 0;
2424
2425
0
    memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score));
2426
2427
0
    return c;
2428
0
}
2429
2430
/*
2431
 * ---------------------------------------------------------------------------
2432
 * SUBEXP
2433
 */
2434
0
int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2435
0
    int32_t *out_i = (int32_t *)out;
2436
0
    int n, count;
2437
0
    int k = c->u.subexp.k;
2438
2439
0
    for (count = 0, n = *out_size; count < n; count++) {
2440
0
        int i = 0, tail;
2441
0
        int val;
2442
2443
        /* Get number of 1s */
2444
        //while (get_bit_MSB(in) == 1) i++;
2445
0
        i = get_one_bits_MSB(in);
2446
0
        if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
2447
0
            return -1;
2448
        /*
2449
         * Val is
2450
         * i > 0:  2^(k+i-1) + k+i-1 bits
2451
         * i = 0:  k bits
2452
         */
2453
0
        if (i) {
2454
0
            tail = i + k-1;
2455
0
            val = 0;
2456
0
            while (tail) {
2457
                //val = val<<1; val |= get_bit_MSB(in);
2458
0
                GET_BIT_MSB(in, val);
2459
0
                tail--;
2460
0
            }
2461
0
            val += 1 << (i + k-1);
2462
0
        } else {
2463
0
            tail = k;
2464
0
            val = 0;
2465
0
            while (tail) {
2466
                //val = val<<1; val |= get_bit_MSB(in);
2467
0
                GET_BIT_MSB(in, val);
2468
0
                tail--;
2469
0
            }
2470
0
        }
2471
2472
0
        out_i[count] = val - c->u.subexp.offset;
2473
0
    }
2474
2475
0
    return 0;
2476
0
}
2477
2478
1.23k
void cram_subexp_decode_free(cram_codec *c) {
2479
1.23k
    if (c)
2480
1.23k
        free(c);
2481
1.23k
}
2482
2483
0
int cram_subexp_describe(cram_codec *c, kstring_t *ks) {
2484
0
    return ksprintf(ks, "SUBEXP(offset=%d,k=%d)",
2485
0
                    c->u.subexp.offset,
2486
0
                    c->u.subexp.k)
2487
0
        < 0 ? -1 : 0;
2488
0
}
2489
2490
cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
2491
                                    char *data, int size,
2492
                                    enum cram_encoding codec,
2493
                                    enum cram_external_type option,
2494
1.23k
                                    int version, varint_vec *vv) {
2495
1.23k
    cram_codec *c;
2496
1.23k
    char *cp = data;
2497
2498
1.23k
    if (option != E_INT) {
2499
3
        hts_log_error("This codec only supports INT encodings");
2500
3
        return NULL;
2501
3
    }
2502
2503
1.23k
    if (!(c = malloc(sizeof(*c))))
2504
0
        return NULL;
2505
2506
1.23k
    c->codec  = E_SUBEXP;
2507
1.23k
    c->decode = cram_subexp_decode;
2508
1.23k
    c->free   = cram_subexp_decode_free;
2509
1.23k
    c->describe = cram_subexp_describe;
2510
1.23k
    c->u.subexp.k = -1;
2511
2512
1.23k
    c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL);
2513
1.23k
    c->u.subexp.k      = vv->varint_get32(&cp, data + size, NULL);
2514
2515
1.23k
    if (cp - data != size || c->u.subexp.k < 0) {
2516
0
        hts_log_error("Malformed subexp header stream");
2517
0
        free(c);
2518
0
        return NULL;
2519
0
    }
2520
2521
1.23k
    return c;
2522
1.23k
}
2523
2524
/*
2525
 * ---------------------------------------------------------------------------
2526
 * GAMMA
2527
 */
2528
0
int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2529
0
    int32_t *out_i = (int32_t *)out;
2530
0
    int i, n;
2531
2532
0
    for (i = 0, n = *out_size; i < n; i++) {
2533
0
        int nz = 0;
2534
0
        int val;
2535
        //while (get_bit_MSB(in) == 0) nz++;
2536
0
        nz = get_zero_bits_MSB(in);
2537
0
        if (cram_not_enough_bits(in, nz))
2538
0
            return -1;
2539
0
        val = 1;
2540
0
        while (nz > 0) {
2541
            //val <<= 1; val |= get_bit_MSB(in);
2542
0
            GET_BIT_MSB(in, val);
2543
0
            nz--;
2544
0
        }
2545
2546
0
        out_i[i] = val - c->u.gamma.offset;
2547
0
    }
2548
2549
0
    return 0;
2550
0
}
2551
2552
2.04k
void cram_gamma_decode_free(cram_codec *c) {
2553
2.04k
    if (c)
2554
2.04k
        free(c);
2555
2.04k
}
2556
2557
0
int cram_gamma_describe(cram_codec *c, kstring_t *ks) {
2558
0
    return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset)
2559
0
        < 0 ? -1 : 0;
2560
0
}
2561
2562
cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
2563
                                   char *data, int size,
2564
                                   enum cram_encoding codec,
2565
                                   enum cram_external_type option,
2566
2.04k
                                   int version, varint_vec *vv) {
2567
2.04k
    cram_codec *c = NULL;
2568
2.04k
    char *cp = data;
2569
2570
2.04k
    if (option != E_INT) {
2571
3
        hts_log_error("This codec only supports INT encodings");
2572
3
        return NULL;
2573
3
    }
2574
2575
2.04k
    if (size < 1)
2576
3
        goto malformed;
2577
2578
2.04k
    if (!(c = malloc(sizeof(*c))))
2579
0
        return NULL;
2580
2581
2.04k
    c->codec  = E_GAMMA;
2582
2.04k
    c->decode = cram_gamma_decode;
2583
2.04k
    c->free   = cram_gamma_decode_free;
2584
2.04k
    c->describe = cram_gamma_describe;
2585
2586
2.04k
    c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL);
2587
2588
2.04k
    if (cp - data != size)
2589
3
        goto malformed;
2590
2591
2.04k
    return c;
2592
2593
6
 malformed:
2594
6
    hts_log_error("Malformed gamma header stream");
2595
6
    free(c);
2596
6
    return NULL;
2597
2.04k
}
2598
2599
/*
2600
 * ---------------------------------------------------------------------------
2601
 * HUFFMAN
2602
 */
2603
2604
2.05k
static int code_sort(const void *vp1, const void *vp2) {
2605
2.05k
    const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
2606
2.05k
    const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
2607
2608
2.05k
    if (c1->len != c2->len)
2609
471
        return c1->len - c2->len;
2610
1.58k
    else
2611
1.58k
        return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
2612
2.05k
}
2613
2614
768
void cram_huffman_decode_free(cram_codec *c) {
2615
768
    if (!c)
2616
0
        return;
2617
2618
768
    if (c->u.huffman.codes)
2619
579
        free(c->u.huffman.codes);
2620
768
    free(c);
2621
768
}
2622
2623
int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
2624
0
                             cram_block *in, char *out, int *out_size) {
2625
0
    return -1;
2626
0
}
2627
2628
int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
2629
0
                              cram_block *in, char *out, int *out_size) {
2630
0
    int i, n;
2631
2632
0
    if (!out)
2633
0
        return 0;
2634
2635
    /* Special case of 0 length codes */
2636
0
    for (i = 0, n = *out_size; i < n; i++) {
2637
0
        out[i] = c->u.huffman.codes[0].symbol;
2638
0
    }
2639
0
    return 0;
2640
0
}
2641
2642
int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
2643
0
                             cram_block *in, char *out, int *out_size) {
2644
0
    int i, n, ncodes = c->u.huffman.ncodes;
2645
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2646
2647
0
    for (i = 0, n = *out_size; i < n; i++) {
2648
0
        int idx = 0;
2649
0
        int val = 0, len = 0, last_len = 0;
2650
2651
0
        for (;;) {
2652
0
            int dlen = codes[idx].len - last_len;
2653
0
            if (cram_not_enough_bits(in, dlen))
2654
0
                return -1;
2655
2656
            //val <<= dlen;
2657
            //val  |= get_bits_MSB(in, dlen);
2658
            //last_len = (len += dlen);
2659
2660
0
            last_len = (len += dlen);
2661
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2662
2663
0
            idx = val - codes[idx].p;
2664
0
            if (idx >= ncodes || idx < 0)
2665
0
                return -1;
2666
2667
0
            if (codes[idx].code == val && codes[idx].len == len) {
2668
0
                if (out) out[i] = codes[idx].symbol;
2669
0
                break;
2670
0
            }
2671
0
        }
2672
0
    }
2673
2674
0
    return 0;
2675
0
}
2676
2677
int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
2678
0
                             cram_block *in, char *out, int *out_size) {
2679
0
    int32_t *out_i = (int32_t *)out;
2680
0
    int i, n;
2681
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2682
2683
    /* Special case of 0 length codes */
2684
0
    for (i = 0, n = *out_size; i < n; i++) {
2685
0
        out_i[i] = codes[0].symbol;
2686
0
    }
2687
0
    return 0;
2688
0
}
2689
2690
int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
2691
0
                            cram_block *in, char *out, int *out_size) {
2692
0
    int32_t *out_i = (int32_t *)out;
2693
0
    int i, n, ncodes = c->u.huffman.ncodes;
2694
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2695
2696
0
    for (i = 0, n = *out_size; i < n; i++) {
2697
0
        int idx = 0;
2698
0
        int val = 0, len = 0, last_len = 0;
2699
2700
        // Now one bit at a time for remaining checks
2701
0
        for (;;) {
2702
0
            int dlen = codes[idx].len - last_len;
2703
0
            if (cram_not_enough_bits(in, dlen))
2704
0
                return -1;
2705
2706
            //val <<= dlen;
2707
            //val  |= get_bits_MSB(in, dlen);
2708
            //last_len = (len += dlen);
2709
2710
0
            last_len = (len += dlen);
2711
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2712
2713
0
            idx = val - codes[idx].p;
2714
0
            if (idx >= ncodes || idx < 0)
2715
0
                return -1;
2716
2717
0
            if (codes[idx].code == val && codes[idx].len == len) {
2718
0
                out_i[i] = codes[idx].symbol;
2719
0
                break;
2720
0
            }
2721
0
        }
2722
0
    }
2723
2724
0
    return 0;
2725
0
}
2726
2727
int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c,
2728
0
                              cram_block *in, char *out, int *out_size) {
2729
0
    int64_t *out_i = (int64_t *)out;
2730
0
    int i, n;
2731
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2732
2733
    /* Special case of 0 length codes */
2734
0
    for (i = 0, n = *out_size; i < n; i++) {
2735
0
        out_i[i] = codes[0].symbol;
2736
0
    }
2737
0
    return 0;
2738
0
}
2739
2740
int cram_huffman_decode_long(cram_slice *slice, cram_codec *c,
2741
0
                             cram_block *in, char *out, int *out_size) {
2742
0
    int64_t *out_i = (int64_t *)out;
2743
0
    int i, n, ncodes = c->u.huffman.ncodes;
2744
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2745
2746
0
    for (i = 0, n = *out_size; i < n; i++) {
2747
0
        int idx = 0;
2748
0
        int val = 0, len = 0, last_len = 0;
2749
2750
        // Now one bit at a time for remaining checks
2751
0
        for (;;) {
2752
0
            int dlen = codes[idx].len - last_len;
2753
0
            if (cram_not_enough_bits(in, dlen))
2754
0
                return -1;
2755
2756
            //val <<= dlen;
2757
            //val  |= get_bits_MSB(in, dlen);
2758
            //last_len = (len += dlen);
2759
2760
0
            last_len = (len += dlen);
2761
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2762
2763
0
            idx = val - codes[idx].p;
2764
0
            if (idx >= ncodes || idx < 0)
2765
0
                return -1;
2766
2767
0
            if (codes[idx].code == val && codes[idx].len == len) {
2768
0
                out_i[i] = codes[idx].symbol;
2769
0
                break;
2770
0
            }
2771
0
        }
2772
0
    }
2773
2774
0
    return 0;
2775
0
}
2776
2777
0
int cram_huffman_describe(cram_codec *c, kstring_t *ks) {
2778
0
    int r = 0, n;
2779
0
    r |= ksprintf(ks, "HUFFMAN(codes={") < 0;
2780
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2781
0
        r |= ksprintf(ks, "%s%"PRId64, n?",":"",
2782
0
                      c->u.huffman.codes[n].symbol);
2783
0
    }
2784
0
    r |= ksprintf(ks, "},lengths={") < 0;
2785
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2786
0
        r |= ksprintf(ks, "%s%d", n?",":"",
2787
0
                      c->u.huffman.codes[n].len);
2788
0
    }
2789
0
    r |= ksprintf(ks, "})") < 0;
2790
0
    return r;
2791
0
}
2792
2793
/*
2794
 * Initialises a huffman decoder from an encoding data stream.
2795
 */
2796
cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
2797
                                     char *data, int size,
2798
                                     enum cram_encoding codec,
2799
                                     enum cram_external_type option,
2800
918
                                     int version, varint_vec *vv) {
2801
918
    int32_t ncodes = 0, i, j;
2802
918
    char *cp = data, *data_end = &data[size];
2803
918
    cram_codec *h;
2804
918
    cram_huffman_code *codes = NULL;
2805
918
    int32_t val, last_len, max_len = 0;
2806
918
    uint32_t max_val; // needs one more bit than val
2807
918
    const int max_code_bits = sizeof(val) * 8 - 1;
2808
918
    int err = 0;
2809
2810
918
    if (option == E_BYTE_ARRAY_BLOCK) {
2811
3
        hts_log_error("BYTE_ARRAYs not supported by this codec");
2812
3
        return NULL;
2813
3
    }
2814
2815
915
    ncodes = vv->varint_get32(&cp, data_end, &err);
2816
915
    if (ncodes < 0) {
2817
0
        hts_log_error("Invalid number of symbols in huffman stream");
2818
0
        return NULL;
2819
0
    }
2820
915
    if (ncodes >= SIZE_MAX / sizeof(*codes)) {
2821
0
        errno = ENOMEM;
2822
0
        return NULL;
2823
0
    }
2824
915
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2825
915
    if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) {
2826
6
        errno = ENOMEM;
2827
6
        return NULL;
2828
6
    }
2829
909
#endif
2830
909
    h = calloc(1, sizeof(*h));
2831
909
    if (!h)
2832
0
        return NULL;
2833
2834
909
    h->codec  = E_HUFFMAN;
2835
909
    h->free   = cram_huffman_decode_free;
2836
2837
909
    h->u.huffman.ncodes = ncodes;
2838
909
    h->u.huffman.option = option;
2839
909
    if (ncodes) {
2840
711
        codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes));
2841
711
        if (!codes) {
2842
0
            free(h);
2843
0
            return NULL;
2844
0
        }
2845
711
    } else {
2846
198
        codes = h->u.huffman.codes = NULL;
2847
198
    }
2848
2849
    /* Read symbols and bit-lengths */
2850
909
    if (option == E_LONG) {
2851
12.6M
        for (i = 0; i < ncodes; i++)
2852
12.6M
            codes[i].symbol = vv->varint_get64(&cp, data_end, &err);
2853
870
    } else if (option == E_INT || option == E_BYTE) {
2854
3.56k
        for (i = 0; i < ncodes; i++)
2855
2.70k
            codes[i].symbol = vv->varint_get32(&cp, data_end, &err);
2856
861
    } else {
2857
9
        goto malformed;
2858
9
    }
2859
2860
900
    if (err)
2861
63
        goto malformed;
2862
2863
837
    i = vv->varint_get32(&cp, data_end, &err);
2864
837
    if (i != ncodes)
2865
12
        goto malformed;
2866
2867
825
    if (ncodes == 0) {
2868
        /* NULL huffman stream.  Ensure it returns an error if
2869
           anything tries to use it. */
2870
189
        h->decode = cram_huffman_decode_null;
2871
189
        return h;
2872
189
    }
2873
2874
2.37k
    for (i = 0; i < ncodes; i++) {
2875
1.75k
        codes[i].len = vv->varint_get32(&cp, data_end, &err);
2876
1.75k
        if (err)
2877
9
            break;
2878
1.74k
        if (codes[i].len < 0) {
2879
12
            hts_log_error("Huffman code length (%d) is negative", codes[i].len);
2880
12
            goto malformed;
2881
12
        }
2882
1.73k
        if (max_len < codes[i].len)
2883
423
            max_len = codes[i].len;
2884
1.73k
    }
2885
624
    if (err || cp - data != size || max_len >= ncodes)
2886
21
        goto malformed;
2887
2888
    /* 31 is max. bits available in val */
2889
603
    if (max_len > max_code_bits) {
2890
3
        hts_log_error("Huffman code length (%d) is greater "
2891
3
                      "than maximum supported (%d)", max_len, max_code_bits);
2892
3
        goto malformed;
2893
3
    }
2894
2895
    /* Sort by bit length and then by symbol value */
2896
600
    qsort(codes, ncodes, sizeof(*codes), code_sort);
2897
2898
    /* Assign canonical codes */
2899
600
    val = -1, last_len = 0, max_val = 0;
2900
1.75k
    for (i = 0; i < ncodes; i++) {
2901
1.17k
        val++;
2902
1.17k
        if (val > max_val)
2903
21
            goto malformed;
2904
2905
1.15k
        if (codes[i].len > last_len) {
2906
360
            val <<= (codes[i].len - last_len);
2907
360
            last_len = codes[i].len;
2908
360
            max_val = (1U << codes[i].len) - 1;
2909
360
        }
2910
1.15k
        codes[i].code = val;
2911
1.15k
    }
2912
2913
    /*
2914
     * Compute the next starting point, offset by the i'th value.
2915
     * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
2916
     * codes[10..13].p = 30 - 10.
2917
     */
2918
579
    last_len = 0;
2919
1.71k
    for (i = j = 0; i < ncodes; i++) {
2920
1.13k
        if (codes[i].len > last_len) {
2921
360
            j = codes[i].code - i;
2922
360
            last_len = codes[i].len;
2923
360
        }
2924
1.13k
        codes[i].p = j;
2925
1.13k
    }
2926
2927
    // puts("==HUFF LEN==");
2928
    // for (i = 0; i <= last_len+1; i++) {
2929
    //     printf("len %d=%d prefix %d\n", i, h->u.huffman.lengths[i], h->u.huffman.prefix[i]);
2930
    // }
2931
    // puts("===HUFFMAN CODES===");
2932
    // for (i = 0; i < ncodes; i++) {
2933
    //     int j;
2934
    //     printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
2935
    //     j = codes[i].len;
2936
    //     while (j) {
2937
    //         putchar(codes[i].code & (1 << --j) ? '1' : '0');
2938
    //     }
2939
    //     printf(" %d\n", codes[i].code);
2940
    // }
2941
2942
579
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
2943
450
        if (h->u.huffman.codes[0].len == 0)
2944
216
            h->decode = cram_huffman_decode_char0;
2945
234
        else
2946
234
            h->decode = cram_huffman_decode_char;
2947
450
    } else if (option == E_LONG || option == E_SLONG) {
2948
0
        if (h->u.huffman.codes[0].len == 0)
2949
0
            h->decode = cram_huffman_decode_long0;
2950
0
        else
2951
0
            h->decode = cram_huffman_decode_long;
2952
129
    } else if (option == E_INT || option == E_SINT || option == E_BYTE) {
2953
129
        if (h->u.huffman.codes[0].len == 0)
2954
33
            h->decode = cram_huffman_decode_int0;
2955
96
        else
2956
96
            h->decode = cram_huffman_decode_int;
2957
129
    } else {
2958
0
        return NULL;
2959
0
    }
2960
579
    h->describe = cram_huffman_describe;
2961
2962
579
    return (cram_codec *)h;
2963
2964
141
 malformed:
2965
141
    hts_log_error("Malformed huffman header stream");
2966
141
    free(codes);
2967
141
    free(h);
2968
141
    return NULL;
2969
579
}
2970
2971
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
2972
55.6k
                              char *in, int in_size) {
2973
55.6k
    return 0;
2974
55.6k
}
2975
2976
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
2977
0
                             char *in, int in_size) {
2978
0
    int i, code, len, r = 0;
2979
0
    unsigned char *syms = (unsigned char *)in;
2980
2981
0
    while (in_size--) {
2982
0
        int sym = *syms++;
2983
0
        if (sym >= -1 && sym < MAX_HUFF) {
2984
0
            i = c->u.e_huffman.val2code[sym+1];
2985
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
2986
0
            code = c->u.e_huffman.codes[i].code;
2987
0
            len  = c->u.e_huffman.codes[i].len;
2988
0
        } else {
2989
            /* Slow - use a lookup table for when sym < MAX_HUFF? */
2990
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
2991
0
                if (c->u.e_huffman.codes[i].symbol == sym)
2992
0
                    break;
2993
0
            }
2994
0
            if (i == c->u.e_huffman.nvals)
2995
0
                return -1;
2996
2997
0
            code = c->u.e_huffman.codes[i].code;
2998
0
            len  = c->u.e_huffman.codes[i].len;
2999
0
        }
3000
3001
0
        r |= store_bits_MSB(c->out, code, len);
3002
0
    }
3003
3004
0
    return r;
3005
0
}
3006
3007
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
3008
11.4M
                             char *in, int in_size) {
3009
11.4M
    return 0;
3010
11.4M
}
3011
3012
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
3013
0
                            char *in, int in_size) {
3014
0
    int i, code, len, r = 0;
3015
0
    int *syms = (int *)in;
3016
3017
0
    while (in_size--) {
3018
0
        int sym = *syms++;
3019
3020
0
        if (sym >= -1 && sym < MAX_HUFF) {
3021
0
            i = c->u.e_huffman.val2code[sym+1];
3022
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3023
0
            code = c->u.e_huffman.codes[i].code;
3024
0
            len  = c->u.e_huffman.codes[i].len;
3025
0
        } else {
3026
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3027
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3028
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3029
0
                    break;
3030
0
            }
3031
0
            if (i == c->u.e_huffman.nvals)
3032
0
                return -1;
3033
3034
0
            code = c->u.e_huffman.codes[i].code;
3035
0
            len  = c->u.e_huffman.codes[i].len;
3036
0
        }
3037
3038
0
        r |= store_bits_MSB(c->out, code, len);
3039
0
    }
3040
3041
0
    return r;
3042
0
}
3043
3044
int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c,
3045
0
                              char *in, int in_size) {
3046
0
    return 0;
3047
0
}
3048
3049
int cram_huffman_encode_long(cram_slice *slice, cram_codec *c,
3050
0
                             char *in, int in_size) {
3051
0
    int i, code, len, r = 0;
3052
0
    int64_t *syms = (int64_t *)in;
3053
3054
0
    while (in_size--) {
3055
0
        int sym = *syms++;
3056
3057
0
        if (sym >= -1 && sym < MAX_HUFF) {
3058
0
            i = c->u.e_huffman.val2code[sym+1];
3059
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3060
0
            code = c->u.e_huffman.codes[i].code;
3061
0
            len  = c->u.e_huffman.codes[i].len;
3062
0
        } else {
3063
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3064
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3065
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3066
0
                    break;
3067
0
            }
3068
0
            if (i == c->u.e_huffman.nvals)
3069
0
                return -1;
3070
3071
0
            code = c->u.e_huffman.codes[i].code;
3072
0
            len  = c->u.e_huffman.codes[i].len;
3073
0
        }
3074
3075
0
        r |= store_bits_MSB(c->out, code, len);
3076
0
    }
3077
3078
0
    return r;
3079
0
}
3080
3081
516k
void cram_huffman_encode_free(cram_codec *c) {
3082
516k
    if (!c)
3083
0
        return;
3084
3085
516k
    if (c->u.e_huffman.codes)
3086
516k
        free(c->u.e_huffman.codes);
3087
516k
    free(c);
3088
516k
}
3089
3090
/*
3091
 * Encodes a huffman tree.
3092
 * Returns number of bytes written.
3093
 */
3094
int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
3095
515k
                              int version) {
3096
515k
    int i, len = 0, r = 0, n;
3097
515k
    cram_huffman_code *codes = c->u.e_huffman.codes;
3098
    /*
3099
     * Up to code length 127 means 2.5e+26 bytes of data required (worst
3100
     * case huffman tree needs symbols with freqs matching the Fibonacci
3101
     * series). So guaranteed 1 byte per code.
3102
     *
3103
     * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
3104
     *
3105
     * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
3106
     */
3107
515k
    char *tmp = malloc(6*c->u.e_huffman.nvals+16);
3108
515k
    char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16;
3109
3110
515k
    if (!tmp)
3111
0
        return -1;
3112
3113
515k
    if (prefix) {
3114
465k
        size_t l = strlen(prefix);
3115
465k
        BLOCK_APPEND(b, prefix, l);
3116
465k
        len += l;
3117
465k
    }
3118
3119
515k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3120
515k
    if (c->u.e_huffman.option == E_LONG) {
3121
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3122
0
            tp += c->vv->varint_put64(tp, tpend, codes[i].symbol);
3123
0
        }
3124
515k
    } else if (c->u.e_huffman.option == E_SLONG) {
3125
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3126
0
            tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol);
3127
0
        }
3128
515k
    } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) {
3129
1.03M
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3130
515k
            tp += c->vv->varint_put32(tp, tpend, codes[i].symbol);
3131
515k
        }
3132
515k
    } else if (c->u.e_huffman.option == E_SINT) {
3133
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3134
0
            tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol);
3135
0
        }
3136
0
    } else {
3137
0
        return -1;
3138
0
    }
3139
3140
515k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3141
1.03M
    for (i = 0; i < c->u.e_huffman.nvals; i++)
3142
515k
        tp += c->vv->varint_put32(tp, tpend, codes[i].len);
3143
3144
515k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
3145
515k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
3146
515k
    BLOCK_APPEND(b, tmp, tp-tmp);
3147
515k
    len += tp-tmp;
3148
3149
515k
    free(tmp);
3150
3151
515k
    if (r > 0)
3152
515k
        return len;
3153
3154
0
 block_err:
3155
0
    return -1;
3156
515k
}
3157
3158
cram_codec *cram_huffman_encode_init(cram_stats *st,
3159
                                     enum cram_encoding codec,
3160
                                     enum cram_external_type option,
3161
                                     void *dat,
3162
516k
                                     int version, varint_vec *vv) {
3163
516k
    int *vals = NULL, *freqs = NULL, *lens = NULL, code, len;
3164
516k
    int *new_vals, *new_freqs;
3165
516k
    int i, max_val = 0, min_val = INT_MAX, k;
3166
516k
    size_t nvals, vals_alloc = 0;
3167
516k
    cram_codec *c;
3168
516k
    cram_huffman_code *codes;
3169
3170
516k
    c = malloc(sizeof(*c));
3171
516k
    if (!c)
3172
0
        return NULL;
3173
516k
    c->codec = E_HUFFMAN;
3174
3175
    /* Count number of unique symbols */
3176
529M
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
3177
529M
        if (!st->freqs[i])
3178
528M
            continue;
3179
424k
        if (nvals >= vals_alloc) {
3180
424k
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3181
424k
            new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3182
424k
            if (!new_vals) goto nomem;
3183
424k
            vals = new_vals;
3184
424k
            new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3185
424k
            if (!new_freqs) goto nomem;
3186
424k
            freqs = new_freqs;
3187
424k
        }
3188
424k
        vals[nvals] = i;
3189
424k
        freqs[nvals] = st->freqs[i];
3190
424k
        assert(st->freqs[i] > 0);
3191
424k
        if (max_val < i) max_val = i;
3192
424k
        if (min_val > i) min_val = i;
3193
424k
        nvals++;
3194
424k
    }
3195
516k
    if (st->h) {
3196
92.5k
        khint_t k;
3197
3198
462k
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
3199
370k
            if (!kh_exist(st->h, k))
3200
277k
                continue;
3201
92.5k
            if (nvals >= vals_alloc) {
3202
92.5k
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3203
92.5k
                new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3204
92.5k
                if (!new_vals) goto nomem;
3205
92.5k
                vals = new_vals;
3206
92.5k
                new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3207
92.5k
                if (!new_freqs) goto nomem;
3208
92.5k
                freqs = new_freqs;
3209
92.5k
            }
3210
92.5k
            vals[nvals]= kh_key(st->h, k);
3211
92.5k
            freqs[nvals] = kh_val(st->h, k);
3212
92.5k
            assert(freqs[nvals] > 0);
3213
92.5k
            if (max_val < i) max_val = i;
3214
92.5k
            if (min_val > i) min_val = i;
3215
92.5k
            nvals++;
3216
92.5k
        }
3217
92.5k
    }
3218
3219
516k
    assert(nvals > 0);
3220
3221
516k
    new_freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
3222
516k
    if (!new_freqs) goto nomem;
3223
516k
    freqs = new_freqs;
3224
516k
    lens = calloc(2*nvals, sizeof(*lens));
3225
516k
    if (!lens) goto nomem;
3226
3227
    /* Inefficient, use pointers to form chain so we can insert and maintain
3228
     * a sorted list? This is currently O(nvals^2) complexity.
3229
     */
3230
516k
    for (;;) {
3231
516k
        int low1 = INT_MAX, low2 = INT_MAX;
3232
516k
        int ind1 = 0, ind2 = 0;
3233
1.03M
        for (i = 0; i < nvals; i++) {
3234
516k
            if (freqs[i] < 0)
3235
0
                continue;
3236
516k
            if (low1 > freqs[i])
3237
516k
                low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
3238
0
            else if (low2 > freqs[i])
3239
0
                low2 = freqs[i], ind2 = i;
3240
516k
        }
3241
516k
        if (low2 == INT_MAX)
3242
516k
            break;
3243
3244
0
        freqs[nvals] = low1 + low2;
3245
0
        lens[ind1] = nvals;
3246
0
        lens[ind2] = nvals;
3247
0
        freqs[ind1] *= -1;
3248
0
        freqs[ind2] *= -1;
3249
0
        nvals++;
3250
0
    }
3251
516k
    nvals = nvals/2+1;
3252
3253
    /* Assign lengths */
3254
1.03M
    for (i = 0; i < nvals; i++) {
3255
516k
        int code_len = 0;
3256
516k
        for (k = lens[i]; k; k = lens[k])
3257
0
            code_len++;
3258
516k
        lens[i] = code_len;
3259
516k
        freqs[i] *= -1;
3260
        //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
3261
516k
    }
3262
3263
3264
    /* Sort, need in a struct */
3265
516k
    if (!(codes = malloc(nvals * sizeof(*codes))))
3266
0
        goto nomem;
3267
1.03M
    for (i = 0; i < nvals; i++) {
3268
516k
        codes[i].symbol = vals[i];
3269
516k
        codes[i].len = lens[i];
3270
516k
    }
3271
516k
    qsort(codes, nvals, sizeof(*codes), code_sort);
3272
3273
    /*
3274
     * Generate canonical codes from lengths.
3275
     * Sort by length.
3276
     * Start with 0.
3277
     * Every new code of same length is +1.
3278
     * Every new code of new length is +1 then <<1 per extra length.
3279
     *
3280
     * /\
3281
     * a/\
3282
     * /\/\
3283
     * bcd/\
3284
     *    ef
3285
     *
3286
     * a 1  0
3287
     * b 3  4 (0+1)<<2
3288
     * c 3  5
3289
     * d 3  6
3290
     * e 4  14  (6+1)<<1
3291
     * f 5  15
3292
     */
3293
516k
    code = 0; len = codes[0].len;
3294
1.03M
    for (i = 0; i < nvals; i++) {
3295
516k
        while (len != codes[i].len) {
3296
0
            code<<=1;
3297
0
            len++;
3298
0
        }
3299
516k
        codes[i].code = code++;
3300
3301
516k
        if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
3302
514k
            c->u.e_huffman.val2code[codes[i].symbol+1] = i;
3303
3304
        //fprintf(stderr, "sym %d, code %d, len %d\n",
3305
        //      codes[i].symbol, codes[i].code, codes[i].len);
3306
516k
    }
3307
3308
516k
    free(lens);
3309
516k
    free(vals);
3310
516k
    free(freqs);
3311
3312
516k
    c->u.e_huffman.codes = codes;
3313
516k
    c->u.e_huffman.nvals = nvals;
3314
516k
    c->u.e_huffman.option = option;
3315
3316
516k
    c->free = cram_huffman_encode_free;
3317
516k
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
3318
14.4k
        if (c->u.e_huffman.codes[0].len == 0)
3319
14.4k
            c->encode = cram_huffman_encode_char0;
3320
0
        else
3321
0
            c->encode = cram_huffman_encode_char;
3322
502k
    } else if (option == E_INT || option == E_SINT) {
3323
502k
        if (c->u.e_huffman.codes[0].len == 0)
3324
502k
            c->encode = cram_huffman_encode_int0;
3325
0
        else
3326
0
            c->encode = cram_huffman_encode_int;
3327
502k
    } else if (option == E_LONG || option == E_SLONG) {
3328
0
        if (c->u.e_huffman.codes[0].len == 0)
3329
0
            c->encode = cram_huffman_encode_long0;
3330
0
        else
3331
0
            c->encode = cram_huffman_encode_long;
3332
0
    } else {
3333
0
        return NULL;
3334
0
    }
3335
516k
    c->store = cram_huffman_encode_store;
3336
516k
    c->flush = NULL;
3337
3338
516k
    return c;
3339
3340
0
 nomem:
3341
0
    hts_log_error("Out of memory");
3342
0
    free(vals);
3343
0
    free(freqs);
3344
0
    free(lens);
3345
0
    free(c);
3346
0
    return NULL;
3347
516k
}
3348
3349
/*
3350
 * ---------------------------------------------------------------------------
3351
 * BYTE_ARRAY_LEN
3352
 */
3353
int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
3354
                               cram_block *in, char *out,
3355
0
                               int *out_size) {
3356
    /* Fetch length */
3357
0
    int32_t len = 0, one = 1;
3358
0
    int r;
3359
3360
0
    r = c->u.byte_array_len.len_codec->decode(slice, c->u.byte_array_len.len_codec,
3361
0
                                              in, (char *)&len, &one);
3362
    //printf("ByteArray Len=%d\n", len);
3363
3364
0
    if (!r && c->u.byte_array_len.val_codec && len >= 0) {
3365
0
        r = c->u.byte_array_len.val_codec->decode(slice,
3366
0
                                                  c->u.byte_array_len.val_codec,
3367
0
                                                  in, out, &len);
3368
0
    } else {
3369
0
        return -1;
3370
0
    }
3371
3372
0
    *out_size = len;
3373
3374
0
    return r;
3375
0
}
3376
3377
1.45k
void cram_byte_array_len_decode_free(cram_codec *c) {
3378
1.45k
    if (!c) return;
3379
3380
1.45k
    if (c->u.byte_array_len.len_codec)
3381
1.41k
        c->u.byte_array_len.len_codec->free(c->u.byte_array_len.len_codec);
3382
3383
1.45k
    if (c->u.byte_array_len.val_codec)
3384
1.41k
        c->u.byte_array_len.val_codec->free(c->u.byte_array_len.val_codec);
3385
3386
1.45k
    free(c);
3387
1.45k
}
3388
3389
0
int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) {
3390
0
    int r = 0;
3391
0
    r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0;
3392
0
    cram_byte_array_len_decoder *l = &c->u.byte_array_len;
3393
0
    r |=  l->len_codec->describe
3394
0
        ? l->len_codec->describe(l->len_codec, ks)
3395
0
        : (ksprintf(ks, "?")<0);
3396
0
    r |= ksprintf(ks, "},val_codec={") < 0;
3397
0
    r |=  l->val_codec->describe
3398
0
        ? l->val_codec->describe(l->val_codec, ks)
3399
0
        : (ksprintf(ks, "?")<0);
3400
0
    r |= ksprintf(ks, "}") < 0;
3401
3402
0
    return r;
3403
0
}
3404
3405
cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
3406
                                            char *data, int size,
3407
                                            enum cram_encoding codec,
3408
                                            enum cram_external_type option,
3409
1.45k
                                            int version, varint_vec *vv) {
3410
1.45k
    cram_codec *c;
3411
1.45k
    char *cp   = data;
3412
1.45k
    char *endp = data + size;
3413
3414
1.45k
    if (!(c = malloc(sizeof(*c))))
3415
0
        return NULL;
3416
3417
1.45k
    c->codec  = E_BYTE_ARRAY_LEN;
3418
1.45k
    c->decode = cram_byte_array_len_decode;
3419
1.45k
    c->free   = cram_byte_array_len_decode_free;
3420
1.45k
    c->describe = cram_byte_array_len_describe;
3421
1.45k
    c->u.byte_array_len.len_codec = NULL;
3422
1.45k
    c->u.byte_array_len.val_codec = NULL;
3423
3424
1.45k
    int encoding = vv->varint_get32(&cp, endp, NULL);
3425
1.45k
    int sub_size = vv->varint_get32(&cp, endp, NULL);
3426
1.45k
    if (sub_size < 0 || endp - cp < sub_size)
3427
9
        goto malformed;
3428
1.44k
    c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3429
1.44k
                                                      E_INT, version, vv);
3430
1.44k
    if (c->u.byte_array_len.len_codec == NULL)
3431
27
        goto no_codec;
3432
1.41k
    cp += sub_size;
3433
3434
1.41k
    encoding = vv->varint_get32(&cp, endp, NULL);
3435
1.41k
    sub_size = vv->varint_get32(&cp, endp, NULL);
3436
1.41k
    if (sub_size < 0 || endp - cp < sub_size)
3437
3
        goto malformed;
3438
1.41k
    c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3439
1.41k
                                                      option, version, vv);
3440
1.41k
    if (c->u.byte_array_len.val_codec == NULL)
3441
3
        goto no_codec;
3442
1.41k
    cp += sub_size;
3443
3444
1.41k
    if (cp - data != size)
3445
6
        goto malformed;
3446
3447
1.40k
    return c;
3448
3449
18
 malformed:
3450
18
    hts_log_error("Malformed byte_array_len header stream");
3451
48
 no_codec:
3452
48
    cram_byte_array_len_decode_free(c);
3453
48
    return NULL;
3454
18
}
3455
3456
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
3457
26.5k
                               char *in, int in_size) {
3458
26.5k
    int32_t i32 = in_size;
3459
26.5k
    int r = 0;
3460
3461
26.5k
    r |= c->u.e_byte_array_len.len_codec->encode(slice,
3462
26.5k
                                                 c->u.e_byte_array_len.len_codec,
3463
26.5k
                                                 (char *)&i32, 1);
3464
26.5k
    r |= c->u.e_byte_array_len.val_codec->encode(slice,
3465
26.5k
                                                 c->u.e_byte_array_len.val_codec,
3466
26.5k
                                                 in, in_size);
3467
26.5k
    return r;
3468
26.5k
}
3469
3470
86.7k
void cram_byte_array_len_encode_free(cram_codec *c) {
3471
86.7k
    if (!c)
3472
0
        return;
3473
3474
86.7k
    if (c->u.e_byte_array_len.len_codec)
3475
86.7k
        c->u.e_byte_array_len.len_codec->free(c->u.e_byte_array_len.len_codec);
3476
3477
86.7k
    if (c->u.e_byte_array_len.val_codec)
3478
86.7k
        c->u.e_byte_array_len.val_codec->free(c->u.e_byte_array_len.val_codec);
3479
3480
86.7k
    free(c);
3481
86.7k
}
3482
3483
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
3484
86.1k
                                     char *prefix, int version) {
3485
86.1k
    int len = 0, len2, len3, r = 0, n;
3486
86.1k
    cram_codec *tc;
3487
86.1k
    cram_block *b_len = NULL, *b_val = NULL;
3488
3489
86.1k
    if (prefix) {
3490
35.9k
        size_t l = strlen(prefix);
3491
35.9k
        BLOCK_APPEND(b, prefix, l);
3492
35.9k
        len += l;
3493
35.9k
    }
3494
3495
86.1k
    tc = c->u.e_byte_array_len.len_codec;
3496
86.1k
    b_len = cram_new_block(0, 0);
3497
86.1k
    if (!b_len) goto block_err;
3498
86.1k
    len2 = tc->store(tc, b_len, NULL, version);
3499
86.1k
    if (len2 < 0) goto block_err;
3500
3501
86.1k
    tc = c->u.e_byte_array_len.val_codec;
3502
86.1k
    b_val = cram_new_block(0, 0);
3503
86.1k
    if (!b_val) goto block_err;
3504
86.1k
    len3 = tc->store(tc, b_val, NULL, version);
3505
86.1k
    if (len3 < 0) goto block_err;
3506
3507
86.1k
    len += (n = c->vv->varint_put32_blk(b, c->codec));  r |= n;
3508
86.1k
    len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n;
3509
86.1k
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
3510
86.1k
    BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
3511
3512
86.1k
    cram_free_block(b_len);
3513
86.1k
    cram_free_block(b_val);
3514
3515
86.1k
    if (r > 0)
3516
86.1k
        return len + len2 + len3;
3517
3518
0
 block_err:
3519
0
    if (b_len) cram_free_block(b_len);
3520
0
    if (b_val) cram_free_block(b_val);
3521
0
    return -1;
3522
86.1k
}
3523
3524
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
3525
                                            enum cram_encoding codec,
3526
                                            enum cram_external_type option,
3527
                                            void *dat,
3528
86.7k
                                            int version, varint_vec *vv) {
3529
86.7k
    cram_codec *c;
3530
86.7k
    cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
3531
3532
86.7k
    c = malloc(sizeof(*c));
3533
86.7k
    if (!c)
3534
0
        return NULL;
3535
86.7k
    c->codec = E_BYTE_ARRAY_LEN;
3536
86.7k
    c->free = cram_byte_array_len_encode_free;
3537
86.7k
    c->encode = cram_byte_array_len_encode;
3538
86.7k
    c->store = cram_byte_array_len_encode_store;
3539
86.7k
    c->flush = NULL;
3540
3541
86.7k
    c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
3542
86.7k
                                                        st, E_INT,
3543
86.7k
                                                        e->len_dat,
3544
86.7k
                                                        version, vv);
3545
86.7k
    c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
3546
86.7k
                                                        NULL, E_BYTE_ARRAY,
3547
86.7k
                                                        e->val_dat,
3548
86.7k
                                                        version, vv);
3549
3550
86.7k
    if (!c->u.e_byte_array_len.len_codec ||
3551
86.7k
        !c->u.e_byte_array_len.val_codec) {
3552
0
        cram_byte_array_len_encode_free(c);
3553
0
        return NULL;
3554
0
    }
3555
3556
86.7k
    return c;
3557
86.7k
}
3558
3559
/*
3560
 * ---------------------------------------------------------------------------
3561
 * BYTE_ARRAY_STOP
3562
 */
3563
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
3564
                                            cram_block *in, char *out,
3565
0
                                            int *out_size) {
3566
0
    char *cp, ch;
3567
0
    cram_block *b = NULL;
3568
3569
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3570
0
    if (!b)
3571
0
        return *out_size?-1:0;
3572
3573
0
    if (b->idx >= b->uncomp_size)
3574
0
        return -1;
3575
3576
0
    cp = (char *)b->data + b->idx;
3577
0
    if (out) {
3578
       // memccpy equivalent but without copying the terminating byte
3579
0
        ssize_t term = MIN(*out_size, b->uncomp_size - b->idx);
3580
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3581
0
            if (term-- < 0)
3582
0
                break;
3583
0
            *out++ = ch;
3584
0
            cp++;
3585
0
        }
3586
3587
        // Attempted overrun on input or output
3588
0
        if (ch != (char)c->u.byte_array_stop.stop)
3589
0
            return -1;
3590
0
    } else {
3591
        // Consume input, but produce no output
3592
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3593
0
            if (cp - (char *)b->data >= b->uncomp_size)
3594
0
                return -1;
3595
0
            cp++;
3596
0
        }
3597
0
    }
3598
3599
0
    *out_size = cp - (char *)(b->data + b->idx);
3600
0
    b->idx = cp - (char *)b->data + 1;
3601
3602
0
    return 0;
3603
0
}
3604
3605
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
3606
                                      cram_block *in, char *out_,
3607
0
                                      int *out_size) {
3608
0
    cram_block *b;
3609
0
    cram_block *out = (cram_block *)out_;
3610
0
    unsigned char *cp, *cp_end;
3611
0
    unsigned char stop;
3612
3613
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3614
0
    if (!b)
3615
0
        return *out_size?-1:0;
3616
3617
0
    if (b->idx >= b->uncomp_size)
3618
0
        return -1;
3619
0
    cp = b->data + b->idx;
3620
0
    cp_end = b->data + b->uncomp_size;
3621
3622
    // STOP byte is hard-coded as zero by our name tokeniser decoder
3623
    // implementation, so we may ignore what was requested.
3624
0
    stop = b->orig_method == TOK3 ? 0 : c->u.byte_array_stop.stop;
3625
3626
0
    if (cp_end - cp < out->alloc - out->byte) {
3627
0
        unsigned char *out_cp = BLOCK_END(out);
3628
0
        while (cp != cp_end && *cp != stop)
3629
0
            *out_cp++ = *cp++;
3630
0
        BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out);
3631
0
    } else {
3632
0
        unsigned char *cp_start;
3633
0
        for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
3634
0
            ;
3635
0
        BLOCK_APPEND(out, cp_start, cp - cp_start);
3636
0
        BLOCK_GROW(out, cp - cp_start);
3637
0
    }
3638
3639
0
    *out_size = cp - (b->data + b->idx);
3640
0
    b->idx = cp - b->data + 1;
3641
3642
0
    return 0;
3643
3644
0
 block_err:
3645
0
    return -1;
3646
0
}
3647
3648
393
void cram_byte_array_stop_decode_free(cram_codec *c) {
3649
393
    if (!c) return;
3650
3651
393
    free(c);
3652
393
}
3653
3654
0
int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) {
3655
0
    return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)",
3656
0
                    c->u.byte_array_stop.stop,
3657
0
                    c->u.byte_array_stop.content_id)
3658
0
        < 0 ? -1 : 0;
3659
0
}
3660
3661
cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
3662
                                             char *data, int size,
3663
                                             enum cram_encoding codec,
3664
                                             enum cram_external_type option,
3665
408
                                             int version, varint_vec *vv) {
3666
408
    cram_codec *c = NULL;
3667
408
    unsigned char *cp = (unsigned char *)data;
3668
408
    int err = 0;
3669
3670
408
    if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
3671
3
        goto malformed;
3672
3673
405
    if (!(c = malloc(sizeof(*c))))
3674
0
        return NULL;
3675
3676
405
    c->codec  = E_BYTE_ARRAY_STOP;
3677
405
    switch (option) {
3678
357
    case E_BYTE_ARRAY_BLOCK:
3679
357
        c->decode = cram_byte_array_stop_decode_block;
3680
357
        break;
3681
42
    case E_BYTE_ARRAY:
3682
42
        c->decode = cram_byte_array_stop_decode_char;
3683
42
        break;
3684
6
    default:
3685
6
        hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
3686
6
        free(c);
3687
6
        return NULL;
3688
405
    }
3689
399
    c->free   = cram_byte_array_stop_decode_free;
3690
399
    c->describe = cram_byte_array_stop_describe;
3691
3692
399
    c->u.byte_array_stop.stop = *cp++;
3693
399
    if (CRAM_MAJOR_VERS(version) == 1) {
3694
396
        c->u.byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
3695
396
            + ((unsigned int) cp[3]<<24);
3696
396
        cp += 4;
3697
396
    } else {
3698
3
        c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err);
3699
3
    }
3700
3701
399
    if ((char *)cp - data != size || err)
3702
6
        goto malformed;
3703
3704
393
    return c;
3705
3706
9
 malformed:
3707
9
    hts_log_error("Malformed byte_array_stop header stream");
3708
9
    free(c);
3709
9
    return NULL;
3710
399
}
3711
3712
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
3713
90.2k
                                char *in, int in_size) {
3714
90.2k
    BLOCK_APPEND(c->out, in, in_size);
3715
90.2k
    BLOCK_APPEND_CHAR(c->out, c->u.e_byte_array_stop.stop);
3716
90.2k
    return 0;
3717
3718
0
 block_err:
3719
0
    return -1;
3720
90.2k
}
3721
3722
134k
void cram_byte_array_stop_encode_free(cram_codec *c) {
3723
134k
    if (!c)
3724
0
        return;
3725
134k
    free(c);
3726
134k
}
3727
3728
int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
3729
134k
                                      char *prefix, int version) {
3730
134k
    int len = 0;
3731
134k
    char buf[20], *cp = buf;
3732
3733
134k
    if (prefix) {
3734
107k
        size_t l = strlen(prefix);
3735
107k
        BLOCK_APPEND(b, prefix, l);
3736
107k
        len += l;
3737
107k
    }
3738
3739
134k
    cp += c->vv->varint_put32(cp, buf+20, c->codec);
3740
3741
134k
    if (CRAM_MAJOR_VERS(version) == 1) {
3742
0
        cp += c->vv->varint_put32(cp, buf+20, 5);
3743
0
        *cp++ = c->u.e_byte_array_stop.stop;
3744
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  0) & 0xff;
3745
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  8) & 0xff;
3746
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff;
3747
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff;
3748
134k
    } else {
3749
134k
        cp += c->vv->varint_put32(cp, buf+20, 1 +
3750
134k
                                  c->vv->varint_size(c->u.e_byte_array_stop.content_id));
3751
134k
        *cp++ = c->u.e_byte_array_stop.stop;
3752
134k
        cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id);
3753
134k
    }
3754
3755
134k
    BLOCK_APPEND(b, buf, cp-buf);
3756
134k
    len += cp-buf;
3757
3758
134k
    return len;
3759
3760
0
 block_err:
3761
0
    return -1;
3762
134k
}
3763
3764
cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
3765
                                             enum cram_encoding codec,
3766
                                             enum cram_external_type option,
3767
                                             void *dat,
3768
134k
                                             int version, varint_vec *vv) {
3769
134k
    cram_codec *c;
3770
3771
134k
    c = malloc(sizeof(*c));
3772
134k
    if (!c)
3773
0
        return NULL;
3774
134k
    c->codec = E_BYTE_ARRAY_STOP;
3775
134k
    c->free = cram_byte_array_stop_encode_free;
3776
134k
    c->encode = cram_byte_array_stop_encode;
3777
134k
    c->store = cram_byte_array_stop_encode_store;
3778
134k
    c->flush = NULL;
3779
3780
134k
    c->u.e_byte_array_stop.stop = ((int *)dat)[0];
3781
134k
    c->u.e_byte_array_stop.content_id = ((int *)dat)[1];
3782
3783
134k
    return c;
3784
134k
}
3785
3786
/*
3787
 * ---------------------------------------------------------------------------
3788
 */
3789
3790
152
const char *cram_encoding2str(enum cram_encoding t) {
3791
152
    switch (t) {
3792
54
    case E_NULL:            return "NULL";
3793
0
    case E_EXTERNAL:        return "EXTERNAL";
3794
6
    case E_GOLOMB:          return "GOLOMB";
3795
0
    case E_HUFFMAN:         return "HUFFMAN";
3796
0
    case E_BYTE_ARRAY_LEN:  return "BYTE_ARRAY_LEN";
3797
0
    case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
3798
39
    case E_BETA:            return "BETA";
3799
0
    case E_SUBEXP:          return "SUBEXP";
3800
6
    case E_GOLOMB_RICE:     return "GOLOMB_RICE";
3801
0
    case E_GAMMA:           return "GAMMA";
3802
3803
0
    case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED";
3804
0
    case E_VARINT_SIGNED:   return "VARINT_SIGNED";
3805
0
    case E_CONST_BYTE:      return "CONST_BYTE";
3806
0
    case E_CONST_INT:       return "CONST_INT";
3807
3808
0
    case E_NUM_CODECS:
3809
47
    default:                return "?";
3810
152
    }
3811
152
}
3812
3813
static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr,
3814
                                    char *data,
3815
                                    int size,
3816
                                    enum cram_encoding codec,
3817
                                    enum cram_external_type option,
3818
                                    int version, varint_vec *vv) = {
3819
    // CRAM 3.0 valid codecs
3820
    NULL, // null codec
3821
    cram_external_decode_init,
3822
    NULL, // golomb
3823
    cram_huffman_decode_init,
3824
    cram_byte_array_len_decode_init,
3825
    cram_byte_array_stop_decode_init,
3826
    cram_beta_decode_init,
3827
    cram_subexp_decode_init,
3828
    NULL, // golomb rice
3829
    cram_gamma_decode_init,
3830
3831
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3832
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3833
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3834
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3835
3836
    NULL,                      // was xbyte
3837
    cram_varint_decode_init,   // varint unsigned
3838
    cram_varint_decode_init,   // varint signed
3839
    cram_const_decode_init,    // const byte
3840
    cram_const_decode_init,    // const int
3841
3842
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3843
    NULL, NULL, NULL, NULL, NULL,
3844
3845
    NULL, // xhuffman
3846
    cram_xpack_decode_init,
3847
    cram_xrle_decode_init,
3848
    cram_xdelta_decode_init,
3849
};
3850
3851
cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr,
3852
                              enum cram_encoding codec,
3853
                              char *data, int size,
3854
                              enum cram_external_type option,
3855
9.53k
                              int version, varint_vec *vv) {
3856
9.53k
    if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
3857
9.42k
        cram_codec *r = decode_init[codec](hdr, data, size, codec,
3858
9.42k
                                           option, version, vv);
3859
9.42k
        if (r) {
3860
8.89k
            r->vv = vv;
3861
8.89k
            r->codec_id = hdr->ncodecs++;
3862
8.89k
        }
3863
9.42k
        return r;
3864
9.42k
    } else {
3865
113
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3866
113
        return NULL;
3867
113
    }
3868
9.53k
}
3869
3870
static cram_codec *(*encode_init[])(cram_stats *stx,
3871
                                    enum cram_encoding codec,
3872
                                    enum cram_external_type option,
3873
                                    void *opt,
3874
                                    int version, varint_vec *vv) = {
3875
    // CRAM 3.0 valid codecs
3876
    NULL, // null codec
3877
    cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4
3878
    NULL, // golomb
3879
    cram_huffman_encode_init,
3880
    cram_byte_array_len_encode_init,
3881
    cram_byte_array_stop_encode_init,
3882
    cram_beta_encode_init,
3883
    NULL, // subexponential (we support decode only)
3884
    NULL, // golomb rice
3885
    NULL, // gamma (we support decode only)
3886
3887
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3888
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3889
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3890
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3891
3892
    NULL, // was xbyte
3893
    cram_varint_encode_init, // varint unsigned
3894
    cram_varint_encode_init, // varint signed
3895
    cram_const_encode_init,  // const byte
3896
    cram_const_encode_init,  // const int
3897
3898
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3899
    NULL, NULL, NULL, NULL, NULL,
3900
3901
    NULL, // xhuffman
3902
    cram_xpack_encode_init,
3903
    cram_xrle_encode_init,
3904
    cram_xdelta_encode_init,
3905
};
3906
3907
cram_codec *cram_encoder_init(enum cram_encoding codec,
3908
                              cram_stats *st,
3909
                              enum cram_external_type option,
3910
                              void *dat,
3911
1.22M
                              int version, varint_vec *vv) {
3912
1.22M
    if (st && !st->nvals)
3913
302k
        return NULL;
3914
3915
    // cram_stats_encoding assumes integer data, but if option
3916
    // is E_BYTE then tweak the requested encoding.  This ought
3917
    // to be fixed in cram_stats_encoding instead.
3918
920k
    if (option == E_BYTE || option == E_BYTE_ARRAY ||
3919
920k
       option == E_BYTE_ARRAY_BLOCK) {
3920
362k
       if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED)
3921
0
           codec = E_EXTERNAL;
3922
362k
       else if (codec == E_CONST_INT)
3923
0
           codec = E_CONST_BYTE;
3924
362k
    }
3925
3926
920k
    if (encode_init[codec]) {
3927
920k
        cram_codec *r;
3928
920k
        if ((r = encode_init[codec](st, codec, option, dat, version, vv)))
3929
920k
            r->out = NULL;
3930
920k
        if (!r) {
3931
39
            hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec));
3932
39
            return NULL;
3933
39
        }
3934
920k
        r->vv = vv;
3935
920k
        return r;
3936
920k
    } else {
3937
0
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3938
0
        abort();
3939
0
    }
3940
920k
}
3941
3942
/*
3943
 * Returns the content_id used by this codec, also in id2 if byte_array_len.
3944
 * Returns -1 for the CORE block and -2 for unneeded.
3945
 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
3946
 */
3947
0
int cram_codec_to_id(cram_codec *c, int *id2) {
3948
0
    int bnum1, bnum2 = -2;
3949
3950
0
    switch (c->codec) {
3951
0
    case E_CONST_INT:
3952
0
    case E_CONST_BYTE:
3953
0
        bnum1 = -2; // no blocks used
3954
0
        break;
3955
3956
0
    case E_HUFFMAN:
3957
0
        bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1;
3958
0
        break;
3959
3960
0
    case E_GOLOMB:
3961
0
    case E_BETA:
3962
0
    case E_SUBEXP:
3963
0
    case E_GOLOMB_RICE:
3964
0
    case E_GAMMA:
3965
        // CORE block
3966
0
        bnum1 = -1;
3967
0
        break;
3968
3969
0
    case E_EXTERNAL:
3970
0
    case E_VARINT_UNSIGNED:
3971
0
    case E_VARINT_SIGNED:
3972
0
        bnum1 = c->u.external.content_id;
3973
0
        break;
3974
3975
0
    case E_BYTE_ARRAY_LEN:
3976
0
        bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL);
3977
0
        bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL);
3978
0
        break;
3979
3980
0
    case E_BYTE_ARRAY_STOP:
3981
0
        bnum1 = c->u.byte_array_stop.content_id;
3982
0
        break;
3983
3984
0
    case E_NULL:
3985
0
        bnum1 = -2;
3986
0
        break;
3987
3988
0
    default:
3989
0
        hts_log_error("Unknown codec type %d", c->codec);
3990
0
        bnum1 = -1;
3991
0
    }
3992
3993
0
    if (id2)
3994
0
        *id2 = bnum2;
3995
0
    return bnum1;
3996
0
}
3997
3998
3999
/*
4000
 * cram_codec structures are specialised for decoding or encoding.
4001
 * Unfortunately this makes turning a decoder into an encoder (such as
4002
 * when transcoding files) problematic.
4003
 *
4004
 * This function converts a cram decoder codec into an encoder version
4005
 * in-place (ie it modifiers the codec itself).
4006
 *
4007
 * Returns 0 on success;
4008
 *        -1 on failure.
4009
 */
4010
0
int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
4011
0
    int j;
4012
4013
0
    switch (c->codec) {
4014
0
    case E_CONST_INT:
4015
0
    case E_CONST_BYTE:
4016
        // shares struct with decode
4017
0
        c->store = cram_const_encode_store;
4018
0
        break;
4019
4020
0
    case E_EXTERNAL:
4021
        // shares struct with decode
4022
0
        c->free = cram_external_encode_free;
4023
0
        c->store = cram_external_encode_store;
4024
0
        if (c->decode == cram_external_decode_int)
4025
0
            c->encode = cram_external_encode_int;
4026
0
        else if (c->decode == cram_external_decode_long)
4027
0
            c->encode = cram_external_encode_long;
4028
0
        else if (c->decode == cram_external_decode_char)
4029
0
            c->encode = cram_external_encode_char;
4030
0
        else if (c->decode == cram_external_decode_block)
4031
0
            c->encode = cram_external_encode_char;
4032
0
        else
4033
0
            return -1;
4034
0
        break;
4035
4036
0
    case E_VARINT_SIGNED:
4037
0
    case E_VARINT_UNSIGNED:
4038
        // shares struct with decode
4039
0
        c->free = cram_varint_encode_free;
4040
0
        c->store = cram_varint_encode_store;
4041
0
        if (c->decode == cram_varint_decode_int)
4042
0
            c->encode = cram_varint_encode_int;
4043
0
        else if (c->decode == cram_varint_decode_sint)
4044
0
            c->encode = cram_varint_encode_sint;
4045
0
        else if (c->decode == cram_varint_decode_long)
4046
0
            c->encode = cram_varint_encode_long;
4047
0
        else if (c->decode == cram_varint_decode_slong)
4048
0
            c->encode = cram_varint_encode_slong;
4049
0
        else
4050
0
            return -1;
4051
0
        break;
4052
4053
0
    case E_HUFFMAN: {
4054
        // New structure, so switch.
4055
        // FIXME: we huffman and e_huffman structs amended, we could
4056
        // unify this.
4057
0
        cram_codec *t = malloc(sizeof(*t));
4058
0
        if (!t) return -1;
4059
0
        t->vv     = c->vv;
4060
0
        t->codec = E_HUFFMAN;
4061
0
        t->free = cram_huffman_encode_free;
4062
0
        t->store = cram_huffman_encode_store;
4063
0
        t->u.e_huffman.codes = c->u.huffman.codes;
4064
0
        t->u.e_huffman.nvals = c->u.huffman.ncodes;
4065
0
        t->u.e_huffman.option = c->u.huffman.option;
4066
0
        for (j = 0; j < t->u.e_huffman.nvals; j++) {
4067
0
            int32_t sym = t->u.e_huffman.codes[j].symbol;
4068
0
            if (sym >= -1 && sym < MAX_HUFF)
4069
0
                t->u.e_huffman.val2code[sym+1] = j;
4070
0
        }
4071
4072
0
        if (c->decode == cram_huffman_decode_char0)
4073
0
            t->encode = cram_huffman_encode_char0;
4074
0
        else if (c->decode == cram_huffman_decode_char)
4075
0
            t->encode = cram_huffman_encode_char;
4076
0
        else if (c->decode == cram_huffman_decode_int0)
4077
0
            t->encode = cram_huffman_encode_int0;
4078
0
        else if (c->decode == cram_huffman_decode_int)
4079
0
            t->encode = cram_huffman_encode_int;
4080
0
        else if (c->decode == cram_huffman_decode_long0)
4081
0
            t->encode = cram_huffman_encode_long0;
4082
0
        else if (c->decode == cram_huffman_decode_long)
4083
0
            t->encode = cram_huffman_encode_long;
4084
0
        else {
4085
0
            free(t);
4086
0
            return -1;
4087
0
        }
4088
0
        *c = *t;
4089
0
        free(t);
4090
0
        break;
4091
0
    }
4092
4093
0
    case E_BETA:
4094
        // shares struct with decode
4095
0
        c->free = cram_beta_encode_free;
4096
0
        c->store = cram_beta_encode_store;
4097
0
        if (c->decode == cram_beta_decode_int)
4098
0
            c->encode = cram_beta_encode_int;
4099
0
        else if (c->decode == cram_beta_decode_long)
4100
0
            c->encode = cram_beta_encode_long;
4101
0
        else if (c->decode == cram_beta_decode_char)
4102
0
            c->encode = cram_beta_encode_char;
4103
0
        else
4104
0
            return -1;
4105
0
        break;
4106
4107
0
    case E_XPACK: {
4108
        // shares struct with decode
4109
0
        cram_codec t = *c;
4110
0
        t.free = cram_xpack_encode_free;
4111
0
        t.store = cram_xpack_encode_store;
4112
0
        if (t.decode == cram_xpack_decode_long)
4113
0
            t.encode = cram_xpack_encode_long;
4114
0
        else if (t.decode == cram_xpack_decode_int)
4115
0
            t.encode = cram_xpack_encode_int;
4116
0
        else if (t.decode == cram_xpack_decode_char)
4117
0
            t.encode = cram_xpack_encode_char;
4118
0
        else
4119
0
            return -1;
4120
0
        t.u.e_xpack.sub_codec = t.u.xpack.sub_codec;
4121
0
        if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1)
4122
0
            return -1;
4123
0
        *c = t;
4124
0
        break;
4125
0
    }
4126
4127
0
    case E_BYTE_ARRAY_LEN: {
4128
0
        cram_codec *t = malloc(sizeof(*t));
4129
0
        if (!t) return -1;
4130
0
        t->vv     = c->vv;
4131
0
        t->codec  = E_BYTE_ARRAY_LEN;
4132
0
        t->free   = cram_byte_array_len_encode_free;
4133
0
        t->store  = cram_byte_array_len_encode_store;
4134
0
        t->encode = cram_byte_array_len_encode;
4135
0
        t->u.e_byte_array_len.len_codec = c->u.byte_array_len.len_codec;
4136
0
        t->u.e_byte_array_len.val_codec = c->u.byte_array_len.val_codec;
4137
0
        if (cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.len_codec) == -1 ||
4138
0
            cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.val_codec) == -1) {
4139
0
            t->free(t);
4140
0
            return -1;
4141
0
        }
4142
4143
        // {len,val}_{encoding,dat} are undefined, but unused.
4144
        // Leaving them unset here means we can test that assertion.
4145
0
        *c = *t;
4146
0
        free(t);
4147
0
        break;
4148
0
    }
4149
4150
0
    case E_BYTE_ARRAY_STOP:
4151
        // shares struct with decode
4152
0
        c->free   = cram_byte_array_stop_encode_free;
4153
0
        c->store  = cram_byte_array_stop_encode_store;
4154
0
        c->encode = cram_byte_array_stop_encode;
4155
0
        break;
4156
4157
0
    default:
4158
0
        return -1;
4159
0
    }
4160
4161
0
    return 0;
4162
0
}
4163
4164
0
int cram_codec_describe(cram_codec *c, kstring_t *ks) {
4165
0
    if (c && c->describe)
4166
0
        return c->describe(c, ks);
4167
0
    else
4168
0
        return ksprintf(ks, "?");
4169
0
}