Coverage Report

Created: 2025-08-10 06:30

/src/htslib/cram/cram_codecs.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2021,2023 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * FIXME: add checking of cram_external_type to return NULL on unsupported
33
 * {codec,type} tuples.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdlib.h>
40
#include <string.h>
41
#include <assert.h>
42
#include <limits.h>
43
#include <stdint.h>
44
#include <errno.h>
45
#include <stddef.h>
46
47
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48
#include "../fuzz_settings.h"
49
#endif
50
51
#include "../htslib/hts_endian.h"
52
53
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
54
#include <htscodecs/varint.h>
55
#include <htscodecs/pack.h>
56
#include <htscodecs/rle.h>
57
#else
58
#include "../htscodecs/htscodecs/varint.h"
59
#include "../htscodecs/htscodecs/pack.h"
60
#include "../htscodecs/htscodecs/rle.h"
61
#endif
62
63
#include "cram.h"
64
65
/*
66
 * ---------------------------------------------------------------------------
67
 * Block bit-level I/O functions.
68
 * All defined static here to promote easy inlining by the compiler.
69
 */
70
71
#if 0
72
/* Get a single bit, MSB first */
73
static signed int get_bit_MSB(cram_block *block) {
74
    unsigned int val;
75
76
    if (block->byte > block->alloc)
77
        return -1;
78
79
    val = block->data[block->byte] >> block->bit;
80
    if (--block->bit == -1) {
81
        block->bit = 7;
82
        block->byte++;
83
        //printf("(%02X)", block->data[block->byte]);
84
    }
85
86
    //printf("-B%d-", val&1);
87
88
    return val & 1;
89
}
90
#endif
91
92
/*
93
 * Count number of successive 0 and 1 bits
94
 */
95
0
static int get_one_bits_MSB(cram_block *block) {
96
0
    int n = 0, b;
97
0
    if (block->byte >= block->uncomp_size)
98
0
        return -1;
99
0
    do {
100
0
        b = block->data[block->byte] >> block->bit;
101
0
        if (--block->bit == -1) {
102
0
            block->bit = 7;
103
0
            block->byte++;
104
0
            if (block->byte == block->uncomp_size && (b&1))
105
0
                return -1;
106
0
        }
107
0
        n++;
108
0
    } while (b&1);
109
110
0
    return n-1;
111
0
}
112
113
0
static int get_zero_bits_MSB(cram_block *block) {
114
0
    int n = 0, b;
115
0
    if (block->byte >= block->uncomp_size)
116
0
        return -1;
117
0
    do {
118
0
        b = block->data[block->byte] >> block->bit;
119
0
        if (--block->bit == -1) {
120
0
            block->bit = 7;
121
0
            block->byte++;
122
0
            if (block->byte == block->uncomp_size && !(b&1))
123
0
                return -1;
124
0
        }
125
0
        n++;
126
0
    } while (!(b&1));
127
128
0
    return n-1;
129
0
}
130
131
#if 0
132
/* Stores a single bit */
133
static void store_bit_MSB(cram_block *block, unsigned int bit) {
134
    if (block->byte >= block->alloc) {
135
        block->alloc = block->alloc ? block->alloc*2 : 1024;
136
        block->data = realloc(block->data, block->alloc);
137
    }
138
139
    if (bit)
140
        block->data[block->byte] |= (1 << block->bit);
141
142
    if (--block->bit == -1) {
143
        block->bit = 7;
144
        block->byte++;
145
        block->data[block->byte] = 0;
146
    }
147
}
148
#endif
149
150
#if 0
151
/* Rounds to the next whole byte boundary first */
152
static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
153
    if (block->bit != 7) {
154
        block->bit = 7;
155
        block->byte++;
156
    }
157
158
    while (block->byte + len >= block->alloc) {
159
        block->alloc = block->alloc ? block->alloc*2 : 1024;
160
        block->data = realloc(block->data, block->alloc);
161
    }
162
163
    memcpy(&block->data[block->byte], bytes, len);
164
    block->byte += len;
165
}
166
#endif
167
168
/* Local optimised copy for inlining */
169
0
static inline int64_t get_bits_MSB(cram_block *block, int nbits) {
170
0
    uint64_t val = 0;
171
0
    int i;
172
173
#if 0
174
    // Fits within the current byte */
175
    if (nbits <= block->bit+1) {
176
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
177
        if ((block->bit -= nbits) == -1) {
178
            block->bit = 7;
179
            block->byte++;
180
        }
181
        return val;
182
    }
183
184
    // partial first byte
185
    val = block->data[block->byte] & ((1<<(block->bit+1))-1);
186
    nbits -= block->bit+1;
187
    block->bit = 7;
188
    block->byte++;
189
190
    // whole middle bytes
191
    while (nbits >= 8) {
192
        val = (val << 8) | block->data[block->byte++];
193
        nbits -= 8;
194
    }
195
196
    val <<= nbits;
197
    val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
198
    block->bit -= nbits;
199
    return val;
200
#endif
201
202
#if 0
203
    /* Inefficient implementation! */
204
    //printf("{");
205
    for (i = 0; i < nbits; i++)
206
        //val = (val << 1) | get_bit_MSB(block);
207
        GET_BIT_MSB(block, val);
208
#endif
209
210
0
#if 1
211
    /* Combination of 1st two methods */
212
0
    if (nbits <= block->bit+1) {
213
0
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
214
0
        if ((block->bit -= nbits) == -1) {
215
0
            block->bit = 7;
216
0
            block->byte++;
217
0
        }
218
0
        return val;
219
0
    }
220
221
0
    switch(nbits) {
222
//  case 15: GET_BIT_MSB(block, val); // fall through
223
//  case 14: GET_BIT_MSB(block, val); // fall through
224
//  case 13: GET_BIT_MSB(block, val); // fall through
225
//  case 12: GET_BIT_MSB(block, val); // fall through
226
//  case 11: GET_BIT_MSB(block, val); // fall through
227
//  case 10: GET_BIT_MSB(block, val); // fall through
228
//  case  9: GET_BIT_MSB(block, val); // fall through
229
0
    case  8: GET_BIT_MSB(block, val); // fall through
230
0
    case  7: GET_BIT_MSB(block, val); // fall through
231
0
    case  6: GET_BIT_MSB(block, val); // fall through
232
0
    case  5: GET_BIT_MSB(block, val); // fall through
233
0
    case  4: GET_BIT_MSB(block, val); // fall through
234
0
    case  3: GET_BIT_MSB(block, val); // fall through
235
0
    case  2: GET_BIT_MSB(block, val); // fall through
236
0
    case  1: GET_BIT_MSB(block, val);
237
0
        break;
238
239
0
    default:
240
0
        for (i = 0; i < nbits; i++)
241
            //val = (val << 1) | get_bit_MSB(block);
242
0
            GET_BIT_MSB(block, val);
243
0
    }
244
0
#endif
245
246
    //printf("=0x%x}", val);
247
248
0
    return val;
249
0
}
250
251
/*
252
 * Can store up to 24-bits worth of data encoded in an integer value
253
 * Possibly we'd want to have a less optimal store_bits function when dealing
254
 * with nbits > 24, but for now we assume the codes generated are never
255
 * that big. (Given this is only possible with 121392 or more
256
 * characters with exactly the correct frequency distribution we check
257
 * for it elsewhere.)
258
 */
259
17.7k
static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) {
260
    //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
261
262
    /*
263
     * Use slow mode until we tweak the huffman generator to never generate
264
     * codes longer than 24-bits.
265
     */
266
17.7k
    unsigned int mask;
267
268
17.7k
    if (block->byte+8 >= block->alloc) {
269
1.26k
        if (block->byte) {
270
6
            block->alloc *= 2;
271
6
            block->data = realloc(block->data, block->alloc + 8);
272
6
            if (!block->data)
273
0
                return -1;
274
1.25k
        } else {
275
1.25k
            block->alloc = 1024;
276
1.25k
            block->data = realloc(block->data, block->alloc + 8);
277
1.25k
            if (!block->data)
278
0
                return -1;
279
1.25k
            block->data[0] = 0; // initialise first byte of buffer
280
1.25k
        }
281
1.26k
    }
282
283
    /* fits in current bit-field */
284
17.7k
    if (nbits <= block->bit+1) {
285
8.63k
        block->data[block->byte] |= (val << (block->bit+1-nbits));
286
8.63k
        if ((block->bit-=nbits) == -1) {
287
2.52k
            block->bit = 7;
288
2.52k
            block->byte++;
289
2.52k
            block->data[block->byte] = 0;
290
2.52k
        }
291
8.63k
        return 0;
292
8.63k
    }
293
294
9.14k
    block->data[block->byte] |= (val >> (nbits -= block->bit+1));
295
9.14k
    block->bit = 7;
296
9.14k
    block->byte++;
297
9.14k
    block->data[block->byte] = 0;
298
299
9.14k
    mask = 1<<(nbits-1);
300
56.1k
    do {
301
56.1k
        if (val & mask)
302
15.7k
            block->data[block->byte] |= (1 << block->bit);
303
56.1k
        if (--block->bit == -1) {
304
3.15k
            block->bit = 7;
305
3.15k
            block->byte++;
306
3.15k
            block->data[block->byte] = 0;
307
3.15k
        }
308
56.1k
        mask >>= 1;
309
56.1k
    } while(--nbits);
310
311
9.14k
    return 0;
312
17.7k
}
313
314
/*
315
 * Returns the next 'size' bytes from a block, or NULL if insufficient
316
 * data left.This is just a pointer into the block data and not an
317
 * allocated object, so do not free the result.
318
 */
319
0
static char *cram_extract_block(cram_block *b, int size) {
320
0
    char *cp = (char *)b->data + b->idx;
321
0
    b->idx += size;
322
0
    if (b->idx > b->uncomp_size)
323
0
        return NULL;
324
325
0
    return cp;
326
0
}
327
328
/*
329
 * ---------------------------------------------------------------------------
330
 * EXTERNAL
331
 *
332
 * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the
333
 * size of the object being returned.  This type is hard coded in the
334
 * spec document (changing from uint32 to uint64 requires a spec change)
335
 * and there is no data format introspection so implementations have
336
 * to determine which size to use based on version numbers.   It also
337
 * doesn't support signed data.
338
 *
339
 * With CRAM 4.0 onwards the size and sign of the data is no longer stated
340
 * explicitly in the specification.  Instead EXTERNAL is replaced by three
341
 * new encodings, for bytes and signed / unsigned integers which used a
342
 * variable sized encoding.
343
 *
344
 * For simplicity we use the same encode and decode functions for
345
 * bytes (CRAM4) and external (CRAM3). Given we already had code to
346
 * replace codec + type into a function pointer it makes little
347
 * difference how we ended up at that function.  However we disallow
348
 * this codec to operate on integer data for CRAM4 onwards.
349
 */
350
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
351
0
                             cram_block *in, char *out, int *out_size) {
352
0
    char *cp;
353
0
    cram_block *b;
354
355
    /* Find the external block */
356
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
357
0
    if (!b)
358
0
        return *out_size?-1:0;
359
360
0
    cp = (char *)b->data + b->idx;
361
    // E_INT and E_LONG are guaranteed single item queries
362
0
    int err = 0;
363
0
    *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err);
364
0
    b->idx = cp - (char *)b->data;
365
0
    *out_size = 1;
366
367
0
    return err ? -1 : 0;
368
0
}
369
370
int cram_external_decode_long(cram_slice *slice, cram_codec *c,
371
0
                              cram_block *in, char *out, int *out_size) {
372
0
    char *cp;
373
0
    cram_block *b;
374
375
    /* Find the external block */
376
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
377
0
    if (!b)
378
0
        return *out_size?-1:0;
379
380
0
    cp = (char *)b->data + b->idx;
381
    // E_INT and E_LONG are guaranteed single item queries
382
0
    int err = 0;
383
0
    *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err);
384
0
    b->idx = cp - (char *)b->data;
385
0
    *out_size = 1;
386
387
0
    return err ? -1 : 0;
388
0
}
389
390
int cram_external_decode_char(cram_slice *slice, cram_codec *c,
391
                              cram_block *in, char *out,
392
0
                              int *out_size) {
393
0
    char *cp;
394
0
    cram_block *b;
395
396
    /* Find the external block */
397
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
398
0
    if (!b)
399
0
        return *out_size?-1:0;
400
401
0
    cp = cram_extract_block(b, *out_size);
402
0
    if (!cp)
403
0
        return -1;
404
405
0
    if (out)
406
0
        memcpy(out, cp, *out_size);
407
0
    return 0;
408
0
}
409
410
static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
411
                                      cram_block *in, char *out_,
412
0
                                      int *out_size) {
413
0
    char *cp;
414
0
    cram_block *out = (cram_block *)out_;
415
0
    cram_block *b = NULL;
416
417
    /* Find the external block */
418
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
419
0
    if (!b)
420
0
        return *out_size?-1:0;
421
422
0
    cp = cram_extract_block(b, *out_size);
423
0
    if (!cp)
424
0
        return -1;
425
426
0
    BLOCK_APPEND(out, cp, *out_size);
427
0
    return 0;
428
429
0
 block_err:
430
0
    return -1;
431
0
}
432
433
2.70k
void cram_external_decode_free(cram_codec *c) {
434
2.70k
    if (c)
435
2.70k
        free(c);
436
2.70k
}
437
438
439
0
int cram_external_decode_size(cram_slice *slice, cram_codec *c) {
440
0
    cram_block *b;
441
442
    /* Find the external block */
443
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
444
0
    if (!b)
445
0
        return -1;
446
447
0
    return b->uncomp_size;
448
0
}
449
450
0
cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) {
451
0
    return cram_get_block_by_id(slice, c->u.external.content_id);
452
0
}
453
454
0
int cram_external_describe(cram_codec *c, kstring_t *ks) {
455
0
    return ksprintf(ks, "EXTERNAL(id=%d)",
456
0
                    c->u.external.content_id) < 0 ? -1 : 0;
457
0
}
458
459
cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
460
                                      char *data, int size,
461
                                      enum cram_encoding codec,
462
                                      enum cram_external_type option,
463
2.74k
                                      int version, varint_vec *vv) {
464
2.74k
    cram_codec *c = NULL;
465
2.74k
    char *cp = data;
466
467
2.74k
    if (size < 1)
468
9
        goto malformed;
469
470
2.73k
    if (!(c = malloc(sizeof(*c))))
471
0
        return NULL;
472
473
2.73k
    c->codec  = E_EXTERNAL;
474
2.73k
    if (CRAM_MAJOR_VERS(version) >= 4) {
475
        // Version 4 does not permit integer data to be encoded as a
476
        // series of bytes.  This is used purely for bytes, either
477
        // singular or declared as arrays
478
63
        switch (codec) {
479
63
        case E_EXTERNAL:
480
63
            if (option == E_BYTE_ARRAY_BLOCK)
481
48
                c->decode = cram_external_decode_block;
482
15
            else if (option == E_BYTE || option == E_BYTE_ARRAY)
483
3
                c->decode = cram_external_decode_char;
484
12
            else
485
12
                goto malformed;
486
51
            break;
487
51
        default:
488
0
            goto malformed;
489
63
        }
490
2.67k
    } else {
491
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
492
        // use the option field to indicate the input data format so
493
        // we know which serialisation format to use.
494
2.67k
        if (option == E_INT)
495
1.97k
            c->decode = cram_external_decode_int;
496
699
        else if (option == E_LONG)
497
0
            c->decode = cram_external_decode_long;
498
699
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
499
33
            c->decode = cram_external_decode_char;
500
666
        else
501
666
            c->decode = cram_external_decode_block;
502
2.67k
    }
503
2.72k
    c->free   = cram_external_decode_free;
504
2.72k
    c->size   = cram_external_decode_size;
505
2.72k
    c->get_block = cram_external_get_block;
506
2.72k
    c->describe = cram_external_describe;
507
508
2.72k
    c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL);
509
510
2.72k
    if (cp - data != size)
511
18
        goto malformed;
512
513
2.70k
    c->u.external.type = option;
514
515
2.70k
    return c;
516
517
39
 malformed:
518
39
    hts_log_error("Malformed external header stream");
519
39
    free(c);
520
39
    return NULL;
521
2.72k
}
522
523
int cram_external_encode_int(cram_slice *slice, cram_codec *c,
524
14.1M
                             char *in, int in_size) {
525
14.1M
    uint32_t *i32 = (uint32_t *)in;
526
14.1M
    return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1;
527
14.1M
}
528
529
int cram_external_encode_sint(cram_slice *slice, cram_codec *c,
530
0
                             char *in, int in_size) {
531
0
    int32_t *i32 = (int32_t *)in;
532
0
    return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1;
533
0
}
534
535
int cram_external_encode_long(cram_slice *slice, cram_codec *c,
536
0
                             char *in, int in_size) {
537
0
    uint64_t *i64 = (uint64_t *)in;
538
0
    return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1;
539
0
}
540
541
int cram_external_encode_slong(cram_slice *slice, cram_codec *c,
542
0
                               char *in, int in_size) {
543
0
    int64_t *i64 = (int64_t *)in;
544
0
    return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1;
545
0
}
546
547
int cram_external_encode_char(cram_slice *slice, cram_codec *c,
548
451k
                              char *in, int in_size) {
549
451k
    BLOCK_APPEND(c->out, in, in_size);
550
451k
    return 0;
551
552
0
 block_err:
553
0
    return -1;
554
451k
}
555
556
350k
void cram_external_encode_free(cram_codec *c) {
557
350k
    if (!c)
558
0
        return;
559
350k
    free(c);
560
350k
}
561
562
int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
563
349k
                               int version) {
564
349k
    char tmp[99], *tp = tmp, *tpend = tmp+99;
565
349k
    int len = 0, r = 0, n;
566
567
349k
    if (prefix) {
568
107k
        size_t l = strlen(prefix);
569
107k
        BLOCK_APPEND(b, prefix, l);
570
107k
        len += l;
571
107k
    }
572
573
349k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id);
574
349k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
575
349k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
576
349k
    BLOCK_APPEND(b, tmp, tp-tmp);
577
349k
    len += tp-tmp;
578
579
349k
    if (r > 0)
580
349k
        return len;
581
582
0
 block_err:
583
0
    return -1;
584
349k
}
585
586
cram_codec *cram_external_encode_init(cram_stats *st,
587
                                      enum cram_encoding codec,
588
                                      enum cram_external_type option,
589
                                      void *dat,
590
350k
                                      int version, varint_vec *vv) {
591
350k
    cram_codec *c;
592
593
350k
    c = malloc(sizeof(*c));
594
350k
    if (!c)
595
0
        return NULL;
596
350k
    c->codec = E_EXTERNAL;
597
350k
    c->free = cram_external_encode_free;
598
350k
    if (CRAM_MAJOR_VERS(version) >= 4) {
599
        // Version 4 does not permit integer data to be encoded as a
600
        // series of bytes.  This is used purely for bytes, either
601
        // singular or declared as arrays
602
0
        switch (codec) {
603
0
        case E_EXTERNAL:
604
0
            if (option != E_BYTE && option != E_BYTE_ARRAY)
605
0
                return NULL;
606
0
            c->encode = cram_external_encode_char;
607
0
            break;
608
0
        default:
609
0
            return NULL;
610
0
        }
611
350k
    } else {
612
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
613
        // use the option field to indicate the input data format so
614
        // we know which serialisation format to use.
615
350k
        if (option == E_INT)
616
103k
            c->encode = cram_external_encode_int;
617
247k
        else if (option == E_LONG)
618
0
            c->encode = cram_external_encode_long;
619
247k
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
620
247k
            c->encode = cram_external_encode_char;
621
0
        else
622
0
            abort();
623
350k
    }
624
350k
    c->store = cram_external_encode_store;
625
350k
    c->flush = NULL;
626
627
350k
    c->u.e_external.content_id = (size_t)dat;
628
629
350k
    return c;
630
350k
}
631
632
/*
633
 * ---------------------------------------------------------------------------
634
 * VARINT
635
 *
636
 * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8
637
 * format as well as bytes.  In CRAM 4 EXTERNAL is only for bytes and
638
 * byte arrays, with two dedicated encodings for integers:
639
 * VARINT_SIGNED and VARINT_UNSIGNED.  These also differ a little to
640
 * EXTERNAL with the addition of an offset field, meaning we can store
641
 * values in, say, the range -2 to 1 million without needing to use
642
 * a signed zig-zag transformation.
643
 */
644
int cram_varint_decode_int(cram_slice *slice, cram_codec *c,
645
0
                           cram_block *in, char *out, int *out_size) {
646
0
    char *cp;
647
0
    cram_block *b;
648
649
    /* Find the data block */
650
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
651
0
    if (!b)
652
0
        return *out_size?-1:0;
653
654
0
    cp = (char *)b->data + b->idx;
655
    // E_INT and E_LONG are guaranteed single item queries
656
0
    int err = 0;
657
0
    *(int32_t *)out = c->vv->varint_get32(&cp,
658
0
                                          (char *)b->data + b->uncomp_size,
659
0
                                          &err) + c->u.varint.offset;
660
0
    b->idx = cp - (char *)b->data;
661
0
    *out_size = 1;
662
663
0
    return err ? -1 : 0;
664
0
}
665
666
int cram_varint_decode_sint(cram_slice *slice, cram_codec *c,
667
0
                            cram_block *in, char *out, int *out_size) {
668
0
    char *cp;
669
0
    cram_block *b;
670
671
    /* Find the data block */
672
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
673
0
    if (!b)
674
0
        return *out_size?-1:0;
675
676
0
    cp = (char *)b->data + b->idx;
677
    // E_INT and E_LONG are guaranteed single item queries
678
0
    int err = 0;
679
0
    *(int32_t *)out = c->vv->varint_get32s(&cp,
680
0
                                           (char *)b->data + b->uncomp_size,
681
0
                                           &err) + c->u.varint.offset;
682
0
    b->idx = cp - (char *)b->data;
683
0
    *out_size = 1;
684
685
0
    return err ? -1 : 0;
686
0
}
687
688
int cram_varint_decode_long(cram_slice *slice, cram_codec *c,
689
0
                            cram_block *in, char *out, int *out_size) {
690
0
    char *cp;
691
0
    cram_block *b;
692
693
    /* Find the data block */
694
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
695
0
    if (!b)
696
0
        return *out_size?-1:0;
697
698
0
    cp = (char *)b->data + b->idx;
699
    // E_INT and E_LONG are guaranteed single item queries
700
0
    int err = 0;
701
0
    *(int64_t *)out = c->vv->varint_get64(&cp,
702
0
                                          (char *)b->data + b->uncomp_size,
703
0
                                          &err) + c->u.varint.offset;
704
0
    b->idx = cp - (char *)b->data;
705
0
    *out_size = 1;
706
707
0
    return err ? -1 : 0;
708
0
}
709
710
int cram_varint_decode_slong(cram_slice *slice, cram_codec *c,
711
0
                             cram_block *in, char *out, int *out_size) {
712
0
    char *cp;
713
0
    cram_block *b;
714
715
    /* Find the data block */
716
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
717
0
    if (!b)
718
0
        return *out_size?-1:0;
719
720
0
    cp = (char *)b->data + b->idx;
721
    // E_INT and E_LONG are guaranteed single item queries
722
0
    int err = 0;
723
0
    *(int64_t *)out = c->vv->varint_get64s(&cp,
724
0
                                           (char *)b->data + b->uncomp_size,
725
0
                                           &err) + c->u.varint.offset;
726
0
    b->idx = cp - (char *)b->data;
727
0
    *out_size = 1;
728
729
0
    return err ? -1 : 0;
730
0
}
731
732
779
void cram_varint_decode_free(cram_codec *c) {
733
779
    if (c)
734
779
        free(c);
735
779
}
736
737
0
int cram_varint_decode_size(cram_slice *slice, cram_codec *c) {
738
0
    cram_block *b;
739
740
    /* Find the data block */
741
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
742
0
    if (!b)
743
0
        return -1;
744
745
0
    return b->uncomp_size;
746
0
}
747
748
0
cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) {
749
0
    return cram_get_block_by_id(slice, c->u.varint.content_id);
750
0
}
751
752
0
int cram_varint_describe(cram_codec *c, kstring_t *ks) {
753
0
    return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)",
754
0
                    c->u.varint.content_id,
755
0
                    c->u.varint.offset,
756
0
                    c->u.varint.type)
757
0
        < 0 ? -1 : 0;
758
0
}
759
760
cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
761
                                    char *data, int size,
762
                                    enum cram_encoding codec,
763
                                    enum cram_external_type option,
764
815
                                    int version, varint_vec *vv) {
765
815
    cram_codec *c;
766
815
    char *cp = data, *cp_end = data+size;
767
768
815
    if (!(c = malloc(sizeof(*c))))
769
0
        return NULL;
770
771
815
    c->codec  = codec;
772
773
    // Function pointer choice is theoretically by codec type.
774
    // Given we have some vars as int32 and some as int64 we
775
    // use option too for sizing, although on disk format
776
    // does not change.
777
815
    switch(codec) {
778
564
    case E_VARINT_UNSIGNED:
779
564
        c->decode = (option == E_INT)
780
564
            ? cram_varint_decode_int
781
564
            : cram_varint_decode_long;
782
564
        break;
783
251
    case E_VARINT_SIGNED:
784
251
        c->decode = (option == E_INT)
785
251
            ? cram_varint_decode_sint
786
251
            : cram_varint_decode_slong;
787
251
        break;
788
0
    default:
789
0
        return NULL;
790
815
    }
791
792
815
    c->free   = cram_varint_decode_free;
793
815
    c->size   = cram_varint_decode_size;
794
815
    c->get_block = cram_varint_get_block;
795
815
    c->describe = cram_varint_describe;
796
797
815
    c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL);
798
815
    c->u.varint.offset     = vv->varint_get64s(&cp, cp_end, NULL);
799
800
815
    if (cp - data != size) {
801
36
        fprintf(stderr, "Malformed varint header stream\n");
802
36
        free(c);
803
36
        return NULL;
804
36
    }
805
806
779
    c->u.varint.type = option;
807
808
779
    return c;
809
815
}
810
811
int cram_varint_encode_int(cram_slice *slice, cram_codec *c,
812
0
                           char *in, int in_size) {
813
0
    uint32_t *i32 = (uint32_t *)in;
814
0
    return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0
815
0
        ? 0 : -1;
816
0
}
817
818
int cram_varint_encode_sint(cram_slice *slice, cram_codec *c,
819
0
                            char *in, int in_size) {
820
0
    int32_t *i32 = (int32_t *)in;
821
0
    return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0
822
0
        ? 0 : -1;
823
0
}
824
825
int cram_varint_encode_long(cram_slice *slice, cram_codec *c,
826
0
                            char *in, int in_size) {
827
0
    uint64_t *i64 = (uint64_t *)in;
828
0
    return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0
829
0
        ? 0 : -1;
830
0
}
831
832
int cram_varint_encode_slong(cram_slice *slice, cram_codec *c,
833
0
                             char *in, int in_size) {
834
0
    int64_t *i64 = (int64_t *)in;
835
0
    return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0
836
0
        ? 0 : -1;
837
0
}
838
839
0
void cram_varint_encode_free(cram_codec *c) {
840
0
    if (!c)
841
0
        return;
842
0
    free(c);
843
0
}
844
845
int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix,
846
0
                             int version) {
847
0
    char tmp[99], *tp = tmp;
848
0
    int len = 0;
849
850
0
    if (prefix) {
851
0
        size_t l = strlen(prefix);
852
0
        BLOCK_APPEND(b, prefix, l);
853
0
        len += l;
854
0
    }
855
856
0
    tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id);
857
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset);
858
0
    len += c->vv->varint_put32_blk(b, c->codec);
859
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
860
0
    BLOCK_APPEND(b, tmp, tp-tmp);
861
0
    len += tp-tmp;
862
863
0
    return len;
864
865
0
 block_err:
866
0
    return -1;
867
0
}
868
869
cram_codec *cram_varint_encode_init(cram_stats *st,
870
                                    enum cram_encoding codec,
871
                                    enum cram_external_type option,
872
                                    void *dat,
873
0
                                    int version, varint_vec *vv) {
874
0
    cram_codec *c;
875
876
0
    if (!(c = malloc(sizeof(*c))))
877
0
        return NULL;
878
879
0
    c->u.e_varint.offset = 0;
880
0
    if (st) {
881
        // Marginal difference so far! Not worth the hassle?
882
0
        if (st->min_val < 0 && st->min_val >= -127
883
0
            && st->max_val / -st->min_val > 100) {
884
0
            c->u.e_varint.offset = -st->min_val;
885
0
            codec = E_VARINT_UNSIGNED;
886
0
        } else if (st->min_val > 0) {
887
0
            c->u.e_varint.offset = -st->min_val;
888
0
        }
889
0
    }
890
891
0
    c->codec = codec;
892
0
    c->free = cram_varint_encode_free;
893
894
    // Function pointer choice is theoretically by codec type.
895
    // Given we have some vars as int32 and some as int64 we
896
    // use option too for sizing, although on disk format
897
    // does not change.
898
0
    switch (codec) {
899
0
    case E_VARINT_UNSIGNED:
900
0
        c->encode = (option == E_INT)
901
0
            ? cram_varint_encode_int
902
0
            : cram_varint_encode_long;
903
0
        break;
904
0
    case E_VARINT_SIGNED:
905
0
        c->encode = (option == E_INT)
906
0
            ? cram_varint_encode_sint
907
0
            : cram_varint_encode_slong;
908
0
        break;
909
0
    default:
910
0
        return NULL;
911
0
    }
912
0
    c->store = cram_varint_encode_store;
913
0
    c->flush = NULL;
914
915
0
    c->u.e_varint.content_id = (size_t)dat;
916
917
0
    return c;
918
0
}
919
/*
920
 * ---------------------------------------------------------------------------
921
 * CONST_BYTE and CONST_INT
922
 */
923
int cram_const_decode_byte(cram_slice *slice, cram_codec *c,
924
0
                           cram_block *in, char *out, int *out_size) {
925
0
    int i, n;
926
927
0
    for (i = 0, n = *out_size; i < n; i++)
928
0
        out[i] = c->u.xconst.val;
929
930
0
    return 0;
931
0
}
932
933
int cram_const_decode_int(cram_slice *slice, cram_codec *c,
934
0
                          cram_block *in, char *out, int *out_size) {
935
0
    int32_t *out_i = (int32_t *)out;
936
0
    int i, n;
937
938
0
    for (i = 0, n = *out_size; i < n; i++)
939
0
        out_i[i] = c->u.xconst.val;
940
941
0
    return 0;
942
0
}
943
944
int cram_const_decode_long(cram_slice *slice, cram_codec *c,
945
0
                           cram_block *in, char *out, int *out_size) {
946
0
    int64_t *out_i = (int64_t *)out;
947
0
    int i, n;
948
949
0
    for (i = 0, n = *out_size; i < n; i++)
950
0
        out_i[i] = c->u.xconst.val;
951
952
0
    return 0;
953
0
}
954
955
465
void cram_const_decode_free(cram_codec *c) {
956
465
    if (c)
957
465
        free(c);
958
465
}
959
960
0
int cram_const_decode_size(cram_slice *slice, cram_codec *c) {
961
0
    return 0;
962
0
}
963
964
0
int cram_const_describe(cram_codec *c, kstring_t *ks) {
965
0
    return ksprintf(ks, "CONST(val=%"PRId64")",
966
0
                    c->u.xconst.val) < 0 ? -1 : 0;
967
0
}
968
969
cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
970
                                   char *data, int size,
971
                                   enum cram_encoding codec,
972
                                   enum cram_external_type option,
973
477
                                   int version, varint_vec *vv) {
974
477
    cram_codec *c;
975
477
    char *cp = data;
976
977
477
    if (!(c = malloc(sizeof(*c))))
978
0
        return NULL;
979
980
477
    c->codec  = codec;
981
477
    if (codec == E_CONST_BYTE)
982
6
        c->decode = cram_const_decode_byte;
983
471
    else if (option == E_INT)
984
303
        c->decode = cram_const_decode_int;
985
168
    else
986
168
        c->decode = cram_const_decode_long;
987
477
    c->free   = cram_const_decode_free;
988
477
    c->size   = cram_const_decode_size;
989
477
    c->get_block = NULL;
990
477
    c->describe = cram_const_describe;
991
992
477
    c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL);
993
994
477
    if (cp - data != size) {
995
12
        fprintf(stderr, "Malformed const header stream\n");
996
12
        free(c);
997
12
        return NULL;
998
12
    }
999
1000
465
    return c;
1001
477
}
1002
1003
int cram_const_encode(cram_slice *slice, cram_codec *c,
1004
0
                      char *in, int in_size) {
1005
0
    return 0;
1006
0
}
1007
1008
int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix,
1009
0
                            int version) {
1010
0
    char tmp[99], *tp = tmp;
1011
0
    int len = 0;
1012
1013
0
    if (prefix) {
1014
0
        size_t l = strlen(prefix);
1015
0
        BLOCK_APPEND(b, prefix, l);
1016
0
        len += l;
1017
0
    }
1018
1019
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val);
1020
0
    len += c->vv->varint_put32_blk(b, c->codec);
1021
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
1022
0
    BLOCK_APPEND(b, tmp, tp-tmp);
1023
0
    len += tp-tmp;
1024
1025
0
    return len;
1026
1027
0
 block_err:
1028
0
    return -1;
1029
0
}
1030
1031
cram_codec *cram_const_encode_init(cram_stats *st,
1032
                                   enum cram_encoding codec,
1033
                                   enum cram_external_type option,
1034
                                   void *dat,
1035
0
                                   int version, varint_vec *vv) {
1036
0
    cram_codec *c;
1037
1038
0
    if (!(c = malloc(sizeof(*c))))
1039
0
        return NULL;
1040
1041
0
    c->codec = codec;
1042
0
    c->free = cram_const_decode_free; // as as decode
1043
0
    c->encode = cram_const_encode; // a nop
1044
0
    c->store = cram_const_encode_store;
1045
0
    c->flush = NULL;
1046
0
    c->u.e_xconst.val = st->min_val;
1047
1048
0
    return c;
1049
0
}
1050
1051
/*
1052
 * ---------------------------------------------------------------------------
1053
 * BETA
1054
 */
1055
0
int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1056
0
    int64_t *out_i = (int64_t *)out;
1057
0
    int i, n = *out_size;
1058
1059
0
    if (c->u.beta.nbits) {
1060
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1061
0
            return -1;
1062
1063
0
        for (i = 0; i < n; i++)
1064
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1065
0
    } else {
1066
0
        for (i = 0; i < n; i++)
1067
0
            out_i[i] = -c->u.beta.offset;
1068
0
    }
1069
1070
0
    return 0;
1071
0
}
1072
1073
0
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1074
0
    int32_t *out_i = (int32_t *)out;
1075
0
    int i, n = *out_size;
1076
1077
0
    if (c->u.beta.nbits) {
1078
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1079
0
            return -1;
1080
1081
0
        for (i = 0; i < n; i++)
1082
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1083
0
    } else {
1084
0
        for (i = 0; i < n; i++)
1085
0
            out_i[i] = -c->u.beta.offset;
1086
0
    }
1087
1088
0
    return 0;
1089
0
}
1090
1091
0
int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1092
0
    int i, n = *out_size;
1093
1094
1095
0
    if (c->u.beta.nbits) {
1096
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1097
0
            return -1;
1098
1099
0
        if (out)
1100
0
            for (i = 0; i < n; i++)
1101
0
                out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1102
0
        else
1103
0
            for (i = 0; i < n; i++)
1104
0
                get_bits_MSB(in, c->u.beta.nbits);
1105
0
    } else {
1106
0
        if (out)
1107
0
            for (i = 0; i < n; i++)
1108
0
                out[i] = -c->u.beta.offset;
1109
0
    }
1110
1111
0
    return 0;
1112
0
}
1113
1114
258
void cram_beta_decode_free(cram_codec *c) {
1115
258
    if (c)
1116
258
        free(c);
1117
258
}
1118
1119
0
int cram_beta_describe(cram_codec *c, kstring_t *ks) {
1120
0
    return ksprintf(ks, "BETA(offset=%d, nbits=%d)",
1121
0
                    c->u.beta.offset, c->u.beta.nbits)
1122
0
        < 0 ? -1 : 0;
1123
0
}
1124
1125
cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
1126
                                  char *data, int size,
1127
                                  enum cram_encoding codec,
1128
                                  enum cram_external_type option,
1129
279
                                  int version, varint_vec *vv) {
1130
279
    cram_codec *c;
1131
279
    char *cp = data;
1132
1133
279
    if (!(c = malloc(sizeof(*c))))
1134
0
        return NULL;
1135
1136
279
    c->codec  = E_BETA;
1137
279
    if (option == E_INT || option == E_SINT)
1138
57
        c->decode = cram_beta_decode_int;
1139
222
    else if (option == E_LONG || option == E_SLONG)
1140
3
        c->decode = cram_beta_decode_long;
1141
219
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1142
216
        c->decode = cram_beta_decode_char;
1143
3
    else {
1144
3
        hts_log_error("BYTE_ARRAYs not supported by this codec");
1145
3
        free(c);
1146
3
        return NULL;
1147
3
    }
1148
276
    c->free   = cram_beta_decode_free;
1149
276
    c->describe = cram_beta_describe;
1150
1151
276
    c->u.beta.nbits = -1;
1152
276
    c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL);
1153
276
    if (cp < data + size) // Ensure test below works
1154
273
        c->u.beta.nbits  = vv->varint_get32(&cp, data + size, NULL);
1155
1156
276
    if (cp - data != size
1157
276
        || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) {
1158
18
        hts_log_error("Malformed beta header stream");
1159
18
        free(c);
1160
18
        return NULL;
1161
18
    }
1162
1163
258
    return c;
1164
276
}
1165
1166
int cram_beta_encode_store(cram_codec *c, cram_block *b,
1167
1.26k
                           char *prefix, int version) {
1168
1.26k
    int len = 0, r = 0, n;
1169
1170
1.26k
    if (prefix) {
1171
1.26k
        size_t l = strlen(prefix);
1172
1.26k
        BLOCK_APPEND(b, prefix, l);
1173
1.26k
        len += l;
1174
1.26k
    }
1175
1176
1.26k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1177
    // codec length
1178
1.26k
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset)
1179
1.26k
                                         + c->vv->varint_size(c->u.e_beta.nbits)));
1180
1.26k
    r |= n;
1181
1.26k
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n;
1182
1.26k
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits));  r |= n;
1183
1184
1.26k
    if (r > 0) return len;
1185
1186
0
 block_err:
1187
0
    return -1;
1188
1.26k
}
1189
1190
int cram_beta_encode_long(cram_slice *slice, cram_codec *c,
1191
0
                          char *in, int in_size) {
1192
0
    int64_t *syms = (int64_t *)in;
1193
0
    int i, r = 0;
1194
1195
0
    for (i = 0; i < in_size; i++)
1196
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1197
0
                            c->u.e_beta.nbits);
1198
1199
0
    return r;
1200
0
}
1201
1202
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
1203
17.7k
                         char *in, int in_size) {
1204
17.7k
    int *syms = (int *)in;
1205
17.7k
    int i, r = 0;
1206
1207
35.5k
    for (i = 0; i < in_size; i++)
1208
17.7k
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1209
17.7k
                            c->u.e_beta.nbits);
1210
1211
17.7k
    return r;
1212
17.7k
}
1213
1214
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
1215
0
                          char *in, int in_size) {
1216
0
    unsigned char *syms = (unsigned char *)in;
1217
0
    int i, r = 0;
1218
1219
0
    for (i = 0; i < in_size; i++)
1220
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1221
0
                            c->u.e_beta.nbits);
1222
1223
0
    return r;
1224
0
}
1225
1226
1.26k
void cram_beta_encode_free(cram_codec *c) {
1227
1.26k
    if (c) free(c);
1228
1.26k
}
1229
1230
cram_codec *cram_beta_encode_init(cram_stats *st,
1231
                                  enum cram_encoding codec,
1232
                                  enum cram_external_type option,
1233
                                  void *dat,
1234
1.35k
                                  int version, varint_vec *vv) {
1235
1.35k
    cram_codec *c;
1236
1.35k
    hts_pos_t min_val, max_val;
1237
1.35k
    int len = 0;
1238
1.35k
    int64_t range;
1239
1240
1.35k
    c = malloc(sizeof(*c));
1241
1.35k
    if (!c)
1242
0
        return NULL;
1243
1.35k
    c->codec  = E_BETA;
1244
1.35k
    c->free   = cram_beta_encode_free;
1245
1.35k
    if (option == E_INT || option == E_SINT)
1246
1.35k
        c->encode = cram_beta_encode_int;
1247
0
    else if (option == E_LONG || option == E_SLONG)
1248
0
        c->encode = cram_beta_encode_long;
1249
0
    else
1250
0
        c->encode = cram_beta_encode_char;
1251
1.35k
    c->store  = cram_beta_encode_store;
1252
1.35k
    c->flush = NULL;
1253
1254
1.35k
    if (dat) {
1255
1.35k
        min_val = ((hts_pos_t *)dat)[0];
1256
1.35k
        max_val = ((hts_pos_t *)dat)[1];
1257
1.35k
    } else {
1258
0
        min_val = INT_MAX;
1259
0
        max_val = INT_MIN;
1260
0
        int i;
1261
0
        for (i = 0; i < MAX_STAT_VAL; i++) {
1262
0
            if (!st->freqs[i])
1263
0
                continue;
1264
0
            if (min_val > i)
1265
0
                min_val = i;
1266
0
            max_val = i;
1267
0
        }
1268
0
        if (st->h) {
1269
0
            khint_t k;
1270
1271
0
            for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1272
0
                if (!kh_exist(st->h, k))
1273
0
                    continue;
1274
1275
0
                i = kh_key(st->h, k);
1276
0
                if (min_val > i)
1277
0
                    min_val = i;
1278
0
                if (max_val < i)
1279
0
                    max_val = i;
1280
0
            }
1281
0
        }
1282
0
    }
1283
1284
1.35k
    if (max_val < min_val)
1285
0
        goto err;
1286
1287
1.35k
    range = (int64_t) max_val - min_val;
1288
1.35k
    switch (option) {
1289
0
    case E_SINT:
1290
0
        if (min_val < INT_MIN || range > INT_MAX)
1291
0
            goto err;
1292
0
        break;
1293
1294
1.35k
    case E_INT:
1295
1.35k
        if (max_val > UINT_MAX || range > UINT_MAX)
1296
93
            goto err;
1297
1.26k
        break;
1298
1299
1.26k
    default:
1300
0
        break;
1301
1.35k
    }
1302
1303
1.26k
    c->u.e_beta.offset = -min_val;
1304
5.20k
    while (range) {
1305
3.94k
        len++;
1306
3.94k
        range >>= 1;
1307
3.94k
    }
1308
1.26k
    c->u.e_beta.nbits = len;
1309
1310
1.26k
    return c;
1311
1312
93
 err:
1313
93
    free(c);
1314
93
    return NULL;
1315
1.35k
}
1316
1317
/*
1318
 * ---------------------------------------------------------------------------
1319
 * XPACK: Packing multiple values into a single byte.  A fast transform that
1320
 * reduces time taken by entropy encoder and may also improve compression.
1321
 *
1322
 * This also has the additional requirement that the data series is not
1323
 * interleaved with another, permitting efficient encoding and decoding
1324
 * of all elements enmasse instead of needing to only extract the bits
1325
 * necessary per item.
1326
 */
1327
0
int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1328
0
    int64_t *out_i = (int64_t *)out;
1329
0
    int i, n = *out_size;
1330
1331
0
    if (c->u.xpack.nbits) {
1332
0
        for (i = 0; i < n; i++)
1333
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1334
0
    } else {
1335
0
        for (i = 0; i < n; i++)
1336
0
            out_i[i] = c->u.xpack.rmap[0];
1337
0
    }
1338
1339
0
    return 0;
1340
0
}
1341
1342
0
int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1343
0
    int32_t *out_i = (int32_t *)out;
1344
0
    int i, n = *out_size;
1345
1346
0
    if (c->u.xpack.nbits) {
1347
0
        if (cram_not_enough_bits(in, c->u.xpack.nbits * n))
1348
0
            return -1;
1349
1350
0
        for (i = 0; i < n; i++)
1351
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1352
0
    } else {
1353
0
        for (i = 0; i < n; i++)
1354
0
            out_i[i] = c->u.xpack.rmap[0];
1355
0
    }
1356
1357
0
    return 0;
1358
0
}
1359
1360
0
static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) {
1361
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
1362
0
    if (b)
1363
0
        return 0;
1364
1365
    // get sub-codec data.
1366
0
    cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec);
1367
0
    if (!sub_b)
1368
0
        return -1;
1369
1370
    // Allocate local block to expand into
1371
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
1372
0
    if (!b)
1373
0
        return -1;
1374
0
    int n = sub_b->uncomp_size * 8/c->u.xpack.nbits;
1375
0
    BLOCK_GROW(b, n);
1376
0
    b->uncomp_size = n;
1377
1378
0
    uint8_t p[256];
1379
0
    int z;
1380
0
    for (z = 0; z < 256; z++)
1381
0
        p[z] = c->u.xpack.rmap[z];
1382
0
    hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size,
1383
0
               8 / c->u.xpack.nbits, p);
1384
1385
0
    return 0;
1386
1387
0
 block_err:
1388
0
    return -1;
1389
0
}
1390
1391
0
int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1392
    // FIXME: we need to ban data-series interleaving in the spec for this to work.
1393
1394
    // Remember this may be called when threaded and multi-slice per container.
1395
    // Hence one cram_codec instance, multiple slices, multiple blocks.
1396
    // We therefore have to cache appropriate block info in slice and not codec.
1397
    //    b = cram_get_block_by_id(slice, c->external.content_id);
1398
0
    if (c->u.xpack.nval > 1) {
1399
0
        cram_xpack_decode_expand_char(slice, c);
1400
0
        cram_block *b = slice->block_by_id[512 + c->codec_id];
1401
0
        if (!b)
1402
0
            return -1;
1403
1404
0
        if (out)
1405
0
            memcpy(out, b->data + b->byte, *out_size);
1406
0
        b->byte += *out_size;
1407
0
    } else {
1408
0
        memset(out, c->u.xpack.rmap[0], *out_size);
1409
0
    }
1410
1411
0
    return 0;
1412
0
}
1413
1414
939
void cram_xpack_decode_free(cram_codec *c) {
1415
939
    if (!c) return;
1416
1417
939
    if (c->u.xpack.sub_codec)
1418
831
        c->u.xpack.sub_codec->free(c->u.xpack.sub_codec);
1419
1420
    //free(slice->block_by_id[512 + c->codec_id]);
1421
    //slice->block_by_id[512 + c->codec_id] = 0;
1422
1423
939
    free(c);
1424
939
}
1425
1426
0
int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) {
1427
0
    cram_xpack_decode_expand_char(slice, c);
1428
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1429
0
}
1430
1431
0
cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) {
1432
0
    cram_xpack_decode_expand_char(slice, c);
1433
0
    return slice->block_by_id[512 + c->codec_id];
1434
0
}
1435
1436
cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr,
1437
                                   char *data, int size,
1438
                                   enum cram_encoding codec,
1439
                                   enum cram_external_type option,
1440
939
                                   int version, varint_vec *vv) {
1441
939
    cram_codec *c;
1442
939
    char *cp = data;
1443
939
    char *endp = data+size;
1444
1445
939
    if (!(c = calloc(1, sizeof(*c))))
1446
0
        return NULL;
1447
1448
939
    c->codec  = E_XPACK;
1449
939
    if (option == E_LONG)
1450
12
        c->decode = cram_xpack_decode_long;
1451
927
    else if (option == E_INT)
1452
558
        c->decode = cram_xpack_decode_int;
1453
369
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1454
366
        c->decode = cram_xpack_decode_char;
1455
3
    else {
1456
3
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
1457
3
        goto malformed;
1458
3
    }
1459
936
    c->free = cram_xpack_decode_free;
1460
936
    c->size = cram_xpack_decode_size;
1461
936
    c->get_block = cram_xpack_get_block;
1462
936
    c->describe = NULL;
1463
1464
936
    c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL);
1465
936
    c->u.xpack.nval  = vv->varint_get32(&cp, endp, NULL);
1466
936
    if (c->u.xpack.nbits >= 8  || c->u.xpack.nbits < 0 ||
1467
936
        c->u.xpack.nval  > 256 || c->u.xpack.nval < 0)
1468
39
        goto malformed;
1469
897
    int i;
1470
3.95k
    for (i = 0; i < c->u.xpack.nval; i++) {
1471
3.09k
        uint32_t v = vv->varint_get32(&cp, endp, NULL);
1472
3.09k
        if (v >= 256)
1473
33
            goto malformed;
1474
3.06k
        c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K
1475
3.06k
    }
1476
1477
864
    int encoding = vv->varint_get32(&cp, endp, NULL);
1478
864
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1479
864
    if (sub_size < 0 || endp - cp < sub_size)
1480
6
        goto malformed;
1481
858
    c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1482
858
                                             option, version, vv);
1483
858
    if (c->u.xpack.sub_codec == NULL)
1484
27
        goto malformed;
1485
831
    cp += sub_size;
1486
1487
831
    if (cp - data != size
1488
831
        || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) {
1489
123
    malformed:
1490
123
        fprintf(stderr, "Malformed xpack header stream\n");
1491
123
        cram_xpack_decode_free(c);
1492
123
        return NULL;
1493
15
    }
1494
1495
816
    return c;
1496
831
}
1497
1498
0
int cram_xpack_encode_flush(cram_codec *c) {
1499
    // Pack the buffered up data
1500
0
    int meta_len;
1501
0
    uint64_t out_len;
1502
0
    uint8_t out_meta[1024];
1503
0
    uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out),
1504
0
                            out_meta, &meta_len, &out_len);
1505
1506
    // We now need to pass this through the next layer of transform
1507
0
    if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming
1508
0
                                     c->u.e_xpack.sub_codec,
1509
0
                                     (char *)out, out_len))
1510
0
        return -1;
1511
1512
0
    int r = 0;
1513
0
    if (c->u.e_xpack.sub_codec->flush)
1514
0
        r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec);
1515
1516
0
    free(out);
1517
0
    return r;
1518
0
}
1519
1520
int cram_xpack_encode_store(cram_codec *c, cram_block *b,
1521
0
                            char *prefix, int version) {
1522
0
    int len = 0, r = 0, n;
1523
1524
0
    if (prefix) {
1525
0
        size_t l = strlen(prefix);
1526
0
        BLOCK_APPEND(b, prefix, l);
1527
0
        len += l;
1528
0
    }
1529
1530
    // Store sub-codec
1531
0
    cram_codec *tc = c->u.e_xpack.sub_codec;
1532
0
    cram_block *tb = cram_new_block(0, 0);
1533
0
    if (!tb)
1534
0
        return -1;
1535
0
    int len2 = tc->store(tc, tb, NULL, version);
1536
1537
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1538
1539
    // codec length
1540
0
    int len1 = 0, i;
1541
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1542
0
        len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n;
1543
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits)
1544
0
                                        +  c->vv->varint_size(c->u.e_xpack.nval)
1545
0
                                        + len1 + len2)); r |= n;
1546
1547
    // The map and sub-codec
1548
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n;
1549
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval));  r |= n;
1550
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1551
0
        len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n;
1552
1553
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1554
1555
0
    cram_free_block(tb);
1556
1557
0
    return r > 0 ? len + len2 : -1;
1558
1559
0
 block_err:
1560
0
    return -1;
1561
0
}
1562
1563
// Same as cram_beta_encode_long
1564
int cram_xpack_encode_long(cram_slice *slice, cram_codec *c,
1565
0
                           char *in, int in_size) {
1566
0
    int64_t *syms = (int64_t *)in;
1567
0
    int i, r = 0;
1568
1569
0
    for (i = 0; i < in_size; i++)
1570
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1571
1572
0
    return r;
1573
0
}
1574
1575
int cram_xpack_encode_int(cram_slice *slice, cram_codec *c,
1576
0
                          char *in, int in_size) {
1577
0
    int *syms = (int *)in;
1578
0
    int i, r = 0;
1579
1580
0
    for (i = 0; i < in_size; i++)
1581
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1582
1583
0
    return r;
1584
0
}
1585
1586
int cram_xpack_encode_char(cram_slice *slice, cram_codec *c,
1587
0
                           char *in, int in_size) {
1588
0
    BLOCK_APPEND(c->out, in, in_size);
1589
0
    return 0;
1590
1591
0
 block_err:
1592
0
    return -1;
1593
0
}
1594
1595
0
void cram_xpack_encode_free(cram_codec *c) {
1596
0
    if (!c) return;
1597
1598
0
    if (c->u.e_xpack.sub_codec)
1599
0
        c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec);
1600
1601
0
    cram_free_block(c->out);
1602
1603
0
    free(c);
1604
0
}
1605
1606
cram_codec *cram_xpack_encode_init(cram_stats *st,
1607
                                   enum cram_encoding codec,
1608
                                   enum cram_external_type option,
1609
                                   void *dat,
1610
0
                                   int version, varint_vec *vv) {
1611
0
    cram_codec *c;
1612
1613
0
    if (!(c = malloc(sizeof(*c))))
1614
0
        return NULL;
1615
1616
0
    c->codec  = E_XPACK;
1617
0
    c->free   = cram_xpack_encode_free;
1618
0
    if (option == E_LONG)
1619
0
        c->encode = cram_xpack_encode_long;
1620
0
    else if (option == E_INT)
1621
0
        c->encode = cram_xpack_encode_int;
1622
0
    else
1623
0
        c->encode = cram_xpack_encode_char;
1624
0
    c->store  = cram_xpack_encode_store;
1625
0
    c->flush  = cram_xpack_encode_flush;
1626
1627
0
    cram_xpack_encoder *e = (cram_xpack_encoder *)dat;
1628
0
    c->u.e_xpack.nbits = e->nbits;
1629
0
    c->u.e_xpack.nval = e->nval;
1630
0
    c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
1631
0
                                               E_BYTE_ARRAY, e->sub_codec_dat,
1632
0
                                               version, vv);
1633
1634
    // Initialise fwd and rev maps
1635
0
    memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3
1636
0
    int i, n;
1637
0
    for (i = n = 0; i < 256; i++)
1638
0
        if (e->map[i] != -1)
1639
0
            c->u.e_xpack.rmap[n++] = i;               // 0,1,2,3 to P,A,C,K
1640
0
    if (n != e->nval) {
1641
0
        fprintf(stderr, "Incorrectly specified number of map items in PACK\n");
1642
0
        return NULL;
1643
0
    }
1644
1645
0
    return c;
1646
0
}
1647
1648
/*
1649
 * ---------------------------------------------------------------------------
1650
 * XDELTA: subtract successive values, zig-zag to turn +/- to + only,
1651
 * and then var-int encode the result.
1652
 *
1653
 * This also has the additional requirement that the data series is not
1654
 * interleaved with another, permitting efficient encoding and decoding
1655
 * of all elements enmasse instead of needing to only extract the bits
1656
 * necessary per item.
1657
 */
1658
1659
0
static uint8_t  zigzag8 (int8_t  x) { return (x << 1) ^ (x >>  7); }
1660
0
static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); }
1661
0
static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); }
1662
1663
//static int8_t  unzigzag8 (uint8_t  x) { return (x >> 1) ^ -(x & 1); }
1664
0
static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); }
1665
0
static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); }
1666
1667
0
int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1668
0
    return -1;
1669
0
}
1670
1671
0
int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1672
    // Slow value-by-value method for now
1673
0
    uint32_t *out32 = (uint32_t *)out;
1674
0
    int i;
1675
0
    for (i = 0; i < *out_size; i++) {
1676
0
        uint32_t v;
1677
0
        int one = 1;
1678
0
        if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in,
1679
0
                                          (char *)&v, &one) < 0)
1680
0
            return -1;
1681
0
        uint32_t d = unzigzag32(v);
1682
0
        c->u.xdelta.last = out32[i] = d + c->u.xdelta.last;
1683
0
    }
1684
1685
0
    return 0;
1686
0
}
1687
1688
0
static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) {
1689
0
    return -1;
1690
0
}
1691
1692
0
int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1693
0
    return -1;
1694
0
}
1695
1696
0
static inline int16_t le_int2(int16_t i) {
1697
0
    int16_t s;
1698
0
    i16_to_le(i, (uint8_t *)&s);
1699
0
    return s;
1700
0
}
1701
1702
int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in,
1703
0
                             char *out_, int *out_size) {
1704
0
    cram_block *out = (cram_block *)out_;
1705
0
    cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec);
1706
0
    int i = 0;
1707
1708
0
    const int w = c->u.xdelta.word_size;
1709
0
    uint32_t npad = (w - *out_size%w)%w;
1710
0
    uint32_t out_sz = *out_size + npad;
1711
0
    c->u.xdelta.last = 0;  // reset for each new array
1712
1713
0
    for (i = 0; i < out_sz; i += w) {
1714
0
        uint16_t v;
1715
        // Need better interface
1716
0
        char *cp = (char *)b->data + b->byte;
1717
0
        char *cp_end = (char *)b->data + b->uncomp_size;
1718
0
        int err = 0;
1719
0
        v = c->vv->varint_get32(&cp, cp_end, &err);
1720
0
        if (err)
1721
0
            return -1;
1722
0
        b->byte = cp - (char *)b->data;
1723
1724
0
        switch(w) {
1725
0
        case 2: {
1726
0
            int16_t d = unzigzag16(v), z;
1727
0
            c->u.xdelta.last = d + c->u.xdelta.last;
1728
0
            z = le_int2(c->u.xdelta.last);
1729
0
            BLOCK_APPEND(out, &z, 2-npad);
1730
0
            npad = 0;
1731
0
            break;
1732
0
        }
1733
0
        default:
1734
0
            fprintf(stderr, "Unsupported word size by XDELTA\n");
1735
0
            return -1;
1736
0
        }
1737
0
    }
1738
1739
0
    return 0;
1740
1741
0
 block_err:
1742
0
    return -1;
1743
0
}
1744
1745
141
void cram_xdelta_decode_free(cram_codec *c) {
1746
141
    if (!c) return;
1747
1748
141
    if (c->u.xdelta.sub_codec)
1749
78
        c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec);
1750
1751
141
    free(c);
1752
141
}
1753
1754
0
int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) {
1755
0
    cram_xdelta_decode_expand_char(slice, c);
1756
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1757
0
}
1758
1759
0
cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) {
1760
0
    cram_xdelta_decode_expand_char(slice, c);
1761
0
    return slice->block_by_id[512 + c->codec_id];
1762
0
}
1763
1764
cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr,
1765
                                    char *data, int size,
1766
                                    enum cram_encoding codec,
1767
                                    enum cram_external_type option,
1768
141
                                    int version, varint_vec *vv) {
1769
141
    cram_codec *c;
1770
141
    char *cp = data;
1771
141
    char *endp = data+size;
1772
1773
141
    if (!(c = calloc(1, sizeof(*c))))
1774
0
        return NULL;
1775
1776
141
    c->codec  = E_XDELTA;
1777
141
    if (option == E_LONG)
1778
3
        c->decode = cram_xdelta_decode_long;
1779
138
    else if (option == E_INT)
1780
36
        c->decode = cram_xdelta_decode_int;
1781
102
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1782
60
        c->decode = cram_xdelta_decode_char;
1783
42
    else if (option == E_BYTE_ARRAY_BLOCK) {
1784
42
        option = E_BYTE_ARRAY;
1785
42
        c->decode = cram_xdelta_decode_block;
1786
42
    } else {
1787
0
        free(c);
1788
0
        return NULL;
1789
0
    }
1790
141
    c->free = cram_xdelta_decode_free;
1791
141
    c->size = cram_xdelta_decode_size;
1792
141
    c->get_block = cram_xdelta_get_block;
1793
141
    c->describe = NULL;
1794
1795
141
    c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL);
1796
141
    c->u.xdelta.last = 0;
1797
1798
141
    int encoding = vv->varint_get32(&cp, endp, NULL);
1799
141
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1800
141
    if (sub_size < 0 || endp - cp < sub_size)
1801
24
        goto malformed;
1802
117
    c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1803
117
                                              option, version, vv);
1804
117
    if (c->u.xdelta.sub_codec == NULL)
1805
39
        goto malformed;
1806
78
    cp += sub_size;
1807
1808
78
    if (cp - data != size) {
1809
81
    malformed:
1810
81
        fprintf(stderr, "Malformed xdelta header stream\n");
1811
81
        cram_xdelta_decode_free(c);
1812
81
        return NULL;
1813
18
    }
1814
1815
60
    return c;
1816
78
}
1817
1818
0
int cram_xdelta_encode_flush(cram_codec *c) {
1819
0
    int r = -1;
1820
0
    cram_block *b = cram_new_block(0, 0);
1821
0
    if (!b)
1822
0
        return -1;
1823
1824
0
    switch (c->u.e_xdelta.word_size) {
1825
0
    case 2: {
1826
        // Delta + zigzag transform.
1827
        // Subtracting two 8-bit values has a 9-bit result (-255 to 255).
1828
        // However think of it as turning a wheel clockwise or anti-clockwise.
1829
        // If it has 256 gradations then a -ve rotation followed by a +ve
1830
        // rotation of the same amount reverses it regardless.
1831
        //
1832
        // Similarly the zig-zag transformation doesn't invent any extra bits,
1833
        // so the entire thing can be done in-situ.  This may permit faster
1834
        // SIMD loops if we break apart the steps.
1835
1836
        // uint16_t last = 0, d;
1837
        // for (i = 0; i < n; i++) {
1838
        //     d = io[i] - last;
1839
        //     last = io[i];
1840
        //     io[i] = zigzag16(vd);
1841
        // }
1842
1843
        // --- vs ---
1844
1845
        // for (i = n-1; i >= 1; i--)
1846
        //     io[i] -= io[i-1];
1847
        // for (i = 0; i < n; i++)
1848
        //     io[i] = zigzag16(io[i]);
1849
1850
        // varint: need array variant for speed here.
1851
        // With zig-zag
1852
0
        int i, n = BLOCK_SIZE(c->out)/2;;
1853
0
        uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0;
1854
1855
0
        if (n*2 < BLOCK_SIZE(c->out)) {
1856
            // half word
1857
0
            last = *(uint8_t *)dat;
1858
0
            c->vv->varint_put32_blk(b, zigzag16(last));
1859
0
            dat = (uint16_t *)(((uint8_t *)dat)+1);
1860
0
        }
1861
1862
0
        for (i = 0; i < n; i++) {
1863
0
            uint16_t d = dat[i] - last; // possibly unaligned
1864
0
            last = dat[i];
1865
0
            c->vv->varint_put32_blk(b, zigzag16(d));
1866
0
        }
1867
1868
0
        break;
1869
0
    }
1870
1871
0
    case 4: {
1872
0
        int i, n = BLOCK_SIZE(c->out)/4;;
1873
0
        uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0;
1874
1875
0
        for (i = 0; i < n; i++) {
1876
0
            uint32_t d = dat[i] - last;
1877
0
            last = dat[i];
1878
0
            c->vv->varint_put32_blk(b, zigzag32(d));
1879
0
        }
1880
1881
0
        break;
1882
0
    }
1883
1884
0
    case 1: {
1885
0
        int i, n = BLOCK_SIZE(c->out);;
1886
0
        uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0;
1887
1888
0
        for (i = 0; i < n; i++) {
1889
0
            uint32_t d = dat[i] - last;
1890
0
            last = dat[i];
1891
0
            c->vv->varint_put32_blk(b, zigzag8(d));
1892
0
        }
1893
1894
0
        break;
1895
0
    }
1896
1897
0
    default:
1898
0
        goto err;
1899
0
    }
1900
1901
0
    if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec,
1902
0
                                      (char *)b->data, b->byte))
1903
0
        goto err;
1904
1905
0
    r = 0;
1906
1907
0
 err:
1908
0
    cram_free_block(b);
1909
0
    return r;
1910
1911
0
}
1912
1913
int cram_xdelta_encode_store(cram_codec *c, cram_block *b,
1914
0
                            char *prefix, int version) {
1915
0
    int len = 0, r = 0, n;
1916
1917
0
    if (prefix) {
1918
0
        size_t l = strlen(prefix);
1919
0
        BLOCK_APPEND(b, prefix, l);
1920
0
        len += l;
1921
0
    }
1922
1923
    // Store sub-codec
1924
0
    cram_codec *tc = c->u.e_xdelta.sub_codec;
1925
0
    cram_block *tb = cram_new_block(0, 0);
1926
0
    if (!tb)
1927
0
        return -1;
1928
0
    int len2 = tc->store(tc, tb, NULL, version);
1929
1930
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1931
1932
    // codec length
1933
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size)
1934
0
                                        + len2)); r |= n;
1935
1936
    // This and sub-codec
1937
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n;
1938
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1939
1940
0
    cram_free_block(tb);
1941
1942
0
    return r > 0 ? len + len2 : -1;
1943
1944
0
 block_err:
1945
0
    return -1;
1946
0
}
1947
1948
// Same as cram_beta_encode_long
1949
int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c,
1950
0
                           char *in, int in_size) {
1951
0
    return -1;
1952
0
}
1953
1954
int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c,
1955
0
                          char *in, int in_size) {
1956
0
    return -1;
1957
0
}
1958
1959
int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c,
1960
0
                            char *in, int in_size) {
1961
0
    char *dat = malloc(in_size*5);
1962
0
    if (!dat)
1963
0
        return -1;
1964
0
    char *cp = dat, *cp_end = dat + in_size*5;
1965
1966
0
    c->u.e_xdelta.last = 0; // reset for each new array
1967
0
    if (c->u.e_xdelta.word_size == 2) {
1968
0
        int i, part;
1969
1970
0
        part = in_size%2;
1971
0
        if (part) {
1972
0
            uint16_t z = in[0];
1973
0
            c->u.e_xdelta.last = le_int2(z);
1974
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last));
1975
0
        }
1976
1977
0
        uint16_t *in16 = (uint16_t *)(in+part);
1978
0
        for (i = 0; i < in_size/2; i++) {
1979
0
            uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last;
1980
0
            c->u.e_xdelta.last = le_int2(in16[i]);
1981
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(d));
1982
0
        }
1983
0
    }
1984
0
    if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec,
1985
0
                                      (char *)dat, cp-dat)) {
1986
0
        free(dat);
1987
0
        return -1;
1988
0
    }
1989
1990
0
    free(dat);
1991
0
    return 0;
1992
0
}
1993
1994
0
void cram_xdelta_encode_free(cram_codec *c) {
1995
0
    if (!c) return;
1996
1997
0
    if (c->u.e_xdelta.sub_codec)
1998
0
        c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec);
1999
2000
0
    cram_free_block(c->out);
2001
2002
0
    free(c);
2003
0
}
2004
2005
cram_codec *cram_xdelta_encode_init(cram_stats *st,
2006
                                    enum cram_encoding codec,
2007
                                    enum cram_external_type option,
2008
                                    void *dat,
2009
0
                                    int version, varint_vec *vv) {
2010
0
    cram_codec *c;
2011
2012
0
    if (!(c = malloc(sizeof(*c))))
2013
0
        return NULL;
2014
2015
0
    c->codec  = E_XDELTA;
2016
0
    c->free   = cram_xdelta_encode_free;
2017
0
    if (option == E_LONG)
2018
0
        c->encode = cram_xdelta_encode_long;
2019
0
    else if (option == E_INT)
2020
0
        c->encode = cram_xdelta_encode_int;
2021
0
    else
2022
0
        c->encode = cram_xdelta_encode_char;
2023
0
    c->store  = cram_xdelta_encode_store;
2024
0
    c->flush  = cram_xdelta_encode_flush;
2025
2026
0
    cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat;
2027
0
    c->u.e_xdelta.word_size = e->word_size;
2028
0
    c->u.e_xdelta.last = 0;
2029
0
    c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
2030
0
                                                E_BYTE_ARRAY,
2031
0
                                                e->sub_codec_dat,
2032
0
                                                version, vv);
2033
2034
0
    return c;
2035
0
}
2036
2037
/*
2038
 * ---------------------------------------------------------------------------
2039
 * XRLE
2040
 *
2041
 * This also has the additional requirement that the data series is not
2042
 * interleaved with another, permitting efficient encoding and decoding
2043
 * of all elements enmasse instead of needing to only extract the bits
2044
 * necessary per item.
2045
 */
2046
0
int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2047
    // TODO if and when needed
2048
0
    return -1;
2049
0
}
2050
2051
0
int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2052
    // TODO if and when needed
2053
0
    return -1;
2054
0
}
2055
2056
// Expands an XRLE transform and caches result in slice->block_by_id[]
2057
0
static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) {
2058
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2059
0
    if (b)
2060
0
        return 0;
2061
2062
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
2063
0
    if (!b)
2064
0
        return -1;
2065
0
    cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec);
2066
0
    if (!lit_b)
2067
0
        return -1;
2068
0
    unsigned char *lit_dat = lit_b->data;
2069
0
    unsigned int lit_sz = lit_b->uncomp_size;
2070
0
    unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec);
2071
2072
0
    cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec);
2073
0
    if (!len_b)
2074
0
        return -1;
2075
0
    unsigned char *len_dat = len_b->data;
2076
2077
0
    uint8_t rle_syms[256];
2078
0
    int rle_nsyms = 0;
2079
0
    int i;
2080
0
    for (i = 0; i < 256; i++) {
2081
0
        if (c->u.xrle.rep_score[i] > 0)
2082
0
            rle_syms[rle_nsyms++] = i;
2083
0
    }
2084
2085
0
    uint64_t out_sz;
2086
0
    int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz);
2087
0
    if (!(b->data = malloc(out_sz)))
2088
0
        return -1;
2089
0
    hts_rle_decode(lit_dat, lit_sz,
2090
0
                   len_dat+nb, len_sz-nb,
2091
0
                   rle_syms, rle_nsyms,
2092
0
                   b->data, &out_sz);
2093
0
    b->uncomp_size = out_sz;
2094
2095
0
    return 0;
2096
0
}
2097
2098
0
int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) {
2099
0
    cram_xrle_decode_expand_char(slice, c);
2100
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
2101
0
}
2102
2103
0
cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) {
2104
0
    cram_xrle_decode_expand_char(slice, c);
2105
0
    return slice->block_by_id[512 + c->codec_id];
2106
0
}
2107
2108
0
int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2109
0
    int n = *out_size;
2110
2111
0
    cram_xrle_decode_expand_char(slice, c);
2112
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2113
2114
0
    memcpy(out, b->data + b->idx, n);
2115
0
    b->idx += n;
2116
0
    return 0;
2117
2118
    // Old code when not cached
2119
0
    while (n > 0) {
2120
0
        if (c->u.xrle.cur_len == 0) {
2121
0
            unsigned char lit;
2122
0
            int one = 1;
2123
0
            if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in,
2124
0
                                          (char *)&lit, &one) < 0)
2125
0
                return -1;
2126
0
            c->u.xrle.cur_lit = lit;
2127
2128
0
            if (c->u.xrle.rep_score[lit] > 0) {
2129
0
                if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in,
2130
0
                                              (char *)&c->u.xrle.cur_len, &one) < 0)
2131
0
                    return -1;
2132
0
            } // else cur_len still zero
2133
            //else fprintf(stderr, "%d\n", lit);
2134
2135
0
            c->u.xrle.cur_len++;
2136
0
        }
2137
2138
0
        if (n >= c->u.xrle.cur_len) {
2139
0
            memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len);
2140
0
            out += c->u.xrle.cur_len;
2141
0
            n -= c->u.xrle.cur_len;
2142
0
            c->u.xrle.cur_len = 0;
2143
0
        } else {
2144
0
            memset(out, c->u.xrle.cur_lit, n);
2145
0
            out += n;
2146
0
            c->u.xrle.cur_len -= n;
2147
0
            n = 0;
2148
0
        }
2149
0
    }
2150
2151
0
    return 0;
2152
0
}
2153
2154
186
void cram_xrle_decode_free(cram_codec *c) {
2155
186
    if (!c) return;
2156
2157
186
    if (c->u.xrle.len_codec)
2158
48
        c->u.xrle.len_codec->free(c->u.xrle.len_codec);
2159
2160
186
    if (c->u.xrle.lit_codec)
2161
27
        c->u.xrle.lit_codec->free(c->u.xrle.lit_codec);
2162
2163
186
    free(c);
2164
186
}
2165
2166
cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr,
2167
                                  char *data, int size,
2168
                                  enum cram_encoding codec,
2169
                                  enum cram_external_type option,
2170
195
                                  int version, varint_vec *vv) {
2171
195
    cram_codec *c;
2172
195
    char *cp = data;
2173
195
    char *endp = data+size;
2174
195
    int err = 0;
2175
2176
195
    if (!(c = calloc(1, sizeof(*c))))
2177
0
        return NULL;
2178
2179
195
    c->codec  = E_XRLE;
2180
195
    if (option == E_LONG)
2181
6
        c->decode = cram_xrle_decode_long;
2182
189
    else if (option == E_INT)
2183
84
        c->decode = cram_xrle_decode_int;
2184
105
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
2185
96
        c->decode = cram_xrle_decode_char;
2186
9
    else {
2187
9
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
2188
9
        free(c);
2189
9
        return NULL;
2190
9
    }
2191
186
    c->free   = cram_xrle_decode_free;
2192
186
    c->size   = cram_xrle_decode_size;
2193
186
    c->get_block = cram_xrle_get_block;
2194
186
    c->describe = NULL;
2195
186
    c->u.xrle.cur_len = 0;
2196
186
    c->u.xrle.cur_lit = -1;
2197
2198
    // RLE map
2199
186
    int i, j, nrle = vv->varint_get32(&cp, endp, &err);
2200
186
    memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score));
2201
5.08k
    for (i = 0; i < nrle && i < 256; i++) {
2202
4.89k
        j = vv->varint_get32(&cp, endp, &err);
2203
4.89k
        if (j >= 0 && j < 256)
2204
4.52k
            c->u.xrle.rep_score[j] = 1;
2205
4.89k
    }
2206
2207
    // Length and literal sub encodings
2208
186
    c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err);
2209
186
    int sub_size = vv->varint_get32(&cp, endp, &err);
2210
186
    if (sub_size < 0 || endp - cp < sub_size)
2211
30
        goto malformed;
2212
156
    c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding,
2213
156
                                            cp, sub_size, E_INT, version, vv);
2214
156
    if (c->u.xrle.len_codec == NULL)
2215
108
        goto malformed;
2216
48
    cp += sub_size;
2217
2218
48
    c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err);
2219
48
    sub_size = vv->varint_get32(&cp, endp, &err);
2220
48
    if (sub_size < 0 || endp - cp < sub_size)
2221
12
        goto malformed;
2222
36
    c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding,
2223
36
                                            cp, sub_size, option, version, vv);
2224
36
    if (c->u.xrle.lit_codec == NULL)
2225
9
        goto malformed;
2226
27
    cp += sub_size;
2227
2228
27
    if (err)
2229
0
        goto malformed;
2230
2231
27
    return c;
2232
2233
159
 malformed:
2234
159
    fprintf(stderr, "Malformed xrle header stream\n");
2235
159
    cram_xrle_decode_free(c);
2236
159
    return NULL;
2237
27
}
2238
2239
0
int cram_xrle_encode_flush(cram_codec *c) {
2240
0
    uint8_t *out_lit, *out_len;
2241
0
    uint64_t out_lit_size, out_len_size;
2242
0
    uint8_t rle_syms[256];
2243
0
    int rle_nsyms = 0, i;
2244
2245
0
    for (i = 0; i < 256; i++)
2246
0
        if (c->u.e_xrle.rep_score[i] > 0)
2247
0
            rle_syms[rle_nsyms++] = i;
2248
2249
0
    if (!c->u.e_xrle.to_flush) {
2250
0
        c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out);
2251
0
        c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out);
2252
0
    }
2253
2254
0
    out_len = malloc(c->u.e_xrle.to_flush_size+8);
2255
0
    if (!out_len)
2256
0
        return -1;
2257
2258
0
    int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size);
2259
2260
0
    out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
2261
0
                             out_len+nb, &out_len_size,
2262
0
                             rle_syms, &rle_nsyms,
2263
0
                             NULL, &out_lit_size);
2264
0
    out_len_size += nb;
2265
2266
2267
    // TODO: can maybe "gift" the sub codec the data block, to remove
2268
    // one level of memcpy.
2269
0
    if (c->u.e_xrle.len_codec->encode(NULL,
2270
0
                                      c->u.e_xrle.len_codec,
2271
0
                                      (char *)out_len, out_len_size))
2272
0
        return -1;
2273
2274
0
    if (c->u.e_xrle.lit_codec->encode(NULL,
2275
0
                                      c->u.e_xrle.lit_codec,
2276
0
                                      (char *)out_lit, out_lit_size))
2277
0
        return -1;
2278
2279
0
    free(out_len);
2280
0
    free(out_lit);
2281
2282
0
    return 0;
2283
0
}
2284
2285
int cram_xrle_encode_store(cram_codec *c, cram_block *b,
2286
0
                            char *prefix, int version) {
2287
0
    int len = 0, r = 0, n;
2288
0
    cram_codec *tc;
2289
0
    cram_block *b_rle, *b_len, *b_lit;
2290
2291
0
    if (prefix) {
2292
0
        size_t l = strlen(prefix);
2293
0
        BLOCK_APPEND(b, prefix, l);
2294
0
        len += l;
2295
0
    }
2296
2297
    // List of symbols to RLE
2298
0
    b_rle = cram_new_block(0, 0);
2299
0
    if (!b_rle)
2300
0
        return -1;
2301
0
    int i, nrle = 0, len1 = 0;
2302
0
    for (i = 0; i < 256; i++) {
2303
0
        if (c->u.e_xrle.rep_score[i] > 0) {
2304
0
            nrle++;
2305
0
            len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n;
2306
0
        }
2307
0
    }
2308
2309
    // Store length and literal sub-codecs to get encoded length
2310
0
    tc = c->u.e_xrle.len_codec;
2311
0
    b_len = cram_new_block(0, 0);
2312
0
    if (!b_len)
2313
0
        return -1;
2314
0
    int len2 = tc->store(tc, b_len, NULL, version);
2315
2316
0
    tc = c->u.e_xrle.lit_codec;
2317
0
    b_lit = cram_new_block(0, 0);
2318
0
    if (!b_lit)
2319
0
        return -1;
2320
0
    int len3 = tc->store(tc, b_lit, NULL, version);
2321
2322
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
2323
0
    len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3
2324
0
                                        + c->vv->varint_size(nrle))); r |= n;
2325
0
    len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n;
2326
0
    BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle));
2327
0
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
2328
0
    BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit));
2329
2330
0
    cram_free_block(b_rle);
2331
0
    cram_free_block(b_len);
2332
0
    cram_free_block(b_lit);
2333
2334
0
    if (r > 0)
2335
0
        return len + len1 + len2 + len3;
2336
2337
0
 block_err:
2338
0
    return -1;
2339
0
}
2340
2341
int cram_xrle_encode_long(cram_slice *slice, cram_codec *c,
2342
0
                           char *in, int in_size) {
2343
    // TODO if and when needed
2344
0
    return -1;
2345
0
}
2346
2347
int cram_xrle_encode_int(cram_slice *slice, cram_codec *c,
2348
0
                          char *in, int in_size) {
2349
    // TODO if and when needed
2350
0
    return -1;
2351
0
}
2352
2353
int cram_xrle_encode_char(cram_slice *slice, cram_codec *c,
2354
0
                          char *in, int in_size) {
2355
0
    if (c->u.e_xrle.to_flush) {
2356
0
        if (!c->out && !(c->out = cram_new_block(0, 0)))
2357
0
            return -1;
2358
0
        BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size);
2359
0
        c->u.e_xrle.to_flush = NULL;
2360
0
        c->u.e_xrle.to_flush_size = 0;
2361
0
    }
2362
2363
0
    if (c->out && BLOCK_SIZE(c->out) > 0) {
2364
        // Gathering data
2365
0
        BLOCK_APPEND(c->out, in, in_size);
2366
0
        return 0;
2367
0
    }
2368
2369
    // else cache copy of the data we're about to send to flush instead.
2370
0
    c->u.e_xrle.to_flush = in;
2371
0
    c->u.e_xrle.to_flush_size = in_size;
2372
0
    return 0;
2373
2374
0
 block_err:
2375
0
    return -1;
2376
0
}
2377
2378
0
void cram_xrle_encode_free(cram_codec *c) {
2379
0
    if (!c) return;
2380
2381
0
    if (c->u.e_xrle.len_codec)
2382
0
        c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec);
2383
0
    if (c->u.e_xrle.lit_codec)
2384
0
        c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec);
2385
2386
0
    cram_free_block(c->out);
2387
2388
0
    free(c);
2389
0
}
2390
2391
cram_codec *cram_xrle_encode_init(cram_stats *st,
2392
                                  enum cram_encoding codec,
2393
                                  enum cram_external_type option,
2394
                                  void *dat,
2395
0
                                  int version, varint_vec *vv) {
2396
0
    cram_codec *c;
2397
2398
0
    if (!(c = malloc(sizeof(*c))))
2399
0
        return NULL;
2400
2401
0
    c->codec  = E_XRLE;
2402
0
    c->free   = cram_xrle_encode_free;
2403
0
    if (option == E_LONG)
2404
0
        c->encode = cram_xrle_encode_long;
2405
0
    else if (option == E_INT)
2406
0
        c->encode = cram_xrle_encode_int;
2407
0
    else
2408
0
        c->encode = cram_xrle_encode_char;
2409
0
    c->store  = cram_xrle_encode_store;
2410
0
    c->flush  = cram_xrle_encode_flush;
2411
2412
0
    cram_xrle_encoder *e = (cram_xrle_encoder *)dat;
2413
2414
0
    c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL,
2415
0
                                              E_BYTE, e->len_dat,
2416
0
                                              version, vv);
2417
0
    c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL,
2418
0
                                              E_BYTE, e->lit_dat,
2419
0
                                              version, vv);
2420
0
    c->u.e_xrle.cur_lit = -1;
2421
0
    c->u.e_xrle.cur_len = -1;
2422
0
    c->u.e_xrle.to_flush = NULL;
2423
0
    c->u.e_xrle.to_flush_size = 0;
2424
2425
0
    memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score));
2426
2427
0
    return c;
2428
0
}
2429
2430
/*
2431
 * ---------------------------------------------------------------------------
2432
 * SUBEXP
2433
 */
2434
0
int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2435
0
    int32_t *out_i = (int32_t *)out;
2436
0
    int n, count;
2437
0
    int k = c->u.subexp.k;
2438
2439
0
    for (count = 0, n = *out_size; count < n; count++) {
2440
0
        int i = 0, tail;
2441
0
        int val;
2442
2443
        /* Get number of 1s */
2444
        //while (get_bit_MSB(in) == 1) i++;
2445
0
        i = get_one_bits_MSB(in);
2446
0
        if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
2447
0
            return -1;
2448
        /*
2449
         * Val is
2450
         * i > 0:  2^(k+i-1) + k+i-1 bits
2451
         * i = 0:  k bits
2452
         */
2453
0
        if (i) {
2454
0
            tail = i + k-1;
2455
0
            val = 0;
2456
0
            while (tail) {
2457
                //val = val<<1; val |= get_bit_MSB(in);
2458
0
                GET_BIT_MSB(in, val);
2459
0
                tail--;
2460
0
            }
2461
0
            val += 1 << (i + k-1);
2462
0
        } else {
2463
0
            tail = k;
2464
0
            val = 0;
2465
0
            while (tail) {
2466
                //val = val<<1; val |= get_bit_MSB(in);
2467
0
                GET_BIT_MSB(in, val);
2468
0
                tail--;
2469
0
            }
2470
0
        }
2471
2472
0
        out_i[count] = val - c->u.subexp.offset;
2473
0
    }
2474
2475
0
    return 0;
2476
0
}
2477
2478
2.02k
void cram_subexp_decode_free(cram_codec *c) {
2479
2.02k
    if (c)
2480
2.02k
        free(c);
2481
2.02k
}
2482
2483
0
int cram_subexp_describe(cram_codec *c, kstring_t *ks) {
2484
0
    return ksprintf(ks, "SUBEXP(offset=%d,k=%d)",
2485
0
                    c->u.subexp.offset,
2486
0
                    c->u.subexp.k)
2487
0
        < 0 ? -1 : 0;
2488
0
}
2489
2490
cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
2491
                                    char *data, int size,
2492
                                    enum cram_encoding codec,
2493
                                    enum cram_external_type option,
2494
2.03k
                                    int version, varint_vec *vv) {
2495
2.03k
    cram_codec *c;
2496
2.03k
    char *cp = data;
2497
2498
2.03k
    if (option != E_INT) {
2499
3
        hts_log_error("This codec only supports INT encodings");
2500
3
        return NULL;
2501
3
    }
2502
2503
2.03k
    if (!(c = malloc(sizeof(*c))))
2504
0
        return NULL;
2505
2506
2.03k
    c->codec  = E_SUBEXP;
2507
2.03k
    c->decode = cram_subexp_decode;
2508
2.03k
    c->free   = cram_subexp_decode_free;
2509
2.03k
    c->describe = cram_subexp_describe;
2510
2.03k
    c->u.subexp.k = -1;
2511
2512
2.03k
    c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL);
2513
2.03k
    c->u.subexp.k      = vv->varint_get32(&cp, data + size, NULL);
2514
2515
2.03k
    if (cp - data != size || c->u.subexp.k < 0) {
2516
6
        hts_log_error("Malformed subexp header stream");
2517
6
        free(c);
2518
6
        return NULL;
2519
6
    }
2520
2521
2.02k
    return c;
2522
2.03k
}
2523
2524
/*
2525
 * ---------------------------------------------------------------------------
2526
 * GAMMA
2527
 */
2528
0
int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2529
0
    int32_t *out_i = (int32_t *)out;
2530
0
    int i, n;
2531
2532
0
    for (i = 0, n = *out_size; i < n; i++) {
2533
0
        int nz = 0;
2534
0
        int val;
2535
        //while (get_bit_MSB(in) == 0) nz++;
2536
0
        nz = get_zero_bits_MSB(in);
2537
0
        if (cram_not_enough_bits(in, nz))
2538
0
            return -1;
2539
0
        val = 1;
2540
0
        while (nz > 0) {
2541
            //val <<= 1; val |= get_bit_MSB(in);
2542
0
            GET_BIT_MSB(in, val);
2543
0
            nz--;
2544
0
        }
2545
2546
0
        out_i[i] = val - c->u.gamma.offset;
2547
0
    }
2548
2549
0
    return 0;
2550
0
}
2551
2552
3.19k
void cram_gamma_decode_free(cram_codec *c) {
2553
3.19k
    if (c)
2554
3.19k
        free(c);
2555
3.19k
}
2556
2557
0
int cram_gamma_describe(cram_codec *c, kstring_t *ks) {
2558
0
    return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset)
2559
0
        < 0 ? -1 : 0;
2560
0
}
2561
2562
cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
2563
                                   char *data, int size,
2564
                                   enum cram_encoding codec,
2565
                                   enum cram_external_type option,
2566
3.20k
                                   int version, varint_vec *vv) {
2567
3.20k
    cram_codec *c = NULL;
2568
3.20k
    char *cp = data;
2569
2570
3.20k
    if (option != E_INT) {
2571
3
        hts_log_error("This codec only supports INT encodings");
2572
3
        return NULL;
2573
3
    }
2574
2575
3.19k
    if (size < 1)
2576
3
        goto malformed;
2577
2578
3.19k
    if (!(c = malloc(sizeof(*c))))
2579
0
        return NULL;
2580
2581
3.19k
    c->codec  = E_GAMMA;
2582
3.19k
    c->decode = cram_gamma_decode;
2583
3.19k
    c->free   = cram_gamma_decode_free;
2584
3.19k
    c->describe = cram_gamma_describe;
2585
2586
3.19k
    c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL);
2587
2588
3.19k
    if (cp - data != size)
2589
3
        goto malformed;
2590
2591
3.19k
    return c;
2592
2593
6
 malformed:
2594
6
    hts_log_error("Malformed gamma header stream");
2595
6
    free(c);
2596
6
    return NULL;
2597
3.19k
}
2598
2599
/*
2600
 * ---------------------------------------------------------------------------
2601
 * HUFFMAN
2602
 */
2603
2604
3.18k
static int code_sort(const void *vp1, const void *vp2) {
2605
3.18k
    const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
2606
3.18k
    const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
2607
2608
3.18k
    if (c1->len != c2->len)
2609
642
        return c1->len - c2->len;
2610
2.53k
    else
2611
2.53k
        return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
2612
3.18k
}
2613
2614
768
void cram_huffman_decode_free(cram_codec *c) {
2615
768
    if (!c)
2616
0
        return;
2617
2618
768
    if (c->u.huffman.codes)
2619
435
        free(c->u.huffman.codes);
2620
768
    free(c);
2621
768
}
2622
2623
int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
2624
0
                             cram_block *in, char *out, int *out_size) {
2625
0
    return -1;
2626
0
}
2627
2628
int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
2629
0
                              cram_block *in, char *out, int *out_size) {
2630
0
    int i, n;
2631
2632
0
    if (!out)
2633
0
        return 0;
2634
2635
    /* Special case of 0 length codes */
2636
0
    for (i = 0, n = *out_size; i < n; i++) {
2637
0
        out[i] = c->u.huffman.codes[0].symbol;
2638
0
    }
2639
0
    return 0;
2640
0
}
2641
2642
int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
2643
0
                             cram_block *in, char *out, int *out_size) {
2644
0
    int i, n, ncodes = c->u.huffman.ncodes;
2645
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2646
2647
0
    for (i = 0, n = *out_size; i < n; i++) {
2648
0
        int idx = 0;
2649
0
        int val = 0, len = 0, last_len = 0;
2650
2651
0
        for (;;) {
2652
0
            int dlen = codes[idx].len - last_len;
2653
0
            if (cram_not_enough_bits(in, dlen))
2654
0
                return -1;
2655
2656
            //val <<= dlen;
2657
            //val  |= get_bits_MSB(in, dlen);
2658
            //last_len = (len += dlen);
2659
2660
0
            last_len = (len += dlen);
2661
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2662
2663
0
            idx = val - codes[idx].p;
2664
0
            if (idx >= ncodes || idx < 0)
2665
0
                return -1;
2666
2667
0
            if (codes[idx].code == val && codes[idx].len == len) {
2668
0
                if (out) out[i] = codes[idx].symbol;
2669
0
                break;
2670
0
            }
2671
0
        }
2672
0
    }
2673
2674
0
    return 0;
2675
0
}
2676
2677
int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
2678
0
                             cram_block *in, char *out, int *out_size) {
2679
0
    int32_t *out_i = (int32_t *)out;
2680
0
    int i, n;
2681
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2682
2683
    /* Special case of 0 length codes */
2684
0
    for (i = 0, n = *out_size; i < n; i++) {
2685
0
        out_i[i] = codes[0].symbol;
2686
0
    }
2687
0
    return 0;
2688
0
}
2689
2690
int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
2691
0
                            cram_block *in, char *out, int *out_size) {
2692
0
    int32_t *out_i = (int32_t *)out;
2693
0
    int i, n, ncodes = c->u.huffman.ncodes;
2694
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2695
2696
0
    for (i = 0, n = *out_size; i < n; i++) {
2697
0
        int idx = 0;
2698
0
        int val = 0, len = 0, last_len = 0;
2699
2700
        // Now one bit at a time for remaining checks
2701
0
        for (;;) {
2702
0
            int dlen = codes[idx].len - last_len;
2703
0
            if (cram_not_enough_bits(in, dlen))
2704
0
                return -1;
2705
2706
            //val <<= dlen;
2707
            //val  |= get_bits_MSB(in, dlen);
2708
            //last_len = (len += dlen);
2709
2710
0
            last_len = (len += dlen);
2711
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2712
2713
0
            idx = val - codes[idx].p;
2714
0
            if (idx >= ncodes || idx < 0)
2715
0
                return -1;
2716
2717
0
            if (codes[idx].code == val && codes[idx].len == len) {
2718
0
                out_i[i] = codes[idx].symbol;
2719
0
                break;
2720
0
            }
2721
0
        }
2722
0
    }
2723
2724
0
    return 0;
2725
0
}
2726
2727
int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c,
2728
0
                              cram_block *in, char *out, int *out_size) {
2729
0
    int64_t *out_i = (int64_t *)out;
2730
0
    int i, n;
2731
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2732
2733
    /* Special case of 0 length codes */
2734
0
    for (i = 0, n = *out_size; i < n; i++) {
2735
0
        out_i[i] = codes[0].symbol;
2736
0
    }
2737
0
    return 0;
2738
0
}
2739
2740
int cram_huffman_decode_long(cram_slice *slice, cram_codec *c,
2741
0
                             cram_block *in, char *out, int *out_size) {
2742
0
    int64_t *out_i = (int64_t *)out;
2743
0
    int i, n, ncodes = c->u.huffman.ncodes;
2744
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2745
2746
0
    for (i = 0, n = *out_size; i < n; i++) {
2747
0
        int idx = 0;
2748
0
        int val = 0, len = 0, last_len = 0;
2749
2750
        // Now one bit at a time for remaining checks
2751
0
        for (;;) {
2752
0
            int dlen = codes[idx].len - last_len;
2753
0
            if (cram_not_enough_bits(in, dlen))
2754
0
                return -1;
2755
2756
            //val <<= dlen;
2757
            //val  |= get_bits_MSB(in, dlen);
2758
            //last_len = (len += dlen);
2759
2760
0
            last_len = (len += dlen);
2761
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2762
2763
0
            idx = val - codes[idx].p;
2764
0
            if (idx >= ncodes || idx < 0)
2765
0
                return -1;
2766
2767
0
            if (codes[idx].code == val && codes[idx].len == len) {
2768
0
                out_i[i] = codes[idx].symbol;
2769
0
                break;
2770
0
            }
2771
0
        }
2772
0
    }
2773
2774
0
    return 0;
2775
0
}
2776
2777
0
int cram_huffman_describe(cram_codec *c, kstring_t *ks) {
2778
0
    int r = 0, n;
2779
0
    r |= ksprintf(ks, "HUFFMAN(codes={") < 0;
2780
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2781
0
        r |= ksprintf(ks, "%s%"PRId64, n?",":"",
2782
0
                      c->u.huffman.codes[n].symbol);
2783
0
    }
2784
0
    r |= ksprintf(ks, "},lengths={") < 0;
2785
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2786
0
        r |= ksprintf(ks, "%s%d", n?",":"",
2787
0
                      c->u.huffman.codes[n].len);
2788
0
    }
2789
0
    r |= ksprintf(ks, "})") < 0;
2790
0
    return r;
2791
0
}
2792
2793
/*
2794
 * Initialises a huffman decoder from an encoding data stream.
2795
 */
2796
cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
2797
                                     char *data, int size,
2798
                                     enum cram_encoding codec,
2799
                                     enum cram_external_type option,
2800
1.00k
                                     int version, varint_vec *vv) {
2801
1.00k
    int32_t ncodes = 0, i, j;
2802
1.00k
    char *cp = data, *data_end = &data[size];
2803
1.00k
    cram_codec *h;
2804
1.00k
    cram_huffman_code *codes = NULL;
2805
1.00k
    int32_t val, last_len, max_len = 0;
2806
1.00k
    uint32_t max_val; // needs one more bit than val
2807
1.00k
    const int max_code_bits = sizeof(val) * 8 - 1;
2808
1.00k
    int err = 0;
2809
2810
1.00k
    if (option == E_BYTE_ARRAY_BLOCK) {
2811
3
        hts_log_error("BYTE_ARRAYs not supported by this codec");
2812
3
        return NULL;
2813
3
    }
2814
2815
1.00k
    ncodes = vv->varint_get32(&cp, data_end, &err);
2816
1.00k
    if (ncodes < 0) {
2817
3
        hts_log_error("Invalid number of symbols in huffman stream");
2818
3
        return NULL;
2819
3
    }
2820
999
    if (ncodes >= SIZE_MAX / sizeof(*codes)) {
2821
0
        errno = ENOMEM;
2822
0
        return NULL;
2823
0
    }
2824
999
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2825
999
    if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) {
2826
6
        errno = ENOMEM;
2827
6
        return NULL;
2828
6
    }
2829
993
#endif
2830
993
    h = calloc(1, sizeof(*h));
2831
993
    if (!h)
2832
0
        return NULL;
2833
2834
993
    h->codec  = E_HUFFMAN;
2835
993
    h->free   = cram_huffman_decode_free;
2836
2837
993
    h->u.huffman.ncodes = ncodes;
2838
993
    h->u.huffman.option = option;
2839
993
    if (ncodes) {
2840
651
        codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes));
2841
651
        if (!codes) {
2842
0
            free(h);
2843
0
            return NULL;
2844
0
        }
2845
651
    } else {
2846
342
        codes = h->u.huffman.codes = NULL;
2847
342
    }
2848
2849
    /* Read symbols and bit-lengths */
2850
993
    if (option == E_LONG) {
2851
6.35M
        for (i = 0; i < ncodes; i++)
2852
6.35M
            codes[i].symbol = vv->varint_get64(&cp, data_end, &err);
2853
930
    } else if (option == E_INT || option == E_BYTE) {
2854
5.42k
        for (i = 0; i < ncodes; i++)
2855
4.50k
            codes[i].symbol = vv->varint_get32(&cp, data_end, &err);
2856
918
    } else {
2857
12
        goto malformed;
2858
12
    }
2859
2860
981
    if (err)
2861
87
        goto malformed;
2862
2863
894
    i = vv->varint_get32(&cp, data_end, &err);
2864
894
    if (i != ncodes)
2865
15
        goto malformed;
2866
2867
879
    if (ncodes == 0) {
2868
        /* NULL huffman stream.  Ensure it returns an error if
2869
           anything tries to use it. */
2870
333
        h->decode = cram_huffman_decode_null;
2871
333
        return h;
2872
333
    }
2873
2874
3.34k
    for (i = 0; i < ncodes; i++) {
2875
2.85k
        codes[i].len = vv->varint_get32(&cp, data_end, &err);
2876
2.85k
        if (err)
2877
33
            break;
2878
2.82k
        if (codes[i].len < 0) {
2879
21
            hts_log_error("Huffman code length (%d) is negative", codes[i].len);
2880
21
            goto malformed;
2881
21
        }
2882
2.79k
        if (max_len < codes[i].len)
2883
498
            max_len = codes[i].len;
2884
2.79k
    }
2885
525
    if (err || cp - data != size || max_len >= ncodes)
2886
57
        goto malformed;
2887
2888
    /* 31 is max. bits available in val */
2889
468
    if (max_len > max_code_bits) {
2890
3
        hts_log_error("Huffman code length (%d) is greater "
2891
3
                      "than maximum supported (%d)", max_len, max_code_bits);
2892
3
        goto malformed;
2893
3
    }
2894
2895
    /* Sort by bit length and then by symbol value */
2896
465
    qsort(codes, ncodes, sizeof(*codes), code_sort);
2897
2898
    /* Assign canonical codes */
2899
465
    val = -1, last_len = 0, max_val = 0;
2900
1.52k
    for (i = 0; i < ncodes; i++) {
2901
1.08k
        val++;
2902
1.08k
        if (val > max_val)
2903
30
            goto malformed;
2904
2905
1.05k
        if (codes[i].len > last_len) {
2906
291
            val <<= (codes[i].len - last_len);
2907
291
            last_len = codes[i].len;
2908
291
            max_val = (1U << codes[i].len) - 1;
2909
291
        }
2910
1.05k
        codes[i].code = val;
2911
1.05k
    }
2912
2913
    /*
2914
     * Compute the next starting point, offset by the i'th value.
2915
     * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
2916
     * codes[10..13].p = 30 - 10.
2917
     */
2918
435
    last_len = 0;
2919
1.46k
    for (i = j = 0; i < ncodes; i++) {
2920
1.02k
        if (codes[i].len > last_len) {
2921
291
            j = codes[i].code - i;
2922
291
            last_len = codes[i].len;
2923
291
        }
2924
1.02k
        codes[i].p = j;
2925
1.02k
    }
2926
2927
    // puts("==HUFF LEN==");
2928
    // for (i = 0; i <= last_len+1; i++) {
2929
    //     printf("len %d=%d prefix %d\n", i, h->u.huffman.lengths[i], h->u.huffman.prefix[i]);
2930
    // }
2931
    // puts("===HUFFMAN CODES===");
2932
    // for (i = 0; i < ncodes; i++) {
2933
    //     int j;
2934
    //     printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
2935
    //     j = codes[i].len;
2936
    //     while (j) {
2937
    //         putchar(codes[i].code & (1 << --j) ? '1' : '0');
2938
    //     }
2939
    //     printf(" %d\n", codes[i].code);
2940
    // }
2941
2942
435
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
2943
249
        if (h->u.huffman.codes[0].len == 0)
2944
120
            h->decode = cram_huffman_decode_char0;
2945
129
        else
2946
129
            h->decode = cram_huffman_decode_char;
2947
249
    } else if (option == E_LONG || option == E_SLONG) {
2948
0
        if (h->u.huffman.codes[0].len == 0)
2949
0
            h->decode = cram_huffman_decode_long0;
2950
0
        else
2951
0
            h->decode = cram_huffman_decode_long;
2952
186
    } else if (option == E_INT || option == E_SINT || option == E_BYTE) {
2953
186
        if (h->u.huffman.codes[0].len == 0)
2954
57
            h->decode = cram_huffman_decode_int0;
2955
129
        else
2956
129
            h->decode = cram_huffman_decode_int;
2957
186
    } else {
2958
0
        return NULL;
2959
0
    }
2960
435
    h->describe = cram_huffman_describe;
2961
2962
435
    return (cram_codec *)h;
2963
2964
225
 malformed:
2965
225
    hts_log_error("Malformed huffman header stream");
2966
225
    free(codes);
2967
225
    free(h);
2968
225
    return NULL;
2969
435
}
2970
2971
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
2972
77.2k
                              char *in, int in_size) {
2973
77.2k
    return 0;
2974
77.2k
}
2975
2976
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
2977
0
                             char *in, int in_size) {
2978
0
    int i, code, len, r = 0;
2979
0
    unsigned char *syms = (unsigned char *)in;
2980
2981
0
    while (in_size--) {
2982
0
        int sym = *syms++;
2983
0
        if (sym >= -1 && sym < MAX_HUFF) {
2984
0
            i = c->u.e_huffman.val2code[sym+1];
2985
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
2986
0
            code = c->u.e_huffman.codes[i].code;
2987
0
            len  = c->u.e_huffman.codes[i].len;
2988
0
        } else {
2989
            /* Slow - use a lookup table for when sym < MAX_HUFF? */
2990
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
2991
0
                if (c->u.e_huffman.codes[i].symbol == sym)
2992
0
                    break;
2993
0
            }
2994
0
            if (i == c->u.e_huffman.nvals)
2995
0
                return -1;
2996
2997
0
            code = c->u.e_huffman.codes[i].code;
2998
0
            len  = c->u.e_huffman.codes[i].len;
2999
0
        }
3000
3001
0
        r |= store_bits_MSB(c->out, code, len);
3002
0
    }
3003
3004
0
    return r;
3005
0
}
3006
3007
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
3008
55.7M
                             char *in, int in_size) {
3009
55.7M
    return 0;
3010
55.7M
}
3011
3012
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
3013
0
                            char *in, int in_size) {
3014
0
    int i, code, len, r = 0;
3015
0
    int *syms = (int *)in;
3016
3017
0
    while (in_size--) {
3018
0
        int sym = *syms++;
3019
3020
0
        if (sym >= -1 && sym < MAX_HUFF) {
3021
0
            i = c->u.e_huffman.val2code[sym+1];
3022
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3023
0
            code = c->u.e_huffman.codes[i].code;
3024
0
            len  = c->u.e_huffman.codes[i].len;
3025
0
        } else {
3026
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3027
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3028
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3029
0
                    break;
3030
0
            }
3031
0
            if (i == c->u.e_huffman.nvals)
3032
0
                return -1;
3033
3034
0
            code = c->u.e_huffman.codes[i].code;
3035
0
            len  = c->u.e_huffman.codes[i].len;
3036
0
        }
3037
3038
0
        r |= store_bits_MSB(c->out, code, len);
3039
0
    }
3040
3041
0
    return r;
3042
0
}
3043
3044
int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c,
3045
0
                              char *in, int in_size) {
3046
0
    return 0;
3047
0
}
3048
3049
int cram_huffman_encode_long(cram_slice *slice, cram_codec *c,
3050
0
                             char *in, int in_size) {
3051
0
    int i, code, len, r = 0;
3052
0
    int64_t *syms = (int64_t *)in;
3053
3054
0
    while (in_size--) {
3055
0
        int sym = *syms++;
3056
3057
0
        if (sym >= -1 && sym < MAX_HUFF) {
3058
0
            i = c->u.e_huffman.val2code[sym+1];
3059
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3060
0
            code = c->u.e_huffman.codes[i].code;
3061
0
            len  = c->u.e_huffman.codes[i].len;
3062
0
        } else {
3063
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3064
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3065
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3066
0
                    break;
3067
0
            }
3068
0
            if (i == c->u.e_huffman.nvals)
3069
0
                return -1;
3070
3071
0
            code = c->u.e_huffman.codes[i].code;
3072
0
            len  = c->u.e_huffman.codes[i].len;
3073
0
        }
3074
3075
0
        r |= store_bits_MSB(c->out, code, len);
3076
0
    }
3077
3078
0
    return r;
3079
0
}
3080
3081
922k
void cram_huffman_encode_free(cram_codec *c) {
3082
922k
    if (!c)
3083
0
        return;
3084
3085
922k
    if (c->u.e_huffman.codes)
3086
922k
        free(c->u.e_huffman.codes);
3087
922k
    free(c);
3088
922k
}
3089
3090
/*
3091
 * Encodes a huffman tree.
3092
 * Returns number of bytes written.
3093
 */
3094
int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
3095
920k
                              int version) {
3096
920k
    int i, len = 0, r = 0, n;
3097
920k
    cram_huffman_code *codes = c->u.e_huffman.codes;
3098
    /*
3099
     * Up to code length 127 means 2.5e+26 bytes of data required (worst
3100
     * case huffman tree needs symbols with freqs matching the Fibonacci
3101
     * series). So guaranteed 1 byte per code.
3102
     *
3103
     * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
3104
     *
3105
     * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
3106
     */
3107
920k
    char *tmp = malloc(6*c->u.e_huffman.nvals+16);
3108
920k
    char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16;
3109
3110
920k
    if (!tmp)
3111
0
        return -1;
3112
3113
920k
    if (prefix) {
3114
807k
        size_t l = strlen(prefix);
3115
807k
        BLOCK_APPEND(b, prefix, l);
3116
807k
        len += l;
3117
807k
    }
3118
3119
920k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3120
920k
    if (c->u.e_huffman.option == E_LONG) {
3121
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3122
0
            tp += c->vv->varint_put64(tp, tpend, codes[i].symbol);
3123
0
        }
3124
920k
    } else if (c->u.e_huffman.option == E_SLONG) {
3125
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3126
0
            tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol);
3127
0
        }
3128
920k
    } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) {
3129
1.84M
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3130
920k
            tp += c->vv->varint_put32(tp, tpend, codes[i].symbol);
3131
920k
        }
3132
920k
    } else if (c->u.e_huffman.option == E_SINT) {
3133
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3134
0
            tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol);
3135
0
        }
3136
0
    } else {
3137
0
        return -1;
3138
0
    }
3139
3140
920k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3141
1.84M
    for (i = 0; i < c->u.e_huffman.nvals; i++)
3142
920k
        tp += c->vv->varint_put32(tp, tpend, codes[i].len);
3143
3144
920k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
3145
920k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
3146
920k
    BLOCK_APPEND(b, tmp, tp-tmp);
3147
920k
    len += tp-tmp;
3148
3149
920k
    free(tmp);
3150
3151
920k
    if (r > 0)
3152
920k
        return len;
3153
3154
0
 block_err:
3155
0
    return -1;
3156
920k
}
3157
3158
cram_codec *cram_huffman_encode_init(cram_stats *st,
3159
                                     enum cram_encoding codec,
3160
                                     enum cram_external_type option,
3161
                                     void *dat,
3162
922k
                                     int version, varint_vec *vv) {
3163
922k
    int *vals = NULL, *freqs = NULL, *lens = NULL, code, len;
3164
922k
    int *new_vals, *new_freqs;
3165
922k
    int i, max_val = 0, min_val = INT_MAX, k;
3166
922k
    size_t nvals, vals_alloc = 0;
3167
922k
    cram_codec *c;
3168
922k
    cram_huffman_code *codes;
3169
3170
922k
    c = malloc(sizeof(*c));
3171
922k
    if (!c)
3172
0
        return NULL;
3173
922k
    c->codec = E_HUFFMAN;
3174
3175
    /* Count number of unique symbols */
3176
945M
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
3177
944M
        if (!st->freqs[i])
3178
944M
            continue;
3179
762k
        if (nvals >= vals_alloc) {
3180
762k
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3181
762k
            new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3182
762k
            if (!new_vals) goto nomem;
3183
762k
            vals = new_vals;
3184
762k
            new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3185
762k
            if (!new_freqs) goto nomem;
3186
762k
            freqs = new_freqs;
3187
762k
        }
3188
762k
        vals[nvals] = i;
3189
762k
        freqs[nvals] = st->freqs[i];
3190
762k
        assert(st->freqs[i] > 0);
3191
762k
        if (max_val < i) max_val = i;
3192
762k
        if (min_val > i) min_val = i;
3193
762k
        nvals++;
3194
762k
    }
3195
922k
    if (st->h) {
3196
160k
        khint_t k;
3197
3198
801k
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
3199
641k
            if (!kh_exist(st->h, k))
3200
481k
                continue;
3201
160k
            if (nvals >= vals_alloc) {
3202
160k
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3203
160k
                new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3204
160k
                if (!new_vals) goto nomem;
3205
160k
                vals = new_vals;
3206
160k
                new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3207
160k
                if (!new_freqs) goto nomem;
3208
160k
                freqs = new_freqs;
3209
160k
            }
3210
160k
            vals[nvals]= kh_key(st->h, k);
3211
160k
            freqs[nvals] = kh_val(st->h, k);
3212
160k
            assert(freqs[nvals] > 0);
3213
160k
            if (max_val < i) max_val = i;
3214
160k
            if (min_val > i) min_val = i;
3215
160k
            nvals++;
3216
160k
        }
3217
160k
    }
3218
3219
922k
    assert(nvals > 0);
3220
3221
922k
    new_freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
3222
922k
    if (!new_freqs) goto nomem;
3223
922k
    freqs = new_freqs;
3224
922k
    lens = calloc(2*nvals, sizeof(*lens));
3225
922k
    if (!lens) goto nomem;
3226
3227
    /* Inefficient, use pointers to form chain so we can insert and maintain
3228
     * a sorted list? This is currently O(nvals^2) complexity.
3229
     */
3230
922k
    for (;;) {
3231
922k
        int low1 = INT_MAX, low2 = INT_MAX;
3232
922k
        int ind1 = 0, ind2 = 0;
3233
1.84M
        for (i = 0; i < nvals; i++) {
3234
922k
            if (freqs[i] < 0)
3235
0
                continue;
3236
922k
            if (low1 > freqs[i])
3237
922k
                low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
3238
0
            else if (low2 > freqs[i])
3239
0
                low2 = freqs[i], ind2 = i;
3240
922k
        }
3241
922k
        if (low2 == INT_MAX)
3242
922k
            break;
3243
3244
0
        freqs[nvals] = low1 + low2;
3245
0
        lens[ind1] = nvals;
3246
0
        lens[ind2] = nvals;
3247
0
        freqs[ind1] *= -1;
3248
0
        freqs[ind2] *= -1;
3249
0
        nvals++;
3250
0
    }
3251
922k
    nvals = nvals/2+1;
3252
3253
    /* Assign lengths */
3254
1.84M
    for (i = 0; i < nvals; i++) {
3255
922k
        int code_len = 0;
3256
922k
        for (k = lens[i]; k; k = lens[k])
3257
0
            code_len++;
3258
922k
        lens[i] = code_len;
3259
922k
        freqs[i] *= -1;
3260
        //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
3261
922k
    }
3262
3263
3264
    /* Sort, need in a struct */
3265
922k
    if (!(codes = malloc(nvals * sizeof(*codes))))
3266
0
        goto nomem;
3267
1.84M
    for (i = 0; i < nvals; i++) {
3268
922k
        codes[i].symbol = vals[i];
3269
922k
        codes[i].len = lens[i];
3270
922k
    }
3271
922k
    qsort(codes, nvals, sizeof(*codes), code_sort);
3272
3273
    /*
3274
     * Generate canonical codes from lengths.
3275
     * Sort by length.
3276
     * Start with 0.
3277
     * Every new code of same length is +1.
3278
     * Every new code of new length is +1 then <<1 per extra length.
3279
     *
3280
     * /\
3281
     * a/\
3282
     * /\/\
3283
     * bcd/\
3284
     *    ef
3285
     *
3286
     * a 1  0
3287
     * b 3  4 (0+1)<<2
3288
     * c 3  5
3289
     * d 3  6
3290
     * e 4  14  (6+1)<<1
3291
     * f 5  15
3292
     */
3293
922k
    code = 0; len = codes[0].len;
3294
1.84M
    for (i = 0; i < nvals; i++) {
3295
922k
        while (len != codes[i].len) {
3296
0
            code<<=1;
3297
0
            len++;
3298
0
        }
3299
922k
        codes[i].code = code++;
3300
3301
922k
        if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
3302
916k
            c->u.e_huffman.val2code[codes[i].symbol+1] = i;
3303
3304
        //fprintf(stderr, "sym %d, code %d, len %d\n",
3305
        //      codes[i].symbol, codes[i].code, codes[i].len);
3306
922k
    }
3307
3308
922k
    free(lens);
3309
922k
    free(vals);
3310
922k
    free(freqs);
3311
3312
922k
    c->u.e_huffman.codes = codes;
3313
922k
    c->u.e_huffman.nvals = nvals;
3314
922k
    c->u.e_huffman.option = option;
3315
3316
922k
    c->free = cram_huffman_encode_free;
3317
922k
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
3318
26.4k
        if (c->u.e_huffman.codes[0].len == 0)
3319
26.4k
            c->encode = cram_huffman_encode_char0;
3320
0
        else
3321
0
            c->encode = cram_huffman_encode_char;
3322
896k
    } else if (option == E_INT || option == E_SINT) {
3323
896k
        if (c->u.e_huffman.codes[0].len == 0)
3324
896k
            c->encode = cram_huffman_encode_int0;
3325
0
        else
3326
0
            c->encode = cram_huffman_encode_int;
3327
896k
    } else if (option == E_LONG || option == E_SLONG) {
3328
0
        if (c->u.e_huffman.codes[0].len == 0)
3329
0
            c->encode = cram_huffman_encode_long0;
3330
0
        else
3331
0
            c->encode = cram_huffman_encode_long;
3332
0
    } else {
3333
0
        return NULL;
3334
0
    }
3335
922k
    c->store = cram_huffman_encode_store;
3336
922k
    c->flush = NULL;
3337
3338
922k
    return c;
3339
3340
0
 nomem:
3341
0
    hts_log_error("Out of memory");
3342
0
    free(vals);
3343
0
    free(freqs);
3344
0
    free(lens);
3345
0
    free(c);
3346
0
    return NULL;
3347
922k
}
3348
3349
/*
3350
 * ---------------------------------------------------------------------------
3351
 * BYTE_ARRAY_LEN
3352
 */
3353
int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
3354
                               cram_block *in, char *out,
3355
0
                               int *out_size) {
3356
    /* Fetch length */
3357
0
    int32_t len = 0, one = 1;
3358
0
    int r;
3359
3360
0
    r = c->u.byte_array_len.len_codec->decode(slice, c->u.byte_array_len.len_codec,
3361
0
                                              in, (char *)&len, &one);
3362
    //printf("ByteArray Len=%d\n", len);
3363
3364
0
    if (!r && c->u.byte_array_len.val_codec && len >= 0) {
3365
0
        r = c->u.byte_array_len.val_codec->decode(slice,
3366
0
                                                  c->u.byte_array_len.val_codec,
3367
0
                                                  in, out, &len);
3368
0
    } else {
3369
0
        return -1;
3370
0
    }
3371
3372
0
    *out_size = len;
3373
3374
0
    return r;
3375
0
}
3376
3377
2.32k
void cram_byte_array_len_decode_free(cram_codec *c) {
3378
2.32k
    if (!c) return;
3379
3380
2.32k
    if (c->u.byte_array_len.len_codec)
3381
2.27k
        c->u.byte_array_len.len_codec->free(c->u.byte_array_len.len_codec);
3382
3383
2.32k
    if (c->u.byte_array_len.val_codec)
3384
2.26k
        c->u.byte_array_len.val_codec->free(c->u.byte_array_len.val_codec);
3385
3386
2.32k
    free(c);
3387
2.32k
}
3388
3389
0
int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) {
3390
0
    int r = 0;
3391
0
    r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0;
3392
0
    cram_byte_array_len_decoder *l = &c->u.byte_array_len;
3393
0
    r |=  l->len_codec->describe
3394
0
        ? l->len_codec->describe(l->len_codec, ks)
3395
0
        : (ksprintf(ks, "?")<0);
3396
0
    r |= ksprintf(ks, "},val_codec={") < 0;
3397
0
    r |=  l->val_codec->describe
3398
0
        ? l->val_codec->describe(l->val_codec, ks)
3399
0
        : (ksprintf(ks, "?")<0);
3400
0
    r |= ksprintf(ks, "}") < 0;
3401
3402
0
    return r;
3403
0
}
3404
3405
cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
3406
                                            char *data, int size,
3407
                                            enum cram_encoding codec,
3408
                                            enum cram_external_type option,
3409
2.32k
                                            int version, varint_vec *vv) {
3410
2.32k
    cram_codec *c;
3411
2.32k
    char *cp   = data;
3412
2.32k
    char *endp = data + size;
3413
3414
2.32k
    if (!(c = malloc(sizeof(*c))))
3415
0
        return NULL;
3416
3417
2.32k
    c->codec  = E_BYTE_ARRAY_LEN;
3418
2.32k
    c->decode = cram_byte_array_len_decode;
3419
2.32k
    c->free   = cram_byte_array_len_decode_free;
3420
2.32k
    c->describe = cram_byte_array_len_describe;
3421
2.32k
    c->u.byte_array_len.len_codec = NULL;
3422
2.32k
    c->u.byte_array_len.val_codec = NULL;
3423
3424
2.32k
    int encoding = vv->varint_get32(&cp, endp, NULL);
3425
2.32k
    int sub_size = vv->varint_get32(&cp, endp, NULL);
3426
2.32k
    if (sub_size < 0 || endp - cp < sub_size)
3427
9
        goto malformed;
3428
2.31k
    c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3429
2.31k
                                                      E_INT, version, vv);
3430
2.31k
    if (c->u.byte_array_len.len_codec == NULL)
3431
45
        goto no_codec;
3432
2.27k
    cp += sub_size;
3433
3434
2.27k
    encoding = vv->varint_get32(&cp, endp, NULL);
3435
2.27k
    sub_size = vv->varint_get32(&cp, endp, NULL);
3436
2.27k
    if (sub_size < 0 || endp - cp < sub_size)
3437
6
        goto malformed;
3438
2.26k
    c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3439
2.26k
                                                      option, version, vv);
3440
2.26k
    if (c->u.byte_array_len.val_codec == NULL)
3441
3
        goto no_codec;
3442
2.26k
    cp += sub_size;
3443
3444
2.26k
    if (cp - data != size)
3445
6
        goto malformed;
3446
3447
2.25k
    return c;
3448
3449
21
 malformed:
3450
21
    hts_log_error("Malformed byte_array_len header stream");
3451
69
 no_codec:
3452
69
    cram_byte_array_len_decode_free(c);
3453
69
    return NULL;
3454
21
}
3455
3456
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
3457
39.3k
                               char *in, int in_size) {
3458
39.3k
    int32_t i32 = in_size;
3459
39.3k
    int r = 0;
3460
3461
39.3k
    r |= c->u.e_byte_array_len.len_codec->encode(slice,
3462
39.3k
                                                 c->u.e_byte_array_len.len_codec,
3463
39.3k
                                                 (char *)&i32, 1);
3464
39.3k
    r |= c->u.e_byte_array_len.val_codec->encode(slice,
3465
39.3k
                                                 c->u.e_byte_array_len.val_codec,
3466
39.3k
                                                 in, in_size);
3467
39.3k
    return r;
3468
39.3k
}
3469
3470
177k
void cram_byte_array_len_encode_free(cram_codec *c) {
3471
177k
    if (!c)
3472
0
        return;
3473
3474
177k
    if (c->u.e_byte_array_len.len_codec)
3475
177k
        c->u.e_byte_array_len.len_codec->free(c->u.e_byte_array_len.len_codec);
3476
3477
177k
    if (c->u.e_byte_array_len.val_codec)
3478
177k
        c->u.e_byte_array_len.val_codec->free(c->u.e_byte_array_len.val_codec);
3479
3480
177k
    free(c);
3481
177k
}
3482
3483
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
3484
177k
                                     char *prefix, int version) {
3485
177k
    int len = 0, len2, len3, r = 0, n;
3486
177k
    cram_codec *tc;
3487
177k
    cram_block *b_len = NULL, *b_val = NULL;
3488
3489
177k
    if (prefix) {
3490
62.8k
        size_t l = strlen(prefix);
3491
62.8k
        BLOCK_APPEND(b, prefix, l);
3492
62.8k
        len += l;
3493
62.8k
    }
3494
3495
177k
    tc = c->u.e_byte_array_len.len_codec;
3496
177k
    b_len = cram_new_block(0, 0);
3497
177k
    if (!b_len) goto block_err;
3498
177k
    len2 = tc->store(tc, b_len, NULL, version);
3499
177k
    if (len2 < 0) goto block_err;
3500
3501
177k
    tc = c->u.e_byte_array_len.val_codec;
3502
177k
    b_val = cram_new_block(0, 0);
3503
177k
    if (!b_val) goto block_err;
3504
177k
    len3 = tc->store(tc, b_val, NULL, version);
3505
177k
    if (len3 < 0) goto block_err;
3506
3507
177k
    len += (n = c->vv->varint_put32_blk(b, c->codec));  r |= n;
3508
177k
    len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n;
3509
177k
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
3510
177k
    BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
3511
3512
177k
    cram_free_block(b_len);
3513
177k
    cram_free_block(b_val);
3514
3515
177k
    if (r > 0)
3516
177k
        return len + len2 + len3;
3517
3518
0
 block_err:
3519
0
    if (b_len) cram_free_block(b_len);
3520
0
    if (b_val) cram_free_block(b_val);
3521
0
    return -1;
3522
177k
}
3523
3524
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
3525
                                            enum cram_encoding codec,
3526
                                            enum cram_external_type option,
3527
                                            void *dat,
3528
177k
                                            int version, varint_vec *vv) {
3529
177k
    cram_codec *c;
3530
177k
    cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
3531
3532
177k
    c = malloc(sizeof(*c));
3533
177k
    if (!c)
3534
0
        return NULL;
3535
177k
    c->codec = E_BYTE_ARRAY_LEN;
3536
177k
    c->free = cram_byte_array_len_encode_free;
3537
177k
    c->encode = cram_byte_array_len_encode;
3538
177k
    c->store = cram_byte_array_len_encode_store;
3539
177k
    c->flush = NULL;
3540
3541
177k
    c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
3542
177k
                                                        st, E_INT,
3543
177k
                                                        e->len_dat,
3544
177k
                                                        version, vv);
3545
177k
    c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
3546
177k
                                                        NULL, E_BYTE_ARRAY,
3547
177k
                                                        e->val_dat,
3548
177k
                                                        version, vv);
3549
3550
177k
    if (!c->u.e_byte_array_len.len_codec ||
3551
177k
        !c->u.e_byte_array_len.val_codec) {
3552
0
        cram_byte_array_len_encode_free(c);
3553
0
        return NULL;
3554
0
    }
3555
3556
177k
    return c;
3557
177k
}
3558
3559
/*
3560
 * ---------------------------------------------------------------------------
3561
 * BYTE_ARRAY_STOP
3562
 */
3563
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
3564
                                            cram_block *in, char *out,
3565
0
                                            int *out_size) {
3566
0
    char *cp, ch;
3567
0
    cram_block *b = NULL;
3568
3569
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3570
0
    if (!b)
3571
0
        return *out_size?-1:0;
3572
3573
0
    if (b->idx >= b->uncomp_size)
3574
0
        return -1;
3575
3576
0
    cp = (char *)b->data + b->idx;
3577
0
    if (out) {
3578
       // memccpy equivalent but without copying the terminating byte
3579
0
        ssize_t term = MIN(*out_size, b->uncomp_size - b->idx);
3580
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3581
0
            if (term-- < 0)
3582
0
                break;
3583
0
            *out++ = ch;
3584
0
            cp++;
3585
0
        }
3586
3587
        // Attempted overrun on input or output
3588
0
        if (ch != (char)c->u.byte_array_stop.stop)
3589
0
            return -1;
3590
0
    } else {
3591
        // Consume input, but produce no output
3592
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3593
0
            if (cp - (char *)b->data >= b->uncomp_size)
3594
0
                return -1;
3595
0
            cp++;
3596
0
        }
3597
0
    }
3598
3599
0
    *out_size = cp - (char *)(b->data + b->idx);
3600
0
    b->idx = cp - (char *)b->data + 1;
3601
3602
0
    return 0;
3603
0
}
3604
3605
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
3606
                                      cram_block *in, char *out_,
3607
0
                                      int *out_size) {
3608
0
    cram_block *b;
3609
0
    cram_block *out = (cram_block *)out_;
3610
0
    unsigned char *cp, *cp_end;
3611
0
    unsigned char stop;
3612
3613
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3614
0
    if (!b)
3615
0
        return *out_size?-1:0;
3616
3617
0
    if (b->idx >= b->uncomp_size)
3618
0
        return -1;
3619
0
    cp = b->data + b->idx;
3620
0
    cp_end = b->data + b->uncomp_size;
3621
3622
    // STOP byte is hard-coded as zero by our name tokeniser decoder
3623
    // implementation, so we may ignore what was requested.
3624
0
    stop = b->orig_method == TOK3 ? 0 : c->u.byte_array_stop.stop;
3625
3626
0
    if (cp_end - cp < out->alloc - out->byte) {
3627
0
        unsigned char *out_cp = BLOCK_END(out);
3628
0
        while (cp != cp_end && *cp != stop)
3629
0
            *out_cp++ = *cp++;
3630
0
        BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out);
3631
0
    } else {
3632
0
        unsigned char *cp_start;
3633
0
        for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
3634
0
            ;
3635
0
        BLOCK_APPEND(out, cp_start, cp - cp_start);
3636
0
        BLOCK_GROW(out, cp - cp_start);
3637
0
    }
3638
3639
0
    *out_size = cp - (b->data + b->idx);
3640
0
    b->idx = cp - b->data + 1;
3641
3642
0
    return 0;
3643
3644
0
 block_err:
3645
0
    return -1;
3646
0
}
3647
3648
663
void cram_byte_array_stop_decode_free(cram_codec *c) {
3649
663
    if (!c) return;
3650
3651
663
    free(c);
3652
663
}
3653
3654
0
int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) {
3655
0
    return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)",
3656
0
                    c->u.byte_array_stop.stop,
3657
0
                    c->u.byte_array_stop.content_id)
3658
0
        < 0 ? -1 : 0;
3659
0
}
3660
3661
cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
3662
                                             char *data, int size,
3663
                                             enum cram_encoding codec,
3664
                                             enum cram_external_type option,
3665
675
                                             int version, varint_vec *vv) {
3666
675
    cram_codec *c = NULL;
3667
675
    unsigned char *cp = (unsigned char *)data;
3668
675
    int err = 0;
3669
3670
675
    if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
3671
3
        goto malformed;
3672
3673
672
    if (!(c = malloc(sizeof(*c))))
3674
0
        return NULL;
3675
3676
672
    c->codec  = E_BYTE_ARRAY_STOP;
3677
672
    switch (option) {
3678
621
    case E_BYTE_ARRAY_BLOCK:
3679
621
        c->decode = cram_byte_array_stop_decode_block;
3680
621
        break;
3681
48
    case E_BYTE_ARRAY:
3682
48
        c->decode = cram_byte_array_stop_decode_char;
3683
48
        break;
3684
3
    default:
3685
3
        hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
3686
3
        free(c);
3687
3
        return NULL;
3688
672
    }
3689
669
    c->free   = cram_byte_array_stop_decode_free;
3690
669
    c->describe = cram_byte_array_stop_describe;
3691
3692
669
    c->u.byte_array_stop.stop = *cp++;
3693
669
    if (CRAM_MAJOR_VERS(version) == 1) {
3694
666
        c->u.byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
3695
666
            + ((unsigned int) cp[3]<<24);
3696
666
        cp += 4;
3697
666
    } else {
3698
3
        c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err);
3699
3
    }
3700
3701
669
    if ((char *)cp - data != size || err)
3702
6
        goto malformed;
3703
3704
663
    return c;
3705
3706
9
 malformed:
3707
9
    hts_log_error("Malformed byte_array_stop header stream");
3708
9
    free(c);
3709
9
    return NULL;
3710
669
}
3711
3712
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
3713
216k
                                char *in, int in_size) {
3714
216k
    BLOCK_APPEND(c->out, in, in_size);
3715
216k
    BLOCK_APPEND_CHAR(c->out, c->u.e_byte_array_stop.stop);
3716
216k
    return 0;
3717
3718
0
 block_err:
3719
0
    return -1;
3720
216k
}
3721
3722
263k
void cram_byte_array_stop_encode_free(cram_codec *c) {
3723
263k
    if (!c)
3724
0
        return;
3725
263k
    free(c);
3726
263k
}
3727
3728
int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
3729
263k
                                      char *prefix, int version) {
3730
263k
    int len = 0;
3731
263k
    char buf[20], *cp = buf;
3732
3733
263k
    if (prefix) {
3734
188k
        size_t l = strlen(prefix);
3735
188k
        BLOCK_APPEND(b, prefix, l);
3736
188k
        len += l;
3737
188k
    }
3738
3739
263k
    cp += c->vv->varint_put32(cp, buf+20, c->codec);
3740
3741
263k
    if (CRAM_MAJOR_VERS(version) == 1) {
3742
0
        cp += c->vv->varint_put32(cp, buf+20, 5);
3743
0
        *cp++ = c->u.e_byte_array_stop.stop;
3744
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  0) & 0xff;
3745
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  8) & 0xff;
3746
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff;
3747
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff;
3748
263k
    } else {
3749
263k
        cp += c->vv->varint_put32(cp, buf+20, 1 +
3750
263k
                                  c->vv->varint_size(c->u.e_byte_array_stop.content_id));
3751
263k
        *cp++ = c->u.e_byte_array_stop.stop;
3752
263k
        cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id);
3753
263k
    }
3754
3755
263k
    BLOCK_APPEND(b, buf, cp-buf);
3756
263k
    len += cp-buf;
3757
3758
263k
    return len;
3759
3760
0
 block_err:
3761
0
    return -1;
3762
263k
}
3763
3764
cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
3765
                                             enum cram_encoding codec,
3766
                                             enum cram_external_type option,
3767
                                             void *dat,
3768
263k
                                             int version, varint_vec *vv) {
3769
263k
    cram_codec *c;
3770
3771
263k
    c = malloc(sizeof(*c));
3772
263k
    if (!c)
3773
0
        return NULL;
3774
263k
    c->codec = E_BYTE_ARRAY_STOP;
3775
263k
    c->free = cram_byte_array_stop_encode_free;
3776
263k
    c->encode = cram_byte_array_stop_encode;
3777
263k
    c->store = cram_byte_array_stop_encode_store;
3778
263k
    c->flush = NULL;
3779
3780
263k
    c->u.e_byte_array_stop.stop = ((int *)dat)[0];
3781
263k
    c->u.e_byte_array_stop.content_id = ((int *)dat)[1];
3782
3783
263k
    return c;
3784
263k
}
3785
3786
/*
3787
 * ---------------------------------------------------------------------------
3788
 */
3789
3790
328
const char *cram_encoding2str(enum cram_encoding t) {
3791
328
    switch (t) {
3792
128
    case E_NULL:            return "NULL";
3793
0
    case E_EXTERNAL:        return "EXTERNAL";
3794
6
    case E_GOLOMB:          return "GOLOMB";
3795
0
    case E_HUFFMAN:         return "HUFFMAN";
3796
0
    case E_BYTE_ARRAY_LEN:  return "BYTE_ARRAY_LEN";
3797
0
    case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
3798
93
    case E_BETA:            return "BETA";
3799
0
    case E_SUBEXP:          return "SUBEXP";
3800
3
    case E_GOLOMB_RICE:     return "GOLOMB_RICE";
3801
0
    case E_GAMMA:           return "GAMMA";
3802
3803
0
    case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED";
3804
0
    case E_VARINT_SIGNED:   return "VARINT_SIGNED";
3805
0
    case E_CONST_BYTE:      return "CONST_BYTE";
3806
0
    case E_CONST_INT:       return "CONST_INT";
3807
3808
0
    case E_NUM_CODECS:
3809
98
    default:                return "?";
3810
328
    }
3811
328
}
3812
3813
static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr,
3814
                                    char *data,
3815
                                    int size,
3816
                                    enum cram_encoding codec,
3817
                                    enum cram_external_type option,
3818
                                    int version, varint_vec *vv) = {
3819
    // CRAM 3.0 valid codecs
3820
    NULL, // null codec
3821
    cram_external_decode_init,
3822
    NULL, // golomb
3823
    cram_huffman_decode_init,
3824
    cram_byte_array_len_decode_init,
3825
    cram_byte_array_stop_decode_init,
3826
    cram_beta_decode_init,
3827
    cram_subexp_decode_init,
3828
    NULL, // golomb rice
3829
    cram_gamma_decode_init,
3830
3831
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3832
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3833
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3834
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3835
3836
    NULL,                      // was xbyte
3837
    cram_varint_decode_init,   // varint unsigned
3838
    cram_varint_decode_init,   // varint signed
3839
    cram_const_decode_init,    // const byte
3840
    cram_const_decode_init,    // const int
3841
3842
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3843
    NULL, NULL, NULL, NULL, NULL,
3844
3845
    NULL, // xhuffman
3846
    cram_xpack_decode_init,
3847
    cram_xrle_decode_init,
3848
    cram_xdelta_decode_init,
3849
};
3850
3851
cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr,
3852
                              enum cram_encoding codec,
3853
                              char *data, int size,
3854
                              enum cram_external_type option,
3855
15.0k
                              int version, varint_vec *vv) {
3856
15.0k
    if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
3857
14.8k
        cram_codec *r = decode_init[codec](hdr, data, size, codec,
3858
14.8k
                                           option, version, vv);
3859
14.8k
        if (r) {
3860
14.0k
            r->vv = vv;
3861
14.0k
            r->codec_id = hdr->ncodecs++;
3862
14.0k
        }
3863
14.8k
        return r;
3864
14.8k
    } else {
3865
235
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3866
235
        return NULL;
3867
235
    }
3868
15.0k
}
3869
3870
static cram_codec *(*encode_init[])(cram_stats *stx,
3871
                                    enum cram_encoding codec,
3872
                                    enum cram_external_type option,
3873
                                    void *opt,
3874
                                    int version, varint_vec *vv) = {
3875
    // CRAM 3.0 valid codecs
3876
    NULL, // null codec
3877
    cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4
3878
    NULL, // golomb
3879
    cram_huffman_encode_init,
3880
    cram_byte_array_len_encode_init,
3881
    cram_byte_array_stop_encode_init,
3882
    cram_beta_encode_init,
3883
    NULL, // subexponential (we support decode only)
3884
    NULL, // golomb rice
3885
    NULL, // gamma (we support decode only)
3886
3887
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3888
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3889
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3890
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3891
3892
    NULL, // was xbyte
3893
    cram_varint_encode_init, // varint unsigned
3894
    cram_varint_encode_init, // varint signed
3895
    cram_const_encode_init,  // const byte
3896
    cram_const_encode_init,  // const int
3897
3898
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3899
    NULL, NULL, NULL, NULL, NULL,
3900
3901
    NULL, // xhuffman
3902
    cram_xpack_encode_init,
3903
    cram_xrle_encode_init,
3904
    cram_xdelta_encode_init,
3905
};
3906
3907
cram_codec *cram_encoder_init(enum cram_encoding codec,
3908
                              cram_stats *st,
3909
                              enum cram_external_type option,
3910
                              void *dat,
3911
2.24M
                              int version, varint_vec *vv) {
3912
2.24M
    if (st && !st->nvals)
3913
530k
        return NULL;
3914
3915
    // cram_stats_encoding assumes integer data, but if option
3916
    // is E_BYTE then tweak the requested encoding.  This ought
3917
    // to be fixed in cram_stats_encoding instead.
3918
1.71M
    if (option == E_BYTE || option == E_BYTE_ARRAY ||
3919
1.71M
       option == E_BYTE_ARRAY_BLOCK) {
3920
715k
       if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED)
3921
0
           codec = E_EXTERNAL;
3922
715k
       else if (codec == E_CONST_INT)
3923
0
           codec = E_CONST_BYTE;
3924
715k
    }
3925
3926
1.71M
    if (encode_init[codec]) {
3927
1.71M
        cram_codec *r;
3928
1.71M
        if ((r = encode_init[codec](st, codec, option, dat, version, vv)))
3929
1.71M
            r->out = NULL;
3930
1.71M
        if (!r) {
3931
93
            hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec));
3932
93
            return NULL;
3933
93
        }
3934
1.71M
        r->vv = vv;
3935
1.71M
        return r;
3936
1.71M
    } else {
3937
0
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3938
0
        abort();
3939
0
    }
3940
1.71M
}
3941
3942
/*
3943
 * Returns the content_id used by this codec, also in id2 if byte_array_len.
3944
 * Returns -1 for the CORE block and -2 for unneeded.
3945
 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
3946
 */
3947
0
int cram_codec_to_id(cram_codec *c, int *id2) {
3948
0
    int bnum1, bnum2 = -2;
3949
3950
0
    switch (c->codec) {
3951
0
    case E_CONST_INT:
3952
0
    case E_CONST_BYTE:
3953
0
        bnum1 = -2; // no blocks used
3954
0
        break;
3955
3956
0
    case E_HUFFMAN:
3957
0
        bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1;
3958
0
        break;
3959
3960
0
    case E_GOLOMB:
3961
0
    case E_BETA:
3962
0
    case E_SUBEXP:
3963
0
    case E_GOLOMB_RICE:
3964
0
    case E_GAMMA:
3965
        // CORE block
3966
0
        bnum1 = -1;
3967
0
        break;
3968
3969
0
    case E_EXTERNAL:
3970
0
    case E_VARINT_UNSIGNED:
3971
0
    case E_VARINT_SIGNED:
3972
0
        bnum1 = c->u.external.content_id;
3973
0
        break;
3974
3975
0
    case E_BYTE_ARRAY_LEN:
3976
0
        bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL);
3977
0
        bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL);
3978
0
        break;
3979
3980
0
    case E_BYTE_ARRAY_STOP:
3981
0
        bnum1 = c->u.byte_array_stop.content_id;
3982
0
        break;
3983
3984
0
    case E_NULL:
3985
0
        bnum1 = -2;
3986
0
        break;
3987
3988
0
    default:
3989
0
        hts_log_error("Unknown codec type %d", c->codec);
3990
0
        bnum1 = -1;
3991
0
    }
3992
3993
0
    if (id2)
3994
0
        *id2 = bnum2;
3995
0
    return bnum1;
3996
0
}
3997
3998
3999
/*
4000
 * cram_codec structures are specialised for decoding or encoding.
4001
 * Unfortunately this makes turning a decoder into an encoder (such as
4002
 * when transcoding files) problematic.
4003
 *
4004
 * This function converts a cram decoder codec into an encoder version
4005
 * in-place (ie it modifiers the codec itself).
4006
 *
4007
 * Returns 0 on success;
4008
 *        -1 on failure.
4009
 */
4010
0
int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
4011
0
    int j;
4012
4013
0
    switch (c->codec) {
4014
0
    case E_CONST_INT:
4015
0
    case E_CONST_BYTE:
4016
        // shares struct with decode
4017
0
        c->store = cram_const_encode_store;
4018
0
        break;
4019
4020
0
    case E_EXTERNAL:
4021
        // shares struct with decode
4022
0
        c->free = cram_external_encode_free;
4023
0
        c->store = cram_external_encode_store;
4024
0
        if (c->decode == cram_external_decode_int)
4025
0
            c->encode = cram_external_encode_int;
4026
0
        else if (c->decode == cram_external_decode_long)
4027
0
            c->encode = cram_external_encode_long;
4028
0
        else if (c->decode == cram_external_decode_char)
4029
0
            c->encode = cram_external_encode_char;
4030
0
        else if (c->decode == cram_external_decode_block)
4031
0
            c->encode = cram_external_encode_char;
4032
0
        else
4033
0
            return -1;
4034
0
        break;
4035
4036
0
    case E_VARINT_SIGNED:
4037
0
    case E_VARINT_UNSIGNED:
4038
        // shares struct with decode
4039
0
        c->free = cram_varint_encode_free;
4040
0
        c->store = cram_varint_encode_store;
4041
0
        if (c->decode == cram_varint_decode_int)
4042
0
            c->encode = cram_varint_encode_int;
4043
0
        else if (c->decode == cram_varint_decode_sint)
4044
0
            c->encode = cram_varint_encode_sint;
4045
0
        else if (c->decode == cram_varint_decode_long)
4046
0
            c->encode = cram_varint_encode_long;
4047
0
        else if (c->decode == cram_varint_decode_slong)
4048
0
            c->encode = cram_varint_encode_slong;
4049
0
        else
4050
0
            return -1;
4051
0
        break;
4052
4053
0
    case E_HUFFMAN: {
4054
        // New structure, so switch.
4055
        // FIXME: we huffman and e_huffman structs amended, we could
4056
        // unify this.
4057
0
        cram_codec *t = malloc(sizeof(*t));
4058
0
        if (!t) return -1;
4059
0
        t->vv     = c->vv;
4060
0
        t->codec = E_HUFFMAN;
4061
0
        t->free = cram_huffman_encode_free;
4062
0
        t->store = cram_huffman_encode_store;
4063
0
        t->u.e_huffman.codes = c->u.huffman.codes;
4064
0
        t->u.e_huffman.nvals = c->u.huffman.ncodes;
4065
0
        t->u.e_huffman.option = c->u.huffman.option;
4066
0
        for (j = 0; j < t->u.e_huffman.nvals; j++) {
4067
0
            int32_t sym = t->u.e_huffman.codes[j].symbol;
4068
0
            if (sym >= -1 && sym < MAX_HUFF)
4069
0
                t->u.e_huffman.val2code[sym+1] = j;
4070
0
        }
4071
4072
0
        if (c->decode == cram_huffman_decode_char0)
4073
0
            t->encode = cram_huffman_encode_char0;
4074
0
        else if (c->decode == cram_huffman_decode_char)
4075
0
            t->encode = cram_huffman_encode_char;
4076
0
        else if (c->decode == cram_huffman_decode_int0)
4077
0
            t->encode = cram_huffman_encode_int0;
4078
0
        else if (c->decode == cram_huffman_decode_int)
4079
0
            t->encode = cram_huffman_encode_int;
4080
0
        else if (c->decode == cram_huffman_decode_long0)
4081
0
            t->encode = cram_huffman_encode_long0;
4082
0
        else if (c->decode == cram_huffman_decode_long)
4083
0
            t->encode = cram_huffman_encode_long;
4084
0
        else {
4085
0
            free(t);
4086
0
            return -1;
4087
0
        }
4088
0
        *c = *t;
4089
0
        free(t);
4090
0
        break;
4091
0
    }
4092
4093
0
    case E_BETA:
4094
        // shares struct with decode
4095
0
        c->free = cram_beta_encode_free;
4096
0
        c->store = cram_beta_encode_store;
4097
0
        if (c->decode == cram_beta_decode_int)
4098
0
            c->encode = cram_beta_encode_int;
4099
0
        else if (c->decode == cram_beta_decode_long)
4100
0
            c->encode = cram_beta_encode_long;
4101
0
        else if (c->decode == cram_beta_decode_char)
4102
0
            c->encode = cram_beta_encode_char;
4103
0
        else
4104
0
            return -1;
4105
0
        break;
4106
4107
0
    case E_XPACK: {
4108
        // shares struct with decode
4109
0
        cram_codec t = *c;
4110
0
        t.free = cram_xpack_encode_free;
4111
0
        t.store = cram_xpack_encode_store;
4112
0
        if (t.decode == cram_xpack_decode_long)
4113
0
            t.encode = cram_xpack_encode_long;
4114
0
        else if (t.decode == cram_xpack_decode_int)
4115
0
            t.encode = cram_xpack_encode_int;
4116
0
        else if (t.decode == cram_xpack_decode_char)
4117
0
            t.encode = cram_xpack_encode_char;
4118
0
        else
4119
0
            return -1;
4120
0
        t.u.e_xpack.sub_codec = t.u.xpack.sub_codec;
4121
0
        if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1)
4122
0
            return -1;
4123
0
        *c = t;
4124
0
        break;
4125
0
    }
4126
4127
0
    case E_BYTE_ARRAY_LEN: {
4128
0
        cram_codec *t = malloc(sizeof(*t));
4129
0
        if (!t) return -1;
4130
0
        t->vv     = c->vv;
4131
0
        t->codec  = E_BYTE_ARRAY_LEN;
4132
0
        t->free   = cram_byte_array_len_encode_free;
4133
0
        t->store  = cram_byte_array_len_encode_store;
4134
0
        t->encode = cram_byte_array_len_encode;
4135
0
        t->u.e_byte_array_len.len_codec = c->u.byte_array_len.len_codec;
4136
0
        t->u.e_byte_array_len.val_codec = c->u.byte_array_len.val_codec;
4137
0
        if (cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.len_codec) == -1 ||
4138
0
            cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.val_codec) == -1) {
4139
0
            t->free(t);
4140
0
            return -1;
4141
0
        }
4142
4143
        // {len,val}_{encoding,dat} are undefined, but unused.
4144
        // Leaving them unset here means we can test that assertion.
4145
0
        *c = *t;
4146
0
        free(t);
4147
0
        break;
4148
0
    }
4149
4150
0
    case E_BYTE_ARRAY_STOP:
4151
        // shares struct with decode
4152
0
        c->free   = cram_byte_array_stop_encode_free;
4153
0
        c->store  = cram_byte_array_stop_encode_store;
4154
0
        c->encode = cram_byte_array_stop_encode;
4155
0
        break;
4156
4157
0
    default:
4158
0
        return -1;
4159
0
    }
4160
4161
0
    return 0;
4162
0
}
4163
4164
0
int cram_codec_describe(cram_codec *c, kstring_t *ks) {
4165
0
    if (c && c->describe)
4166
0
        return c->describe(c, ks);
4167
0
    else
4168
0
        return ksprintf(ks, "?");
4169
0
}