Coverage Report

Created: 2025-07-18 07:26

/src/htslib/cram/cram_codecs.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2021,2023 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * FIXME: add checking of cram_external_type to return NULL on unsupported
33
 * {codec,type} tuples.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdlib.h>
40
#include <string.h>
41
#include <assert.h>
42
#include <limits.h>
43
#include <stdint.h>
44
#include <errno.h>
45
#include <stddef.h>
46
47
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48
#include "../fuzz_settings.h"
49
#endif
50
51
#include "../htslib/hts_endian.h"
52
53
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
54
#include <htscodecs/varint.h>
55
#include <htscodecs/pack.h>
56
#include <htscodecs/rle.h>
57
#else
58
#include "../htscodecs/htscodecs/varint.h"
59
#include "../htscodecs/htscodecs/pack.h"
60
#include "../htscodecs/htscodecs/rle.h"
61
#endif
62
63
#include "cram.h"
64
65
/*
66
 * ---------------------------------------------------------------------------
67
 * Block bit-level I/O functions.
68
 * All defined static here to promote easy inlining by the compiler.
69
 */
70
71
#if 0
72
/* Get a single bit, MSB first */
73
static signed int get_bit_MSB(cram_block *block) {
74
    unsigned int val;
75
76
    if (block->byte > block->alloc)
77
        return -1;
78
79
    val = block->data[block->byte] >> block->bit;
80
    if (--block->bit == -1) {
81
        block->bit = 7;
82
        block->byte++;
83
        //printf("(%02X)", block->data[block->byte]);
84
    }
85
86
    //printf("-B%d-", val&1);
87
88
    return val & 1;
89
}
90
#endif
91
92
/*
93
 * Count number of successive 0 and 1 bits
94
 */
95
0
static int get_one_bits_MSB(cram_block *block) {
96
0
    int n = 0, b;
97
0
    if (block->byte >= block->uncomp_size)
98
0
        return -1;
99
0
    do {
100
0
        b = block->data[block->byte] >> block->bit;
101
0
        if (--block->bit == -1) {
102
0
            block->bit = 7;
103
0
            block->byte++;
104
0
            if (block->byte == block->uncomp_size && (b&1))
105
0
                return -1;
106
0
        }
107
0
        n++;
108
0
    } while (b&1);
109
110
0
    return n-1;
111
0
}
112
113
0
static int get_zero_bits_MSB(cram_block *block) {
114
0
    int n = 0, b;
115
0
    if (block->byte >= block->uncomp_size)
116
0
        return -1;
117
0
    do {
118
0
        b = block->data[block->byte] >> block->bit;
119
0
        if (--block->bit == -1) {
120
0
            block->bit = 7;
121
0
            block->byte++;
122
0
            if (block->byte == block->uncomp_size && !(b&1))
123
0
                return -1;
124
0
        }
125
0
        n++;
126
0
    } while (!(b&1));
127
128
0
    return n-1;
129
0
}
130
131
#if 0
132
/* Stores a single bit */
133
static void store_bit_MSB(cram_block *block, unsigned int bit) {
134
    if (block->byte >= block->alloc) {
135
        block->alloc = block->alloc ? block->alloc*2 : 1024;
136
        block->data = realloc(block->data, block->alloc);
137
    }
138
139
    if (bit)
140
        block->data[block->byte] |= (1 << block->bit);
141
142
    if (--block->bit == -1) {
143
        block->bit = 7;
144
        block->byte++;
145
        block->data[block->byte] = 0;
146
    }
147
}
148
#endif
149
150
#if 0
151
/* Rounds to the next whole byte boundary first */
152
static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
153
    if (block->bit != 7) {
154
        block->bit = 7;
155
        block->byte++;
156
    }
157
158
    while (block->byte + len >= block->alloc) {
159
        block->alloc = block->alloc ? block->alloc*2 : 1024;
160
        block->data = realloc(block->data, block->alloc);
161
    }
162
163
    memcpy(&block->data[block->byte], bytes, len);
164
    block->byte += len;
165
}
166
#endif
167
168
/* Local optimised copy for inlining */
169
0
static inline int64_t get_bits_MSB(cram_block *block, int nbits) {
170
0
    uint64_t val = 0;
171
0
    int i;
172
173
#if 0
174
    // Fits within the current byte */
175
    if (nbits <= block->bit+1) {
176
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
177
        if ((block->bit -= nbits) == -1) {
178
            block->bit = 7;
179
            block->byte++;
180
        }
181
        return val;
182
    }
183
184
    // partial first byte
185
    val = block->data[block->byte] & ((1<<(block->bit+1))-1);
186
    nbits -= block->bit+1;
187
    block->bit = 7;
188
    block->byte++;
189
190
    // whole middle bytes
191
    while (nbits >= 8) {
192
        val = (val << 8) | block->data[block->byte++];
193
        nbits -= 8;
194
    }
195
196
    val <<= nbits;
197
    val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
198
    block->bit -= nbits;
199
    return val;
200
#endif
201
202
#if 0
203
    /* Inefficient implementation! */
204
    //printf("{");
205
    for (i = 0; i < nbits; i++)
206
        //val = (val << 1) | get_bit_MSB(block);
207
        GET_BIT_MSB(block, val);
208
#endif
209
210
0
#if 1
211
    /* Combination of 1st two methods */
212
0
    if (nbits <= block->bit+1) {
213
0
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
214
0
        if ((block->bit -= nbits) == -1) {
215
0
            block->bit = 7;
216
0
            block->byte++;
217
0
        }
218
0
        return val;
219
0
    }
220
221
0
    switch(nbits) {
222
//  case 15: GET_BIT_MSB(block, val); // fall through
223
//  case 14: GET_BIT_MSB(block, val); // fall through
224
//  case 13: GET_BIT_MSB(block, val); // fall through
225
//  case 12: GET_BIT_MSB(block, val); // fall through
226
//  case 11: GET_BIT_MSB(block, val); // fall through
227
//  case 10: GET_BIT_MSB(block, val); // fall through
228
//  case  9: GET_BIT_MSB(block, val); // fall through
229
0
    case  8: GET_BIT_MSB(block, val); // fall through
230
0
    case  7: GET_BIT_MSB(block, val); // fall through
231
0
    case  6: GET_BIT_MSB(block, val); // fall through
232
0
    case  5: GET_BIT_MSB(block, val); // fall through
233
0
    case  4: GET_BIT_MSB(block, val); // fall through
234
0
    case  3: GET_BIT_MSB(block, val); // fall through
235
0
    case  2: GET_BIT_MSB(block, val); // fall through
236
0
    case  1: GET_BIT_MSB(block, val);
237
0
        break;
238
239
0
    default:
240
0
        for (i = 0; i < nbits; i++)
241
            //val = (val << 1) | get_bit_MSB(block);
242
0
            GET_BIT_MSB(block, val);
243
0
    }
244
0
#endif
245
246
    //printf("=0x%x}", val);
247
248
0
    return val;
249
0
}
250
251
/*
252
 * Can store up to 24-bits worth of data encoded in an integer value
253
 * Possibly we'd want to have a less optimal store_bits function when dealing
254
 * with nbits > 24, but for now we assume the codes generated are never
255
 * that big. (Given this is only possible with 121392 or more
256
 * characters with exactly the correct frequency distribution we check
257
 * for it elsewhere.)
258
 */
259
17.6k
static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) {
260
    //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
261
262
    /*
263
     * Use slow mode until we tweak the huffman generator to never generate
264
     * codes longer than 24-bits.
265
     */
266
17.6k
    unsigned int mask;
267
268
17.6k
    if (block->byte+8 >= block->alloc) {
269
1.07k
        if (block->byte) {
270
7
            block->alloc *= 2;
271
7
            block->data = realloc(block->data, block->alloc + 8);
272
7
            if (!block->data)
273
0
                return -1;
274
1.07k
        } else {
275
1.07k
            block->alloc = 1024;
276
1.07k
            block->data = realloc(block->data, block->alloc + 8);
277
1.07k
            if (!block->data)
278
0
                return -1;
279
1.07k
            block->data[0] = 0; // initialise first byte of buffer
280
1.07k
        }
281
1.07k
    }
282
283
    /* fits in current bit-field */
284
17.6k
    if (nbits <= block->bit+1) {
285
7.27k
        block->data[block->byte] |= (val << (block->bit+1-nbits));
286
7.27k
        if ((block->bit-=nbits) == -1) {
287
2.46k
            block->bit = 7;
288
2.46k
            block->byte++;
289
2.46k
            block->data[block->byte] = 0;
290
2.46k
        }
291
7.27k
        return 0;
292
7.27k
    }
293
294
10.4k
    block->data[block->byte] |= (val >> (nbits -= block->bit+1));
295
10.4k
    block->bit = 7;
296
10.4k
    block->byte++;
297
10.4k
    block->data[block->byte] = 0;
298
299
10.4k
    mask = 1<<(nbits-1);
300
47.2k
    do {
301
47.2k
        if (val & mask)
302
11.8k
            block->data[block->byte] |= (1 << block->bit);
303
47.2k
        if (--block->bit == -1) {
304
1.60k
            block->bit = 7;
305
1.60k
            block->byte++;
306
1.60k
            block->data[block->byte] = 0;
307
1.60k
        }
308
47.2k
        mask >>= 1;
309
47.2k
    } while(--nbits);
310
311
10.4k
    return 0;
312
17.6k
}
313
314
/*
315
 * Returns the next 'size' bytes from a block, or NULL if insufficient
316
 * data left.This is just a pointer into the block data and not an
317
 * allocated object, so do not free the result.
318
 */
319
0
static char *cram_extract_block(cram_block *b, int size) {
320
0
    char *cp = (char *)b->data + b->idx;
321
0
    b->idx += size;
322
0
    if (b->idx > b->uncomp_size)
323
0
        return NULL;
324
325
0
    return cp;
326
0
}
327
328
/*
329
 * ---------------------------------------------------------------------------
330
 * EXTERNAL
331
 *
332
 * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the
333
 * size of the object being returned.  This type is hard coded in the
334
 * spec document (changing from uint32 to uint64 requires a spec change)
335
 * and there is no data format introspection so implementations have
336
 * to determine which size to use based on version numbers.   It also
337
 * doesn't support signed data.
338
 *
339
 * With CRAM 4.0 onwards the size and sign of the data is no longer stated
340
 * explicitly in the specification.  Instead EXTERNAL is replaced by three
341
 * new encodings, for bytes and signed / unsigned integers which used a
342
 * variable sized encoding.
343
 *
344
 * For simplicity we use the same encode and decode functions for
345
 * bytes (CRAM4) and external (CRAM3). Given we already had code to
346
 * replace codec + type into a function pointer it makes little
347
 * difference how we ended up at that function.  However we disallow
348
 * this codec to operate on integer data for CRAM4 onwards.
349
 */
350
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
351
0
                             cram_block *in, char *out, int *out_size) {
352
0
    char *cp;
353
0
    cram_block *b;
354
355
    /* Find the external block */
356
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
357
0
    if (!b)
358
0
        return *out_size?-1:0;
359
360
0
    cp = (char *)b->data + b->idx;
361
    // E_INT and E_LONG are guaranteed single item queries
362
0
    int err = 0;
363
0
    *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err);
364
0
    b->idx = cp - (char *)b->data;
365
0
    *out_size = 1;
366
367
0
    return err ? -1 : 0;
368
0
}
369
370
int cram_external_decode_long(cram_slice *slice, cram_codec *c,
371
0
                              cram_block *in, char *out, int *out_size) {
372
0
    char *cp;
373
0
    cram_block *b;
374
375
    /* Find the external block */
376
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
377
0
    if (!b)
378
0
        return *out_size?-1:0;
379
380
0
    cp = (char *)b->data + b->idx;
381
    // E_INT and E_LONG are guaranteed single item queries
382
0
    int err = 0;
383
0
    *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err);
384
0
    b->idx = cp - (char *)b->data;
385
0
    *out_size = 1;
386
387
0
    return err ? -1 : 0;
388
0
}
389
390
int cram_external_decode_char(cram_slice *slice, cram_codec *c,
391
                              cram_block *in, char *out,
392
0
                              int *out_size) {
393
0
    char *cp;
394
0
    cram_block *b;
395
396
    /* Find the external block */
397
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
398
0
    if (!b)
399
0
        return *out_size?-1:0;
400
401
0
    cp = cram_extract_block(b, *out_size);
402
0
    if (!cp)
403
0
        return -1;
404
405
0
    if (out)
406
0
        memcpy(out, cp, *out_size);
407
0
    return 0;
408
0
}
409
410
static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
411
                                      cram_block *in, char *out_,
412
0
                                      int *out_size) {
413
0
    char *cp;
414
0
    cram_block *out = (cram_block *)out_;
415
0
    cram_block *b = NULL;
416
417
    /* Find the external block */
418
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
419
0
    if (!b)
420
0
        return *out_size?-1:0;
421
422
0
    cp = cram_extract_block(b, *out_size);
423
0
    if (!cp)
424
0
        return -1;
425
426
0
    BLOCK_APPEND(out, cp, *out_size);
427
0
    return 0;
428
429
0
 block_err:
430
0
    return -1;
431
0
}
432
433
2.68k
void cram_external_decode_free(cram_codec *c) {
434
2.68k
    if (c)
435
2.68k
        free(c);
436
2.68k
}
437
438
439
0
int cram_external_decode_size(cram_slice *slice, cram_codec *c) {
440
0
    cram_block *b;
441
442
    /* Find the external block */
443
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
444
0
    if (!b)
445
0
        return -1;
446
447
0
    return b->uncomp_size;
448
0
}
449
450
0
cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) {
451
0
    return cram_get_block_by_id(slice, c->u.external.content_id);
452
0
}
453
454
0
int cram_external_describe(cram_codec *c, kstring_t *ks) {
455
0
    return ksprintf(ks, "EXTERNAL(id=%d)",
456
0
                    c->u.external.content_id) < 0 ? -1 : 0;
457
0
}
458
459
cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
460
                                      char *data, int size,
461
                                      enum cram_encoding codec,
462
                                      enum cram_external_type option,
463
2.70k
                                      int version, varint_vec *vv) {
464
2.70k
    cram_codec *c = NULL;
465
2.70k
    char *cp = data;
466
467
2.70k
    if (size < 1)
468
3
        goto malformed;
469
470
2.69k
    if (!(c = malloc(sizeof(*c))))
471
0
        return NULL;
472
473
2.69k
    c->codec  = E_EXTERNAL;
474
2.69k
    if (CRAM_MAJOR_VERS(version) >= 4) {
475
        // Version 4 does not permit integer data to be encoded as a
476
        // series of bytes.  This is used purely for bytes, either
477
        // singular or declared as arrays
478
0
        switch (codec) {
479
0
        case E_EXTERNAL:
480
0
            if (option == E_BYTE_ARRAY_BLOCK)
481
0
                c->decode = cram_external_decode_block;
482
0
            else if (option == E_BYTE || option == E_BYTE_ARRAY)
483
0
                c->decode = cram_external_decode_char;
484
0
            else
485
0
                goto malformed;
486
0
            break;
487
0
        default:
488
0
            goto malformed;
489
0
        }
490
2.69k
    } else {
491
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
492
        // use the option field to indicate the input data format so
493
        // we know which serialisation format to use.
494
2.69k
        if (option == E_INT)
495
2.40k
            c->decode = cram_external_decode_int;
496
297
        else if (option == E_LONG)
497
0
            c->decode = cram_external_decode_long;
498
297
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
499
12
            c->decode = cram_external_decode_char;
500
285
        else
501
285
            c->decode = cram_external_decode_block;
502
2.69k
    }
503
2.69k
    c->free   = cram_external_decode_free;
504
2.69k
    c->size   = cram_external_decode_size;
505
2.69k
    c->get_block = cram_external_get_block;
506
2.69k
    c->describe = cram_external_describe;
507
508
2.69k
    c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL);
509
510
2.69k
    if (cp - data != size)
511
9
        goto malformed;
512
513
2.68k
    c->u.external.type = option;
514
515
2.68k
    return c;
516
517
12
 malformed:
518
12
    hts_log_error("Malformed external header stream");
519
12
    free(c);
520
12
    return NULL;
521
2.69k
}
522
523
int cram_external_encode_int(cram_slice *slice, cram_codec *c,
524
8.11M
                             char *in, int in_size) {
525
8.11M
    uint32_t *i32 = (uint32_t *)in;
526
8.11M
    return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1;
527
8.11M
}
528
529
int cram_external_encode_sint(cram_slice *slice, cram_codec *c,
530
0
                             char *in, int in_size) {
531
0
    int32_t *i32 = (int32_t *)in;
532
0
    return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1;
533
0
}
534
535
int cram_external_encode_long(cram_slice *slice, cram_codec *c,
536
0
                             char *in, int in_size) {
537
0
    uint64_t *i64 = (uint64_t *)in;
538
0
    return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1;
539
0
}
540
541
int cram_external_encode_slong(cram_slice *slice, cram_codec *c,
542
0
                               char *in, int in_size) {
543
0
    int64_t *i64 = (int64_t *)in;
544
0
    return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1;
545
0
}
546
547
int cram_external_encode_char(cram_slice *slice, cram_codec *c,
548
415k
                              char *in, int in_size) {
549
415k
    BLOCK_APPEND(c->out, in, in_size);
550
415k
    return 0;
551
552
0
 block_err:
553
0
    return -1;
554
415k
}
555
556
337k
void cram_external_encode_free(cram_codec *c) {
557
337k
    if (!c)
558
0
        return;
559
337k
    free(c);
560
337k
}
561
562
int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
563
337k
                               int version) {
564
337k
    char tmp[99], *tp = tmp, *tpend = tmp+99;
565
337k
    int len = 0, r = 0, n;
566
567
337k
    if (prefix) {
568
98.9k
        size_t l = strlen(prefix);
569
98.9k
        BLOCK_APPEND(b, prefix, l);
570
98.9k
        len += l;
571
98.9k
    }
572
573
337k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id);
574
337k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
575
337k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
576
337k
    BLOCK_APPEND(b, tmp, tp-tmp);
577
337k
    len += tp-tmp;
578
579
337k
    if (r > 0)
580
337k
        return len;
581
582
0
 block_err:
583
0
    return -1;
584
337k
}
585
586
cram_codec *cram_external_encode_init(cram_stats *st,
587
                                      enum cram_encoding codec,
588
                                      enum cram_external_type option,
589
                                      void *dat,
590
337k
                                      int version, varint_vec *vv) {
591
337k
    cram_codec *c;
592
593
337k
    c = malloc(sizeof(*c));
594
337k
    if (!c)
595
0
        return NULL;
596
337k
    c->codec = E_EXTERNAL;
597
337k
    c->free = cram_external_encode_free;
598
337k
    if (CRAM_MAJOR_VERS(version) >= 4) {
599
        // Version 4 does not permit integer data to be encoded as a
600
        // series of bytes.  This is used purely for bytes, either
601
        // singular or declared as arrays
602
0
        switch (codec) {
603
0
        case E_EXTERNAL:
604
0
            if (option != E_BYTE && option != E_BYTE_ARRAY)
605
0
                return NULL;
606
0
            c->encode = cram_external_encode_char;
607
0
            break;
608
0
        default:
609
0
            return NULL;
610
0
        }
611
337k
    } else {
612
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
613
        // use the option field to indicate the input data format so
614
        // we know which serialisation format to use.
615
337k
        if (option == E_INT)
616
95.4k
            c->encode = cram_external_encode_int;
617
242k
        else if (option == E_LONG)
618
0
            c->encode = cram_external_encode_long;
619
242k
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
620
242k
            c->encode = cram_external_encode_char;
621
0
        else
622
0
            abort();
623
337k
    }
624
337k
    c->store = cram_external_encode_store;
625
337k
    c->flush = NULL;
626
627
337k
    c->u.e_external.content_id = (size_t)dat;
628
629
337k
    return c;
630
337k
}
631
632
/*
633
 * ---------------------------------------------------------------------------
634
 * VARINT
635
 *
636
 * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8
637
 * format as well as bytes.  In CRAM 4 EXTERNAL is only for bytes and
638
 * byte arrays, with two dedicated encodings for integers:
639
 * VARINT_SIGNED and VARINT_UNSIGNED.  These also differ a little to
640
 * EXTERNAL with the addition of an offset field, meaning we can store
641
 * values in, say, the range -2 to 1 million without needing to use
642
 * a signed zig-zag transformation.
643
 */
644
int cram_varint_decode_int(cram_slice *slice, cram_codec *c,
645
0
                           cram_block *in, char *out, int *out_size) {
646
0
    char *cp;
647
0
    cram_block *b;
648
649
    /* Find the data block */
650
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
651
0
    if (!b)
652
0
        return *out_size?-1:0;
653
654
0
    cp = (char *)b->data + b->idx;
655
    // E_INT and E_LONG are guaranteed single item queries
656
0
    int err = 0;
657
0
    *(int32_t *)out = c->vv->varint_get32(&cp,
658
0
                                          (char *)b->data + b->uncomp_size,
659
0
                                          &err) + c->u.varint.offset;
660
0
    b->idx = cp - (char *)b->data;
661
0
    *out_size = 1;
662
663
0
    return err ? -1 : 0;
664
0
}
665
666
int cram_varint_decode_sint(cram_slice *slice, cram_codec *c,
667
0
                            cram_block *in, char *out, int *out_size) {
668
0
    char *cp;
669
0
    cram_block *b;
670
671
    /* Find the data block */
672
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
673
0
    if (!b)
674
0
        return *out_size?-1:0;
675
676
0
    cp = (char *)b->data + b->idx;
677
    // E_INT and E_LONG are guaranteed single item queries
678
0
    int err = 0;
679
0
    *(int32_t *)out = c->vv->varint_get32s(&cp,
680
0
                                           (char *)b->data + b->uncomp_size,
681
0
                                           &err) + c->u.varint.offset;
682
0
    b->idx = cp - (char *)b->data;
683
0
    *out_size = 1;
684
685
0
    return err ? -1 : 0;
686
0
}
687
688
int cram_varint_decode_long(cram_slice *slice, cram_codec *c,
689
0
                            cram_block *in, char *out, int *out_size) {
690
0
    char *cp;
691
0
    cram_block *b;
692
693
    /* Find the data block */
694
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
695
0
    if (!b)
696
0
        return *out_size?-1:0;
697
698
0
    cp = (char *)b->data + b->idx;
699
    // E_INT and E_LONG are guaranteed single item queries
700
0
    int err = 0;
701
0
    *(int64_t *)out = c->vv->varint_get64(&cp,
702
0
                                          (char *)b->data + b->uncomp_size,
703
0
                                          &err) + c->u.varint.offset;
704
0
    b->idx = cp - (char *)b->data;
705
0
    *out_size = 1;
706
707
0
    return err ? -1 : 0;
708
0
}
709
710
int cram_varint_decode_slong(cram_slice *slice, cram_codec *c,
711
0
                             cram_block *in, char *out, int *out_size) {
712
0
    char *cp;
713
0
    cram_block *b;
714
715
    /* Find the data block */
716
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
717
0
    if (!b)
718
0
        return *out_size?-1:0;
719
720
0
    cp = (char *)b->data + b->idx;
721
    // E_INT and E_LONG are guaranteed single item queries
722
0
    int err = 0;
723
0
    *(int64_t *)out = c->vv->varint_get64s(&cp,
724
0
                                           (char *)b->data + b->uncomp_size,
725
0
                                           &err) + c->u.varint.offset;
726
0
    b->idx = cp - (char *)b->data;
727
0
    *out_size = 1;
728
729
0
    return err ? -1 : 0;
730
0
}
731
732
813
void cram_varint_decode_free(cram_codec *c) {
733
813
    if (c)
734
813
        free(c);
735
813
}
736
737
0
int cram_varint_decode_size(cram_slice *slice, cram_codec *c) {
738
0
    cram_block *b;
739
740
    /* Find the data block */
741
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
742
0
    if (!b)
743
0
        return -1;
744
745
0
    return b->uncomp_size;
746
0
}
747
748
0
cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) {
749
0
    return cram_get_block_by_id(slice, c->u.varint.content_id);
750
0
}
751
752
0
int cram_varint_describe(cram_codec *c, kstring_t *ks) {
753
0
    return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)",
754
0
                    c->u.varint.content_id,
755
0
                    c->u.varint.offset,
756
0
                    c->u.varint.type)
757
0
        < 0 ? -1 : 0;
758
0
}
759
760
cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
761
                                    char *data, int size,
762
                                    enum cram_encoding codec,
763
                                    enum cram_external_type option,
764
813
                                    int version, varint_vec *vv) {
765
813
    cram_codec *c;
766
813
    char *cp = data, *cp_end = data+size;
767
768
813
    if (!(c = malloc(sizeof(*c))))
769
0
        return NULL;
770
771
813
    c->codec  = codec;
772
773
    // Function pointer choice is theoretically by codec type.
774
    // Given we have some vars as int32 and some as int64 we
775
    // use option too for sizing, although on disk format
776
    // does not change.
777
813
    switch(codec) {
778
642
    case E_VARINT_UNSIGNED:
779
642
        c->decode = (option == E_INT)
780
642
            ? cram_varint_decode_int
781
642
            : cram_varint_decode_long;
782
642
        break;
783
171
    case E_VARINT_SIGNED:
784
171
        c->decode = (option == E_INT)
785
171
            ? cram_varint_decode_sint
786
171
            : cram_varint_decode_slong;
787
171
        break;
788
0
    default:
789
0
        return NULL;
790
813
    }
791
792
813
    c->free   = cram_varint_decode_free;
793
813
    c->size   = cram_varint_decode_size;
794
813
    c->get_block = cram_varint_get_block;
795
813
    c->describe = cram_varint_describe;
796
797
813
    c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL);
798
813
    c->u.varint.offset     = vv->varint_get64s(&cp, cp_end, NULL);
799
800
813
    if (cp - data != size) {
801
0
        fprintf(stderr, "Malformed varint header stream\n");
802
0
        free(c);
803
0
        return NULL;
804
0
    }
805
806
813
    c->u.varint.type = option;
807
808
813
    return c;
809
813
}
810
811
int cram_varint_encode_int(cram_slice *slice, cram_codec *c,
812
0
                           char *in, int in_size) {
813
0
    uint32_t *i32 = (uint32_t *)in;
814
0
    return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0
815
0
        ? 0 : -1;
816
0
}
817
818
int cram_varint_encode_sint(cram_slice *slice, cram_codec *c,
819
0
                            char *in, int in_size) {
820
0
    int32_t *i32 = (int32_t *)in;
821
0
    return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0
822
0
        ? 0 : -1;
823
0
}
824
825
int cram_varint_encode_long(cram_slice *slice, cram_codec *c,
826
0
                            char *in, int in_size) {
827
0
    uint64_t *i64 = (uint64_t *)in;
828
0
    return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0
829
0
        ? 0 : -1;
830
0
}
831
832
int cram_varint_encode_slong(cram_slice *slice, cram_codec *c,
833
0
                             char *in, int in_size) {
834
0
    int64_t *i64 = (int64_t *)in;
835
0
    return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0
836
0
        ? 0 : -1;
837
0
}
838
839
0
void cram_varint_encode_free(cram_codec *c) {
840
0
    if (!c)
841
0
        return;
842
0
    free(c);
843
0
}
844
845
int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix,
846
0
                             int version) {
847
0
    char tmp[99], *tp = tmp;
848
0
    int len = 0;
849
850
0
    if (prefix) {
851
0
        size_t l = strlen(prefix);
852
0
        BLOCK_APPEND(b, prefix, l);
853
0
        len += l;
854
0
    }
855
856
0
    tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id);
857
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset);
858
0
    len += c->vv->varint_put32_blk(b, c->codec);
859
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
860
0
    BLOCK_APPEND(b, tmp, tp-tmp);
861
0
    len += tp-tmp;
862
863
0
    return len;
864
865
0
 block_err:
866
0
    return -1;
867
0
}
868
869
cram_codec *cram_varint_encode_init(cram_stats *st,
870
                                    enum cram_encoding codec,
871
                                    enum cram_external_type option,
872
                                    void *dat,
873
0
                                    int version, varint_vec *vv) {
874
0
    cram_codec *c;
875
876
0
    if (!(c = malloc(sizeof(*c))))
877
0
        return NULL;
878
879
0
    c->u.e_varint.offset = 0;
880
0
    if (st) {
881
        // Marginal difference so far! Not worth the hassle?
882
0
        if (st->min_val < 0 && st->min_val >= -127
883
0
            && st->max_val / -st->min_val > 100) {
884
0
            c->u.e_varint.offset = -st->min_val;
885
0
            codec = E_VARINT_UNSIGNED;
886
0
        } else if (st->min_val > 0) {
887
0
            c->u.e_varint.offset = -st->min_val;
888
0
        }
889
0
    }
890
891
0
    c->codec = codec;
892
0
    c->free = cram_varint_encode_free;
893
894
    // Function pointer choice is theoretically by codec type.
895
    // Given we have some vars as int32 and some as int64 we
896
    // use option too for sizing, although on disk format
897
    // does not change.
898
0
    switch (codec) {
899
0
    case E_VARINT_UNSIGNED:
900
0
        c->encode = (option == E_INT)
901
0
            ? cram_varint_encode_int
902
0
            : cram_varint_encode_long;
903
0
        break;
904
0
    case E_VARINT_SIGNED:
905
0
        c->encode = (option == E_INT)
906
0
            ? cram_varint_encode_sint
907
0
            : cram_varint_encode_slong;
908
0
        break;
909
0
    default:
910
0
        return NULL;
911
0
    }
912
0
    c->store = cram_varint_encode_store;
913
0
    c->flush = NULL;
914
915
0
    c->u.e_varint.content_id = (size_t)dat;
916
917
0
    return c;
918
0
}
919
/*
920
 * ---------------------------------------------------------------------------
921
 * CONST_BYTE and CONST_INT
922
 */
923
int cram_const_decode_byte(cram_slice *slice, cram_codec *c,
924
0
                           cram_block *in, char *out, int *out_size) {
925
0
    int i, n;
926
927
0
    for (i = 0, n = *out_size; i < n; i++)
928
0
        out[i] = c->u.xconst.val;
929
930
0
    return 0;
931
0
}
932
933
int cram_const_decode_int(cram_slice *slice, cram_codec *c,
934
0
                          cram_block *in, char *out, int *out_size) {
935
0
    int32_t *out_i = (int32_t *)out;
936
0
    int i, n;
937
938
0
    for (i = 0, n = *out_size; i < n; i++)
939
0
        out_i[i] = c->u.xconst.val;
940
941
0
    return 0;
942
0
}
943
944
int cram_const_decode_long(cram_slice *slice, cram_codec *c,
945
0
                           cram_block *in, char *out, int *out_size) {
946
0
    int64_t *out_i = (int64_t *)out;
947
0
    int i, n;
948
949
0
    for (i = 0, n = *out_size; i < n; i++)
950
0
        out_i[i] = c->u.xconst.val;
951
952
0
    return 0;
953
0
}
954
955
534
void cram_const_decode_free(cram_codec *c) {
956
534
    if (c)
957
534
        free(c);
958
534
}
959
960
0
int cram_const_decode_size(cram_slice *slice, cram_codec *c) {
961
0
    return 0;
962
0
}
963
964
0
int cram_const_describe(cram_codec *c, kstring_t *ks) {
965
0
    return ksprintf(ks, "CONST(val=%"PRId64")",
966
0
                    c->u.xconst.val) < 0 ? -1 : 0;
967
0
}
968
969
cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
970
                                   char *data, int size,
971
                                   enum cram_encoding codec,
972
                                   enum cram_external_type option,
973
534
                                   int version, varint_vec *vv) {
974
534
    cram_codec *c;
975
534
    char *cp = data;
976
977
534
    if (!(c = malloc(sizeof(*c))))
978
0
        return NULL;
979
980
534
    c->codec  = codec;
981
534
    if (codec == E_CONST_BYTE)
982
6
        c->decode = cram_const_decode_byte;
983
528
    else if (option == E_INT)
984
327
        c->decode = cram_const_decode_int;
985
201
    else
986
201
        c->decode = cram_const_decode_long;
987
534
    c->free   = cram_const_decode_free;
988
534
    c->size   = cram_const_decode_size;
989
534
    c->get_block = NULL;
990
534
    c->describe = cram_const_describe;
991
992
534
    c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL);
993
994
534
    if (cp - data != size) {
995
0
        fprintf(stderr, "Malformed const header stream\n");
996
0
        free(c);
997
0
        return NULL;
998
0
    }
999
1000
534
    return c;
1001
534
}
1002
1003
int cram_const_encode(cram_slice *slice, cram_codec *c,
1004
0
                      char *in, int in_size) {
1005
0
    return 0;
1006
0
}
1007
1008
int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix,
1009
0
                            int version) {
1010
0
    char tmp[99], *tp = tmp;
1011
0
    int len = 0;
1012
1013
0
    if (prefix) {
1014
0
        size_t l = strlen(prefix);
1015
0
        BLOCK_APPEND(b, prefix, l);
1016
0
        len += l;
1017
0
    }
1018
1019
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val);
1020
0
    len += c->vv->varint_put32_blk(b, c->codec);
1021
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
1022
0
    BLOCK_APPEND(b, tmp, tp-tmp);
1023
0
    len += tp-tmp;
1024
1025
0
    return len;
1026
1027
0
 block_err:
1028
0
    return -1;
1029
0
}
1030
1031
cram_codec *cram_const_encode_init(cram_stats *st,
1032
                                   enum cram_encoding codec,
1033
                                   enum cram_external_type option,
1034
                                   void *dat,
1035
0
                                   int version, varint_vec *vv) {
1036
0
    cram_codec *c;
1037
1038
0
    if (!(c = malloc(sizeof(*c))))
1039
0
        return NULL;
1040
1041
0
    c->codec = codec;
1042
0
    c->free = cram_const_decode_free; // as as decode
1043
0
    c->encode = cram_const_encode; // a nop
1044
0
    c->store = cram_const_encode_store;
1045
0
    c->flush = NULL;
1046
0
    c->u.e_xconst.val = st->min_val;
1047
1048
0
    return c;
1049
0
}
1050
1051
/*
1052
 * ---------------------------------------------------------------------------
1053
 * BETA
1054
 */
1055
0
int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1056
0
    int64_t *out_i = (int64_t *)out;
1057
0
    int i, n = *out_size;
1058
1059
0
    if (c->u.beta.nbits) {
1060
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1061
0
            return -1;
1062
1063
0
        for (i = 0; i < n; i++)
1064
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1065
0
    } else {
1066
0
        for (i = 0; i < n; i++)
1067
0
            out_i[i] = -c->u.beta.offset;
1068
0
    }
1069
1070
0
    return 0;
1071
0
}
1072
1073
0
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1074
0
    int32_t *out_i = (int32_t *)out;
1075
0
    int i, n = *out_size;
1076
1077
0
    if (c->u.beta.nbits) {
1078
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1079
0
            return -1;
1080
1081
0
        for (i = 0; i < n; i++)
1082
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1083
0
    } else {
1084
0
        for (i = 0; i < n; i++)
1085
0
            out_i[i] = -c->u.beta.offset;
1086
0
    }
1087
1088
0
    return 0;
1089
0
}
1090
1091
0
int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1092
0
    int i, n = *out_size;
1093
1094
1095
0
    if (c->u.beta.nbits) {
1096
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1097
0
            return -1;
1098
1099
0
        if (out)
1100
0
            for (i = 0; i < n; i++)
1101
0
                out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1102
0
        else
1103
0
            for (i = 0; i < n; i++)
1104
0
                get_bits_MSB(in, c->u.beta.nbits);
1105
0
    } else {
1106
0
        if (out)
1107
0
            for (i = 0; i < n; i++)
1108
0
                out[i] = -c->u.beta.offset;
1109
0
    }
1110
1111
0
    return 0;
1112
0
}
1113
1114
96
void cram_beta_decode_free(cram_codec *c) {
1115
96
    if (c)
1116
96
        free(c);
1117
96
}
1118
1119
0
int cram_beta_describe(cram_codec *c, kstring_t *ks) {
1120
0
    return ksprintf(ks, "BETA(offset=%d, nbits=%d)",
1121
0
                    c->u.beta.offset, c->u.beta.nbits)
1122
0
        < 0 ? -1 : 0;
1123
0
}
1124
1125
cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
1126
                                  char *data, int size,
1127
                                  enum cram_encoding codec,
1128
                                  enum cram_external_type option,
1129
96
                                  int version, varint_vec *vv) {
1130
96
    cram_codec *c;
1131
96
    char *cp = data;
1132
1133
96
    if (!(c = malloc(sizeof(*c))))
1134
0
        return NULL;
1135
1136
96
    c->codec  = E_BETA;
1137
96
    if (option == E_INT || option == E_SINT)
1138
24
        c->decode = cram_beta_decode_int;
1139
72
    else if (option == E_LONG || option == E_SLONG)
1140
0
        c->decode = cram_beta_decode_long;
1141
72
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1142
72
        c->decode = cram_beta_decode_char;
1143
0
    else {
1144
0
        hts_log_error("BYTE_ARRAYs not supported by this codec");
1145
0
        free(c);
1146
0
        return NULL;
1147
0
    }
1148
96
    c->free   = cram_beta_decode_free;
1149
96
    c->describe = cram_beta_describe;
1150
1151
96
    c->u.beta.nbits = -1;
1152
96
    c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL);
1153
96
    if (cp < data + size) // Ensure test below works
1154
96
        c->u.beta.nbits  = vv->varint_get32(&cp, data + size, NULL);
1155
1156
96
    if (cp - data != size
1157
96
        || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) {
1158
0
        hts_log_error("Malformed beta header stream");
1159
0
        free(c);
1160
0
        return NULL;
1161
0
    }
1162
1163
96
    return c;
1164
96
}
1165
1166
int cram_beta_encode_store(cram_codec *c, cram_block *b,
1167
1.07k
                           char *prefix, int version) {
1168
1.07k
    int len = 0, r = 0, n;
1169
1170
1.07k
    if (prefix) {
1171
1.07k
        size_t l = strlen(prefix);
1172
1.07k
        BLOCK_APPEND(b, prefix, l);
1173
1.07k
        len += l;
1174
1.07k
    }
1175
1176
1.07k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1177
    // codec length
1178
1.07k
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset)
1179
1.07k
                                         + c->vv->varint_size(c->u.e_beta.nbits)));
1180
1.07k
    r |= n;
1181
1.07k
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n;
1182
1.07k
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits));  r |= n;
1183
1184
1.07k
    if (r > 0) return len;
1185
1186
0
 block_err:
1187
0
    return -1;
1188
1.07k
}
1189
1190
int cram_beta_encode_long(cram_slice *slice, cram_codec *c,
1191
0
                          char *in, int in_size) {
1192
0
    int64_t *syms = (int64_t *)in;
1193
0
    int i, r = 0;
1194
1195
0
    for (i = 0; i < in_size; i++)
1196
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1197
0
                            c->u.e_beta.nbits);
1198
1199
0
    return r;
1200
0
}
1201
1202
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
1203
17.6k
                         char *in, int in_size) {
1204
17.6k
    int *syms = (int *)in;
1205
17.6k
    int i, r = 0;
1206
1207
35.3k
    for (i = 0; i < in_size; i++)
1208
17.6k
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1209
17.6k
                            c->u.e_beta.nbits);
1210
1211
17.6k
    return r;
1212
17.6k
}
1213
1214
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
1215
0
                          char *in, int in_size) {
1216
0
    unsigned char *syms = (unsigned char *)in;
1217
0
    int i, r = 0;
1218
1219
0
    for (i = 0; i < in_size; i++)
1220
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1221
0
                            c->u.e_beta.nbits);
1222
1223
0
    return r;
1224
0
}
1225
1226
1.07k
void cram_beta_encode_free(cram_codec *c) {
1227
1.07k
    if (c) free(c);
1228
1.07k
}
1229
1230
cram_codec *cram_beta_encode_init(cram_stats *st,
1231
                                  enum cram_encoding codec,
1232
                                  enum cram_external_type option,
1233
                                  void *dat,
1234
1.10k
                                  int version, varint_vec *vv) {
1235
1.10k
    cram_codec *c;
1236
1.10k
    hts_pos_t min_val, max_val;
1237
1.10k
    int len = 0;
1238
1.10k
    int64_t range;
1239
1240
1.10k
    c = malloc(sizeof(*c));
1241
1.10k
    if (!c)
1242
0
        return NULL;
1243
1.10k
    c->codec  = E_BETA;
1244
1.10k
    c->free   = cram_beta_encode_free;
1245
1.10k
    if (option == E_INT || option == E_SINT)
1246
1.10k
        c->encode = cram_beta_encode_int;
1247
0
    else if (option == E_LONG || option == E_SLONG)
1248
0
        c->encode = cram_beta_encode_long;
1249
0
    else
1250
0
        c->encode = cram_beta_encode_char;
1251
1.10k
    c->store  = cram_beta_encode_store;
1252
1.10k
    c->flush = NULL;
1253
1254
1.10k
    if (dat) {
1255
1.10k
        min_val = ((hts_pos_t *)dat)[0];
1256
1.10k
        max_val = ((hts_pos_t *)dat)[1];
1257
1.10k
    } else {
1258
0
        min_val = INT_MAX;
1259
0
        max_val = INT_MIN;
1260
0
        int i;
1261
0
        for (i = 0; i < MAX_STAT_VAL; i++) {
1262
0
            if (!st->freqs[i])
1263
0
                continue;
1264
0
            if (min_val > i)
1265
0
                min_val = i;
1266
0
            max_val = i;
1267
0
        }
1268
0
        if (st->h) {
1269
0
            khint_t k;
1270
1271
0
            for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1272
0
                if (!kh_exist(st->h, k))
1273
0
                    continue;
1274
1275
0
                i = kh_key(st->h, k);
1276
0
                if (min_val > i)
1277
0
                    min_val = i;
1278
0
                if (max_val < i)
1279
0
                    max_val = i;
1280
0
            }
1281
0
        }
1282
0
    }
1283
1284
1.10k
    if (max_val < min_val)
1285
0
        goto err;
1286
1287
1.10k
    range = (int64_t) max_val - min_val;
1288
1.10k
    switch (option) {
1289
0
    case E_SINT:
1290
0
        if (min_val < INT_MIN || range > INT_MAX)
1291
0
            goto err;
1292
0
        break;
1293
1294
1.10k
    case E_INT:
1295
1.10k
        if (max_val > UINT_MAX || range > UINT_MAX)
1296
37
            goto err;
1297
1.07k
        break;
1298
1299
1.07k
    default:
1300
0
        break;
1301
1.10k
    }
1302
1303
1.07k
    c->u.e_beta.offset = -min_val;
1304
4.54k
    while (range) {
1305
3.47k
        len++;
1306
3.47k
        range >>= 1;
1307
3.47k
    }
1308
1.07k
    c->u.e_beta.nbits = len;
1309
1310
1.07k
    return c;
1311
1312
37
 err:
1313
37
    free(c);
1314
37
    return NULL;
1315
1.10k
}
1316
1317
/*
1318
 * ---------------------------------------------------------------------------
1319
 * XPACK: Packing multiple values into a single byte.  A fast transform that
1320
 * reduces time taken by entropy encoder and may also improve compression.
1321
 *
1322
 * This also has the additional requirement that the data series is not
1323
 * interleaved with another, permitting efficient encoding and decoding
1324
 * of all elements enmasse instead of needing to only extract the bits
1325
 * necessary per item.
1326
 */
1327
0
int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1328
0
    int64_t *out_i = (int64_t *)out;
1329
0
    int i, n = *out_size;
1330
1331
0
    if (c->u.xpack.nbits) {
1332
0
        for (i = 0; i < n; i++)
1333
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1334
0
    } else {
1335
0
        for (i = 0; i < n; i++)
1336
0
            out_i[i] = c->u.xpack.rmap[0];
1337
0
    }
1338
1339
0
    return 0;
1340
0
}
1341
1342
0
int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1343
0
    int32_t *out_i = (int32_t *)out;
1344
0
    int i, n = *out_size;
1345
1346
0
    if (c->u.xpack.nbits) {
1347
0
        if (cram_not_enough_bits(in, c->u.xpack.nbits * n))
1348
0
            return -1;
1349
1350
0
        for (i = 0; i < n; i++)
1351
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1352
0
    } else {
1353
0
        for (i = 0; i < n; i++)
1354
0
            out_i[i] = c->u.xpack.rmap[0];
1355
0
    }
1356
1357
0
    return 0;
1358
0
}
1359
1360
0
static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) {
1361
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
1362
0
    if (b)
1363
0
        return 0;
1364
1365
    // get sub-codec data.
1366
0
    cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec);
1367
0
    if (!sub_b)
1368
0
        return -1;
1369
1370
    // Allocate local block to expand into
1371
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
1372
0
    if (!b)
1373
0
        return -1;
1374
0
    int n = sub_b->uncomp_size * 8/c->u.xpack.nbits;
1375
0
    BLOCK_GROW(b, n);
1376
0
    b->uncomp_size = n;
1377
1378
0
    uint8_t p[256];
1379
0
    int z;
1380
0
    for (z = 0; z < 256; z++)
1381
0
        p[z] = c->u.xpack.rmap[z];
1382
0
    hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size,
1383
0
               8 / c->u.xpack.nbits, p);
1384
1385
0
    return 0;
1386
1387
0
 block_err:
1388
0
    return -1;
1389
0
}
1390
1391
0
int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1392
    // FIXME: we need to ban data-series interleaving in the spec for this to work.
1393
1394
    // Remember this may be called when threaded and multi-slice per container.
1395
    // Hence one cram_codec instance, multiple slices, multiple blocks.
1396
    // We therefore have to cache appropriate block info in slice and not codec.
1397
    //    b = cram_get_block_by_id(slice, c->external.content_id);
1398
0
    if (c->u.xpack.nval > 1) {
1399
0
        cram_xpack_decode_expand_char(slice, c);
1400
0
        cram_block *b = slice->block_by_id[512 + c->codec_id];
1401
0
        if (!b)
1402
0
            return -1;
1403
1404
0
        if (out)
1405
0
            memcpy(out, b->data + b->byte, *out_size);
1406
0
        b->byte += *out_size;
1407
0
    } else {
1408
0
        memset(out, c->u.xpack.rmap[0], *out_size);
1409
0
    }
1410
1411
0
    return 0;
1412
0
}
1413
1414
942
void cram_xpack_decode_free(cram_codec *c) {
1415
942
    if (!c) return;
1416
1417
942
    if (c->u.xpack.sub_codec)
1418
924
        c->u.xpack.sub_codec->free(c->u.xpack.sub_codec);
1419
1420
    //free(slice->block_by_id[512 + c->codec_id]);
1421
    //slice->block_by_id[512 + c->codec_id] = 0;
1422
1423
942
    free(c);
1424
942
}
1425
1426
0
int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) {
1427
0
    cram_xpack_decode_expand_char(slice, c);
1428
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1429
0
}
1430
1431
0
cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) {
1432
0
    cram_xpack_decode_expand_char(slice, c);
1433
0
    return slice->block_by_id[512 + c->codec_id];
1434
0
}
1435
1436
cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr,
1437
                                   char *data, int size,
1438
                                   enum cram_encoding codec,
1439
                                   enum cram_external_type option,
1440
942
                                   int version, varint_vec *vv) {
1441
942
    cram_codec *c;
1442
942
    char *cp = data;
1443
942
    char *endp = data+size;
1444
1445
942
    if (!(c = calloc(1, sizeof(*c))))
1446
0
        return NULL;
1447
1448
942
    c->codec  = E_XPACK;
1449
942
    if (option == E_LONG)
1450
0
        c->decode = cram_xpack_decode_long;
1451
942
    else if (option == E_INT)
1452
516
        c->decode = cram_xpack_decode_int;
1453
426
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1454
426
        c->decode = cram_xpack_decode_char;
1455
0
    else {
1456
0
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
1457
0
        goto malformed;
1458
0
    }
1459
942
    c->free = cram_xpack_decode_free;
1460
942
    c->size = cram_xpack_decode_size;
1461
942
    c->get_block = cram_xpack_get_block;
1462
942
    c->describe = NULL;
1463
1464
942
    c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL);
1465
942
    c->u.xpack.nval  = vv->varint_get32(&cp, endp, NULL);
1466
942
    if (c->u.xpack.nbits >= 8  || c->u.xpack.nbits < 0 ||
1467
942
        c->u.xpack.nval  > 256 || c->u.xpack.nval < 0)
1468
6
        goto malformed;
1469
936
    int i;
1470
3.65k
    for (i = 0; i < c->u.xpack.nval; i++) {
1471
2.73k
        uint32_t v = vv->varint_get32(&cp, endp, NULL);
1472
2.73k
        if (v >= 256)
1473
9
            goto malformed;
1474
2.72k
        c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K
1475
2.72k
    }
1476
1477
927
    int encoding = vv->varint_get32(&cp, endp, NULL);
1478
927
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1479
927
    if (sub_size < 0 || endp - cp < sub_size)
1480
0
        goto malformed;
1481
927
    c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1482
927
                                             option, version, vv);
1483
927
    if (c->u.xpack.sub_codec == NULL)
1484
3
        goto malformed;
1485
924
    cp += sub_size;
1486
1487
924
    if (cp - data != size
1488
924
        || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) {
1489
24
    malformed:
1490
24
        fprintf(stderr, "Malformed xpack header stream\n");
1491
24
        cram_xpack_decode_free(c);
1492
24
        return NULL;
1493
6
    }
1494
1495
918
    return c;
1496
924
}
1497
1498
0
int cram_xpack_encode_flush(cram_codec *c) {
1499
    // Pack the buffered up data
1500
0
    int meta_len;
1501
0
    uint64_t out_len;
1502
0
    uint8_t out_meta[1024];
1503
0
    uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out),
1504
0
                            out_meta, &meta_len, &out_len);
1505
1506
    // We now need to pass this through the next layer of transform
1507
0
    if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming
1508
0
                                     c->u.e_xpack.sub_codec,
1509
0
                                     (char *)out, out_len))
1510
0
        return -1;
1511
1512
0
    int r = 0;
1513
0
    if (c->u.e_xpack.sub_codec->flush)
1514
0
        r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec);
1515
1516
0
    free(out);
1517
0
    return r;
1518
0
}
1519
1520
int cram_xpack_encode_store(cram_codec *c, cram_block *b,
1521
0
                            char *prefix, int version) {
1522
0
    int len = 0, r = 0, n;
1523
1524
0
    if (prefix) {
1525
0
        size_t l = strlen(prefix);
1526
0
        BLOCK_APPEND(b, prefix, l);
1527
0
        len += l;
1528
0
    }
1529
1530
    // Store sub-codec
1531
0
    cram_codec *tc = c->u.e_xpack.sub_codec;
1532
0
    cram_block *tb = cram_new_block(0, 0);
1533
0
    if (!tb)
1534
0
        return -1;
1535
0
    int len2 = tc->store(tc, tb, NULL, version);
1536
1537
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1538
1539
    // codec length
1540
0
    int len1 = 0, i;
1541
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1542
0
        len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n;
1543
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits)
1544
0
                                        +  c->vv->varint_size(c->u.e_xpack.nval)
1545
0
                                        + len1 + len2)); r |= n;
1546
1547
    // The map and sub-codec
1548
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n;
1549
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval));  r |= n;
1550
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1551
0
        len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n;
1552
1553
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1554
1555
0
    cram_free_block(tb);
1556
1557
0
    return r > 0 ? len + len2 : -1;
1558
1559
0
 block_err:
1560
0
    return -1;
1561
0
}
1562
1563
// Same as cram_beta_encode_long
1564
int cram_xpack_encode_long(cram_slice *slice, cram_codec *c,
1565
0
                           char *in, int in_size) {
1566
0
    int64_t *syms = (int64_t *)in;
1567
0
    int i, r = 0;
1568
1569
0
    for (i = 0; i < in_size; i++)
1570
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1571
1572
0
    return r;
1573
0
}
1574
1575
int cram_xpack_encode_int(cram_slice *slice, cram_codec *c,
1576
0
                          char *in, int in_size) {
1577
0
    int *syms = (int *)in;
1578
0
    int i, r = 0;
1579
1580
0
    for (i = 0; i < in_size; i++)
1581
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1582
1583
0
    return r;
1584
0
}
1585
1586
int cram_xpack_encode_char(cram_slice *slice, cram_codec *c,
1587
0
                           char *in, int in_size) {
1588
0
    BLOCK_APPEND(c->out, in, in_size);
1589
0
    return 0;
1590
1591
0
 block_err:
1592
0
    return -1;
1593
0
}
1594
1595
0
void cram_xpack_encode_free(cram_codec *c) {
1596
0
    if (!c) return;
1597
1598
0
    if (c->u.e_xpack.sub_codec)
1599
0
        c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec);
1600
1601
0
    cram_free_block(c->out);
1602
1603
0
    free(c);
1604
0
}
1605
1606
cram_codec *cram_xpack_encode_init(cram_stats *st,
1607
                                   enum cram_encoding codec,
1608
                                   enum cram_external_type option,
1609
                                   void *dat,
1610
0
                                   int version, varint_vec *vv) {
1611
0
    cram_codec *c;
1612
1613
0
    if (!(c = malloc(sizeof(*c))))
1614
0
        return NULL;
1615
1616
0
    c->codec  = E_XPACK;
1617
0
    c->free   = cram_xpack_encode_free;
1618
0
    if (option == E_LONG)
1619
0
        c->encode = cram_xpack_encode_long;
1620
0
    else if (option == E_INT)
1621
0
        c->encode = cram_xpack_encode_int;
1622
0
    else
1623
0
        c->encode = cram_xpack_encode_char;
1624
0
    c->store  = cram_xpack_encode_store;
1625
0
    c->flush  = cram_xpack_encode_flush;
1626
1627
0
    cram_xpack_encoder *e = (cram_xpack_encoder *)dat;
1628
0
    c->u.e_xpack.nbits = e->nbits;
1629
0
    c->u.e_xpack.nval = e->nval;
1630
0
    c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
1631
0
                                               E_BYTE_ARRAY, e->sub_codec_dat,
1632
0
                                               version, vv);
1633
1634
    // Initialise fwd and rev maps
1635
0
    memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3
1636
0
    int i, n;
1637
0
    for (i = n = 0; i < 256; i++)
1638
0
        if (e->map[i] != -1)
1639
0
            c->u.e_xpack.rmap[n++] = i;               // 0,1,2,3 to P,A,C,K
1640
0
    if (n != e->nval) {
1641
0
        fprintf(stderr, "Incorrectly specified number of map items in PACK\n");
1642
0
        return NULL;
1643
0
    }
1644
1645
0
    return c;
1646
0
}
1647
1648
/*
1649
 * ---------------------------------------------------------------------------
1650
 * XDELTA: subtract successive values, zig-zag to turn +/- to + only,
1651
 * and then var-int encode the result.
1652
 *
1653
 * This also has the additional requirement that the data series is not
1654
 * interleaved with another, permitting efficient encoding and decoding
1655
 * of all elements enmasse instead of needing to only extract the bits
1656
 * necessary per item.
1657
 */
1658
1659
0
static uint8_t  zigzag8 (int8_t  x) { return (x << 1) ^ (x >>  7); }
1660
0
static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); }
1661
0
static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); }
1662
1663
//static int8_t  unzigzag8 (uint8_t  x) { return (x >> 1) ^ -(x & 1); }
1664
0
static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); }
1665
0
static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); }
1666
1667
0
int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1668
0
    return -1;
1669
0
}
1670
1671
0
int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1672
    // Slow value-by-value method for now
1673
0
    uint32_t *out32 = (uint32_t *)out;
1674
0
    int i;
1675
0
    for (i = 0; i < *out_size; i++) {
1676
0
        uint32_t v;
1677
0
        int one = 1;
1678
0
        if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in,
1679
0
                                          (char *)&v, &one) < 0)
1680
0
            return -1;
1681
0
        uint32_t d = unzigzag32(v);
1682
0
        c->u.xdelta.last = out32[i] = d + c->u.xdelta.last;
1683
0
    }
1684
1685
0
    return 0;
1686
0
}
1687
1688
0
static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) {
1689
0
    return -1;
1690
0
}
1691
1692
0
int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1693
0
    return -1;
1694
0
}
1695
1696
0
static inline int16_t le_int2(int16_t i) {
1697
0
    int16_t s;
1698
0
    i16_to_le(i, (uint8_t *)&s);
1699
0
    return s;
1700
0
}
1701
1702
int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in,
1703
0
                             char *out_, int *out_size) {
1704
0
    cram_block *out = (cram_block *)out_;
1705
0
    cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec);
1706
0
    int i = 0;
1707
1708
0
    const int w = c->u.xdelta.word_size;
1709
0
    uint32_t npad = (w - *out_size%w)%w;
1710
0
    uint32_t out_sz = *out_size + npad;
1711
0
    c->u.xdelta.last = 0;  // reset for each new array
1712
1713
0
    for (i = 0; i < out_sz; i += w) {
1714
0
        uint16_t v;
1715
        // Need better interface
1716
0
        char *cp = (char *)b->data + b->byte;
1717
0
        char *cp_end = (char *)b->data + b->uncomp_size;
1718
0
        int err = 0;
1719
0
        v = c->vv->varint_get32(&cp, cp_end, &err);
1720
0
        if (err)
1721
0
            return -1;
1722
0
        b->byte = cp - (char *)b->data;
1723
1724
0
        switch(w) {
1725
0
        case 2: {
1726
0
            int16_t d = unzigzag16(v), z;
1727
0
            c->u.xdelta.last = d + c->u.xdelta.last;
1728
0
            z = le_int2(c->u.xdelta.last);
1729
0
            BLOCK_APPEND(out, &z, 2-npad);
1730
0
            npad = 0;
1731
0
            break;
1732
0
        }
1733
0
        default:
1734
0
            fprintf(stderr, "Unsupported word size by XDELTA\n");
1735
0
            return -1;
1736
0
        }
1737
0
    }
1738
1739
0
    return 0;
1740
1741
0
 block_err:
1742
0
    return -1;
1743
0
}
1744
1745
72
void cram_xdelta_decode_free(cram_codec *c) {
1746
72
    if (!c) return;
1747
1748
72
    if (c->u.xdelta.sub_codec)
1749
63
        c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec);
1750
1751
72
    free(c);
1752
72
}
1753
1754
0
int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) {
1755
0
    cram_xdelta_decode_expand_char(slice, c);
1756
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1757
0
}
1758
1759
0
cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) {
1760
0
    cram_xdelta_decode_expand_char(slice, c);
1761
0
    return slice->block_by_id[512 + c->codec_id];
1762
0
}
1763
1764
cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr,
1765
                                    char *data, int size,
1766
                                    enum cram_encoding codec,
1767
                                    enum cram_external_type option,
1768
72
                                    int version, varint_vec *vv) {
1769
72
    cram_codec *c;
1770
72
    char *cp = data;
1771
72
    char *endp = data+size;
1772
1773
72
    if (!(c = calloc(1, sizeof(*c))))
1774
0
        return NULL;
1775
1776
72
    c->codec  = E_XDELTA;
1777
72
    if (option == E_LONG)
1778
0
        c->decode = cram_xdelta_decode_long;
1779
72
    else if (option == E_INT)
1780
21
        c->decode = cram_xdelta_decode_int;
1781
51
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1782
45
        c->decode = cram_xdelta_decode_char;
1783
6
    else if (option == E_BYTE_ARRAY_BLOCK) {
1784
6
        option = E_BYTE_ARRAY;
1785
6
        c->decode = cram_xdelta_decode_block;
1786
6
    } else {
1787
0
        free(c);
1788
0
        return NULL;
1789
0
    }
1790
72
    c->free = cram_xdelta_decode_free;
1791
72
    c->size = cram_xdelta_decode_size;
1792
72
    c->get_block = cram_xdelta_get_block;
1793
72
    c->describe = NULL;
1794
1795
72
    c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL);
1796
72
    c->u.xdelta.last = 0;
1797
1798
72
    int encoding = vv->varint_get32(&cp, endp, NULL);
1799
72
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1800
72
    if (sub_size < 0 || endp - cp < sub_size)
1801
9
        goto malformed;
1802
63
    c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1803
63
                                              option, version, vv);
1804
63
    if (c->u.xdelta.sub_codec == NULL)
1805
0
        goto malformed;
1806
63
    cp += sub_size;
1807
1808
63
    if (cp - data != size) {
1809
15
    malformed:
1810
15
        fprintf(stderr, "Malformed xdelta header stream\n");
1811
15
        cram_xdelta_decode_free(c);
1812
15
        return NULL;
1813
6
    }
1814
1815
57
    return c;
1816
63
}
1817
1818
0
int cram_xdelta_encode_flush(cram_codec *c) {
1819
0
    int r = -1;
1820
0
    cram_block *b = cram_new_block(0, 0);
1821
0
    if (!b)
1822
0
        return -1;
1823
1824
0
    switch (c->u.e_xdelta.word_size) {
1825
0
    case 2: {
1826
        // Delta + zigzag transform.
1827
        // Subtracting two 8-bit values has a 9-bit result (-255 to 255).
1828
        // However think of it as turning a wheel clockwise or anti-clockwise.
1829
        // If it has 256 gradations then a -ve rotation followed by a +ve
1830
        // rotation of the same amount reverses it regardless.
1831
        //
1832
        // Similarly the zig-zag transformation doesn't invent any extra bits,
1833
        // so the entire thing can be done in-situ.  This may permit faster
1834
        // SIMD loops if we break apart the steps.
1835
1836
        // uint16_t last = 0, d;
1837
        // for (i = 0; i < n; i++) {
1838
        //     d = io[i] - last;
1839
        //     last = io[i];
1840
        //     io[i] = zigzag16(vd);
1841
        // }
1842
1843
        // --- vs ---
1844
1845
        // for (i = n-1; i >= 1; i--)
1846
        //     io[i] -= io[i-1];
1847
        // for (i = 0; i < n; i++)
1848
        //     io[i] = zigzag16(io[i]);
1849
1850
        // varint: need array variant for speed here.
1851
        // With zig-zag
1852
0
        int i, n = BLOCK_SIZE(c->out)/2;;
1853
0
        uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0;
1854
1855
0
        if (n*2 < BLOCK_SIZE(c->out)) {
1856
            // half word
1857
0
            last = *(uint8_t *)dat;
1858
0
            c->vv->varint_put32_blk(b, zigzag16(last));
1859
0
            dat = (uint16_t *)(((uint8_t *)dat)+1);
1860
0
        }
1861
1862
0
        for (i = 0; i < n; i++) {
1863
0
            uint16_t d = dat[i] - last; // possibly unaligned
1864
0
            last = dat[i];
1865
0
            c->vv->varint_put32_blk(b, zigzag16(d));
1866
0
        }
1867
1868
0
        break;
1869
0
    }
1870
1871
0
    case 4: {
1872
0
        int i, n = BLOCK_SIZE(c->out)/4;;
1873
0
        uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0;
1874
1875
0
        for (i = 0; i < n; i++) {
1876
0
            uint32_t d = dat[i] - last;
1877
0
            last = dat[i];
1878
0
            c->vv->varint_put32_blk(b, zigzag32(d));
1879
0
        }
1880
1881
0
        break;
1882
0
    }
1883
1884
0
    case 1: {
1885
0
        int i, n = BLOCK_SIZE(c->out);;
1886
0
        uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0;
1887
1888
0
        for (i = 0; i < n; i++) {
1889
0
            uint32_t d = dat[i] - last;
1890
0
            last = dat[i];
1891
0
            c->vv->varint_put32_blk(b, zigzag8(d));
1892
0
        }
1893
1894
0
        break;
1895
0
    }
1896
1897
0
    default:
1898
0
        goto err;
1899
0
    }
1900
1901
0
    if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec,
1902
0
                                      (char *)b->data, b->byte))
1903
0
        goto err;
1904
1905
0
    r = 0;
1906
1907
0
 err:
1908
0
    cram_free_block(b);
1909
0
    return r;
1910
1911
0
}
1912
1913
int cram_xdelta_encode_store(cram_codec *c, cram_block *b,
1914
0
                            char *prefix, int version) {
1915
0
    int len = 0, r = 0, n;
1916
1917
0
    if (prefix) {
1918
0
        size_t l = strlen(prefix);
1919
0
        BLOCK_APPEND(b, prefix, l);
1920
0
        len += l;
1921
0
    }
1922
1923
    // Store sub-codec
1924
0
    cram_codec *tc = c->u.e_xdelta.sub_codec;
1925
0
    cram_block *tb = cram_new_block(0, 0);
1926
0
    if (!tb)
1927
0
        return -1;
1928
0
    int len2 = tc->store(tc, tb, NULL, version);
1929
1930
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1931
1932
    // codec length
1933
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size)
1934
0
                                        + len2)); r |= n;
1935
1936
    // This and sub-codec
1937
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n;
1938
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1939
1940
0
    cram_free_block(tb);
1941
1942
0
    return r > 0 ? len + len2 : -1;
1943
1944
0
 block_err:
1945
0
    return -1;
1946
0
}
1947
1948
// Same as cram_beta_encode_long
1949
int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c,
1950
0
                           char *in, int in_size) {
1951
0
    return -1;
1952
0
}
1953
1954
int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c,
1955
0
                          char *in, int in_size) {
1956
0
    return -1;
1957
0
}
1958
1959
int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c,
1960
0
                            char *in, int in_size) {
1961
0
    char *dat = malloc(in_size*5);
1962
0
    if (!dat)
1963
0
        return -1;
1964
0
    char *cp = dat, *cp_end = dat + in_size*5;
1965
1966
0
    c->u.e_xdelta.last = 0; // reset for each new array
1967
0
    if (c->u.e_xdelta.word_size == 2) {
1968
0
        int i, part;
1969
1970
0
        part = in_size%2;
1971
0
        if (part) {
1972
0
            uint16_t z = in[0];
1973
0
            c->u.e_xdelta.last = le_int2(z);
1974
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last));
1975
0
        }
1976
1977
0
        uint16_t *in16 = (uint16_t *)(in+part);
1978
0
        for (i = 0; i < in_size/2; i++) {
1979
0
            uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last;
1980
0
            c->u.e_xdelta.last = le_int2(in16[i]);
1981
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(d));
1982
0
        }
1983
0
    }
1984
0
    if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec,
1985
0
                                      (char *)dat, cp-dat)) {
1986
0
        free(dat);
1987
0
        return -1;
1988
0
    }
1989
1990
0
    free(dat);
1991
0
    return 0;
1992
0
}
1993
1994
0
void cram_xdelta_encode_free(cram_codec *c) {
1995
0
    if (!c) return;
1996
1997
0
    if (c->u.e_xdelta.sub_codec)
1998
0
        c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec);
1999
2000
0
    cram_free_block(c->out);
2001
2002
0
    free(c);
2003
0
}
2004
2005
cram_codec *cram_xdelta_encode_init(cram_stats *st,
2006
                                    enum cram_encoding codec,
2007
                                    enum cram_external_type option,
2008
                                    void *dat,
2009
0
                                    int version, varint_vec *vv) {
2010
0
    cram_codec *c;
2011
2012
0
    if (!(c = malloc(sizeof(*c))))
2013
0
        return NULL;
2014
2015
0
    c->codec  = E_XDELTA;
2016
0
    c->free   = cram_xdelta_encode_free;
2017
0
    if (option == E_LONG)
2018
0
        c->encode = cram_xdelta_encode_long;
2019
0
    else if (option == E_INT)
2020
0
        c->encode = cram_xdelta_encode_int;
2021
0
    else
2022
0
        c->encode = cram_xdelta_encode_char;
2023
0
    c->store  = cram_xdelta_encode_store;
2024
0
    c->flush  = cram_xdelta_encode_flush;
2025
2026
0
    cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat;
2027
0
    c->u.e_xdelta.word_size = e->word_size;
2028
0
    c->u.e_xdelta.last = 0;
2029
0
    c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
2030
0
                                                E_BYTE_ARRAY,
2031
0
                                                e->sub_codec_dat,
2032
0
                                                version, vv);
2033
2034
0
    return c;
2035
0
}
2036
2037
/*
2038
 * ---------------------------------------------------------------------------
2039
 * XRLE
2040
 *
2041
 * This also has the additional requirement that the data series is not
2042
 * interleaved with another, permitting efficient encoding and decoding
2043
 * of all elements enmasse instead of needing to only extract the bits
2044
 * necessary per item.
2045
 */
2046
0
int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2047
    // TODO if and when needed
2048
0
    return -1;
2049
0
}
2050
2051
0
int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2052
    // TODO if and when needed
2053
0
    return -1;
2054
0
}
2055
2056
// Expands an XRLE transform and caches result in slice->block_by_id[]
2057
0
static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) {
2058
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2059
0
    if (b)
2060
0
        return 0;
2061
2062
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
2063
0
    if (!b)
2064
0
        return -1;
2065
0
    cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec);
2066
0
    if (!lit_b)
2067
0
        return -1;
2068
0
    unsigned char *lit_dat = lit_b->data;
2069
0
    unsigned int lit_sz = lit_b->uncomp_size;
2070
0
    unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec);
2071
2072
0
    cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec);
2073
0
    if (!len_b)
2074
0
        return -1;
2075
0
    unsigned char *len_dat = len_b->data;
2076
2077
0
    uint8_t rle_syms[256];
2078
0
    int rle_nsyms = 0;
2079
0
    int i;
2080
0
    for (i = 0; i < 256; i++) {
2081
0
        if (c->u.xrle.rep_score[i] > 0)
2082
0
            rle_syms[rle_nsyms++] = i;
2083
0
    }
2084
2085
0
    uint64_t out_sz;
2086
0
    int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz);
2087
0
    if (!(b->data = malloc(out_sz)))
2088
0
        return -1;
2089
0
    hts_rle_decode(lit_dat, lit_sz,
2090
0
                   len_dat+nb, len_sz-nb,
2091
0
                   rle_syms, rle_nsyms,
2092
0
                   b->data, &out_sz);
2093
0
    b->uncomp_size = out_sz;
2094
2095
0
    return 0;
2096
0
}
2097
2098
0
int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) {
2099
0
    cram_xrle_decode_expand_char(slice, c);
2100
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
2101
0
}
2102
2103
0
cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) {
2104
0
    cram_xrle_decode_expand_char(slice, c);
2105
0
    return slice->block_by_id[512 + c->codec_id];
2106
0
}
2107
2108
0
int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2109
0
    int n = *out_size;
2110
2111
0
    cram_xrle_decode_expand_char(slice, c);
2112
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2113
2114
0
    memcpy(out, b->data + b->idx, n);
2115
0
    b->idx += n;
2116
0
    return 0;
2117
2118
    // Old code when not cached
2119
0
    while (n > 0) {
2120
0
        if (c->u.xrle.cur_len == 0) {
2121
0
            unsigned char lit;
2122
0
            int one = 1;
2123
0
            if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in,
2124
0
                                          (char *)&lit, &one) < 0)
2125
0
                return -1;
2126
0
            c->u.xrle.cur_lit = lit;
2127
2128
0
            if (c->u.xrle.rep_score[lit] > 0) {
2129
0
                if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in,
2130
0
                                              (char *)&c->u.xrle.cur_len, &one) < 0)
2131
0
                    return -1;
2132
0
            } // else cur_len still zero
2133
            //else fprintf(stderr, "%d\n", lit);
2134
2135
0
            c->u.xrle.cur_len++;
2136
0
        }
2137
2138
0
        if (n >= c->u.xrle.cur_len) {
2139
0
            memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len);
2140
0
            out += c->u.xrle.cur_len;
2141
0
            n -= c->u.xrle.cur_len;
2142
0
            c->u.xrle.cur_len = 0;
2143
0
        } else {
2144
0
            memset(out, c->u.xrle.cur_lit, n);
2145
0
            out += n;
2146
0
            c->u.xrle.cur_len -= n;
2147
0
            n = 0;
2148
0
        }
2149
0
    }
2150
2151
0
    return 0;
2152
0
}
2153
2154
3
void cram_xrle_decode_free(cram_codec *c) {
2155
3
    if (!c) return;
2156
2157
3
    if (c->u.xrle.len_codec)
2158
0
        c->u.xrle.len_codec->free(c->u.xrle.len_codec);
2159
2160
3
    if (c->u.xrle.lit_codec)
2161
0
        c->u.xrle.lit_codec->free(c->u.xrle.lit_codec);
2162
2163
3
    free(c);
2164
3
}
2165
2166
cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr,
2167
                                  char *data, int size,
2168
                                  enum cram_encoding codec,
2169
                                  enum cram_external_type option,
2170
9
                                  int version, varint_vec *vv) {
2171
9
    cram_codec *c;
2172
9
    char *cp = data;
2173
9
    char *endp = data+size;
2174
9
    int err = 0;
2175
2176
9
    if (!(c = calloc(1, sizeof(*c))))
2177
0
        return NULL;
2178
2179
9
    c->codec  = E_XRLE;
2180
9
    if (option == E_LONG)
2181
0
        c->decode = cram_xrle_decode_long;
2182
9
    else if (option == E_INT)
2183
0
        c->decode = cram_xrle_decode_int;
2184
9
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
2185
3
        c->decode = cram_xrle_decode_char;
2186
6
    else {
2187
6
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
2188
6
        free(c);
2189
6
        return NULL;
2190
6
    }
2191
3
    c->free   = cram_xrle_decode_free;
2192
3
    c->size   = cram_xrle_decode_size;
2193
3
    c->get_block = cram_xrle_get_block;
2194
3
    c->describe = NULL;
2195
3
    c->u.xrle.cur_len = 0;
2196
3
    c->u.xrle.cur_lit = -1;
2197
2198
    // RLE map
2199
3
    int i, j, nrle = vv->varint_get32(&cp, endp, &err);
2200
3
    memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score));
2201
3
    for (i = 0; i < nrle && i < 256; i++) {
2202
0
        j = vv->varint_get32(&cp, endp, &err);
2203
0
        if (j >= 0 && j < 256)
2204
0
            c->u.xrle.rep_score[j] = 1;
2205
0
    }
2206
2207
    // Length and literal sub encodings
2208
3
    c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err);
2209
3
    int sub_size = vv->varint_get32(&cp, endp, &err);
2210
3
    if (sub_size < 0 || endp - cp < sub_size)
2211
0
        goto malformed;
2212
3
    c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding,
2213
3
                                            cp, sub_size, E_INT, version, vv);
2214
3
    if (c->u.xrle.len_codec == NULL)
2215
3
        goto malformed;
2216
0
    cp += sub_size;
2217
2218
0
    c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err);
2219
0
    sub_size = vv->varint_get32(&cp, endp, &err);
2220
0
    if (sub_size < 0 || endp - cp < sub_size)
2221
0
        goto malformed;
2222
0
    c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding,
2223
0
                                            cp, sub_size, option, version, vv);
2224
0
    if (c->u.xrle.lit_codec == NULL)
2225
0
        goto malformed;
2226
0
    cp += sub_size;
2227
2228
0
    if (err)
2229
0
        goto malformed;
2230
2231
0
    return c;
2232
2233
3
 malformed:
2234
3
    fprintf(stderr, "Malformed xrle header stream\n");
2235
3
    cram_xrle_decode_free(c);
2236
3
    return NULL;
2237
0
}
2238
2239
0
int cram_xrle_encode_flush(cram_codec *c) {
2240
0
    uint8_t *out_lit, *out_len;
2241
0
    uint64_t out_lit_size, out_len_size;
2242
0
    uint8_t rle_syms[256];
2243
0
    int rle_nsyms = 0, i;
2244
2245
0
    for (i = 0; i < 256; i++)
2246
0
        if (c->u.e_xrle.rep_score[i] > 0)
2247
0
            rle_syms[rle_nsyms++] = i;
2248
2249
0
    if (!c->u.e_xrle.to_flush) {
2250
0
        c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out);
2251
0
        c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out);
2252
0
    }
2253
2254
0
    out_len = malloc(c->u.e_xrle.to_flush_size+8);
2255
0
    if (!out_len)
2256
0
        return -1;
2257
2258
0
    int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size);
2259
2260
0
    out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
2261
0
                             out_len+nb, &out_len_size,
2262
0
                             rle_syms, &rle_nsyms,
2263
0
                             NULL, &out_lit_size);
2264
0
    out_len_size += nb;
2265
2266
2267
    // TODO: can maybe "gift" the sub codec the data block, to remove
2268
    // one level of memcpy.
2269
0
    if (c->u.e_xrle.len_codec->encode(NULL,
2270
0
                                      c->u.e_xrle.len_codec,
2271
0
                                      (char *)out_len, out_len_size))
2272
0
        return -1;
2273
2274
0
    if (c->u.e_xrle.lit_codec->encode(NULL,
2275
0
                                      c->u.e_xrle.lit_codec,
2276
0
                                      (char *)out_lit, out_lit_size))
2277
0
        return -1;
2278
2279
0
    free(out_len);
2280
0
    free(out_lit);
2281
2282
0
    return 0;
2283
0
}
2284
2285
int cram_xrle_encode_store(cram_codec *c, cram_block *b,
2286
0
                            char *prefix, int version) {
2287
0
    int len = 0, r = 0, n;
2288
0
    cram_codec *tc;
2289
0
    cram_block *b_rle, *b_len, *b_lit;
2290
2291
0
    if (prefix) {
2292
0
        size_t l = strlen(prefix);
2293
0
        BLOCK_APPEND(b, prefix, l);
2294
0
        len += l;
2295
0
    }
2296
2297
    // List of symbols to RLE
2298
0
    b_rle = cram_new_block(0, 0);
2299
0
    if (!b_rle)
2300
0
        return -1;
2301
0
    int i, nrle = 0, len1 = 0;
2302
0
    for (i = 0; i < 256; i++) {
2303
0
        if (c->u.e_xrle.rep_score[i] > 0) {
2304
0
            nrle++;
2305
0
            len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n;
2306
0
        }
2307
0
    }
2308
2309
    // Store length and literal sub-codecs to get encoded length
2310
0
    tc = c->u.e_xrle.len_codec;
2311
0
    b_len = cram_new_block(0, 0);
2312
0
    if (!b_len)
2313
0
        return -1;
2314
0
    int len2 = tc->store(tc, b_len, NULL, version);
2315
2316
0
    tc = c->u.e_xrle.lit_codec;
2317
0
    b_lit = cram_new_block(0, 0);
2318
0
    if (!b_lit)
2319
0
        return -1;
2320
0
    int len3 = tc->store(tc, b_lit, NULL, version);
2321
2322
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
2323
0
    len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3
2324
0
                                        + c->vv->varint_size(nrle))); r |= n;
2325
0
    len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n;
2326
0
    BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle));
2327
0
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
2328
0
    BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit));
2329
2330
0
    cram_free_block(b_rle);
2331
0
    cram_free_block(b_len);
2332
0
    cram_free_block(b_lit);
2333
2334
0
    if (r > 0)
2335
0
        return len + len1 + len2 + len3;
2336
2337
0
 block_err:
2338
0
    return -1;
2339
0
}
2340
2341
int cram_xrle_encode_long(cram_slice *slice, cram_codec *c,
2342
0
                           char *in, int in_size) {
2343
    // TODO if and when needed
2344
0
    return -1;
2345
0
}
2346
2347
int cram_xrle_encode_int(cram_slice *slice, cram_codec *c,
2348
0
                          char *in, int in_size) {
2349
    // TODO if and when needed
2350
0
    return -1;
2351
0
}
2352
2353
int cram_xrle_encode_char(cram_slice *slice, cram_codec *c,
2354
0
                          char *in, int in_size) {
2355
0
    if (c->u.e_xrle.to_flush) {
2356
0
        if (!c->out && !(c->out = cram_new_block(0, 0)))
2357
0
            return -1;
2358
0
        BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size);
2359
0
        c->u.e_xrle.to_flush = NULL;
2360
0
        c->u.e_xrle.to_flush_size = 0;
2361
0
    }
2362
2363
0
    if (c->out && BLOCK_SIZE(c->out) > 0) {
2364
        // Gathering data
2365
0
        BLOCK_APPEND(c->out, in, in_size);
2366
0
        return 0;
2367
0
    }
2368
2369
    // else cache copy of the data we're about to send to flush instead.
2370
0
    c->u.e_xrle.to_flush = in;
2371
0
    c->u.e_xrle.to_flush_size = in_size;
2372
0
    return 0;
2373
2374
0
 block_err:
2375
0
    return -1;
2376
0
}
2377
2378
0
void cram_xrle_encode_free(cram_codec *c) {
2379
0
    if (!c) return;
2380
2381
0
    if (c->u.e_xrle.len_codec)
2382
0
        c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec);
2383
0
    if (c->u.e_xrle.lit_codec)
2384
0
        c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec);
2385
2386
0
    cram_free_block(c->out);
2387
2388
0
    free(c);
2389
0
}
2390
2391
cram_codec *cram_xrle_encode_init(cram_stats *st,
2392
                                  enum cram_encoding codec,
2393
                                  enum cram_external_type option,
2394
                                  void *dat,
2395
0
                                  int version, varint_vec *vv) {
2396
0
    cram_codec *c;
2397
2398
0
    if (!(c = malloc(sizeof(*c))))
2399
0
        return NULL;
2400
2401
0
    c->codec  = E_XRLE;
2402
0
    c->free   = cram_xrle_encode_free;
2403
0
    if (option == E_LONG)
2404
0
        c->encode = cram_xrle_encode_long;
2405
0
    else if (option == E_INT)
2406
0
        c->encode = cram_xrle_encode_int;
2407
0
    else
2408
0
        c->encode = cram_xrle_encode_char;
2409
0
    c->store  = cram_xrle_encode_store;
2410
0
    c->flush  = cram_xrle_encode_flush;
2411
2412
0
    cram_xrle_encoder *e = (cram_xrle_encoder *)dat;
2413
2414
0
    c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL,
2415
0
                                              E_BYTE, e->len_dat,
2416
0
                                              version, vv);
2417
0
    c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL,
2418
0
                                              E_BYTE, e->lit_dat,
2419
0
                                              version, vv);
2420
0
    c->u.e_xrle.cur_lit = -1;
2421
0
    c->u.e_xrle.cur_len = -1;
2422
0
    c->u.e_xrle.to_flush = NULL;
2423
0
    c->u.e_xrle.to_flush_size = 0;
2424
2425
0
    memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score));
2426
2427
0
    return c;
2428
0
}
2429
2430
/*
2431
 * ---------------------------------------------------------------------------
2432
 * SUBEXP
2433
 */
2434
0
int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2435
0
    int32_t *out_i = (int32_t *)out;
2436
0
    int n, count;
2437
0
    int k = c->u.subexp.k;
2438
2439
0
    for (count = 0, n = *out_size; count < n; count++) {
2440
0
        int i = 0, tail;
2441
0
        int val;
2442
2443
        /* Get number of 1s */
2444
        //while (get_bit_MSB(in) == 1) i++;
2445
0
        i = get_one_bits_MSB(in);
2446
0
        if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
2447
0
            return -1;
2448
        /*
2449
         * Val is
2450
         * i > 0:  2^(k+i-1) + k+i-1 bits
2451
         * i = 0:  k bits
2452
         */
2453
0
        if (i) {
2454
0
            tail = i + k-1;
2455
0
            val = 0;
2456
0
            while (tail) {
2457
                //val = val<<1; val |= get_bit_MSB(in);
2458
0
                GET_BIT_MSB(in, val);
2459
0
                tail--;
2460
0
            }
2461
0
            val += 1 << (i + k-1);
2462
0
        } else {
2463
0
            tail = k;
2464
0
            val = 0;
2465
0
            while (tail) {
2466
                //val = val<<1; val |= get_bit_MSB(in);
2467
0
                GET_BIT_MSB(in, val);
2468
0
                tail--;
2469
0
            }
2470
0
        }
2471
2472
0
        out_i[count] = val - c->u.subexp.offset;
2473
0
    }
2474
2475
0
    return 0;
2476
0
}
2477
2478
1.77k
void cram_subexp_decode_free(cram_codec *c) {
2479
1.77k
    if (c)
2480
1.77k
        free(c);
2481
1.77k
}
2482
2483
0
int cram_subexp_describe(cram_codec *c, kstring_t *ks) {
2484
0
    return ksprintf(ks, "SUBEXP(offset=%d,k=%d)",
2485
0
                    c->u.subexp.offset,
2486
0
                    c->u.subexp.k)
2487
0
        < 0 ? -1 : 0;
2488
0
}
2489
2490
cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
2491
                                    char *data, int size,
2492
                                    enum cram_encoding codec,
2493
                                    enum cram_external_type option,
2494
1.77k
                                    int version, varint_vec *vv) {
2495
1.77k
    cram_codec *c;
2496
1.77k
    char *cp = data;
2497
2498
1.77k
    if (option != E_INT) {
2499
0
        hts_log_error("This codec only supports INT encodings");
2500
0
        return NULL;
2501
0
    }
2502
2503
1.77k
    if (!(c = malloc(sizeof(*c))))
2504
0
        return NULL;
2505
2506
1.77k
    c->codec  = E_SUBEXP;
2507
1.77k
    c->decode = cram_subexp_decode;
2508
1.77k
    c->free   = cram_subexp_decode_free;
2509
1.77k
    c->describe = cram_subexp_describe;
2510
1.77k
    c->u.subexp.k = -1;
2511
2512
1.77k
    c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL);
2513
1.77k
    c->u.subexp.k      = vv->varint_get32(&cp, data + size, NULL);
2514
2515
1.77k
    if (cp - data != size || c->u.subexp.k < 0) {
2516
3
        hts_log_error("Malformed subexp header stream");
2517
3
        free(c);
2518
3
        return NULL;
2519
3
    }
2520
2521
1.77k
    return c;
2522
1.77k
}
2523
2524
/*
2525
 * ---------------------------------------------------------------------------
2526
 * GAMMA
2527
 */
2528
0
int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2529
0
    int32_t *out_i = (int32_t *)out;
2530
0
    int i, n;
2531
2532
0
    for (i = 0, n = *out_size; i < n; i++) {
2533
0
        int nz = 0;
2534
0
        int val;
2535
        //while (get_bit_MSB(in) == 0) nz++;
2536
0
        nz = get_zero_bits_MSB(in);
2537
0
        if (cram_not_enough_bits(in, nz))
2538
0
            return -1;
2539
0
        val = 1;
2540
0
        while (nz > 0) {
2541
            //val <<= 1; val |= get_bit_MSB(in);
2542
0
            GET_BIT_MSB(in, val);
2543
0
            nz--;
2544
0
        }
2545
2546
0
        out_i[i] = val - c->u.gamma.offset;
2547
0
    }
2548
2549
0
    return 0;
2550
0
}
2551
2552
4.20k
void cram_gamma_decode_free(cram_codec *c) {
2553
4.20k
    if (c)
2554
4.20k
        free(c);
2555
4.20k
}
2556
2557
0
int cram_gamma_describe(cram_codec *c, kstring_t *ks) {
2558
0
    return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset)
2559
0
        < 0 ? -1 : 0;
2560
0
}
2561
2562
cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
2563
                                   char *data, int size,
2564
                                   enum cram_encoding codec,
2565
                                   enum cram_external_type option,
2566
4.20k
                                   int version, varint_vec *vv) {
2567
4.20k
    cram_codec *c = NULL;
2568
4.20k
    char *cp = data;
2569
2570
4.20k
    if (option != E_INT) {
2571
0
        hts_log_error("This codec only supports INT encodings");
2572
0
        return NULL;
2573
0
    }
2574
2575
4.20k
    if (size < 1)
2576
0
        goto malformed;
2577
2578
4.20k
    if (!(c = malloc(sizeof(*c))))
2579
0
        return NULL;
2580
2581
4.20k
    c->codec  = E_GAMMA;
2582
4.20k
    c->decode = cram_gamma_decode;
2583
4.20k
    c->free   = cram_gamma_decode_free;
2584
4.20k
    c->describe = cram_gamma_describe;
2585
2586
4.20k
    c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL);
2587
2588
4.20k
    if (cp - data != size)
2589
0
        goto malformed;
2590
2591
4.20k
    return c;
2592
2593
0
 malformed:
2594
0
    hts_log_error("Malformed gamma header stream");
2595
0
    free(c);
2596
0
    return NULL;
2597
4.20k
}
2598
2599
/*
2600
 * ---------------------------------------------------------------------------
2601
 * HUFFMAN
2602
 */
2603
2604
402
static int code_sort(const void *vp1, const void *vp2) {
2605
402
    const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
2606
402
    const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
2607
2608
402
    if (c1->len != c2->len)
2609
3
        return c1->len - c2->len;
2610
399
    else
2611
399
        return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
2612
402
}
2613
2614
1.00k
void cram_huffman_decode_free(cram_codec *c) {
2615
1.00k
    if (!c)
2616
0
        return;
2617
2618
1.00k
    if (c->u.huffman.codes)
2619
738
        free(c->u.huffman.codes);
2620
1.00k
    free(c);
2621
1.00k
}
2622
2623
int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
2624
0
                             cram_block *in, char *out, int *out_size) {
2625
0
    return -1;
2626
0
}
2627
2628
int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
2629
0
                              cram_block *in, char *out, int *out_size) {
2630
0
    int i, n;
2631
2632
0
    if (!out)
2633
0
        return 0;
2634
2635
    /* Special case of 0 length codes */
2636
0
    for (i = 0, n = *out_size; i < n; i++) {
2637
0
        out[i] = c->u.huffman.codes[0].symbol;
2638
0
    }
2639
0
    return 0;
2640
0
}
2641
2642
int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
2643
0
                             cram_block *in, char *out, int *out_size) {
2644
0
    int i, n, ncodes = c->u.huffman.ncodes;
2645
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2646
2647
0
    for (i = 0, n = *out_size; i < n; i++) {
2648
0
        int idx = 0;
2649
0
        int val = 0, len = 0, last_len = 0;
2650
2651
0
        for (;;) {
2652
0
            int dlen = codes[idx].len - last_len;
2653
0
            if (cram_not_enough_bits(in, dlen))
2654
0
                return -1;
2655
2656
            //val <<= dlen;
2657
            //val  |= get_bits_MSB(in, dlen);
2658
            //last_len = (len += dlen);
2659
2660
0
            last_len = (len += dlen);
2661
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2662
2663
0
            idx = val - codes[idx].p;
2664
0
            if (idx >= ncodes || idx < 0)
2665
0
                return -1;
2666
2667
0
            if (codes[idx].code == val && codes[idx].len == len) {
2668
0
                if (out) out[i] = codes[idx].symbol;
2669
0
                break;
2670
0
            }
2671
0
        }
2672
0
    }
2673
2674
0
    return 0;
2675
0
}
2676
2677
int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
2678
0
                             cram_block *in, char *out, int *out_size) {
2679
0
    int32_t *out_i = (int32_t *)out;
2680
0
    int i, n;
2681
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2682
2683
    /* Special case of 0 length codes */
2684
0
    for (i = 0, n = *out_size; i < n; i++) {
2685
0
        out_i[i] = codes[0].symbol;
2686
0
    }
2687
0
    return 0;
2688
0
}
2689
2690
int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
2691
0
                            cram_block *in, char *out, int *out_size) {
2692
0
    int32_t *out_i = (int32_t *)out;
2693
0
    int i, n, ncodes = c->u.huffman.ncodes;
2694
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2695
2696
0
    for (i = 0, n = *out_size; i < n; i++) {
2697
0
        int idx = 0;
2698
0
        int val = 0, len = 0, last_len = 0;
2699
2700
        // Now one bit at a time for remaining checks
2701
0
        for (;;) {
2702
0
            int dlen = codes[idx].len - last_len;
2703
0
            if (cram_not_enough_bits(in, dlen))
2704
0
                return -1;
2705
2706
            //val <<= dlen;
2707
            //val  |= get_bits_MSB(in, dlen);
2708
            //last_len = (len += dlen);
2709
2710
0
            last_len = (len += dlen);
2711
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2712
2713
0
            idx = val - codes[idx].p;
2714
0
            if (idx >= ncodes || idx < 0)
2715
0
                return -1;
2716
2717
0
            if (codes[idx].code == val && codes[idx].len == len) {
2718
0
                out_i[i] = codes[idx].symbol;
2719
0
                break;
2720
0
            }
2721
0
        }
2722
0
    }
2723
2724
0
    return 0;
2725
0
}
2726
2727
int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c,
2728
0
                              cram_block *in, char *out, int *out_size) {
2729
0
    int64_t *out_i = (int64_t *)out;
2730
0
    int i, n;
2731
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2732
2733
    /* Special case of 0 length codes */
2734
0
    for (i = 0, n = *out_size; i < n; i++) {
2735
0
        out_i[i] = codes[0].symbol;
2736
0
    }
2737
0
    return 0;
2738
0
}
2739
2740
int cram_huffman_decode_long(cram_slice *slice, cram_codec *c,
2741
0
                             cram_block *in, char *out, int *out_size) {
2742
0
    int64_t *out_i = (int64_t *)out;
2743
0
    int i, n, ncodes = c->u.huffman.ncodes;
2744
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2745
2746
0
    for (i = 0, n = *out_size; i < n; i++) {
2747
0
        int idx = 0;
2748
0
        int val = 0, len = 0, last_len = 0;
2749
2750
        // Now one bit at a time for remaining checks
2751
0
        for (;;) {
2752
0
            int dlen = codes[idx].len - last_len;
2753
0
            if (cram_not_enough_bits(in, dlen))
2754
0
                return -1;
2755
2756
            //val <<= dlen;
2757
            //val  |= get_bits_MSB(in, dlen);
2758
            //last_len = (len += dlen);
2759
2760
0
            last_len = (len += dlen);
2761
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2762
2763
0
            idx = val - codes[idx].p;
2764
0
            if (idx >= ncodes || idx < 0)
2765
0
                return -1;
2766
2767
0
            if (codes[idx].code == val && codes[idx].len == len) {
2768
0
                out_i[i] = codes[idx].symbol;
2769
0
                break;
2770
0
            }
2771
0
        }
2772
0
    }
2773
2774
0
    return 0;
2775
0
}
2776
2777
0
int cram_huffman_describe(cram_codec *c, kstring_t *ks) {
2778
0
    int r = 0, n;
2779
0
    r |= ksprintf(ks, "HUFFMAN(codes={") < 0;
2780
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2781
0
        r |= ksprintf(ks, "%s%"PRId64, n?",":"",
2782
0
                      c->u.huffman.codes[n].symbol);
2783
0
    }
2784
0
    r |= ksprintf(ks, "},lengths={") < 0;
2785
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2786
0
        r |= ksprintf(ks, "%s%d", n?",":"",
2787
0
                      c->u.huffman.codes[n].len);
2788
0
    }
2789
0
    r |= ksprintf(ks, "})") < 0;
2790
0
    return r;
2791
0
}
2792
2793
/*
2794
 * Initialises a huffman decoder from an encoding data stream.
2795
 */
2796
cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
2797
                                     char *data, int size,
2798
                                     enum cram_encoding codec,
2799
                                     enum cram_external_type option,
2800
1.02k
                                     int version, varint_vec *vv) {
2801
1.02k
    int32_t ncodes = 0, i, j;
2802
1.02k
    char *cp = data, *data_end = &data[size];
2803
1.02k
    cram_codec *h;
2804
1.02k
    cram_huffman_code *codes = NULL;
2805
1.02k
    int32_t val, last_len, max_len = 0;
2806
1.02k
    uint32_t max_val; // needs one more bit than val
2807
1.02k
    const int max_code_bits = sizeof(val) * 8 - 1;
2808
1.02k
    int err = 0;
2809
2810
1.02k
    if (option == E_BYTE_ARRAY_BLOCK) {
2811
0
        hts_log_error("BYTE_ARRAYs not supported by this codec");
2812
0
        return NULL;
2813
0
    }
2814
2815
1.02k
    ncodes = vv->varint_get32(&cp, data_end, &err);
2816
1.02k
    if (ncodes < 0) {
2817
0
        hts_log_error("Invalid number of symbols in huffman stream");
2818
0
        return NULL;
2819
0
    }
2820
1.02k
    if (ncodes >= SIZE_MAX / sizeof(*codes)) {
2821
0
        errno = ENOMEM;
2822
0
        return NULL;
2823
0
    }
2824
1.02k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2825
1.02k
    if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) {
2826
3
        errno = ENOMEM;
2827
3
        return NULL;
2828
3
    }
2829
1.02k
#endif
2830
1.02k
    h = calloc(1, sizeof(*h));
2831
1.02k
    if (!h)
2832
0
        return NULL;
2833
2834
1.02k
    h->codec  = E_HUFFMAN;
2835
1.02k
    h->free   = cram_huffman_decode_free;
2836
2837
1.02k
    h->u.huffman.ncodes = ncodes;
2838
1.02k
    h->u.huffman.option = option;
2839
1.02k
    if (ncodes) {
2840
750
        codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes));
2841
750
        if (!codes) {
2842
0
            free(h);
2843
0
            return NULL;
2844
0
        }
2845
750
    } else {
2846
276
        codes = h->u.huffman.codes = NULL;
2847
276
    }
2848
2849
    /* Read symbols and bit-lengths */
2850
1.02k
    if (option == E_LONG) {
2851
0
        for (i = 0; i < ncodes; i++)
2852
0
            codes[i].symbol = vv->varint_get64(&cp, data_end, &err);
2853
1.02k
    } else if (option == E_INT || option == E_BYTE) {
2854
2.18k
        for (i = 0; i < ncodes; i++)
2855
1.15k
            codes[i].symbol = vv->varint_get32(&cp, data_end, &err);
2856
1.02k
    } else {
2857
0
        goto malformed;
2858
0
    }
2859
2860
1.02k
    if (err)
2861
9
        goto malformed;
2862
2863
1.01k
    i = vv->varint_get32(&cp, data_end, &err);
2864
1.01k
    if (i != ncodes)
2865
0
        goto malformed;
2866
2867
1.01k
    if (ncodes == 0) {
2868
        /* NULL huffman stream.  Ensure it returns an error if
2869
           anything tries to use it. */
2870
267
        h->decode = cram_huffman_decode_null;
2871
267
        return h;
2872
267
    }
2873
2874
1.90k
    for (i = 0; i < ncodes; i++) {
2875
1.15k
        codes[i].len = vv->varint_get32(&cp, data_end, &err);
2876
1.15k
        if (err)
2877
3
            break;
2878
1.15k
        if (codes[i].len < 0) {
2879
0
            hts_log_error("Huffman code length (%d) is negative", codes[i].len);
2880
0
            goto malformed;
2881
0
        }
2882
1.15k
        if (max_len < codes[i].len)
2883
411
            max_len = codes[i].len;
2884
1.15k
    }
2885
750
    if (err || cp - data != size || max_len >= ncodes)
2886
9
        goto malformed;
2887
2888
    /* 31 is max. bits available in val */
2889
741
    if (max_len > max_code_bits) {
2890
0
        hts_log_error("Huffman code length (%d) is greater "
2891
0
                      "than maximum supported (%d)", max_len, max_code_bits);
2892
0
        goto malformed;
2893
0
    }
2894
2895
    /* Sort by bit length and then by symbol value */
2896
741
    qsort(codes, ncodes, sizeof(*codes), code_sort);
2897
2898
    /* Assign canonical codes */
2899
741
    val = -1, last_len = 0, max_val = 0;
2900
1.88k
    for (i = 0; i < ncodes; i++) {
2901
1.14k
        val++;
2902
1.14k
        if (val > max_val)
2903
3
            goto malformed;
2904
2905
1.14k
        if (codes[i].len > last_len) {
2906
399
            val <<= (codes[i].len - last_len);
2907
399
            last_len = codes[i].len;
2908
399
            max_val = (1U << codes[i].len) - 1;
2909
399
        }
2910
1.14k
        codes[i].code = val;
2911
1.14k
    }
2912
2913
    /*
2914
     * Compute the next starting point, offset by the i'th value.
2915
     * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
2916
     * codes[10..13].p = 30 - 10.
2917
     */
2918
738
    last_len = 0;
2919
1.87k
    for (i = j = 0; i < ncodes; i++) {
2920
1.13k
        if (codes[i].len > last_len) {
2921
399
            j = codes[i].code - i;
2922
399
            last_len = codes[i].len;
2923
399
        }
2924
1.13k
        codes[i].p = j;
2925
1.13k
    }
2926
2927
    // puts("==HUFF LEN==");
2928
    // for (i = 0; i <= last_len+1; i++) {
2929
    //     printf("len %d=%d prefix %d\n", i, h->u.huffman.lengths[i], h->u.huffman.prefix[i]);
2930
    // }
2931
    // puts("===HUFFMAN CODES===");
2932
    // for (i = 0; i < ncodes; i++) {
2933
    //     int j;
2934
    //     printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
2935
    //     j = codes[i].len;
2936
    //     while (j) {
2937
    //         putchar(codes[i].code & (1 << --j) ? '1' : '0');
2938
    //     }
2939
    //     printf(" %d\n", codes[i].code);
2940
    // }
2941
2942
738
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
2943
525
        if (h->u.huffman.codes[0].len == 0)
2944
270
            h->decode = cram_huffman_decode_char0;
2945
255
        else
2946
255
            h->decode = cram_huffman_decode_char;
2947
525
    } else if (option == E_LONG || option == E_SLONG) {
2948
0
        if (h->u.huffman.codes[0].len == 0)
2949
0
            h->decode = cram_huffman_decode_long0;
2950
0
        else
2951
0
            h->decode = cram_huffman_decode_long;
2952
213
    } else if (option == E_INT || option == E_SINT || option == E_BYTE) {
2953
213
        if (h->u.huffman.codes[0].len == 0)
2954
69
            h->decode = cram_huffman_decode_int0;
2955
144
        else
2956
144
            h->decode = cram_huffman_decode_int;
2957
213
    } else {
2958
0
        return NULL;
2959
0
    }
2960
738
    h->describe = cram_huffman_describe;
2961
2962
738
    return (cram_codec *)h;
2963
2964
21
 malformed:
2965
21
    hts_log_error("Malformed huffman header stream");
2966
21
    free(codes);
2967
21
    free(h);
2968
21
    return NULL;
2969
738
}
2970
2971
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
2972
91.5k
                              char *in, int in_size) {
2973
91.5k
    return 0;
2974
91.5k
}
2975
2976
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
2977
0
                             char *in, int in_size) {
2978
0
    int i, code, len, r = 0;
2979
0
    unsigned char *syms = (unsigned char *)in;
2980
2981
0
    while (in_size--) {
2982
0
        int sym = *syms++;
2983
0
        if (sym >= -1 && sym < MAX_HUFF) {
2984
0
            i = c->u.e_huffman.val2code[sym+1];
2985
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
2986
0
            code = c->u.e_huffman.codes[i].code;
2987
0
            len  = c->u.e_huffman.codes[i].len;
2988
0
        } else {
2989
            /* Slow - use a lookup table for when sym < MAX_HUFF? */
2990
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
2991
0
                if (c->u.e_huffman.codes[i].symbol == sym)
2992
0
                    break;
2993
0
            }
2994
0
            if (i == c->u.e_huffman.nvals)
2995
0
                return -1;
2996
2997
0
            code = c->u.e_huffman.codes[i].code;
2998
0
            len  = c->u.e_huffman.codes[i].len;
2999
0
        }
3000
3001
0
        r |= store_bits_MSB(c->out, code, len);
3002
0
    }
3003
3004
0
    return r;
3005
0
}
3006
3007
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
3008
26.6M
                             char *in, int in_size) {
3009
26.6M
    return 0;
3010
26.6M
}
3011
3012
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
3013
0
                            char *in, int in_size) {
3014
0
    int i, code, len, r = 0;
3015
0
    int *syms = (int *)in;
3016
3017
0
    while (in_size--) {
3018
0
        int sym = *syms++;
3019
3020
0
        if (sym >= -1 && sym < MAX_HUFF) {
3021
0
            i = c->u.e_huffman.val2code[sym+1];
3022
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3023
0
            code = c->u.e_huffman.codes[i].code;
3024
0
            len  = c->u.e_huffman.codes[i].len;
3025
0
        } else {
3026
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3027
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3028
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3029
0
                    break;
3030
0
            }
3031
0
            if (i == c->u.e_huffman.nvals)
3032
0
                return -1;
3033
3034
0
            code = c->u.e_huffman.codes[i].code;
3035
0
            len  = c->u.e_huffman.codes[i].len;
3036
0
        }
3037
3038
0
        r |= store_bits_MSB(c->out, code, len);
3039
0
    }
3040
3041
0
    return r;
3042
0
}
3043
3044
int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c,
3045
0
                              char *in, int in_size) {
3046
0
    return 0;
3047
0
}
3048
3049
int cram_huffman_encode_long(cram_slice *slice, cram_codec *c,
3050
0
                             char *in, int in_size) {
3051
0
    int i, code, len, r = 0;
3052
0
    int64_t *syms = (int64_t *)in;
3053
3054
0
    while (in_size--) {
3055
0
        int sym = *syms++;
3056
3057
0
        if (sym >= -1 && sym < MAX_HUFF) {
3058
0
            i = c->u.e_huffman.val2code[sym+1];
3059
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3060
0
            code = c->u.e_huffman.codes[i].code;
3061
0
            len  = c->u.e_huffman.codes[i].len;
3062
0
        } else {
3063
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3064
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3065
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3066
0
                    break;
3067
0
            }
3068
0
            if (i == c->u.e_huffman.nvals)
3069
0
                return -1;
3070
3071
0
            code = c->u.e_huffman.codes[i].code;
3072
0
            len  = c->u.e_huffman.codes[i].len;
3073
0
        }
3074
3075
0
        r |= store_bits_MSB(c->out, code, len);
3076
0
    }
3077
3078
0
    return r;
3079
0
}
3080
3081
943k
void cram_huffman_encode_free(cram_codec *c) {
3082
943k
    if (!c)
3083
0
        return;
3084
3085
943k
    if (c->u.e_huffman.codes)
3086
943k
        free(c->u.e_huffman.codes);
3087
943k
    free(c);
3088
943k
}
3089
3090
/*
3091
 * Encodes a huffman tree.
3092
 * Returns number of bytes written.
3093
 */
3094
int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
3095
942k
                              int version) {
3096
942k
    int i, len = 0, r = 0, n;
3097
942k
    cram_huffman_code *codes = c->u.e_huffman.codes;
3098
    /*
3099
     * Up to code length 127 means 2.5e+26 bytes of data required (worst
3100
     * case huffman tree needs symbols with freqs matching the Fibonacci
3101
     * series). So guaranteed 1 byte per code.
3102
     *
3103
     * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
3104
     *
3105
     * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
3106
     */
3107
942k
    char *tmp = malloc(6*c->u.e_huffman.nvals+16);
3108
942k
    char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16;
3109
3110
942k
    if (!tmp)
3111
0
        return -1;
3112
3113
942k
    if (prefix) {
3114
834k
        size_t l = strlen(prefix);
3115
834k
        BLOCK_APPEND(b, prefix, l);
3116
834k
        len += l;
3117
834k
    }
3118
3119
942k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3120
942k
    if (c->u.e_huffman.option == E_LONG) {
3121
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3122
0
            tp += c->vv->varint_put64(tp, tpend, codes[i].symbol);
3123
0
        }
3124
942k
    } else if (c->u.e_huffman.option == E_SLONG) {
3125
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3126
0
            tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol);
3127
0
        }
3128
942k
    } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) {
3129
1.88M
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3130
942k
            tp += c->vv->varint_put32(tp, tpend, codes[i].symbol);
3131
942k
        }
3132
942k
    } else if (c->u.e_huffman.option == E_SINT) {
3133
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3134
0
            tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol);
3135
0
        }
3136
0
    } else {
3137
0
        return -1;
3138
0
    }
3139
3140
942k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3141
1.88M
    for (i = 0; i < c->u.e_huffman.nvals; i++)
3142
942k
        tp += c->vv->varint_put32(tp, tpend, codes[i].len);
3143
3144
942k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
3145
942k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
3146
942k
    BLOCK_APPEND(b, tmp, tp-tmp);
3147
942k
    len += tp-tmp;
3148
3149
942k
    free(tmp);
3150
3151
942k
    if (r > 0)
3152
942k
        return len;
3153
3154
0
 block_err:
3155
0
    return -1;
3156
942k
}
3157
3158
cram_codec *cram_huffman_encode_init(cram_stats *st,
3159
                                     enum cram_encoding codec,
3160
                                     enum cram_external_type option,
3161
                                     void *dat,
3162
943k
                                     int version, varint_vec *vv) {
3163
943k
    int *vals = NULL, *freqs = NULL, *lens = NULL, code, len;
3164
943k
    int *new_vals, *new_freqs;
3165
943k
    int i, max_val = 0, min_val = INT_MAX, k;
3166
943k
    size_t nvals, vals_alloc = 0;
3167
943k
    cram_codec *c;
3168
943k
    cram_huffman_code *codes;
3169
3170
943k
    c = malloc(sizeof(*c));
3171
943k
    if (!c)
3172
0
        return NULL;
3173
943k
    c->codec = E_HUFFMAN;
3174
3175
    /* Count number of unique symbols */
3176
967M
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
3177
966M
        if (!st->freqs[i])
3178
965M
            continue;
3179
781k
        if (nvals >= vals_alloc) {
3180
781k
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3181
781k
            new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3182
781k
            if (!new_vals) goto nomem;
3183
781k
            vals = new_vals;
3184
781k
            new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3185
781k
            if (!new_freqs) goto nomem;
3186
781k
            freqs = new_freqs;
3187
781k
        }
3188
781k
        vals[nvals] = i;
3189
781k
        freqs[nvals] = st->freqs[i];
3190
781k
        assert(st->freqs[i] > 0);
3191
781k
        if (max_val < i) max_val = i;
3192
781k
        if (min_val > i) min_val = i;
3193
781k
        nvals++;
3194
781k
    }
3195
943k
    if (st->h) {
3196
161k
        khint_t k;
3197
3198
808k
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
3199
647k
            if (!kh_exist(st->h, k))
3200
485k
                continue;
3201
161k
            if (nvals >= vals_alloc) {
3202
161k
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3203
161k
                new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3204
161k
                if (!new_vals) goto nomem;
3205
161k
                vals = new_vals;
3206
161k
                new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3207
161k
                if (!new_freqs) goto nomem;
3208
161k
                freqs = new_freqs;
3209
161k
            }
3210
161k
            vals[nvals]= kh_key(st->h, k);
3211
161k
            freqs[nvals] = kh_val(st->h, k);
3212
161k
            assert(freqs[nvals] > 0);
3213
161k
            if (max_val < i) max_val = i;
3214
161k
            if (min_val > i) min_val = i;
3215
161k
            nvals++;
3216
161k
        }
3217
161k
    }
3218
3219
943k
    assert(nvals > 0);
3220
3221
943k
    new_freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
3222
943k
    if (!new_freqs) goto nomem;
3223
943k
    freqs = new_freqs;
3224
943k
    lens = calloc(2*nvals, sizeof(*lens));
3225
943k
    if (!lens) goto nomem;
3226
3227
    /* Inefficient, use pointers to form chain so we can insert and maintain
3228
     * a sorted list? This is currently O(nvals^2) complexity.
3229
     */
3230
943k
    for (;;) {
3231
943k
        int low1 = INT_MAX, low2 = INT_MAX;
3232
943k
        int ind1 = 0, ind2 = 0;
3233
1.88M
        for (i = 0; i < nvals; i++) {
3234
943k
            if (freqs[i] < 0)
3235
0
                continue;
3236
943k
            if (low1 > freqs[i])
3237
943k
                low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
3238
0
            else if (low2 > freqs[i])
3239
0
                low2 = freqs[i], ind2 = i;
3240
943k
        }
3241
943k
        if (low2 == INT_MAX)
3242
943k
            break;
3243
3244
0
        freqs[nvals] = low1 + low2;
3245
0
        lens[ind1] = nvals;
3246
0
        lens[ind2] = nvals;
3247
0
        freqs[ind1] *= -1;
3248
0
        freqs[ind2] *= -1;
3249
0
        nvals++;
3250
0
    }
3251
943k
    nvals = nvals/2+1;
3252
3253
    /* Assign lengths */
3254
1.88M
    for (i = 0; i < nvals; i++) {
3255
943k
        int code_len = 0;
3256
943k
        for (k = lens[i]; k; k = lens[k])
3257
0
            code_len++;
3258
943k
        lens[i] = code_len;
3259
943k
        freqs[i] *= -1;
3260
        //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
3261
943k
    }
3262
3263
3264
    /* Sort, need in a struct */
3265
943k
    if (!(codes = malloc(nvals * sizeof(*codes))))
3266
0
        goto nomem;
3267
1.88M
    for (i = 0; i < nvals; i++) {
3268
943k
        codes[i].symbol = vals[i];
3269
943k
        codes[i].len = lens[i];
3270
943k
    }
3271
943k
    qsort(codes, nvals, sizeof(*codes), code_sort);
3272
3273
    /*
3274
     * Generate canonical codes from lengths.
3275
     * Sort by length.
3276
     * Start with 0.
3277
     * Every new code of same length is +1.
3278
     * Every new code of new length is +1 then <<1 per extra length.
3279
     *
3280
     * /\
3281
     * a/\
3282
     * /\/\
3283
     * bcd/\
3284
     *    ef
3285
     *
3286
     * a 1  0
3287
     * b 3  4 (0+1)<<2
3288
     * c 3  5
3289
     * d 3  6
3290
     * e 4  14  (6+1)<<1
3291
     * f 5  15
3292
     */
3293
943k
    code = 0; len = codes[0].len;
3294
1.88M
    for (i = 0; i < nvals; i++) {
3295
943k
        while (len != codes[i].len) {
3296
0
            code<<=1;
3297
0
            len++;
3298
0
        }
3299
943k
        codes[i].code = code++;
3300
3301
943k
        if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
3302
939k
            c->u.e_huffman.val2code[codes[i].symbol+1] = i;
3303
3304
        //fprintf(stderr, "sym %d, code %d, len %d\n",
3305
        //      codes[i].symbol, codes[i].code, codes[i].len);
3306
943k
    }
3307
3308
943k
    free(lens);
3309
943k
    free(vals);
3310
943k
    free(freqs);
3311
3312
943k
    c->u.e_huffman.codes = codes;
3313
943k
    c->u.e_huffman.nvals = nvals;
3314
943k
    c->u.e_huffman.option = option;
3315
3316
943k
    c->free = cram_huffman_encode_free;
3317
943k
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
3318
27.2k
        if (c->u.e_huffman.codes[0].len == 0)
3319
27.2k
            c->encode = cram_huffman_encode_char0;
3320
0
        else
3321
0
            c->encode = cram_huffman_encode_char;
3322
916k
    } else if (option == E_INT || option == E_SINT) {
3323
916k
        if (c->u.e_huffman.codes[0].len == 0)
3324
916k
            c->encode = cram_huffman_encode_int0;
3325
0
        else
3326
0
            c->encode = cram_huffman_encode_int;
3327
916k
    } else if (option == E_LONG || option == E_SLONG) {
3328
0
        if (c->u.e_huffman.codes[0].len == 0)
3329
0
            c->encode = cram_huffman_encode_long0;
3330
0
        else
3331
0
            c->encode = cram_huffman_encode_long;
3332
0
    } else {
3333
0
        return NULL;
3334
0
    }
3335
943k
    c->store = cram_huffman_encode_store;
3336
943k
    c->flush = NULL;
3337
3338
943k
    return c;
3339
3340
0
 nomem:
3341
0
    hts_log_error("Out of memory");
3342
0
    free(vals);
3343
0
    free(freqs);
3344
0
    free(lens);
3345
0
    free(c);
3346
0
    return NULL;
3347
943k
}
3348
3349
/*
3350
 * ---------------------------------------------------------------------------
3351
 * BYTE_ARRAY_LEN
3352
 */
3353
int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
3354
                               cram_block *in, char *out,
3355
0
                               int *out_size) {
3356
    /* Fetch length */
3357
0
    int32_t len = 0, one = 1;
3358
0
    int r;
3359
3360
0
    r = c->u.byte_array_len.len_codec->decode(slice, c->u.byte_array_len.len_codec,
3361
0
                                              in, (char *)&len, &one);
3362
    //printf("ByteArray Len=%d\n", len);
3363
3364
0
    if (!r && c->u.byte_array_len.val_codec && len >= 0) {
3365
0
        r = c->u.byte_array_len.val_codec->decode(slice,
3366
0
                                                  c->u.byte_array_len.val_codec,
3367
0
                                                  in, out, &len);
3368
0
    } else {
3369
0
        return -1;
3370
0
    }
3371
3372
0
    *out_size = len;
3373
3374
0
    return r;
3375
0
}
3376
3377
3.02k
void cram_byte_array_len_decode_free(cram_codec *c) {
3378
3.02k
    if (!c) return;
3379
3380
3.02k
    if (c->u.byte_array_len.len_codec)
3381
3.00k
        c->u.byte_array_len.len_codec->free(c->u.byte_array_len.len_codec);
3382
3383
3.02k
    if (c->u.byte_array_len.val_codec)
3384
3.00k
        c->u.byte_array_len.val_codec->free(c->u.byte_array_len.val_codec);
3385
3386
3.02k
    free(c);
3387
3.02k
}
3388
3389
0
int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) {
3390
0
    int r = 0;
3391
0
    r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0;
3392
0
    cram_byte_array_len_decoder *l = &c->u.byte_array_len;
3393
0
    r |=  l->len_codec->describe
3394
0
        ? l->len_codec->describe(l->len_codec, ks)
3395
0
        : (ksprintf(ks, "?")<0);
3396
0
    r |= ksprintf(ks, "},val_codec={") < 0;
3397
0
    r |=  l->val_codec->describe
3398
0
        ? l->val_codec->describe(l->val_codec, ks)
3399
0
        : (ksprintf(ks, "?")<0);
3400
0
    r |= ksprintf(ks, "}") < 0;
3401
3402
0
    return r;
3403
0
}
3404
3405
cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
3406
                                            char *data, int size,
3407
                                            enum cram_encoding codec,
3408
                                            enum cram_external_type option,
3409
3.02k
                                            int version, varint_vec *vv) {
3410
3.02k
    cram_codec *c;
3411
3.02k
    char *cp   = data;
3412
3.02k
    char *endp = data + size;
3413
3414
3.02k
    if (!(c = malloc(sizeof(*c))))
3415
0
        return NULL;
3416
3417
3.02k
    c->codec  = E_BYTE_ARRAY_LEN;
3418
3.02k
    c->decode = cram_byte_array_len_decode;
3419
3.02k
    c->free   = cram_byte_array_len_decode_free;
3420
3.02k
    c->describe = cram_byte_array_len_describe;
3421
3.02k
    c->u.byte_array_len.len_codec = NULL;
3422
3.02k
    c->u.byte_array_len.val_codec = NULL;
3423
3424
3.02k
    int encoding = vv->varint_get32(&cp, endp, NULL);
3425
3.02k
    int sub_size = vv->varint_get32(&cp, endp, NULL);
3426
3.02k
    if (sub_size < 0 || endp - cp < sub_size)
3427
6
        goto malformed;
3428
3.02k
    c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3429
3.02k
                                                      E_INT, version, vv);
3430
3.02k
    if (c->u.byte_array_len.len_codec == NULL)
3431
18
        goto no_codec;
3432
3.00k
    cp += sub_size;
3433
3434
3.00k
    encoding = vv->varint_get32(&cp, endp, NULL);
3435
3.00k
    sub_size = vv->varint_get32(&cp, endp, NULL);
3436
3.00k
    if (sub_size < 0 || endp - cp < sub_size)
3437
0
        goto malformed;
3438
3.00k
    c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3439
3.00k
                                                      option, version, vv);
3440
3.00k
    if (c->u.byte_array_len.val_codec == NULL)
3441
0
        goto no_codec;
3442
3.00k
    cp += sub_size;
3443
3444
3.00k
    if (cp - data != size)
3445
3
        goto malformed;
3446
3447
3.00k
    return c;
3448
3449
9
 malformed:
3450
9
    hts_log_error("Malformed byte_array_len header stream");
3451
27
 no_codec:
3452
27
    cram_byte_array_len_decode_free(c);
3453
27
    return NULL;
3454
9
}
3455
3456
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
3457
40.4k
                               char *in, int in_size) {
3458
40.4k
    int32_t i32 = in_size;
3459
40.4k
    int r = 0;
3460
3461
40.4k
    r |= c->u.e_byte_array_len.len_codec->encode(slice,
3462
40.4k
                                                 c->u.e_byte_array_len.len_codec,
3463
40.4k
                                                 (char *)&i32, 1);
3464
40.4k
    r |= c->u.e_byte_array_len.val_codec->encode(slice,
3465
40.4k
                                                 c->u.e_byte_array_len.val_codec,
3466
40.4k
                                                 in, in_size);
3467
40.4k
    return r;
3468
40.4k
}
3469
3470
173k
void cram_byte_array_len_encode_free(cram_codec *c) {
3471
173k
    if (!c)
3472
0
        return;
3473
3474
173k
    if (c->u.e_byte_array_len.len_codec)
3475
173k
        c->u.e_byte_array_len.len_codec->free(c->u.e_byte_array_len.len_codec);
3476
3477
173k
    if (c->u.e_byte_array_len.val_codec)
3478
173k
        c->u.e_byte_array_len.val_codec->free(c->u.e_byte_array_len.val_codec);
3479
3480
173k
    free(c);
3481
173k
}
3482
3483
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
3484
173k
                                     char *prefix, int version) {
3485
173k
    int len = 0, len2, len3, r = 0, n;
3486
173k
    cram_codec *tc;
3487
173k
    cram_block *b_len = NULL, *b_val = NULL;
3488
3489
173k
    if (prefix) {
3490
63.9k
        size_t l = strlen(prefix);
3491
63.9k
        BLOCK_APPEND(b, prefix, l);
3492
63.9k
        len += l;
3493
63.9k
    }
3494
3495
173k
    tc = c->u.e_byte_array_len.len_codec;
3496
173k
    b_len = cram_new_block(0, 0);
3497
173k
    if (!b_len) goto block_err;
3498
173k
    len2 = tc->store(tc, b_len, NULL, version);
3499
173k
    if (len2 < 0) goto block_err;
3500
3501
173k
    tc = c->u.e_byte_array_len.val_codec;
3502
173k
    b_val = cram_new_block(0, 0);
3503
173k
    if (!b_val) goto block_err;
3504
173k
    len3 = tc->store(tc, b_val, NULL, version);
3505
173k
    if (len3 < 0) goto block_err;
3506
3507
173k
    len += (n = c->vv->varint_put32_blk(b, c->codec));  r |= n;
3508
173k
    len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n;
3509
173k
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
3510
173k
    BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
3511
3512
173k
    cram_free_block(b_len);
3513
173k
    cram_free_block(b_val);
3514
3515
173k
    if (r > 0)
3516
173k
        return len + len2 + len3;
3517
3518
0
 block_err:
3519
0
    if (b_len) cram_free_block(b_len);
3520
0
    if (b_val) cram_free_block(b_val);
3521
0
    return -1;
3522
173k
}
3523
3524
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
3525
                                            enum cram_encoding codec,
3526
                                            enum cram_external_type option,
3527
                                            void *dat,
3528
173k
                                            int version, varint_vec *vv) {
3529
173k
    cram_codec *c;
3530
173k
    cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
3531
3532
173k
    c = malloc(sizeof(*c));
3533
173k
    if (!c)
3534
0
        return NULL;
3535
173k
    c->codec = E_BYTE_ARRAY_LEN;
3536
173k
    c->free = cram_byte_array_len_encode_free;
3537
173k
    c->encode = cram_byte_array_len_encode;
3538
173k
    c->store = cram_byte_array_len_encode_store;
3539
173k
    c->flush = NULL;
3540
3541
173k
    c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
3542
173k
                                                        st, E_INT,
3543
173k
                                                        e->len_dat,
3544
173k
                                                        version, vv);
3545
173k
    c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
3546
173k
                                                        NULL, E_BYTE_ARRAY,
3547
173k
                                                        e->val_dat,
3548
173k
                                                        version, vv);
3549
3550
173k
    if (!c->u.e_byte_array_len.len_codec ||
3551
173k
        !c->u.e_byte_array_len.val_codec) {
3552
0
        cram_byte_array_len_encode_free(c);
3553
0
        return NULL;
3554
0
    }
3555
3556
173k
    return c;
3557
173k
}
3558
3559
/*
3560
 * ---------------------------------------------------------------------------
3561
 * BYTE_ARRAY_STOP
3562
 */
3563
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
3564
                                            cram_block *in, char *out,
3565
0
                                            int *out_size) {
3566
0
    char *cp, ch;
3567
0
    cram_block *b = NULL;
3568
3569
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3570
0
    if (!b)
3571
0
        return *out_size?-1:0;
3572
3573
0
    if (b->idx >= b->uncomp_size)
3574
0
        return -1;
3575
3576
0
    cp = (char *)b->data + b->idx;
3577
0
    if (out) {
3578
       // memccpy equivalent but without copying the terminating byte
3579
0
        ssize_t term = MIN(*out_size, b->uncomp_size - b->idx);
3580
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3581
0
            if (term-- < 0)
3582
0
                break;
3583
0
            *out++ = ch;
3584
0
            cp++;
3585
0
        }
3586
3587
        // Attempted overrun on input or output
3588
0
        if (ch != (char)c->u.byte_array_stop.stop)
3589
0
            return -1;
3590
0
    } else {
3591
        // Consume input, but produce no output
3592
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3593
0
            if (cp - (char *)b->data >= b->uncomp_size)
3594
0
                return -1;
3595
0
            cp++;
3596
0
        }
3597
0
    }
3598
3599
0
    *out_size = cp - (char *)(b->data + b->idx);
3600
0
    b->idx = cp - (char *)b->data + 1;
3601
3602
0
    return 0;
3603
0
}
3604
3605
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
3606
                                      cram_block *in, char *out_,
3607
0
                                      int *out_size) {
3608
0
    cram_block *b;
3609
0
    cram_block *out = (cram_block *)out_;
3610
0
    unsigned char *cp, *cp_end;
3611
0
    unsigned char stop;
3612
3613
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3614
0
    if (!b)
3615
0
        return *out_size?-1:0;
3616
3617
0
    if (b->idx >= b->uncomp_size)
3618
0
        return -1;
3619
0
    cp = b->data + b->idx;
3620
0
    cp_end = b->data + b->uncomp_size;
3621
3622
    // STOP byte is hard-coded as zero by our name tokeniser decoder
3623
    // implementation, so we may ignore what was requested.
3624
0
    stop = b->orig_method == TOK3 ? 0 : c->u.byte_array_stop.stop;
3625
3626
0
    if (cp_end - cp < out->alloc - out->byte) {
3627
0
        unsigned char *out_cp = BLOCK_END(out);
3628
0
        while (cp != cp_end && *cp != stop)
3629
0
            *out_cp++ = *cp++;
3630
0
        BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out);
3631
0
    } else {
3632
0
        unsigned char *cp_start;
3633
0
        for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
3634
0
            ;
3635
0
        BLOCK_APPEND(out, cp_start, cp - cp_start);
3636
0
        BLOCK_GROW(out, cp - cp_start);
3637
0
    }
3638
3639
0
    *out_size = cp - (b->data + b->idx);
3640
0
    b->idx = cp - b->data + 1;
3641
3642
0
    return 0;
3643
3644
0
 block_err:
3645
0
    return -1;
3646
0
}
3647
3648
642
void cram_byte_array_stop_decode_free(cram_codec *c) {
3649
642
    if (!c) return;
3650
3651
642
    free(c);
3652
642
}
3653
3654
0
int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) {
3655
0
    return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)",
3656
0
                    c->u.byte_array_stop.stop,
3657
0
                    c->u.byte_array_stop.content_id)
3658
0
        < 0 ? -1 : 0;
3659
0
}
3660
3661
cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
3662
                                             char *data, int size,
3663
                                             enum cram_encoding codec,
3664
                                             enum cram_external_type option,
3665
642
                                             int version, varint_vec *vv) {
3666
642
    cram_codec *c = NULL;
3667
642
    unsigned char *cp = (unsigned char *)data;
3668
642
    int err = 0;
3669
3670
642
    if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
3671
0
        goto malformed;
3672
3673
642
    if (!(c = malloc(sizeof(*c))))
3674
0
        return NULL;
3675
3676
642
    c->codec  = E_BYTE_ARRAY_STOP;
3677
642
    switch (option) {
3678
636
    case E_BYTE_ARRAY_BLOCK:
3679
636
        c->decode = cram_byte_array_stop_decode_block;
3680
636
        break;
3681
6
    case E_BYTE_ARRAY:
3682
6
        c->decode = cram_byte_array_stop_decode_char;
3683
6
        break;
3684
0
    default:
3685
0
        hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
3686
0
        free(c);
3687
0
        return NULL;
3688
642
    }
3689
642
    c->free   = cram_byte_array_stop_decode_free;
3690
642
    c->describe = cram_byte_array_stop_describe;
3691
3692
642
    c->u.byte_array_stop.stop = *cp++;
3693
642
    if (CRAM_MAJOR_VERS(version) == 1) {
3694
642
        c->u.byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
3695
642
            + ((unsigned int) cp[3]<<24);
3696
642
        cp += 4;
3697
642
    } else {
3698
0
        c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err);
3699
0
    }
3700
3701
642
    if ((char *)cp - data != size || err)
3702
0
        goto malformed;
3703
3704
642
    return c;
3705
3706
0
 malformed:
3707
0
    hts_log_error("Malformed byte_array_stop header stream");
3708
0
    free(c);
3709
0
    return NULL;
3710
642
}
3711
3712
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
3713
189k
                                char *in, int in_size) {
3714
189k
    BLOCK_APPEND(c->out, in, in_size);
3715
189k
    BLOCK_APPEND_CHAR(c->out, c->u.e_byte_array_stop.stop);
3716
189k
    return 0;
3717
3718
0
 block_err:
3719
0
    return -1;
3720
189k
}
3721
3722
254k
void cram_byte_array_stop_encode_free(cram_codec *c) {
3723
254k
    if (!c)
3724
0
        return;
3725
254k
    free(c);
3726
254k
}
3727
3728
int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
3729
253k
                                      char *prefix, int version) {
3730
253k
    int len = 0;
3731
253k
    char buf[20], *cp = buf;
3732
3733
253k
    if (prefix) {
3734
191k
        size_t l = strlen(prefix);
3735
191k
        BLOCK_APPEND(b, prefix, l);
3736
191k
        len += l;
3737
191k
    }
3738
3739
253k
    cp += c->vv->varint_put32(cp, buf+20, c->codec);
3740
3741
253k
    if (CRAM_MAJOR_VERS(version) == 1) {
3742
0
        cp += c->vv->varint_put32(cp, buf+20, 5);
3743
0
        *cp++ = c->u.e_byte_array_stop.stop;
3744
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  0) & 0xff;
3745
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  8) & 0xff;
3746
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff;
3747
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff;
3748
253k
    } else {
3749
253k
        cp += c->vv->varint_put32(cp, buf+20, 1 +
3750
253k
                                  c->vv->varint_size(c->u.e_byte_array_stop.content_id));
3751
253k
        *cp++ = c->u.e_byte_array_stop.stop;
3752
253k
        cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id);
3753
253k
    }
3754
3755
253k
    BLOCK_APPEND(b, buf, cp-buf);
3756
253k
    len += cp-buf;
3757
3758
253k
    return len;
3759
3760
0
 block_err:
3761
0
    return -1;
3762
253k
}
3763
3764
cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
3765
                                             enum cram_encoding codec,
3766
                                             enum cram_external_type option,
3767
                                             void *dat,
3768
254k
                                             int version, varint_vec *vv) {
3769
254k
    cram_codec *c;
3770
3771
254k
    c = malloc(sizeof(*c));
3772
254k
    if (!c)
3773
0
        return NULL;
3774
254k
    c->codec = E_BYTE_ARRAY_STOP;
3775
254k
    c->free = cram_byte_array_stop_encode_free;
3776
254k
    c->encode = cram_byte_array_stop_encode;
3777
254k
    c->store = cram_byte_array_stop_encode_store;
3778
254k
    c->flush = NULL;
3779
3780
254k
    c->u.e_byte_array_stop.stop = ((int *)dat)[0];
3781
254k
    c->u.e_byte_array_stop.content_id = ((int *)dat)[1];
3782
3783
254k
    return c;
3784
254k
}
3785
3786
/*
3787
 * ---------------------------------------------------------------------------
3788
 */
3789
3790
107
const char *cram_encoding2str(enum cram_encoding t) {
3791
107
    switch (t) {
3792
11
    case E_NULL:            return "NULL";
3793
0
    case E_EXTERNAL:        return "EXTERNAL";
3794
6
    case E_GOLOMB:          return "GOLOMB";
3795
0
    case E_HUFFMAN:         return "HUFFMAN";
3796
0
    case E_BYTE_ARRAY_LEN:  return "BYTE_ARRAY_LEN";
3797
0
    case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
3798
37
    case E_BETA:            return "BETA";
3799
0
    case E_SUBEXP:          return "SUBEXP";
3800
0
    case E_GOLOMB_RICE:     return "GOLOMB_RICE";
3801
0
    case E_GAMMA:           return "GAMMA";
3802
3803
0
    case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED";
3804
0
    case E_VARINT_SIGNED:   return "VARINT_SIGNED";
3805
0
    case E_CONST_BYTE:      return "CONST_BYTE";
3806
0
    case E_CONST_INT:       return "CONST_INT";
3807
3808
0
    case E_NUM_CODECS:
3809
53
    default:                return "?";
3810
107
    }
3811
107
}
3812
3813
static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr,
3814
                                    char *data,
3815
                                    int size,
3816
                                    enum cram_encoding codec,
3817
                                    enum cram_external_type option,
3818
                                    int version, varint_vec *vv) = {
3819
    // CRAM 3.0 valid codecs
3820
    NULL, // null codec
3821
    cram_external_decode_init,
3822
    NULL, // golomb
3823
    cram_huffman_decode_init,
3824
    cram_byte_array_len_decode_init,
3825
    cram_byte_array_stop_decode_init,
3826
    cram_beta_decode_init,
3827
    cram_subexp_decode_init,
3828
    NULL, // golomb rice
3829
    cram_gamma_decode_init,
3830
3831
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3832
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3833
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3834
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3835
3836
    NULL,                      // was xbyte
3837
    cram_varint_decode_init,   // varint unsigned
3838
    cram_varint_decode_init,   // varint signed
3839
    cram_const_decode_init,    // const byte
3840
    cram_const_decode_init,    // const int
3841
3842
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3843
    NULL, NULL, NULL, NULL, NULL,
3844
3845
    NULL, // xhuffman
3846
    cram_xpack_decode_init,
3847
    cram_xrle_decode_init,
3848
    cram_xdelta_decode_init,
3849
};
3850
3851
cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr,
3852
                              enum cram_encoding codec,
3853
                              char *data, int size,
3854
                              enum cram_external_type option,
3855
15.9k
                              int version, varint_vec *vv) {
3856
15.9k
    if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
3857
15.8k
        cram_codec *r = decode_init[codec](hdr, data, size, codec,
3858
15.8k
                                           option, version, vv);
3859
15.8k
        if (r) {
3860
15.7k
            r->vv = vv;
3861
15.7k
            r->codec_id = hdr->ncodecs++;
3862
15.7k
        }
3863
15.8k
        return r;
3864
15.8k
    } else {
3865
70
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3866
70
        return NULL;
3867
70
    }
3868
15.9k
}
3869
3870
static cram_codec *(*encode_init[])(cram_stats *stx,
3871
                                    enum cram_encoding codec,
3872
                                    enum cram_external_type option,
3873
                                    void *opt,
3874
                                    int version, varint_vec *vv) = {
3875
    // CRAM 3.0 valid codecs
3876
    NULL, // null codec
3877
    cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4
3878
    NULL, // golomb
3879
    cram_huffman_encode_init,
3880
    cram_byte_array_len_encode_init,
3881
    cram_byte_array_stop_encode_init,
3882
    cram_beta_encode_init,
3883
    NULL, // subexponential (we support decode only)
3884
    NULL, // golomb rice
3885
    NULL, // gamma (we support decode only)
3886
3887
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3888
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3889
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3890
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3891
3892
    NULL, // was xbyte
3893
    cram_varint_encode_init, // varint unsigned
3894
    cram_varint_encode_init, // varint signed
3895
    cram_const_encode_init,  // const byte
3896
    cram_const_encode_init,  // const int
3897
3898
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3899
    NULL, NULL, NULL, NULL, NULL,
3900
3901
    NULL, // xhuffman
3902
    cram_xpack_encode_init,
3903
    cram_xrle_encode_init,
3904
    cram_xdelta_encode_init,
3905
};
3906
3907
cram_codec *cram_encoder_init(enum cram_encoding codec,
3908
                              cram_stats *st,
3909
                              enum cram_external_type option,
3910
                              void *dat,
3911
2.24M
                              int version, varint_vec *vv) {
3912
2.24M
    if (st && !st->nvals)
3913
536k
        return NULL;
3914
3915
    // cram_stats_encoding assumes integer data, but if option
3916
    // is E_BYTE then tweak the requested encoding.  This ought
3917
    // to be fixed in cram_stats_encoding instead.
3918
1.71M
    if (option == E_BYTE || option == E_BYTE_ARRAY ||
3919
1.71M
       option == E_BYTE_ARRAY_BLOCK) {
3920
697k
       if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED)
3921
0
           codec = E_EXTERNAL;
3922
697k
       else if (codec == E_CONST_INT)
3923
0
           codec = E_CONST_BYTE;
3924
697k
    }
3925
3926
1.71M
    if (encode_init[codec]) {
3927
1.71M
        cram_codec *r;
3928
1.71M
        if ((r = encode_init[codec](st, codec, option, dat, version, vv)))
3929
1.71M
            r->out = NULL;
3930
1.71M
        if (!r) {
3931
37
            hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec));
3932
37
            return NULL;
3933
37
        }
3934
1.71M
        r->vv = vv;
3935
1.71M
        return r;
3936
1.71M
    } else {
3937
0
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3938
0
        abort();
3939
0
    }
3940
1.71M
}
3941
3942
/*
3943
 * Returns the content_id used by this codec, also in id2 if byte_array_len.
3944
 * Returns -1 for the CORE block and -2 for unneeded.
3945
 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
3946
 */
3947
0
int cram_codec_to_id(cram_codec *c, int *id2) {
3948
0
    int bnum1, bnum2 = -2;
3949
3950
0
    switch (c->codec) {
3951
0
    case E_CONST_INT:
3952
0
    case E_CONST_BYTE:
3953
0
        bnum1 = -2; // no blocks used
3954
0
        break;
3955
3956
0
    case E_HUFFMAN:
3957
0
        bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1;
3958
0
        break;
3959
3960
0
    case E_GOLOMB:
3961
0
    case E_BETA:
3962
0
    case E_SUBEXP:
3963
0
    case E_GOLOMB_RICE:
3964
0
    case E_GAMMA:
3965
        // CORE block
3966
0
        bnum1 = -1;
3967
0
        break;
3968
3969
0
    case E_EXTERNAL:
3970
0
    case E_VARINT_UNSIGNED:
3971
0
    case E_VARINT_SIGNED:
3972
0
        bnum1 = c->u.external.content_id;
3973
0
        break;
3974
3975
0
    case E_BYTE_ARRAY_LEN:
3976
0
        bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL);
3977
0
        bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL);
3978
0
        break;
3979
3980
0
    case E_BYTE_ARRAY_STOP:
3981
0
        bnum1 = c->u.byte_array_stop.content_id;
3982
0
        break;
3983
3984
0
    case E_NULL:
3985
0
        bnum1 = -2;
3986
0
        break;
3987
3988
0
    default:
3989
0
        hts_log_error("Unknown codec type %d", c->codec);
3990
0
        bnum1 = -1;
3991
0
    }
3992
3993
0
    if (id2)
3994
0
        *id2 = bnum2;
3995
0
    return bnum1;
3996
0
}
3997
3998
3999
/*
4000
 * cram_codec structures are specialised for decoding or encoding.
4001
 * Unfortunately this makes turning a decoder into an encoder (such as
4002
 * when transcoding files) problematic.
4003
 *
4004
 * This function converts a cram decoder codec into an encoder version
4005
 * in-place (ie it modifiers the codec itself).
4006
 *
4007
 * Returns 0 on success;
4008
 *        -1 on failure.
4009
 */
4010
0
int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
4011
0
    int j;
4012
4013
0
    switch (c->codec) {
4014
0
    case E_CONST_INT:
4015
0
    case E_CONST_BYTE:
4016
        // shares struct with decode
4017
0
        c->store = cram_const_encode_store;
4018
0
        break;
4019
4020
0
    case E_EXTERNAL:
4021
        // shares struct with decode
4022
0
        c->free = cram_external_encode_free;
4023
0
        c->store = cram_external_encode_store;
4024
0
        if (c->decode == cram_external_decode_int)
4025
0
            c->encode = cram_external_encode_int;
4026
0
        else if (c->decode == cram_external_decode_long)
4027
0
            c->encode = cram_external_encode_long;
4028
0
        else if (c->decode == cram_external_decode_char)
4029
0
            c->encode = cram_external_encode_char;
4030
0
        else if (c->decode == cram_external_decode_block)
4031
0
            c->encode = cram_external_encode_char;
4032
0
        else
4033
0
            return -1;
4034
0
        break;
4035
4036
0
    case E_VARINT_SIGNED:
4037
0
    case E_VARINT_UNSIGNED:
4038
        // shares struct with decode
4039
0
        c->free = cram_varint_encode_free;
4040
0
        c->store = cram_varint_encode_store;
4041
0
        if (c->decode == cram_varint_decode_int)
4042
0
            c->encode = cram_varint_encode_int;
4043
0
        else if (c->decode == cram_varint_decode_sint)
4044
0
            c->encode = cram_varint_encode_sint;
4045
0
        else if (c->decode == cram_varint_decode_long)
4046
0
            c->encode = cram_varint_encode_long;
4047
0
        else if (c->decode == cram_varint_decode_slong)
4048
0
            c->encode = cram_varint_encode_slong;
4049
0
        else
4050
0
            return -1;
4051
0
        break;
4052
4053
0
    case E_HUFFMAN: {
4054
        // New structure, so switch.
4055
        // FIXME: we huffman and e_huffman structs amended, we could
4056
        // unify this.
4057
0
        cram_codec *t = malloc(sizeof(*t));
4058
0
        if (!t) return -1;
4059
0
        t->vv     = c->vv;
4060
0
        t->codec = E_HUFFMAN;
4061
0
        t->free = cram_huffman_encode_free;
4062
0
        t->store = cram_huffman_encode_store;
4063
0
        t->u.e_huffman.codes = c->u.huffman.codes;
4064
0
        t->u.e_huffman.nvals = c->u.huffman.ncodes;
4065
0
        t->u.e_huffman.option = c->u.huffman.option;
4066
0
        for (j = 0; j < t->u.e_huffman.nvals; j++) {
4067
0
            int32_t sym = t->u.e_huffman.codes[j].symbol;
4068
0
            if (sym >= -1 && sym < MAX_HUFF)
4069
0
                t->u.e_huffman.val2code[sym+1] = j;
4070
0
        }
4071
4072
0
        if (c->decode == cram_huffman_decode_char0)
4073
0
            t->encode = cram_huffman_encode_char0;
4074
0
        else if (c->decode == cram_huffman_decode_char)
4075
0
            t->encode = cram_huffman_encode_char;
4076
0
        else if (c->decode == cram_huffman_decode_int0)
4077
0
            t->encode = cram_huffman_encode_int0;
4078
0
        else if (c->decode == cram_huffman_decode_int)
4079
0
            t->encode = cram_huffman_encode_int;
4080
0
        else if (c->decode == cram_huffman_decode_long0)
4081
0
            t->encode = cram_huffman_encode_long0;
4082
0
        else if (c->decode == cram_huffman_decode_long)
4083
0
            t->encode = cram_huffman_encode_long;
4084
0
        else {
4085
0
            free(t);
4086
0
            return -1;
4087
0
        }
4088
0
        *c = *t;
4089
0
        free(t);
4090
0
        break;
4091
0
    }
4092
4093
0
    case E_BETA:
4094
        // shares struct with decode
4095
0
        c->free = cram_beta_encode_free;
4096
0
        c->store = cram_beta_encode_store;
4097
0
        if (c->decode == cram_beta_decode_int)
4098
0
            c->encode = cram_beta_encode_int;
4099
0
        else if (c->decode == cram_beta_decode_long)
4100
0
            c->encode = cram_beta_encode_long;
4101
0
        else if (c->decode == cram_beta_decode_char)
4102
0
            c->encode = cram_beta_encode_char;
4103
0
        else
4104
0
            return -1;
4105
0
        break;
4106
4107
0
    case E_XPACK: {
4108
        // shares struct with decode
4109
0
        cram_codec t = *c;
4110
0
        t.free = cram_xpack_encode_free;
4111
0
        t.store = cram_xpack_encode_store;
4112
0
        if (t.decode == cram_xpack_decode_long)
4113
0
            t.encode = cram_xpack_encode_long;
4114
0
        else if (t.decode == cram_xpack_decode_int)
4115
0
            t.encode = cram_xpack_encode_int;
4116
0
        else if (t.decode == cram_xpack_decode_char)
4117
0
            t.encode = cram_xpack_encode_char;
4118
0
        else
4119
0
            return -1;
4120
0
        t.u.e_xpack.sub_codec = t.u.xpack.sub_codec;
4121
0
        if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1)
4122
0
            return -1;
4123
0
        *c = t;
4124
0
        break;
4125
0
    }
4126
4127
0
    case E_BYTE_ARRAY_LEN: {
4128
0
        cram_codec *t = malloc(sizeof(*t));
4129
0
        if (!t) return -1;
4130
0
        t->vv     = c->vv;
4131
0
        t->codec  = E_BYTE_ARRAY_LEN;
4132
0
        t->free   = cram_byte_array_len_encode_free;
4133
0
        t->store  = cram_byte_array_len_encode_store;
4134
0
        t->encode = cram_byte_array_len_encode;
4135
0
        t->u.e_byte_array_len.len_codec = c->u.byte_array_len.len_codec;
4136
0
        t->u.e_byte_array_len.val_codec = c->u.byte_array_len.val_codec;
4137
0
        if (cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.len_codec) == -1 ||
4138
0
            cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.val_codec) == -1) {
4139
0
            t->free(t);
4140
0
            return -1;
4141
0
        }
4142
4143
        // {len,val}_{encoding,dat} are undefined, but unused.
4144
        // Leaving them unset here means we can test that assertion.
4145
0
        *c = *t;
4146
0
        free(t);
4147
0
        break;
4148
0
    }
4149
4150
0
    case E_BYTE_ARRAY_STOP:
4151
        // shares struct with decode
4152
0
        c->free   = cram_byte_array_stop_encode_free;
4153
0
        c->store  = cram_byte_array_stop_encode_store;
4154
0
        c->encode = cram_byte_array_stop_encode;
4155
0
        break;
4156
4157
0
    default:
4158
0
        return -1;
4159
0
    }
4160
4161
0
    return 0;
4162
0
}
4163
4164
0
int cram_codec_describe(cram_codec *c, kstring_t *ks) {
4165
0
    if (c && c->describe)
4166
0
        return c->describe(c, ks);
4167
0
    else
4168
0
        return ksprintf(ks, "?");
4169
0
}