Coverage Report

Created: 2026-05-16 07:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_codecs.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2021,2023, 2025, 2026 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * FIXME: add checking of cram_external_type to return NULL on unsupported
33
 * {codec,type} tuples.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdlib.h>
40
#include <string.h>
41
#include <assert.h>
42
#include <limits.h>
43
#include <stdint.h>
44
#include <errno.h>
45
#include <stddef.h>
46
47
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48
#include "../fuzz_settings.h"
49
#endif
50
51
#include "../htslib/hts_endian.h"
52
53
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
54
#include <htscodecs/varint.h>
55
#include <htscodecs/pack.h>
56
#include <htscodecs/rle.h>
57
#else
58
#include "../htscodecs/htscodecs/varint.h"
59
#include "../htscodecs/htscodecs/pack.h"
60
#include "../htscodecs/htscodecs/rle.h"
61
#endif
62
63
#include "cram.h"
64
65
/*
66
 * ---------------------------------------------------------------------------
67
 * Block bit-level I/O functions.
68
 * All defined static here to promote easy inlining by the compiler.
69
 */
70
71
#if 0
72
/* Get a single bit, MSB first */
73
static signed int get_bit_MSB(cram_block *block) {
74
    unsigned int val;
75
76
    if (block->byte > block->alloc)
77
        return -1;
78
79
    val = block->data[block->byte] >> block->bit;
80
    if (--block->bit == -1) {
81
        block->bit = 7;
82
        block->byte++;
83
        //printf("(%02X)", block->data[block->byte]);
84
    }
85
86
    //printf("-B%d-", val&1);
87
88
    return val & 1;
89
}
90
#endif
91
92
/*
93
 * Count number of successive 0 and 1 bits
94
 */
95
0
static int get_one_bits_MSB(cram_block *block) {
96
0
    int n = 0, b;
97
0
    if (block->byte >= block->uncomp_size)
98
0
        return -1;
99
0
    do {
100
0
        b = block->data[block->byte] >> block->bit;
101
0
        if (--block->bit == -1) {
102
0
            block->bit = 7;
103
0
            block->byte++;
104
0
            if (block->byte == block->uncomp_size && (b&1))
105
0
                return -1;
106
0
        }
107
0
        n++;
108
0
    } while (b&1);
109
110
0
    return n-1;
111
0
}
112
113
0
static int get_zero_bits_MSB(cram_block *block) {
114
0
    int n = 0, b;
115
0
    if (block->byte >= block->uncomp_size)
116
0
        return -1;
117
0
    do {
118
0
        b = block->data[block->byte] >> block->bit;
119
0
        if (--block->bit == -1) {
120
0
            block->bit = 7;
121
0
            block->byte++;
122
0
            if (block->byte == block->uncomp_size && !(b&1))
123
0
                return -1;
124
0
        }
125
0
        n++;
126
0
    } while (!(b&1));
127
128
0
    return n-1;
129
0
}
130
131
#if 0
132
/* Stores a single bit */
133
static void store_bit_MSB(cram_block *block, unsigned int bit) {
134
    if (block->byte >= block->alloc) {
135
        block->alloc = block->alloc ? block->alloc*2 : 1024;
136
        block->data = realloc(block->data, block->alloc);
137
    }
138
139
    if (bit)
140
        block->data[block->byte] |= (1 << block->bit);
141
142
    if (--block->bit == -1) {
143
        block->bit = 7;
144
        block->byte++;
145
        block->data[block->byte] = 0;
146
    }
147
}
148
#endif
149
150
#if 0
151
/* Rounds to the next whole byte boundary first */
152
static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
153
    if (block->bit != 7) {
154
        block->bit = 7;
155
        block->byte++;
156
    }
157
158
    while (block->byte + len >= block->alloc) {
159
        block->alloc = block->alloc ? block->alloc*2 : 1024;
160
        block->data = realloc(block->data, block->alloc);
161
    }
162
163
    memcpy(&block->data[block->byte], bytes, len);
164
    block->byte += len;
165
}
166
#endif
167
168
/* Local optimised copy for inlining */
169
0
static inline int64_t get_bits_MSB(cram_block *block, int nbits) {
170
0
    uint64_t val = 0;
171
0
    int i;
172
173
#if 0
174
    // Fits within the current byte */
175
    if (nbits <= block->bit+1) {
176
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
177
        if ((block->bit -= nbits) == -1) {
178
            block->bit = 7;
179
            block->byte++;
180
        }
181
        return val;
182
    }
183
184
    // partial first byte
185
    val = block->data[block->byte] & ((1<<(block->bit+1))-1);
186
    nbits -= block->bit+1;
187
    block->bit = 7;
188
    block->byte++;
189
190
    // whole middle bytes
191
    while (nbits >= 8) {
192
        val = (val << 8) | block->data[block->byte++];
193
        nbits -= 8;
194
    }
195
196
    val <<= nbits;
197
    val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
198
    block->bit -= nbits;
199
    return val;
200
#endif
201
202
#if 0
203
    /* Inefficient implementation! */
204
    //printf("{");
205
    for (i = 0; i < nbits; i++)
206
        //val = (val << 1) | get_bit_MSB(block);
207
        GET_BIT_MSB(block, val);
208
#endif
209
210
0
#if 1
211
    /* Combination of 1st two methods */
212
0
    if (nbits <= block->bit+1) {
213
0
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
214
0
        if ((block->bit -= nbits) == -1) {
215
0
            block->bit = 7;
216
0
            block->byte++;
217
0
        }
218
0
        return val;
219
0
    }
220
221
0
    switch(nbits) {
222
//  case 15: GET_BIT_MSB(block, val); // fall through
223
//  case 14: GET_BIT_MSB(block, val); // fall through
224
//  case 13: GET_BIT_MSB(block, val); // fall through
225
//  case 12: GET_BIT_MSB(block, val); // fall through
226
//  case 11: GET_BIT_MSB(block, val); // fall through
227
//  case 10: GET_BIT_MSB(block, val); // fall through
228
//  case  9: GET_BIT_MSB(block, val); // fall through
229
0
    case  8: GET_BIT_MSB(block, val); // fall through
230
0
    case  7: GET_BIT_MSB(block, val); // fall through
231
0
    case  6: GET_BIT_MSB(block, val); // fall through
232
0
    case  5: GET_BIT_MSB(block, val); // fall through
233
0
    case  4: GET_BIT_MSB(block, val); // fall through
234
0
    case  3: GET_BIT_MSB(block, val); // fall through
235
0
    case  2: GET_BIT_MSB(block, val); // fall through
236
0
    case  1: GET_BIT_MSB(block, val);
237
0
        break;
238
239
0
    default:
240
0
        for (i = 0; i < nbits; i++)
241
            //val = (val << 1) | get_bit_MSB(block);
242
0
            GET_BIT_MSB(block, val);
243
0
    }
244
0
#endif
245
246
    //printf("=0x%x}", val);
247
248
0
    return val;
249
0
}
250
251
/*
252
 * Can store up to 24-bits worth of data encoded in an integer value
253
 * Possibly we'd want to have a less optimal store_bits function when dealing
254
 * with nbits > 24, but for now we assume the codes generated are never
255
 * that big. (Given this is only possible with 121392 or more
256
 * characters with exactly the correct frequency distribution we check
257
 * for it elsewhere.)
258
 */
259
92
static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) {
260
    //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
261
262
    /*
263
     * Use slow mode until we tweak the huffman generator to never generate
264
     * codes longer than 24-bits.
265
     */
266
92
    unsigned int mask;
267
268
92
    if (block->byte+8 >= block->alloc) {
269
35
        if (block->byte) {
270
0
            block->alloc *= 2;
271
0
            block->data = realloc(block->data, block->alloc + 8);
272
0
            if (!block->data)
273
0
                return -1;
274
35
        } else {
275
35
            block->alloc = 1024;
276
35
            block->data = realloc(block->data, block->alloc + 8);
277
35
            if (!block->data)
278
0
                return -1;
279
35
            block->data[0] = 0; // initialise first byte of buffer
280
35
        }
281
35
    }
282
283
    /* fits in current bit-field */
284
92
    if (nbits <= block->bit+1) {
285
15
        block->data[block->byte] |= (val << (block->bit+1-nbits));
286
15
        if ((block->bit-=nbits) == -1) {
287
0
            block->bit = 7;
288
0
            block->byte++;
289
0
            block->data[block->byte] = 0;
290
0
        }
291
15
        return 0;
292
15
    }
293
294
77
    block->data[block->byte] |= (val >> (nbits -= block->bit+1));
295
77
    block->bit = 7;
296
77
    block->byte++;
297
77
    block->data[block->byte] = 0;
298
299
77
    mask = 1<<(nbits-1);
300
1.85k
    do {
301
1.85k
        if (val & mask)
302
776
            block->data[block->byte] |= (1 << block->bit);
303
1.85k
        if (--block->bit == -1) {
304
191
            block->bit = 7;
305
191
            block->byte++;
306
191
            block->data[block->byte] = 0;
307
191
        }
308
1.85k
        mask >>= 1;
309
1.85k
    } while(--nbits);
310
311
77
    return 0;
312
92
}
313
314
/*
315
 * Returns the next 'size' bytes from a block, or NULL if insufficient
316
 * data left.This is just a pointer into the block data and not an
317
 * allocated object, so do not free the result.
318
 */
319
0
static char *cram_extract_block(cram_block *b, int size) {
320
0
    char *cp = (char *)b->data + b->idx;
321
0
    b->idx += size;
322
0
    if (b->idx > b->uncomp_size)
323
0
        return NULL;
324
325
0
    return cp;
326
0
}
327
328
/*
329
 * ---------------------------------------------------------------------------
330
 * EXTERNAL
331
 *
332
 * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the
333
 * size of the object being returned.  This type is hard coded in the
334
 * spec document (changing from uint32 to uint64 requires a spec change)
335
 * and there is no data format introspection so implementations have
336
 * to determine which size to use based on version numbers.   It also
337
 * doesn't support signed data.
338
 *
339
 * With CRAM 4.0 onwards the size and sign of the data is no longer stated
340
 * explicitly in the specification.  Instead EXTERNAL is replaced by three
341
 * new encodings, for bytes and signed / unsigned integers which used a
342
 * variable sized encoding.
343
 *
344
 * For simplicity we use the same encode and decode functions for
345
 * bytes (CRAM4) and external (CRAM3). Given we already had code to
346
 * replace codec + type into a function pointer it makes little
347
 * difference how we ended up at that function.  However we disallow
348
 * this codec to operate on integer data for CRAM4 onwards.
349
 */
350
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
351
0
                             cram_block *in, char *out, int *out_size) {
352
0
    char *cp;
353
0
    cram_block *b;
354
355
    /* Find the external block */
356
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
357
0
    if (!b)
358
0
        return *out_size?-1:0;
359
360
0
    cp = (char *)b->data + b->idx;
361
    // E_INT and E_LONG are guaranteed single item queries
362
0
    int err = 0;
363
0
    *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err);
364
0
    b->idx = cp - (char *)b->data;
365
0
    *out_size = 1;
366
367
0
    return err ? -1 : 0;
368
0
}
369
370
int cram_external_decode_long(cram_slice *slice, cram_codec *c,
371
0
                              cram_block *in, char *out, int *out_size) {
372
0
    char *cp;
373
0
    cram_block *b;
374
375
    /* Find the external block */
376
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
377
0
    if (!b)
378
0
        return *out_size?-1:0;
379
380
0
    cp = (char *)b->data + b->idx;
381
    // E_INT and E_LONG are guaranteed single item queries
382
0
    int err = 0;
383
0
    *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err);
384
0
    b->idx = cp - (char *)b->data;
385
0
    *out_size = 1;
386
387
0
    return err ? -1 : 0;
388
0
}
389
390
int cram_external_decode_char(cram_slice *slice, cram_codec *c,
391
                              cram_block *in, char *out,
392
0
                              int *out_size) {
393
0
    char *cp;
394
0
    cram_block *b;
395
396
    /* Find the external block */
397
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
398
0
    if (!b)
399
0
        return *out_size?-1:0;
400
401
0
    cp = cram_extract_block(b, *out_size);
402
0
    if (!cp)
403
0
        return -1;
404
405
0
    if (out)
406
0
        memcpy(out, cp, *out_size);
407
0
    return 0;
408
0
}
409
410
static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
411
                                      cram_block *in, char *out_,
412
0
                                      int *out_size) {
413
0
    char *cp;
414
0
    cram_block *out = (cram_block *)out_;
415
0
    cram_block *b = NULL;
416
417
    /* Find the external block */
418
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
419
0
    if (!b)
420
0
        return *out_size?-1:0;
421
422
0
    cp = cram_extract_block(b, *out_size);
423
0
    if (!cp)
424
0
        return -1;
425
426
0
    BLOCK_APPEND(out, cp, *out_size);
427
0
    return 0;
428
429
0
 block_err:
430
0
    return -1;
431
0
}
432
433
1.59k
void cram_external_decode_free(cram_codec *c) {
434
1.59k
    if (c)
435
1.59k
        free(c);
436
1.59k
}
437
438
439
0
int cram_external_decode_size(cram_slice *slice, cram_codec *c) {
440
0
    cram_block *b;
441
442
    /* Find the external block */
443
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
444
0
    if (!b)
445
0
        return -1;
446
447
0
    return b->uncomp_size;
448
0
}
449
450
0
cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) {
451
0
    return cram_get_block_by_id(slice, c->u.external.content_id);
452
0
}
453
454
0
int cram_external_describe(cram_codec *c, kstring_t *ks) {
455
0
    return ksprintf(ks, "EXTERNAL(id=%d)",
456
0
                    c->u.external.content_id) < 0 ? -1 : 0;
457
0
}
458
459
cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
460
                                      char *data, int size,
461
                                      enum cram_encoding codec,
462
                                      enum cram_external_type option,
463
1.64k
                                      int version, varint_vec *vv) {
464
1.64k
    cram_codec *c = NULL;
465
1.64k
    char *cp = data;
466
467
1.64k
    if (size < 1)
468
12
        goto malformed;
469
470
1.63k
    if (!(c = malloc(sizeof(*c))))
471
0
        return NULL;
472
473
1.63k
    c->codec  = E_EXTERNAL;
474
1.63k
    if (CRAM_MAJOR_VERS(version) >= 4) {
475
        // Version 4 does not permit integer data to be encoded as a
476
        // series of bytes.  This is used purely for bytes, either
477
        // singular or declared as arrays
478
132
        switch (codec) {
479
132
        case E_EXTERNAL:
480
132
            if (option == E_BYTE_ARRAY_BLOCK)
481
120
                c->decode = cram_external_decode_block;
482
12
            else if (option == E_BYTE || option == E_BYTE_ARRAY)
483
6
                c->decode = cram_external_decode_char;
484
6
            else
485
6
                goto malformed;
486
126
            break;
487
126
        default:
488
0
            goto malformed;
489
132
        }
490
1.50k
    } else {
491
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
492
        // use the option field to indicate the input data format so
493
        // we know which serialisation format to use.
494
1.50k
        if (option == E_INT)
495
1.14k
            c->decode = cram_external_decode_int;
496
354
        else if (option == E_LONG)
497
0
            c->decode = cram_external_decode_long;
498
354
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
499
48
            c->decode = cram_external_decode_char;
500
306
        else
501
306
            c->decode = cram_external_decode_block;
502
1.50k
    }
503
1.62k
    c->free   = cram_external_decode_free;
504
1.62k
    c->size   = cram_external_decode_size;
505
1.62k
    c->get_block = cram_external_get_block;
506
1.62k
    c->describe = cram_external_describe;
507
508
1.62k
    c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL);
509
510
1.62k
    if (cp - data != size)
511
27
        goto malformed;
512
513
1.59k
    c->u.external.type = option;
514
515
1.59k
    return c;
516
517
45
 malformed:
518
45
    hts_log_error("Malformed external header stream");
519
45
    free(c);
520
45
    return NULL;
521
1.62k
}
522
523
int cram_external_encode_int(cram_slice *slice, cram_codec *c,
524
11.1M
                             char *in, int in_size) {
525
11.1M
    uint32_t *i32 = (uint32_t *)in;
526
11.1M
    return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1;
527
11.1M
}
528
529
int cram_external_encode_sint(cram_slice *slice, cram_codec *c,
530
0
                             char *in, int in_size) {
531
0
    int32_t *i32 = (int32_t *)in;
532
0
    return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1;
533
0
}
534
535
int cram_external_encode_long(cram_slice *slice, cram_codec *c,
536
0
                             char *in, int in_size) {
537
0
    uint64_t *i64 = (uint64_t *)in;
538
0
    return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1;
539
0
}
540
541
int cram_external_encode_slong(cram_slice *slice, cram_codec *c,
542
0
                               char *in, int in_size) {
543
0
    int64_t *i64 = (int64_t *)in;
544
0
    return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1;
545
0
}
546
547
int cram_external_encode_char(cram_slice *slice, cram_codec *c,
548
155k
                              char *in, int in_size) {
549
155k
    BLOCK_APPEND(c->out, in, in_size);
550
155k
    return 0;
551
552
0
 block_err:
553
0
    return -1;
554
155k
}
555
556
15.3k
void cram_external_encode_free(cram_codec *c) {
557
15.3k
    if (!c)
558
0
        return;
559
15.3k
    free(c);
560
15.3k
}
561
562
int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
563
14.7k
                               int version) {
564
14.7k
    char tmp[99], *tp = tmp, *tpend = tmp+99;
565
14.7k
    int len = 0, r = 0, n;
566
567
14.7k
    if (prefix) {
568
7.31k
        size_t l = strlen(prefix);
569
7.31k
        BLOCK_APPEND(b, prefix, l);
570
7.31k
        len += l;
571
7.31k
    }
572
573
14.7k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id);
574
14.7k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
575
14.7k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
576
14.7k
    BLOCK_APPEND(b, tmp, tp-tmp);
577
14.7k
    len += tp-tmp;
578
579
14.7k
    if (r > 0)
580
14.7k
        return len;
581
582
0
 block_err:
583
0
    return -1;
584
14.7k
}
585
586
cram_codec *cram_external_encode_init(cram_stats *st,
587
                                      enum cram_encoding codec,
588
                                      enum cram_external_type option,
589
                                      void *dat,
590
15.3k
                                      int version, varint_vec *vv) {
591
15.3k
    cram_codec *c;
592
593
15.3k
    c = malloc(sizeof(*c));
594
15.3k
    if (!c)
595
0
        return NULL;
596
15.3k
    c->codec = E_EXTERNAL;
597
15.3k
    c->free = cram_external_encode_free;
598
15.3k
    if (CRAM_MAJOR_VERS(version) >= 4) {
599
        // Version 4 does not permit integer data to be encoded as a
600
        // series of bytes.  This is used purely for bytes, either
601
        // singular or declared as arrays
602
0
        switch (codec) {
603
0
        case E_EXTERNAL:
604
0
            if (option != E_BYTE && option != E_BYTE_ARRAY)
605
0
                return NULL;
606
0
            c->encode = cram_external_encode_char;
607
0
            break;
608
0
        default:
609
0
            return NULL;
610
0
        }
611
15.3k
    } else {
612
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
613
        // use the option field to indicate the input data format so
614
        // we know which serialisation format to use.
615
15.3k
        if (option == E_INT)
616
6.38k
            c->encode = cram_external_encode_int;
617
8.94k
        else if (option == E_LONG)
618
0
            c->encode = cram_external_encode_long;
619
8.94k
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
620
8.94k
            c->encode = cram_external_encode_char;
621
0
        else
622
0
            abort();
623
15.3k
    }
624
15.3k
    c->store = cram_external_encode_store;
625
15.3k
    c->flush = NULL;
626
627
15.3k
    c->u.e_external.content_id = (size_t)dat;
628
629
15.3k
    return c;
630
15.3k
}
631
632
/*
633
 * ---------------------------------------------------------------------------
634
 * VARINT
635
 *
636
 * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8
637
 * format as well as bytes.  In CRAM 4 EXTERNAL is only for bytes and
638
 * byte arrays, with two dedicated encodings for integers:
639
 * VARINT_SIGNED and VARINT_UNSIGNED.  These also differ a little to
640
 * EXTERNAL with the addition of an offset field, meaning we can store
641
 * values in, say, the range -2 to 1 million without needing to use
642
 * a signed zig-zag transformation.
643
 */
644
int cram_varint_decode_int(cram_slice *slice, cram_codec *c,
645
0
                           cram_block *in, char *out, int *out_size) {
646
0
    char *cp;
647
0
    cram_block *b;
648
649
    /* Find the data block */
650
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
651
0
    if (!b)
652
0
        return *out_size?-1:0;
653
654
0
    cp = (char *)b->data + b->idx;
655
    // E_INT and E_LONG are guaranteed single item queries
656
0
    int err = 0;
657
0
    *(int32_t *)out = c->vv->varint_get32(&cp,
658
0
                                          (char *)b->data + b->uncomp_size,
659
0
                                          &err) + c->u.varint.offset;
660
0
    b->idx = cp - (char *)b->data;
661
0
    *out_size = 1;
662
663
0
    return err ? -1 : 0;
664
0
}
665
666
int cram_varint_decode_sint(cram_slice *slice, cram_codec *c,
667
0
                            cram_block *in, char *out, int *out_size) {
668
0
    char *cp;
669
0
    cram_block *b;
670
671
    /* Find the data block */
672
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
673
0
    if (!b)
674
0
        return *out_size?-1:0;
675
676
0
    cp = (char *)b->data + b->idx;
677
    // E_INT and E_LONG are guaranteed single item queries
678
0
    int err = 0;
679
0
    *(int32_t *)out = c->vv->varint_get32s(&cp,
680
0
                                           (char *)b->data + b->uncomp_size,
681
0
                                           &err) + c->u.varint.offset;
682
0
    b->idx = cp - (char *)b->data;
683
0
    *out_size = 1;
684
685
0
    return err ? -1 : 0;
686
0
}
687
688
int cram_varint_decode_long(cram_slice *slice, cram_codec *c,
689
0
                            cram_block *in, char *out, int *out_size) {
690
0
    char *cp;
691
0
    cram_block *b;
692
693
    /* Find the data block */
694
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
695
0
    if (!b)
696
0
        return *out_size?-1:0;
697
698
0
    cp = (char *)b->data + b->idx;
699
    // E_INT and E_LONG are guaranteed single item queries
700
0
    int err = 0;
701
0
    *(int64_t *)out = c->vv->varint_get64(&cp,
702
0
                                          (char *)b->data + b->uncomp_size,
703
0
                                          &err) + c->u.varint.offset;
704
0
    b->idx = cp - (char *)b->data;
705
0
    *out_size = 1;
706
707
0
    return err ? -1 : 0;
708
0
}
709
710
int cram_varint_decode_slong(cram_slice *slice, cram_codec *c,
711
0
                             cram_block *in, char *out, int *out_size) {
712
0
    char *cp;
713
0
    cram_block *b;
714
715
    /* Find the data block */
716
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
717
0
    if (!b)
718
0
        return *out_size?-1:0;
719
720
0
    cp = (char *)b->data + b->idx;
721
    // E_INT and E_LONG are guaranteed single item queries
722
0
    int err = 0;
723
0
    *(int64_t *)out = c->vv->varint_get64s(&cp,
724
0
                                           (char *)b->data + b->uncomp_size,
725
0
                                           &err) + c->u.varint.offset;
726
0
    b->idx = cp - (char *)b->data;
727
0
    *out_size = 1;
728
729
0
    return err ? -1 : 0;
730
0
}
731
732
426
void cram_varint_decode_free(cram_codec *c) {
733
426
    if (c)
734
426
        free(c);
735
426
}
736
737
0
int cram_varint_decode_size(cram_slice *slice, cram_codec *c) {
738
0
    cram_block *b;
739
740
    /* Find the data block */
741
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
742
0
    if (!b)
743
0
        return -1;
744
745
0
    return b->uncomp_size;
746
0
}
747
748
0
cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) {
749
0
    return cram_get_block_by_id(slice, c->u.varint.content_id);
750
0
}
751
752
0
int cram_varint_describe(cram_codec *c, kstring_t *ks) {
753
0
    return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)",
754
0
                    c->u.varint.content_id,
755
0
                    c->u.varint.offset,
756
0
                    c->u.varint.type)
757
0
        < 0 ? -1 : 0;
758
0
}
759
760
cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
761
                                    char *data, int size,
762
                                    enum cram_encoding codec,
763
                                    enum cram_external_type option,
764
492
                                    int version, varint_vec *vv) {
765
492
    cram_codec *c;
766
492
    char *cp = data, *cp_end = data+size;
767
768
492
    if (!(c = malloc(sizeof(*c))))
769
0
        return NULL;
770
771
492
    c->codec  = codec;
772
773
    // Function pointer choice is theoretically by codec type.
774
    // Given we have some vars as int32 and some as int64 we
775
    // use option too for sizing, although on disk format
776
    // does not change.
777
492
    switch(codec) {
778
321
    case E_VARINT_UNSIGNED:
779
321
        if (option == E_INT || option == E_SINT)
780
249
            c->decode = cram_varint_decode_int;
781
72
        else if (option == E_LONG || option == E_SLONG)
782
63
            c->decode = cram_varint_decode_long;
783
9
        else
784
9
            goto malformed;
785
312
        break;
786
312
    case E_VARINT_SIGNED:
787
171
        if (option == E_INT || option == E_SINT)
788
156
            c->decode = cram_varint_decode_sint;
789
15
        else if (option == E_LONG || option == E_SLONG)
790
3
            c->decode = cram_varint_decode_slong;
791
12
        else
792
12
            goto malformed;
793
159
        break;
794
159
    default:
795
0
        goto malformed;
796
492
    }
797
798
471
    c->free   = cram_varint_decode_free;
799
471
    c->size   = cram_varint_decode_size;
800
471
    c->get_block = cram_varint_get_block;
801
471
    c->describe = cram_varint_describe;
802
803
471
    c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL);
804
471
    c->u.varint.offset     = vv->varint_get64s(&cp, cp_end, NULL);
805
806
471
    if (cp - data != size) {
807
45
        goto malformed;
808
45
    }
809
810
426
    c->u.varint.type = option;
811
812
426
    return c;
813
814
66
 malformed:
815
66
    hts_log_error("Malformed varint header stream");
816
66
    free(c);
817
66
    return NULL;
818
471
}
819
820
int cram_varint_encode_int(cram_slice *slice, cram_codec *c,
821
0
                           char *in, int in_size) {
822
0
    uint32_t *i32 = (uint32_t *)in;
823
0
    return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0
824
0
        ? 0 : -1;
825
0
}
826
827
int cram_varint_encode_sint(cram_slice *slice, cram_codec *c,
828
0
                            char *in, int in_size) {
829
0
    int32_t *i32 = (int32_t *)in;
830
0
    return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0
831
0
        ? 0 : -1;
832
0
}
833
834
int cram_varint_encode_long(cram_slice *slice, cram_codec *c,
835
0
                            char *in, int in_size) {
836
0
    uint64_t *i64 = (uint64_t *)in;
837
0
    return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0
838
0
        ? 0 : -1;
839
0
}
840
841
int cram_varint_encode_slong(cram_slice *slice, cram_codec *c,
842
0
                             char *in, int in_size) {
843
0
    int64_t *i64 = (int64_t *)in;
844
0
    return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0
845
0
        ? 0 : -1;
846
0
}
847
848
0
void cram_varint_encode_free(cram_codec *c) {
849
0
    if (!c)
850
0
        return;
851
0
    free(c);
852
0
}
853
854
int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix,
855
0
                             int version) {
856
0
    char tmp[99], *tp = tmp;
857
0
    int len = 0;
858
859
0
    if (prefix) {
860
0
        size_t l = strlen(prefix);
861
0
        BLOCK_APPEND(b, prefix, l);
862
0
        len += l;
863
0
    }
864
865
0
    tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id);
866
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset);
867
0
    len += c->vv->varint_put32_blk(b, c->codec);
868
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
869
0
    BLOCK_APPEND(b, tmp, tp-tmp);
870
0
    len += tp-tmp;
871
872
0
    return len;
873
874
0
 block_err:
875
0
    return -1;
876
0
}
877
878
cram_codec *cram_varint_encode_init(cram_stats *st,
879
                                    enum cram_encoding codec,
880
                                    enum cram_external_type option,
881
                                    void *dat,
882
0
                                    int version, varint_vec *vv) {
883
0
    cram_codec *c;
884
885
0
    if (!(c = malloc(sizeof(*c))))
886
0
        return NULL;
887
888
0
    c->u.e_varint.offset = 0;
889
0
    if (st) {
890
        // Marginal difference so far! Not worth the hassle?
891
0
        if (st->min_val < 0 && st->min_val >= -127
892
0
            && st->max_val / -st->min_val > 100) {
893
0
            c->u.e_varint.offset = -st->min_val;
894
0
            codec = E_VARINT_UNSIGNED;
895
0
        } else if (st->min_val > 0) {
896
0
            c->u.e_varint.offset = -st->min_val;
897
0
        }
898
0
    }
899
900
0
    c->codec = codec;
901
0
    c->free = cram_varint_encode_free;
902
903
    // Function pointer choice is theoretically by codec type.
904
    // Given we have some vars as int32 and some as int64 we
905
    // use option too for sizing, although on disk format
906
    // does not change.
907
0
    switch (codec) {
908
0
    case E_VARINT_UNSIGNED:
909
0
        c->encode = (option == E_INT)
910
0
            ? cram_varint_encode_int
911
0
            : cram_varint_encode_long;
912
0
        break;
913
0
    case E_VARINT_SIGNED:
914
0
        c->encode = (option == E_INT)
915
0
            ? cram_varint_encode_sint
916
0
            : cram_varint_encode_slong;
917
0
        break;
918
0
    default:
919
0
        return NULL;
920
0
    }
921
0
    c->store = cram_varint_encode_store;
922
0
    c->flush = NULL;
923
924
0
    c->u.e_varint.content_id = (size_t)dat;
925
926
0
    return c;
927
0
}
928
/*
929
 * ---------------------------------------------------------------------------
930
 * CONST_BYTE and CONST_INT
931
 */
932
int cram_const_decode_byte(cram_slice *slice, cram_codec *c,
933
0
                           cram_block *in, char *out, int *out_size) {
934
0
    int i, n;
935
936
0
    if (!out)
937
0
        return 0;
938
939
0
    for (i = 0, n = *out_size; i < n; i++)
940
0
        out[i] = c->u.xconst.val;
941
942
0
    return 0;
943
0
}
944
945
int cram_const_decode_int(cram_slice *slice, cram_codec *c,
946
0
                          cram_block *in, char *out, int *out_size) {
947
0
    int32_t *out_i = (int32_t *)out;
948
0
    int i, n;
949
950
0
    for (i = 0, n = *out_size; i < n; i++)
951
0
        out_i[i] = c->u.xconst.val;
952
953
0
    return 0;
954
0
}
955
956
int cram_const_decode_long(cram_slice *slice, cram_codec *c,
957
0
                           cram_block *in, char *out, int *out_size) {
958
0
    int64_t *out_i = (int64_t *)out;
959
0
    int i, n;
960
961
0
    for (i = 0, n = *out_size; i < n; i++)
962
0
        out_i[i] = c->u.xconst.val;
963
964
0
    return 0;
965
0
}
966
967
129
void cram_const_decode_free(cram_codec *c) {
968
129
    if (c)
969
129
        free(c);
970
129
}
971
972
0
int cram_const_decode_size(cram_slice *slice, cram_codec *c) {
973
0
    return 0;
974
0
}
975
976
0
int cram_const_describe(cram_codec *c, kstring_t *ks) {
977
0
    return ksprintf(ks, "CONST(val=%"PRId64")",
978
0
                    c->u.xconst.val) < 0 ? -1 : 0;
979
0
}
980
981
cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
982
                                   char *data, int size,
983
                                   enum cram_encoding codec,
984
                                   enum cram_external_type option,
985
147
                                   int version, varint_vec *vv) {
986
147
    cram_codec *c;
987
147
    char *cp = data;
988
989
147
    if (!(c = malloc(sizeof(*c))))
990
0
        return NULL;
991
992
147
    c->codec  = codec;
993
147
    if (codec == E_CONST_BYTE && option == E_BYTE)
994
3
        c->decode = cram_const_decode_byte;
995
144
    else if (codec == E_CONST_INT && (option == E_INT || option == E_SINT))
996
129
        c->decode = cram_const_decode_int;
997
15
    else if (codec == E_CONST_INT && (option == E_LONG || option == E_SLONG))
998
0
        c->decode = cram_const_decode_long;
999
15
    else {
1000
15
        hts_log_error("Malformed const header stream");
1001
15
        free(c);
1002
15
        return NULL;
1003
15
    }
1004
132
    c->free   = cram_const_decode_free;
1005
132
    c->size   = cram_const_decode_size;
1006
132
    c->get_block = NULL;
1007
132
    c->describe = cram_const_describe;
1008
1009
132
    c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL);
1010
1011
132
    if (cp - data != size) {
1012
3
        fprintf(stderr, "Malformed const header stream\n");
1013
3
        free(c);
1014
3
        return NULL;
1015
3
    }
1016
1017
129
    return c;
1018
132
}
1019
1020
int cram_const_encode(cram_slice *slice, cram_codec *c,
1021
0
                      char *in, int in_size) {
1022
0
    return 0;
1023
0
}
1024
1025
int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix,
1026
0
                            int version) {
1027
0
    char tmp[99], *tp = tmp;
1028
0
    int len = 0;
1029
1030
0
    if (prefix) {
1031
0
        size_t l = strlen(prefix);
1032
0
        BLOCK_APPEND(b, prefix, l);
1033
0
        len += l;
1034
0
    }
1035
1036
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val);
1037
0
    len += c->vv->varint_put32_blk(b, c->codec);
1038
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
1039
0
    BLOCK_APPEND(b, tmp, tp-tmp);
1040
0
    len += tp-tmp;
1041
1042
0
    return len;
1043
1044
0
 block_err:
1045
0
    return -1;
1046
0
}
1047
1048
cram_codec *cram_const_encode_init(cram_stats *st,
1049
                                   enum cram_encoding codec,
1050
                                   enum cram_external_type option,
1051
                                   void *dat,
1052
0
                                   int version, varint_vec *vv) {
1053
0
    cram_codec *c;
1054
1055
0
    if (!(c = malloc(sizeof(*c))))
1056
0
        return NULL;
1057
1058
0
    c->codec = codec;
1059
0
    c->free = cram_const_decode_free; // as as decode
1060
0
    c->encode = cram_const_encode; // a nop
1061
0
    c->store = cram_const_encode_store;
1062
0
    c->flush = NULL;
1063
0
    c->u.e_xconst.val = st->min_val;
1064
1065
0
    return c;
1066
0
}
1067
1068
/*
1069
 * ---------------------------------------------------------------------------
1070
 * BETA
1071
 */
1072
0
int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1073
0
    int64_t *out_i = (int64_t *)out;
1074
0
    int i, n = *out_size;
1075
1076
0
    if (c->u.beta.nbits) {
1077
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1078
0
            return -1;
1079
1080
0
        for (i = 0; i < n; i++)
1081
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1082
0
    } else {
1083
0
        for (i = 0; i < n; i++)
1084
0
            out_i[i] = -c->u.beta.offset;
1085
0
    }
1086
1087
0
    return 0;
1088
0
}
1089
1090
0
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1091
0
    int32_t *out_i = (int32_t *)out;
1092
0
    int i, n = *out_size;
1093
1094
0
    if (c->u.beta.nbits) {
1095
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1096
0
            return -1;
1097
1098
0
        for (i = 0; i < n; i++)
1099
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1100
0
    } else {
1101
0
        for (i = 0; i < n; i++)
1102
0
            out_i[i] = -c->u.beta.offset;
1103
0
    }
1104
1105
0
    return 0;
1106
0
}
1107
1108
0
int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1109
0
    int i, n = *out_size;
1110
1111
1112
0
    if (c->u.beta.nbits) {
1113
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1114
0
            return -1;
1115
1116
0
        if (out)
1117
0
            for (i = 0; i < n; i++)
1118
0
                out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1119
0
        else
1120
0
            for (i = 0; i < n; i++)
1121
0
                get_bits_MSB(in, c->u.beta.nbits);
1122
0
    } else {
1123
0
        if (out)
1124
0
            for (i = 0; i < n; i++)
1125
0
                out[i] = -c->u.beta.offset;
1126
0
    }
1127
1128
0
    return 0;
1129
0
}
1130
1131
297
void cram_beta_decode_free(cram_codec *c) {
1132
297
    if (c)
1133
297
        free(c);
1134
297
}
1135
1136
0
int cram_beta_describe(cram_codec *c, kstring_t *ks) {
1137
0
    return ksprintf(ks, "BETA(offset=%d, nbits=%d)",
1138
0
                    c->u.beta.offset, c->u.beta.nbits)
1139
0
        < 0 ? -1 : 0;
1140
0
}
1141
1142
cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
1143
                                  char *data, int size,
1144
                                  enum cram_encoding codec,
1145
                                  enum cram_external_type option,
1146
318
                                  int version, varint_vec *vv) {
1147
318
    cram_codec *c;
1148
318
    char *cp = data;
1149
1150
318
    if (!(c = malloc(sizeof(*c))))
1151
0
        return NULL;
1152
1153
318
    c->codec  = E_BETA;
1154
318
    if (option == E_INT || option == E_SINT)
1155
99
        c->decode = cram_beta_decode_int;
1156
219
    else if (option == E_LONG || option == E_SLONG)
1157
3
        c->decode = cram_beta_decode_long;
1158
216
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1159
213
        c->decode = cram_beta_decode_char;
1160
3
    else {
1161
3
        hts_log_error("BYTE_ARRAYs not supported by this codec");
1162
3
        free(c);
1163
3
        return NULL;
1164
3
    }
1165
315
    c->free   = cram_beta_decode_free;
1166
315
    c->describe = cram_beta_describe;
1167
1168
315
    c->u.beta.nbits = -1;
1169
315
    c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL);
1170
315
    if (cp < data + size) // Ensure test below works
1171
312
        c->u.beta.nbits  = vv->varint_get32(&cp, data + size, NULL);
1172
1173
315
    if (cp - data != size
1174
309
        || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) {
1175
18
        hts_log_error("Malformed beta header stream");
1176
18
        free(c);
1177
18
        return NULL;
1178
18
    }
1179
1180
297
    return c;
1181
315
}
1182
1183
int cram_beta_encode_store(cram_codec *c, cram_block *b,
1184
35
                           char *prefix, int version) {
1185
35
    int len = 0, r = 0, n;
1186
1187
35
    if (prefix) {
1188
35
        size_t l = strlen(prefix);
1189
35
        BLOCK_APPEND(b, prefix, l);
1190
35
        len += l;
1191
35
    }
1192
1193
35
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1194
    // codec length
1195
35
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset)
1196
35
                                         + c->vv->varint_size(c->u.e_beta.nbits)));
1197
35
    r |= n;
1198
35
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n;
1199
35
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits));  r |= n;
1200
1201
35
    if (r > 0) return len;
1202
1203
0
 block_err:
1204
0
    return -1;
1205
35
}
1206
1207
int cram_beta_encode_long(cram_slice *slice, cram_codec *c,
1208
0
                          char *in, int in_size) {
1209
0
    int64_t *syms = (int64_t *)in;
1210
0
    int i, r = 0;
1211
1212
0
    for (i = 0; i < in_size; i++)
1213
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1214
0
                            c->u.e_beta.nbits);
1215
1216
0
    return r;
1217
0
}
1218
1219
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
1220
92
                         char *in, int in_size) {
1221
92
    int *syms = (int *)in;
1222
92
    int i, r = 0;
1223
1224
184
    for (i = 0; i < in_size; i++)
1225
92
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1226
92
                            c->u.e_beta.nbits);
1227
1228
92
    return r;
1229
92
}
1230
1231
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
1232
0
                          char *in, int in_size) {
1233
0
    unsigned char *syms = (unsigned char *)in;
1234
0
    int i, r = 0;
1235
1236
0
    for (i = 0; i < in_size; i++)
1237
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1238
0
                            c->u.e_beta.nbits);
1239
1240
0
    return r;
1241
0
}
1242
1243
35
void cram_beta_encode_free(cram_codec *c) {
1244
35
    if (c) free(c);
1245
35
}
1246
1247
cram_codec *cram_beta_encode_init(cram_stats *st,
1248
                                  enum cram_encoding codec,
1249
                                  enum cram_external_type option,
1250
                                  void *dat,
1251
41
                                  int version, varint_vec *vv) {
1252
41
    cram_codec *c;
1253
41
    hts_pos_t min_val, max_val;
1254
41
    int len = 0;
1255
41
    int64_t range;
1256
1257
41
    c = malloc(sizeof(*c));
1258
41
    if (!c)
1259
0
        return NULL;
1260
41
    c->codec  = E_BETA;
1261
41
    c->free   = cram_beta_encode_free;
1262
41
    if (option == E_INT || option == E_SINT)
1263
41
        c->encode = cram_beta_encode_int;
1264
0
    else if (option == E_LONG || option == E_SLONG)
1265
0
        c->encode = cram_beta_encode_long;
1266
0
    else
1267
0
        c->encode = cram_beta_encode_char;
1268
41
    c->store  = cram_beta_encode_store;
1269
41
    c->flush = NULL;
1270
1271
41
    if (dat) {
1272
41
        min_val = ((hts_pos_t *)dat)[0];
1273
41
        max_val = ((hts_pos_t *)dat)[1];
1274
41
    } else {
1275
0
        min_val = INT_MAX;
1276
0
        max_val = INT_MIN;
1277
0
        int i;
1278
0
        for (i = 0; i < MAX_STAT_VAL; i++) {
1279
0
            if (!st->freqs[i])
1280
0
                continue;
1281
0
            if (min_val > i)
1282
0
                min_val = i;
1283
0
            max_val = i;
1284
0
        }
1285
0
        if (st->h) {
1286
0
            khint_t k;
1287
1288
0
            for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1289
0
                if (!kh_exist(st->h, k))
1290
0
                    continue;
1291
1292
0
                i = kh_key(st->h, k);
1293
0
                if (min_val > i)
1294
0
                    min_val = i;
1295
0
                if (max_val < i)
1296
0
                    max_val = i;
1297
0
            }
1298
0
        }
1299
0
    }
1300
1301
41
    if (max_val < min_val)
1302
0
        goto err;
1303
1304
41
    range = (int64_t) max_val - min_val;
1305
41
    switch (option) {
1306
0
    case E_SINT:
1307
0
        if (min_val < INT_MIN || range > INT_MAX)
1308
0
            goto err;
1309
0
        break;
1310
1311
41
    case E_INT:
1312
41
        if (max_val > UINT_MAX || range > UINT_MAX)
1313
6
            goto err;
1314
35
        break;
1315
1316
35
    default:
1317
0
        break;
1318
41
    }
1319
1320
35
    c->u.e_beta.offset = -min_val;
1321
771
    while (range) {
1322
736
        len++;
1323
736
        range >>= 1;
1324
736
    }
1325
35
    c->u.e_beta.nbits = len;
1326
1327
35
    return c;
1328
1329
6
 err:
1330
6
    free(c);
1331
6
    return NULL;
1332
41
}
1333
1334
/*
1335
 * ---------------------------------------------------------------------------
1336
 * XPACK: Packing multiple values into a single byte.  A fast transform that
1337
 * reduces time taken by entropy encoder and may also improve compression.
1338
 *
1339
 * This also has the additional requirement that the data series is not
1340
 * interleaved with another, permitting efficient encoding and decoding
1341
 * of all elements enmasse instead of needing to only extract the bits
1342
 * necessary per item.
1343
 */
1344
0
int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1345
0
    int64_t *out_i = (int64_t *)out;
1346
0
    int i, n = *out_size;
1347
1348
0
    if (c->u.xpack.nbits) {
1349
0
        for (i = 0; i < n; i++)
1350
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1351
0
    } else {
1352
0
        for (i = 0; i < n; i++)
1353
0
            out_i[i] = c->u.xpack.rmap[0];
1354
0
    }
1355
1356
0
    return 0;
1357
0
}
1358
1359
0
int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1360
0
    int32_t *out_i = (int32_t *)out;
1361
0
    int i, n = *out_size;
1362
1363
0
    if (c->u.xpack.nbits) {
1364
0
        if (cram_not_enough_bits(in, c->u.xpack.nbits * n))
1365
0
            return -1;
1366
1367
0
        for (i = 0; i < n; i++)
1368
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1369
0
    } else {
1370
0
        for (i = 0; i < n; i++)
1371
0
            out_i[i] = c->u.xpack.rmap[0];
1372
0
    }
1373
1374
0
    return 0;
1375
0
}
1376
1377
0
static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) {
1378
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
1379
0
    if (b)
1380
0
        return 0;
1381
1382
    // get sub-codec data.
1383
0
    cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec);
1384
0
    if (!sub_b)
1385
0
        return -1;
1386
1387
    // Allocate local block to expand into
1388
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
1389
0
    if (!b)
1390
0
        return -1;
1391
0
    int n = sub_b->uncomp_size * 8/c->u.xpack.nbits;
1392
0
    BLOCK_GROW(b, n);
1393
0
    b->uncomp_size = n;
1394
1395
0
    uint8_t p[256];
1396
0
    int z;
1397
0
    for (z = 0; z < 256; z++)
1398
0
        p[z] = c->u.xpack.rmap[z];
1399
0
    hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size,
1400
0
               8 / c->u.xpack.nbits, p);
1401
1402
0
    return 0;
1403
1404
0
 block_err:
1405
0
    return -1;
1406
0
}
1407
1408
0
int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1409
    // FIXME: we need to ban data-series interleaving in the spec for this to work.
1410
1411
    // Remember this may be called when threaded and multi-slice per container.
1412
    // Hence one cram_codec instance, multiple slices, multiple blocks.
1413
    // We therefore have to cache appropriate block info in slice and not codec.
1414
    //    b = cram_get_block_by_id(slice, c->external.content_id);
1415
0
    if (c->u.xpack.nval > 1) {
1416
0
        cram_xpack_decode_expand_char(slice, c);
1417
0
        cram_block *b = slice->block_by_id[512 + c->codec_id];
1418
0
        if (!b)
1419
0
            return -1;
1420
1421
0
        if (out)
1422
0
            memcpy(out, b->data + b->byte, *out_size);
1423
0
        b->byte += *out_size;
1424
0
    } else if (out) {
1425
0
        memset(out, c->u.xpack.rmap[0], *out_size);
1426
0
    }
1427
1428
0
    return 0;
1429
0
}
1430
1431
306
void cram_xpack_decode_free(cram_codec *c) {
1432
306
    if (!c) return;
1433
1434
306
    if (c->u.xpack.sub_codec)
1435
213
        c->u.xpack.sub_codec->free(c->u.xpack.sub_codec);
1436
1437
    //free(slice->block_by_id[512 + c->codec_id]);
1438
    //slice->block_by_id[512 + c->codec_id] = 0;
1439
1440
306
    free(c);
1441
306
}
1442
1443
0
int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) {
1444
0
    cram_xpack_decode_expand_char(slice, c);
1445
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1446
0
}
1447
1448
0
cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) {
1449
0
    cram_xpack_decode_expand_char(slice, c);
1450
0
    return slice->block_by_id[512 + c->codec_id];
1451
0
}
1452
1453
cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr,
1454
                                   char *data, int size,
1455
                                   enum cram_encoding codec,
1456
                                   enum cram_external_type option,
1457
306
                                   int version, varint_vec *vv) {
1458
306
    cram_codec *c;
1459
306
    char *cp = data;
1460
306
    char *endp = data+size;
1461
1462
306
    if (!(c = calloc(1, sizeof(*c))))
1463
0
        return NULL;
1464
1465
306
    c->codec  = E_XPACK;
1466
306
    if (option == E_LONG)
1467
3
        c->decode = cram_xpack_decode_long;
1468
303
    else if (option == E_INT)
1469
231
        c->decode = cram_xpack_decode_int;
1470
72
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1471
69
        c->decode = cram_xpack_decode_char;
1472
3
    else {
1473
3
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
1474
3
        goto malformed;
1475
3
    }
1476
303
    c->free = cram_xpack_decode_free;
1477
303
    c->size = cram_xpack_decode_size;
1478
303
    c->get_block = cram_xpack_get_block;
1479
303
    c->describe = NULL;
1480
1481
303
    c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL);
1482
303
    c->u.xpack.nval  = vv->varint_get32(&cp, endp, NULL);
1483
303
    if (c->u.xpack.nbits >= 8  || c->u.xpack.nbits < 0 ||
1484
285
        c->u.xpack.nval  > 256 || c->u.xpack.nval < 0)
1485
30
        goto malformed;
1486
273
    int i;
1487
1.90k
    for (i = 0; i < c->u.xpack.nval; i++) {
1488
1.64k
        uint32_t v = vv->varint_get32(&cp, endp, NULL);
1489
1.64k
        if (v >= 256)
1490
15
            goto malformed;
1491
1.63k
        c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K
1492
1.63k
    }
1493
1494
258
    int encoding = vv->varint_get32(&cp, endp, NULL);
1495
258
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1496
258
    if (sub_size < 0 || endp - cp < sub_size)
1497
12
        goto malformed;
1498
246
    c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1499
246
                                             option, version, vv);
1500
246
    if (c->u.xpack.sub_codec == NULL)
1501
33
        goto malformed;
1502
213
    cp += sub_size;
1503
1504
213
    if (cp - data != size
1505
198
        || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) {
1506
108
    malformed:
1507
108
        fprintf(stderr, "Malformed xpack header stream\n");
1508
108
        cram_xpack_decode_free(c);
1509
108
        return NULL;
1510
15
    }
1511
1512
198
    return c;
1513
213
}
1514
1515
0
int cram_xpack_encode_flush(cram_codec *c) {
1516
    // Pack the buffered up data
1517
0
    int meta_len;
1518
0
    uint64_t out_len;
1519
0
    uint8_t out_meta[1024];
1520
0
    uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out),
1521
0
                            out_meta, &meta_len, &out_len);
1522
1523
    // We now need to pass this through the next layer of transform
1524
0
    if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming
1525
0
                                     c->u.e_xpack.sub_codec,
1526
0
                                     (char *)out, out_len))
1527
0
        return -1;
1528
1529
0
    int r = 0;
1530
0
    if (c->u.e_xpack.sub_codec->flush)
1531
0
        r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec);
1532
1533
0
    free(out);
1534
0
    return r;
1535
0
}
1536
1537
int cram_xpack_encode_store(cram_codec *c, cram_block *b,
1538
0
                            char *prefix, int version) {
1539
0
    int len = 0, r = 0, n;
1540
1541
0
    if (prefix) {
1542
0
        size_t l = strlen(prefix);
1543
0
        BLOCK_APPEND(b, prefix, l);
1544
0
        len += l;
1545
0
    }
1546
1547
    // Store sub-codec
1548
0
    cram_codec *tc = c->u.e_xpack.sub_codec;
1549
0
    cram_block *tb = cram_new_block(0, 0);
1550
0
    if (!tb)
1551
0
        return -1;
1552
0
    int len2 = tc->store(tc, tb, NULL, version);
1553
1554
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1555
1556
    // codec length
1557
0
    int len1 = 0, i;
1558
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1559
0
        len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n;
1560
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits)
1561
0
                                        +  c->vv->varint_size(c->u.e_xpack.nval)
1562
0
                                        + len1 + len2)); r |= n;
1563
1564
    // The map and sub-codec
1565
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n;
1566
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval));  r |= n;
1567
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1568
0
        len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n;
1569
1570
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1571
1572
0
    cram_free_block(tb);
1573
1574
0
    return r > 0 ? len + len2 : -1;
1575
1576
0
 block_err:
1577
0
    return -1;
1578
0
}
1579
1580
// Same as cram_beta_encode_long
1581
int cram_xpack_encode_long(cram_slice *slice, cram_codec *c,
1582
0
                           char *in, int in_size) {
1583
0
    int64_t *syms = (int64_t *)in;
1584
0
    int i, r = 0;
1585
1586
0
    for (i = 0; i < in_size; i++)
1587
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1588
1589
0
    return r;
1590
0
}
1591
1592
int cram_xpack_encode_int(cram_slice *slice, cram_codec *c,
1593
0
                          char *in, int in_size) {
1594
0
    int *syms = (int *)in;
1595
0
    int i, r = 0;
1596
1597
0
    for (i = 0; i < in_size; i++)
1598
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1599
1600
0
    return r;
1601
0
}
1602
1603
int cram_xpack_encode_char(cram_slice *slice, cram_codec *c,
1604
0
                           char *in, int in_size) {
1605
0
    BLOCK_APPEND(c->out, in, in_size);
1606
0
    return 0;
1607
1608
0
 block_err:
1609
0
    return -1;
1610
0
}
1611
1612
0
void cram_xpack_encode_free(cram_codec *c) {
1613
0
    if (!c) return;
1614
1615
0
    if (c->u.e_xpack.sub_codec)
1616
0
        c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec);
1617
1618
0
    cram_free_block(c->out);
1619
1620
0
    free(c);
1621
0
}
1622
1623
cram_codec *cram_xpack_encode_init(cram_stats *st,
1624
                                   enum cram_encoding codec,
1625
                                   enum cram_external_type option,
1626
                                   void *dat,
1627
0
                                   int version, varint_vec *vv) {
1628
0
    cram_codec *c;
1629
1630
0
    if (!(c = malloc(sizeof(*c))))
1631
0
        return NULL;
1632
1633
0
    c->codec  = E_XPACK;
1634
0
    c->free   = cram_xpack_encode_free;
1635
0
    if (option == E_LONG)
1636
0
        c->encode = cram_xpack_encode_long;
1637
0
    else if (option == E_INT)
1638
0
        c->encode = cram_xpack_encode_int;
1639
0
    else
1640
0
        c->encode = cram_xpack_encode_char;
1641
0
    c->store  = cram_xpack_encode_store;
1642
0
    c->flush  = cram_xpack_encode_flush;
1643
1644
0
    cram_xpack_encoder *e = (cram_xpack_encoder *)dat;
1645
0
    c->u.e_xpack.nbits = e->nbits;
1646
0
    c->u.e_xpack.nval = e->nval;
1647
0
    c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
1648
0
                                               E_BYTE_ARRAY, e->sub_codec_dat,
1649
0
                                               version, vv);
1650
1651
    // Initialise fwd and rev maps
1652
0
    memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3
1653
0
    int i, n;
1654
0
    for (i = n = 0; i < 256; i++)
1655
0
        if (e->map[i] != -1)
1656
0
            c->u.e_xpack.rmap[n++] = i;               // 0,1,2,3 to P,A,C,K
1657
0
    if (n != e->nval) {
1658
0
        fprintf(stderr, "Incorrectly specified number of map items in PACK\n");
1659
0
        return NULL;
1660
0
    }
1661
1662
0
    return c;
1663
0
}
1664
1665
/*
1666
 * ---------------------------------------------------------------------------
1667
 * XDELTA: subtract successive values, zig-zag to turn +/- to + only,
1668
 * and then var-int encode the result.
1669
 *
1670
 * This also has the additional requirement that the data series is not
1671
 * interleaved with another, permitting efficient encoding and decoding
1672
 * of all elements enmasse instead of needing to only extract the bits
1673
 * necessary per item.
1674
 */
1675
1676
0
static uint8_t  zigzag8 (int8_t  x) { return (x << 1) ^ (x >>  7); }
1677
0
static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); }
1678
0
static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); }
1679
1680
//static int8_t  unzigzag8 (uint8_t  x) { return (x >> 1) ^ -(x & 1); }
1681
0
static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); }
1682
0
static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); }
1683
1684
0
int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1685
0
    return -1;
1686
0
}
1687
1688
0
int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1689
    // Slow value-by-value method for now
1690
0
    uint32_t *out32 = (uint32_t *)out;
1691
0
    int i;
1692
0
    for (i = 0; i < *out_size; i++) {
1693
0
        uint32_t v;
1694
0
        int one = 1;
1695
0
        if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in,
1696
0
                                          (char *)&v, &one) < 0)
1697
0
            return -1;
1698
0
        uint32_t d = unzigzag32(v);
1699
0
        c->u.xdelta.last = out32[i] = d + c->u.xdelta.last;
1700
0
    }
1701
1702
0
    return 0;
1703
0
}
1704
1705
0
static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) {
1706
0
    return -1;
1707
0
}
1708
1709
0
int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1710
0
    return -1;
1711
0
}
1712
1713
0
static inline int16_t le_int2(int16_t i) {
1714
0
    int16_t s;
1715
0
    i16_to_le(i, (uint8_t *)&s);
1716
0
    return s;
1717
0
}
1718
1719
int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in,
1720
0
                             char *out_, int *out_size) {
1721
0
    cram_block *out = (cram_block *)out_;
1722
0
    cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec);
1723
0
    int i = 0;
1724
1725
0
    const int w = c->u.xdelta.word_size;
1726
0
    uint32_t npad = (w - *out_size%w)%w;
1727
0
    uint32_t out_sz = *out_size + npad;
1728
0
    c->u.xdelta.last = 0;  // reset for each new array
1729
1730
0
    for (i = 0; i < out_sz; i += w) {
1731
0
        uint16_t v;
1732
        // Need better interface
1733
0
        char *cp = (char *)b->data + b->byte;
1734
0
        char *cp_end = (char *)b->data + b->uncomp_size;
1735
0
        int err = 0;
1736
0
        v = c->vv->varint_get32(&cp, cp_end, &err);
1737
0
        if (err)
1738
0
            return -1;
1739
0
        b->byte = cp - (char *)b->data;
1740
1741
0
        switch(w) {
1742
0
        case 2: {
1743
0
            int16_t d = unzigzag16(v), z;
1744
0
            c->u.xdelta.last = d + c->u.xdelta.last;
1745
0
            z = le_int2(c->u.xdelta.last);
1746
0
            BLOCK_APPEND(out, &z, 2-npad);
1747
0
            npad = 0;
1748
0
            break;
1749
0
        }
1750
0
        default:
1751
0
            fprintf(stderr, "Unsupported word size by XDELTA\n");
1752
0
            return -1;
1753
0
        }
1754
0
    }
1755
1756
0
    return 0;
1757
1758
0
 block_err:
1759
0
    return -1;
1760
0
}
1761
1762
210
void cram_xdelta_decode_free(cram_codec *c) {
1763
210
    if (!c) return;
1764
1765
210
    if (c->u.xdelta.sub_codec)
1766
165
        c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec);
1767
1768
210
    free(c);
1769
210
}
1770
1771
0
int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) {
1772
0
    cram_xdelta_decode_expand_char(slice, c);
1773
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1774
0
}
1775
1776
0
cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) {
1777
0
    cram_xdelta_decode_expand_char(slice, c);
1778
0
    return slice->block_by_id[512 + c->codec_id];
1779
0
}
1780
1781
cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr,
1782
                                    char *data, int size,
1783
                                    enum cram_encoding codec,
1784
                                    enum cram_external_type option,
1785
210
                                    int version, varint_vec *vv) {
1786
210
    cram_codec *c;
1787
210
    char *cp = data;
1788
210
    char *endp = data+size;
1789
1790
210
    if (!(c = calloc(1, sizeof(*c))))
1791
0
        return NULL;
1792
1793
210
    c->codec  = E_XDELTA;
1794
210
    if (option == E_LONG)
1795
6
        c->decode = cram_xdelta_decode_long;
1796
204
    else if (option == E_INT)
1797
78
        c->decode = cram_xdelta_decode_int;
1798
126
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1799
78
        c->decode = cram_xdelta_decode_char;
1800
48
    else if (option == E_BYTE_ARRAY_BLOCK) {
1801
48
        option = E_BYTE_ARRAY;
1802
48
        c->decode = cram_xdelta_decode_block;
1803
48
    } else {
1804
0
        free(c);
1805
0
        return NULL;
1806
0
    }
1807
210
    c->free = cram_xdelta_decode_free;
1808
210
    c->size = cram_xdelta_decode_size;
1809
210
    c->get_block = cram_xdelta_get_block;
1810
210
    c->describe = NULL;
1811
1812
210
    c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL);
1813
210
    c->u.xdelta.last = 0;
1814
1815
210
    int encoding = vv->varint_get32(&cp, endp, NULL);
1816
210
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1817
210
    if (sub_size < 0 || endp - cp < sub_size)
1818
6
        goto malformed;
1819
204
    c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1820
204
                                              option, version, vv);
1821
204
    if (c->u.xdelta.sub_codec == NULL)
1822
39
        goto malformed;
1823
165
    cp += sub_size;
1824
1825
165
    if (cp - data != size) {
1826
72
    malformed:
1827
72
        fprintf(stderr, "Malformed xdelta header stream\n");
1828
72
        cram_xdelta_decode_free(c);
1829
72
        return NULL;
1830
27
    }
1831
1832
138
    return c;
1833
165
}
1834
1835
0
int cram_xdelta_encode_flush(cram_codec *c) {
1836
0
    int r = -1;
1837
0
    cram_block *b = cram_new_block(0, 0);
1838
0
    if (!b)
1839
0
        return -1;
1840
1841
0
    switch (c->u.e_xdelta.word_size) {
1842
0
    case 2: {
1843
        // Delta + zigzag transform.
1844
        // Subtracting two 8-bit values has a 9-bit result (-255 to 255).
1845
        // However think of it as turning a wheel clockwise or anti-clockwise.
1846
        // If it has 256 gradations then a -ve rotation followed by a +ve
1847
        // rotation of the same amount reverses it regardless.
1848
        //
1849
        // Similarly the zig-zag transformation doesn't invent any extra bits,
1850
        // so the entire thing can be done in-situ.  This may permit faster
1851
        // SIMD loops if we break apart the steps.
1852
1853
        // uint16_t last = 0, d;
1854
        // for (i = 0; i < n; i++) {
1855
        //     d = io[i] - last;
1856
        //     last = io[i];
1857
        //     io[i] = zigzag16(vd);
1858
        // }
1859
1860
        // --- vs ---
1861
1862
        // for (i = n-1; i >= 1; i--)
1863
        //     io[i] -= io[i-1];
1864
        // for (i = 0; i < n; i++)
1865
        //     io[i] = zigzag16(io[i]);
1866
1867
        // varint: need array variant for speed here.
1868
        // With zig-zag
1869
0
        int i, n = BLOCK_SIZE(c->out)/2;;
1870
0
        uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0;
1871
1872
0
        if (n*2 < BLOCK_SIZE(c->out)) {
1873
            // half word
1874
0
            last = *(uint8_t *)dat;
1875
0
            c->vv->varint_put32_blk(b, zigzag16(last));
1876
0
            dat = (uint16_t *)(((uint8_t *)dat)+1);
1877
0
        }
1878
1879
0
        for (i = 0; i < n; i++) {
1880
0
            uint16_t d = dat[i] - last; // possibly unaligned
1881
0
            last = dat[i];
1882
0
            c->vv->varint_put32_blk(b, zigzag16(d));
1883
0
        }
1884
1885
0
        break;
1886
0
    }
1887
1888
0
    case 4: {
1889
0
        int i, n = BLOCK_SIZE(c->out)/4;;
1890
0
        uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0;
1891
1892
0
        for (i = 0; i < n; i++) {
1893
0
            uint32_t d = dat[i] - last;
1894
0
            last = dat[i];
1895
0
            c->vv->varint_put32_blk(b, zigzag32(d));
1896
0
        }
1897
1898
0
        break;
1899
0
    }
1900
1901
0
    case 1: {
1902
0
        int i, n = BLOCK_SIZE(c->out);;
1903
0
        uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0;
1904
1905
0
        for (i = 0; i < n; i++) {
1906
0
            uint32_t d = dat[i] - last;
1907
0
            last = dat[i];
1908
0
            c->vv->varint_put32_blk(b, zigzag8(d));
1909
0
        }
1910
1911
0
        break;
1912
0
    }
1913
1914
0
    default:
1915
0
        goto err;
1916
0
    }
1917
1918
0
    if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec,
1919
0
                                      (char *)b->data, b->byte))
1920
0
        goto err;
1921
1922
0
    r = 0;
1923
1924
0
 err:
1925
0
    cram_free_block(b);
1926
0
    return r;
1927
1928
0
}
1929
1930
int cram_xdelta_encode_store(cram_codec *c, cram_block *b,
1931
0
                            char *prefix, int version) {
1932
0
    int len = 0, r = 0, n;
1933
1934
0
    if (prefix) {
1935
0
        size_t l = strlen(prefix);
1936
0
        BLOCK_APPEND(b, prefix, l);
1937
0
        len += l;
1938
0
    }
1939
1940
    // Store sub-codec
1941
0
    cram_codec *tc = c->u.e_xdelta.sub_codec;
1942
0
    cram_block *tb = cram_new_block(0, 0);
1943
0
    if (!tb)
1944
0
        return -1;
1945
0
    int len2 = tc->store(tc, tb, NULL, version);
1946
1947
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1948
1949
    // codec length
1950
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size)
1951
0
                                        + len2)); r |= n;
1952
1953
    // This and sub-codec
1954
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n;
1955
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1956
1957
0
    cram_free_block(tb);
1958
1959
0
    return r > 0 ? len + len2 : -1;
1960
1961
0
 block_err:
1962
0
    return -1;
1963
0
}
1964
1965
// Same as cram_beta_encode_long
1966
int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c,
1967
0
                           char *in, int in_size) {
1968
0
    return -1;
1969
0
}
1970
1971
int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c,
1972
0
                          char *in, int in_size) {
1973
0
    return -1;
1974
0
}
1975
1976
int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c,
1977
0
                            char *in, int in_size) {
1978
0
    char *dat = malloc(in_size*5);
1979
0
    if (!dat)
1980
0
        return -1;
1981
0
    char *cp = dat, *cp_end = dat + in_size*5;
1982
1983
0
    c->u.e_xdelta.last = 0; // reset for each new array
1984
0
    if (c->u.e_xdelta.word_size == 2) {
1985
0
        int i, part;
1986
1987
0
        part = in_size%2;
1988
0
        if (part) {
1989
0
            uint16_t z = in[0];
1990
0
            c->u.e_xdelta.last = le_int2(z);
1991
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last));
1992
0
        }
1993
1994
0
        uint16_t *in16 = (uint16_t *)(in+part);
1995
0
        for (i = 0; i < in_size/2; i++) {
1996
0
            uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last;
1997
0
            c->u.e_xdelta.last = le_int2(in16[i]);
1998
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(d));
1999
0
        }
2000
0
    }
2001
0
    if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec,
2002
0
                                      (char *)dat, cp-dat)) {
2003
0
        free(dat);
2004
0
        return -1;
2005
0
    }
2006
2007
0
    free(dat);
2008
0
    return 0;
2009
0
}
2010
2011
0
void cram_xdelta_encode_free(cram_codec *c) {
2012
0
    if (!c) return;
2013
2014
0
    if (c->u.e_xdelta.sub_codec)
2015
0
        c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec);
2016
2017
0
    cram_free_block(c->out);
2018
2019
0
    free(c);
2020
0
}
2021
2022
cram_codec *cram_xdelta_encode_init(cram_stats *st,
2023
                                    enum cram_encoding codec,
2024
                                    enum cram_external_type option,
2025
                                    void *dat,
2026
0
                                    int version, varint_vec *vv) {
2027
0
    cram_codec *c;
2028
2029
0
    if (!(c = malloc(sizeof(*c))))
2030
0
        return NULL;
2031
2032
0
    c->codec  = E_XDELTA;
2033
0
    c->free   = cram_xdelta_encode_free;
2034
0
    if (option == E_LONG)
2035
0
        c->encode = cram_xdelta_encode_long;
2036
0
    else if (option == E_INT)
2037
0
        c->encode = cram_xdelta_encode_int;
2038
0
    else
2039
0
        c->encode = cram_xdelta_encode_char;
2040
0
    c->store  = cram_xdelta_encode_store;
2041
0
    c->flush  = cram_xdelta_encode_flush;
2042
2043
0
    cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat;
2044
0
    c->u.e_xdelta.word_size = e->word_size;
2045
0
    c->u.e_xdelta.last = 0;
2046
0
    c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
2047
0
                                                E_BYTE_ARRAY,
2048
0
                                                e->sub_codec_dat,
2049
0
                                                version, vv);
2050
2051
0
    return c;
2052
0
}
2053
2054
/*
2055
 * ---------------------------------------------------------------------------
2056
 * XRLE
2057
 *
2058
 * This also has the additional requirement that the data series is not
2059
 * interleaved with another, permitting efficient encoding and decoding
2060
 * of all elements enmasse instead of needing to only extract the bits
2061
 * necessary per item.
2062
 */
2063
0
int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2064
    // TODO if and when needed
2065
0
    return -1;
2066
0
}
2067
2068
0
int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2069
    // TODO if and when needed
2070
0
    return -1;
2071
0
}
2072
2073
// Expands an XRLE transform and caches result in slice->block_by_id[]
2074
0
static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) {
2075
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2076
0
    if (b)
2077
0
        return 0;
2078
2079
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
2080
0
    if (!b)
2081
0
        return -1;
2082
0
    cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec);
2083
0
    if (!lit_b)
2084
0
        return -1;
2085
0
    unsigned char *lit_dat = lit_b->data;
2086
0
    unsigned int lit_sz = lit_b->uncomp_size;
2087
0
    unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec);
2088
2089
0
    cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec);
2090
0
    if (!len_b)
2091
0
        return -1;
2092
0
    unsigned char *len_dat = len_b->data;
2093
2094
0
    uint8_t rle_syms[256];
2095
0
    int rle_nsyms = 0;
2096
0
    int i;
2097
0
    for (i = 0; i < 256; i++) {
2098
0
        if (c->u.xrle.rep_score[i] > 0)
2099
0
            rle_syms[rle_nsyms++] = i;
2100
0
    }
2101
2102
0
    uint64_t out_sz;
2103
0
    int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz);
2104
0
    if (!(b->data = malloc(out_sz)))
2105
0
        return -1;
2106
0
    hts_rle_decode(lit_dat, lit_sz,
2107
0
                   len_dat+nb, len_sz-nb,
2108
0
                   rle_syms, rle_nsyms,
2109
0
                   b->data, &out_sz);
2110
0
    b->uncomp_size = out_sz;
2111
2112
0
    return 0;
2113
0
}
2114
2115
0
int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) {
2116
0
    cram_xrle_decode_expand_char(slice, c);
2117
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
2118
0
}
2119
2120
0
cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) {
2121
0
    cram_xrle_decode_expand_char(slice, c);
2122
0
    return slice->block_by_id[512 + c->codec_id];
2123
0
}
2124
2125
0
int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2126
0
    int n = *out_size;
2127
2128
0
    cram_xrle_decode_expand_char(slice, c);
2129
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2130
2131
0
    if (out)
2132
0
        memcpy(out, b->data + b->idx, n);
2133
0
    b->idx += n;
2134
0
    return 0;
2135
2136
    // Old code when not cached
2137
0
    while (n > 0) {
2138
0
        if (c->u.xrle.cur_len == 0) {
2139
0
            unsigned char lit;
2140
0
            int one = 1;
2141
0
            if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in,
2142
0
                                          (char *)&lit, &one) < 0)
2143
0
                return -1;
2144
0
            c->u.xrle.cur_lit = lit;
2145
2146
0
            if (c->u.xrle.rep_score[lit] > 0) {
2147
0
                if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in,
2148
0
                                              (char *)&c->u.xrle.cur_len, &one) < 0)
2149
0
                    return -1;
2150
0
            } // else cur_len still zero
2151
            //else fprintf(stderr, "%d\n", lit);
2152
2153
0
            c->u.xrle.cur_len++;
2154
0
        }
2155
2156
0
        if (n >= c->u.xrle.cur_len) {
2157
0
            memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len);
2158
0
            out += c->u.xrle.cur_len;
2159
0
            n -= c->u.xrle.cur_len;
2160
0
            c->u.xrle.cur_len = 0;
2161
0
        } else {
2162
0
            memset(out, c->u.xrle.cur_lit, n);
2163
0
            out += n;
2164
0
            c->u.xrle.cur_len -= n;
2165
0
            n = 0;
2166
0
        }
2167
0
    }
2168
2169
0
    return 0;
2170
0
}
2171
2172
168
void cram_xrle_decode_free(cram_codec *c) {
2173
168
    if (!c) return;
2174
2175
168
    if (c->u.xrle.len_codec)
2176
60
        c->u.xrle.len_codec->free(c->u.xrle.len_codec);
2177
2178
168
    if (c->u.xrle.lit_codec)
2179
24
        c->u.xrle.lit_codec->free(c->u.xrle.lit_codec);
2180
2181
168
    free(c);
2182
168
}
2183
2184
cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr,
2185
                                  char *data, int size,
2186
                                  enum cram_encoding codec,
2187
                                  enum cram_external_type option,
2188
174
                                  int version, varint_vec *vv) {
2189
174
    cram_codec *c;
2190
174
    char *cp = data;
2191
174
    char *endp = data+size;
2192
174
    int err = 0;
2193
2194
174
    if (!(c = calloc(1, sizeof(*c))))
2195
0
        return NULL;
2196
2197
174
    c->codec  = E_XRLE;
2198
174
    if (option == E_LONG)
2199
6
        c->decode = cram_xrle_decode_long;
2200
168
    else if (option == E_INT)
2201
69
        c->decode = cram_xrle_decode_int;
2202
99
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
2203
93
        c->decode = cram_xrle_decode_char;
2204
6
    else {
2205
6
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
2206
6
        free(c);
2207
6
        return NULL;
2208
6
    }
2209
168
    c->free   = cram_xrle_decode_free;
2210
168
    c->size   = cram_xrle_decode_size;
2211
168
    c->get_block = cram_xrle_get_block;
2212
168
    c->describe = NULL;
2213
168
    c->u.xrle.cur_len = 0;
2214
168
    c->u.xrle.cur_lit = -1;
2215
2216
    // RLE map
2217
168
    int i, j, nrle = vv->varint_get32(&cp, endp, &err);
2218
168
    memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score));
2219
10.8k
    for (i = 0; i < nrle && i < 256; i++) {
2220
10.6k
        j = vv->varint_get32(&cp, endp, &err);
2221
10.6k
        if (j >= 0 && j < 256)
2222
10.2k
            c->u.xrle.rep_score[j] = 1;
2223
10.6k
    }
2224
2225
    // Length and literal sub encodings
2226
168
    c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err);
2227
168
    int sub_size = vv->varint_get32(&cp, endp, &err);
2228
168
    if (sub_size < 0 || endp - cp < sub_size)
2229
18
        goto malformed;
2230
150
    c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding,
2231
150
                                            cp, sub_size, E_INT, version, vv);
2232
150
    if (c->u.xrle.len_codec == NULL)
2233
90
        goto malformed;
2234
60
    cp += sub_size;
2235
2236
60
    c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err);
2237
60
    sub_size = vv->varint_get32(&cp, endp, &err);
2238
60
    if (sub_size < 0 || endp - cp < sub_size)
2239
12
        goto malformed;
2240
48
    c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding,
2241
48
                                            cp, sub_size, option, version, vv);
2242
48
    if (c->u.xrle.lit_codec == NULL)
2243
24
        goto malformed;
2244
24
    cp += sub_size;
2245
2246
24
    if (err)
2247
0
        goto malformed;
2248
2249
24
    return c;
2250
2251
144
 malformed:
2252
144
    fprintf(stderr, "Malformed xrle header stream\n");
2253
144
    cram_xrle_decode_free(c);
2254
144
    return NULL;
2255
24
}
2256
2257
0
int cram_xrle_encode_flush(cram_codec *c) {
2258
0
    uint8_t *out_lit, *out_len;
2259
0
    uint64_t out_lit_size, out_len_size;
2260
0
    uint8_t rle_syms[256];
2261
0
    int rle_nsyms = 0, i;
2262
2263
0
    for (i = 0; i < 256; i++)
2264
0
        if (c->u.e_xrle.rep_score[i] > 0)
2265
0
            rle_syms[rle_nsyms++] = i;
2266
2267
0
    if (!c->u.e_xrle.to_flush) {
2268
0
        c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out);
2269
0
        c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out);
2270
0
    }
2271
2272
0
    out_len = malloc(c->u.e_xrle.to_flush_size+8);
2273
0
    if (!out_len)
2274
0
        return -1;
2275
2276
0
    int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size);
2277
2278
0
    out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
2279
0
                             out_len+nb, &out_len_size,
2280
0
                             rle_syms, &rle_nsyms,
2281
0
                             NULL, &out_lit_size);
2282
0
    out_len_size += nb;
2283
2284
2285
    // TODO: can maybe "gift" the sub codec the data block, to remove
2286
    // one level of memcpy.
2287
0
    if (c->u.e_xrle.len_codec->encode(NULL,
2288
0
                                      c->u.e_xrle.len_codec,
2289
0
                                      (char *)out_len, out_len_size))
2290
0
        return -1;
2291
2292
0
    if (c->u.e_xrle.lit_codec->encode(NULL,
2293
0
                                      c->u.e_xrle.lit_codec,
2294
0
                                      (char *)out_lit, out_lit_size))
2295
0
        return -1;
2296
2297
0
    free(out_len);
2298
0
    free(out_lit);
2299
2300
0
    return 0;
2301
0
}
2302
2303
int cram_xrle_encode_store(cram_codec *c, cram_block *b,
2304
0
                            char *prefix, int version) {
2305
0
    int len = 0, r = 0, n;
2306
0
    cram_codec *tc;
2307
0
    cram_block *b_rle, *b_len, *b_lit;
2308
2309
0
    if (prefix) {
2310
0
        size_t l = strlen(prefix);
2311
0
        BLOCK_APPEND(b, prefix, l);
2312
0
        len += l;
2313
0
    }
2314
2315
    // List of symbols to RLE
2316
0
    b_rle = cram_new_block(0, 0);
2317
0
    if (!b_rle)
2318
0
        return -1;
2319
0
    int i, nrle = 0, len1 = 0;
2320
0
    for (i = 0; i < 256; i++) {
2321
0
        if (c->u.e_xrle.rep_score[i] > 0) {
2322
0
            nrle++;
2323
0
            len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n;
2324
0
        }
2325
0
    }
2326
2327
    // Store length and literal sub-codecs to get encoded length
2328
0
    tc = c->u.e_xrle.len_codec;
2329
0
    b_len = cram_new_block(0, 0);
2330
0
    if (!b_len)
2331
0
        return -1;
2332
0
    int len2 = tc->store(tc, b_len, NULL, version);
2333
2334
0
    tc = c->u.e_xrle.lit_codec;
2335
0
    b_lit = cram_new_block(0, 0);
2336
0
    if (!b_lit)
2337
0
        return -1;
2338
0
    int len3 = tc->store(tc, b_lit, NULL, version);
2339
2340
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
2341
0
    len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3
2342
0
                                        + c->vv->varint_size(nrle))); r |= n;
2343
0
    len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n;
2344
0
    BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle));
2345
0
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
2346
0
    BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit));
2347
2348
0
    cram_free_block(b_rle);
2349
0
    cram_free_block(b_len);
2350
0
    cram_free_block(b_lit);
2351
2352
0
    if (r > 0)
2353
0
        return len + len1 + len2 + len3;
2354
2355
0
 block_err:
2356
0
    return -1;
2357
0
}
2358
2359
int cram_xrle_encode_long(cram_slice *slice, cram_codec *c,
2360
0
                           char *in, int in_size) {
2361
    // TODO if and when needed
2362
0
    return -1;
2363
0
}
2364
2365
int cram_xrle_encode_int(cram_slice *slice, cram_codec *c,
2366
0
                          char *in, int in_size) {
2367
    // TODO if and when needed
2368
0
    return -1;
2369
0
}
2370
2371
int cram_xrle_encode_char(cram_slice *slice, cram_codec *c,
2372
0
                          char *in, int in_size) {
2373
0
    if (c->u.e_xrle.to_flush) {
2374
0
        if (!c->out && !(c->out = cram_new_block(0, 0)))
2375
0
            return -1;
2376
0
        BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size);
2377
0
        c->u.e_xrle.to_flush = NULL;
2378
0
        c->u.e_xrle.to_flush_size = 0;
2379
0
    }
2380
2381
0
    if (c->out && BLOCK_SIZE(c->out) > 0) {
2382
        // Gathering data
2383
0
        BLOCK_APPEND(c->out, in, in_size);
2384
0
        return 0;
2385
0
    }
2386
2387
    // else cache copy of the data we're about to send to flush instead.
2388
0
    c->u.e_xrle.to_flush = in;
2389
0
    c->u.e_xrle.to_flush_size = in_size;
2390
0
    return 0;
2391
2392
0
 block_err:
2393
0
    return -1;
2394
0
}
2395
2396
0
void cram_xrle_encode_free(cram_codec *c) {
2397
0
    if (!c) return;
2398
2399
0
    if (c->u.e_xrle.len_codec)
2400
0
        c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec);
2401
0
    if (c->u.e_xrle.lit_codec)
2402
0
        c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec);
2403
2404
0
    cram_free_block(c->out);
2405
2406
0
    free(c);
2407
0
}
2408
2409
cram_codec *cram_xrle_encode_init(cram_stats *st,
2410
                                  enum cram_encoding codec,
2411
                                  enum cram_external_type option,
2412
                                  void *dat,
2413
0
                                  int version, varint_vec *vv) {
2414
0
    cram_codec *c;
2415
2416
0
    if (!(c = malloc(sizeof(*c))))
2417
0
        return NULL;
2418
2419
0
    c->codec  = E_XRLE;
2420
0
    c->free   = cram_xrle_encode_free;
2421
0
    if (option == E_LONG)
2422
0
        c->encode = cram_xrle_encode_long;
2423
0
    else if (option == E_INT)
2424
0
        c->encode = cram_xrle_encode_int;
2425
0
    else
2426
0
        c->encode = cram_xrle_encode_char;
2427
0
    c->store  = cram_xrle_encode_store;
2428
0
    c->flush  = cram_xrle_encode_flush;
2429
2430
0
    cram_xrle_encoder *e = (cram_xrle_encoder *)dat;
2431
2432
0
    c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL,
2433
0
                                              E_BYTE, e->len_dat,
2434
0
                                              version, vv);
2435
0
    c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL,
2436
0
                                              E_BYTE, e->lit_dat,
2437
0
                                              version, vv);
2438
0
    c->u.e_xrle.cur_lit = -1;
2439
0
    c->u.e_xrle.cur_len = -1;
2440
0
    c->u.e_xrle.to_flush = NULL;
2441
0
    c->u.e_xrle.to_flush_size = 0;
2442
2443
0
    memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score));
2444
2445
0
    return c;
2446
0
}
2447
2448
/*
2449
 * ---------------------------------------------------------------------------
2450
 * SUBEXP
2451
 */
2452
0
int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2453
0
    int32_t *out_i = (int32_t *)out;
2454
0
    int n, count;
2455
0
    int k = c->u.subexp.k;
2456
2457
0
    for (count = 0, n = *out_size; count < n; count++) {
2458
0
        int i = 0, tail;
2459
0
        int val;
2460
2461
        /* Get number of 1s */
2462
        //while (get_bit_MSB(in) == 1) i++;
2463
0
        i = get_one_bits_MSB(in);
2464
0
        if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
2465
0
            return -1;
2466
        /*
2467
         * Val is
2468
         * i > 0:  2^(k+i-1) + k+i-1 bits
2469
         * i = 0:  k bits
2470
         */
2471
0
        if (i) {
2472
0
            tail = i + k-1;
2473
0
            val = 0;
2474
0
            while (tail) {
2475
                //val = val<<1; val |= get_bit_MSB(in);
2476
0
                GET_BIT_MSB(in, val);
2477
0
                tail--;
2478
0
            }
2479
0
            val += 1 << (i + k-1);
2480
0
        } else {
2481
0
            tail = k;
2482
0
            val = 0;
2483
0
            while (tail) {
2484
                //val = val<<1; val |= get_bit_MSB(in);
2485
0
                GET_BIT_MSB(in, val);
2486
0
                tail--;
2487
0
            }
2488
0
        }
2489
2490
0
        out_i[count] = val - c->u.subexp.offset;
2491
0
    }
2492
2493
0
    return 0;
2494
0
}
2495
2496
1.71k
void cram_subexp_decode_free(cram_codec *c) {
2497
1.71k
    if (c)
2498
1.71k
        free(c);
2499
1.71k
}
2500
2501
0
int cram_subexp_describe(cram_codec *c, kstring_t *ks) {
2502
0
    return ksprintf(ks, "SUBEXP(offset=%d,k=%d)",
2503
0
                    c->u.subexp.offset,
2504
0
                    c->u.subexp.k)
2505
0
        < 0 ? -1 : 0;
2506
0
}
2507
2508
cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
2509
                                    char *data, int size,
2510
                                    enum cram_encoding codec,
2511
                                    enum cram_external_type option,
2512
1.71k
                                    int version, varint_vec *vv) {
2513
1.71k
    cram_codec *c;
2514
1.71k
    char *cp = data;
2515
2516
1.71k
    if (option != E_INT) {
2517
3
        hts_log_error("This codec only supports INT encodings");
2518
3
        return NULL;
2519
3
    }
2520
2521
1.71k
    if (!(c = malloc(sizeof(*c))))
2522
0
        return NULL;
2523
2524
1.71k
    c->codec  = E_SUBEXP;
2525
1.71k
    c->decode = cram_subexp_decode;
2526
1.71k
    c->free   = cram_subexp_decode_free;
2527
1.71k
    c->describe = cram_subexp_describe;
2528
1.71k
    c->u.subexp.k = -1;
2529
2530
1.71k
    c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL);
2531
1.71k
    c->u.subexp.k      = vv->varint_get32(&cp, data + size, NULL);
2532
2533
1.71k
    if (cp - data != size || c->u.subexp.k < 0) {
2534
6
        hts_log_error("Malformed subexp header stream");
2535
6
        free(c);
2536
6
        return NULL;
2537
6
    }
2538
2539
1.71k
    return c;
2540
1.71k
}
2541
2542
/*
2543
 * ---------------------------------------------------------------------------
2544
 * GAMMA
2545
 */
2546
0
int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2547
0
    int32_t *out_i = (int32_t *)out;
2548
0
    int i, n;
2549
2550
0
    for (i = 0, n = *out_size; i < n; i++) {
2551
0
        int nz = 0;
2552
0
        int val;
2553
        //while (get_bit_MSB(in) == 0) nz++;
2554
0
        nz = get_zero_bits_MSB(in);
2555
0
        if (cram_not_enough_bits(in, nz))
2556
0
            return -1;
2557
0
        val = 1;
2558
0
        while (nz > 0) {
2559
            //val <<= 1; val |= get_bit_MSB(in);
2560
0
            GET_BIT_MSB(in, val);
2561
0
            nz--;
2562
0
        }
2563
2564
0
        out_i[i] = val - c->u.gamma.offset;
2565
0
    }
2566
2567
0
    return 0;
2568
0
}
2569
2570
1.76k
void cram_gamma_decode_free(cram_codec *c) {
2571
1.76k
    if (c)
2572
1.76k
        free(c);
2573
1.76k
}
2574
2575
0
int cram_gamma_describe(cram_codec *c, kstring_t *ks) {
2576
0
    return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset)
2577
0
        < 0 ? -1 : 0;
2578
0
}
2579
2580
cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
2581
                                   char *data, int size,
2582
                                   enum cram_encoding codec,
2583
                                   enum cram_external_type option,
2584
1.77k
                                   int version, varint_vec *vv) {
2585
1.77k
    cram_codec *c = NULL;
2586
1.77k
    char *cp = data;
2587
2588
1.77k
    if (option != E_INT) {
2589
6
        hts_log_error("This codec only supports INT encodings");
2590
6
        return NULL;
2591
6
    }
2592
2593
1.77k
    if (size < 1)
2594
3
        goto malformed;
2595
2596
1.77k
    if (!(c = malloc(sizeof(*c))))
2597
0
        return NULL;
2598
2599
1.77k
    c->codec  = E_GAMMA;
2600
1.77k
    c->decode = cram_gamma_decode;
2601
1.77k
    c->free   = cram_gamma_decode_free;
2602
1.77k
    c->describe = cram_gamma_describe;
2603
2604
1.77k
    c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL);
2605
2606
1.77k
    if (cp - data != size)
2607
6
        goto malformed;
2608
2609
1.76k
    return c;
2610
2611
9
 malformed:
2612
9
    hts_log_error("Malformed gamma header stream");
2613
9
    free(c);
2614
9
    return NULL;
2615
1.77k
}
2616
2617
/*
2618
 * ---------------------------------------------------------------------------
2619
 * HUFFMAN
2620
 */
2621
2622
2.26k
static int code_sort(const void *vp1, const void *vp2) {
2623
2.26k
    const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
2624
2.26k
    const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
2625
2626
2.26k
    if (c1->len != c2->len)
2627
633
        return c1->len - c2->len;
2628
1.62k
    else
2629
1.62k
        return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
2630
2.26k
}
2631
2632
852
void cram_huffman_decode_free(cram_codec *c) {
2633
852
    if (!c)
2634
0
        return;
2635
2636
852
    if (c->u.huffman.codes)
2637
639
        free(c->u.huffman.codes);
2638
852
    free(c);
2639
852
}
2640
2641
int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
2642
0
                             cram_block *in, char *out, int *out_size) {
2643
0
    return -1;
2644
0
}
2645
2646
int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
2647
0
                              cram_block *in, char *out, int *out_size) {
2648
0
    int i, n;
2649
2650
0
    if (!out)
2651
0
        return 0;
2652
2653
    /* Special case of 0 length codes */
2654
0
    for (i = 0, n = *out_size; i < n; i++) {
2655
0
        out[i] = c->u.huffman.codes[0].symbol;
2656
0
    }
2657
0
    return 0;
2658
0
}
2659
2660
int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
2661
0
                             cram_block *in, char *out, int *out_size) {
2662
0
    int i, n, ncodes = c->u.huffman.ncodes;
2663
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2664
2665
0
    for (i = 0, n = *out_size; i < n; i++) {
2666
0
        int idx = 0;
2667
0
        int val = 0, len = 0, last_len = 0;
2668
2669
0
        for (;;) {
2670
0
            int dlen = codes[idx].len - last_len;
2671
0
            if (cram_not_enough_bits(in, dlen))
2672
0
                return -1;
2673
2674
            //val <<= dlen;
2675
            //val  |= get_bits_MSB(in, dlen);
2676
            //last_len = (len += dlen);
2677
2678
0
            last_len = (len += dlen);
2679
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2680
2681
0
            idx = val - codes[idx].p;
2682
0
            if (idx >= ncodes || idx < 0)
2683
0
                return -1;
2684
2685
0
            if (codes[idx].code == val && codes[idx].len == len) {
2686
0
                if (out) out[i] = codes[idx].symbol;
2687
0
                break;
2688
0
            }
2689
0
        }
2690
0
    }
2691
2692
0
    return 0;
2693
0
}
2694
2695
int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
2696
0
                             cram_block *in, char *out, int *out_size) {
2697
0
    int32_t *out_i = (int32_t *)out;
2698
0
    int i, n;
2699
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2700
2701
    /* Special case of 0 length codes */
2702
0
    for (i = 0, n = *out_size; i < n; i++) {
2703
0
        out_i[i] = codes[0].symbol;
2704
0
    }
2705
0
    return 0;
2706
0
}
2707
2708
int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
2709
0
                            cram_block *in, char *out, int *out_size) {
2710
0
    int32_t *out_i = (int32_t *)out;
2711
0
    int i, n, ncodes = c->u.huffman.ncodes;
2712
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2713
2714
0
    for (i = 0, n = *out_size; i < n; i++) {
2715
0
        int idx = 0;
2716
0
        int val = 0, len = 0, last_len = 0;
2717
2718
        // Now one bit at a time for remaining checks
2719
0
        for (;;) {
2720
0
            int dlen = codes[idx].len - last_len;
2721
0
            if (cram_not_enough_bits(in, dlen))
2722
0
                return -1;
2723
2724
            //val <<= dlen;
2725
            //val  |= get_bits_MSB(in, dlen);
2726
            //last_len = (len += dlen);
2727
2728
0
            last_len = (len += dlen);
2729
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2730
2731
0
            idx = val - codes[idx].p;
2732
0
            if (idx >= ncodes || idx < 0)
2733
0
                return -1;
2734
2735
0
            if (codes[idx].code == val && codes[idx].len == len) {
2736
0
                out_i[i] = codes[idx].symbol;
2737
0
                break;
2738
0
            }
2739
0
        }
2740
0
    }
2741
2742
0
    return 0;
2743
0
}
2744
2745
int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c,
2746
0
                              cram_block *in, char *out, int *out_size) {
2747
0
    int64_t *out_i = (int64_t *)out;
2748
0
    int i, n;
2749
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2750
2751
    /* Special case of 0 length codes */
2752
0
    for (i = 0, n = *out_size; i < n; i++) {
2753
0
        out_i[i] = codes[0].symbol;
2754
0
    }
2755
0
    return 0;
2756
0
}
2757
2758
int cram_huffman_decode_long(cram_slice *slice, cram_codec *c,
2759
0
                             cram_block *in, char *out, int *out_size) {
2760
0
    int64_t *out_i = (int64_t *)out;
2761
0
    int i, n, ncodes = c->u.huffman.ncodes;
2762
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2763
2764
0
    for (i = 0, n = *out_size; i < n; i++) {
2765
0
        int idx = 0;
2766
0
        int val = 0, len = 0, last_len = 0;
2767
2768
        // Now one bit at a time for remaining checks
2769
0
        for (;;) {
2770
0
            int dlen = codes[idx].len - last_len;
2771
0
            if (cram_not_enough_bits(in, dlen))
2772
0
                return -1;
2773
2774
            //val <<= dlen;
2775
            //val  |= get_bits_MSB(in, dlen);
2776
            //last_len = (len += dlen);
2777
2778
0
            last_len = (len += dlen);
2779
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2780
2781
0
            idx = val - codes[idx].p;
2782
0
            if (idx >= ncodes || idx < 0)
2783
0
                return -1;
2784
2785
0
            if (codes[idx].code == val && codes[idx].len == len) {
2786
0
                out_i[i] = codes[idx].symbol;
2787
0
                break;
2788
0
            }
2789
0
        }
2790
0
    }
2791
2792
0
    return 0;
2793
0
}
2794
2795
0
int cram_huffman_describe(cram_codec *c, kstring_t *ks) {
2796
0
    int r = 0, n;
2797
0
    r |= ksprintf(ks, "HUFFMAN(codes={") < 0;
2798
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2799
0
        r |= ksprintf(ks, "%s%"PRId64, n?",":"",
2800
0
                      c->u.huffman.codes[n].symbol);
2801
0
    }
2802
0
    r |= ksprintf(ks, "},lengths={") < 0;
2803
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2804
0
        r |= ksprintf(ks, "%s%d", n?",":"",
2805
0
                      c->u.huffman.codes[n].len);
2806
0
    }
2807
0
    r |= ksprintf(ks, "})") < 0;
2808
0
    return r;
2809
0
}
2810
2811
/*
2812
 * Initialises a huffman decoder from an encoding data stream.
2813
 */
2814
cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
2815
                                     char *data, int size,
2816
                                     enum cram_encoding codec,
2817
                                     enum cram_external_type option,
2818
1.03k
                                     int version, varint_vec *vv) {
2819
1.03k
    int32_t ncodes = 0, i, j;
2820
1.03k
    char *cp = data, *data_end = &data[size];
2821
1.03k
    cram_codec *h;
2822
1.03k
    cram_huffman_code *codes = NULL;
2823
1.03k
    int32_t val, last_len, max_len = 0;
2824
1.03k
    uint32_t max_val; // needs one more bit than val
2825
1.03k
    const int max_code_bits = sizeof(val) * 8 - 1;
2826
1.03k
    int err = 0;
2827
2828
1.03k
    if (option == E_BYTE_ARRAY_BLOCK) {
2829
3
        hts_log_error("BYTE_ARRAYs not supported by this codec");
2830
3
        return NULL;
2831
3
    }
2832
2833
1.02k
    ncodes = vv->varint_get32(&cp, data_end, &err);
2834
1.02k
    if (ncodes < 0) {
2835
6
        hts_log_error("Invalid number of symbols in huffman stream");
2836
6
        return NULL;
2837
6
    }
2838
1.02k
    if (ncodes >= SIZE_MAX / sizeof(*codes)) {
2839
0
        errno = ENOMEM;
2840
0
        return NULL;
2841
0
    }
2842
1.02k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2843
1.02k
    if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) {
2844
9
        errno = ENOMEM;
2845
9
        return NULL;
2846
9
    }
2847
1.01k
#endif
2848
1.01k
    h = calloc(1, sizeof(*h));
2849
1.01k
    if (!h)
2850
0
        return NULL;
2851
2852
1.01k
    h->codec  = E_HUFFMAN;
2853
1.01k
    h->free   = cram_huffman_decode_free;
2854
2855
1.01k
    h->u.huffman.ncodes = ncodes;
2856
1.01k
    h->u.huffman.option = option;
2857
1.01k
    if (ncodes) {
2858
792
        codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes));
2859
792
        if (!codes) {
2860
0
            free(h);
2861
0
            return NULL;
2862
0
        }
2863
792
    } else {
2864
222
        codes = h->u.huffman.codes = NULL;
2865
222
    }
2866
2867
    /* Read symbols and bit-lengths */
2868
1.01k
    if (option == E_LONG) {
2869
6.42M
        for (i = 0; i < ncodes; i++)
2870
6.42M
            codes[i].symbol = vv->varint_get64(&cp, data_end, &err);
2871
960
    } else if (option == E_INT || option == E_BYTE) {
2872
1.27M
        for (i = 0; i < ncodes; i++)
2873
1.27M
            codes[i].symbol = vv->varint_get32(&cp, data_end, &err);
2874
951
    } else {
2875
9
        goto malformed;
2876
9
    }
2877
2878
1.00k
    if (err)
2879
57
        goto malformed;
2880
2881
948
    i = vv->varint_get32(&cp, data_end, &err);
2882
948
    if (i != ncodes)
2883
36
        goto malformed;
2884
2885
912
    if (ncodes == 0) {
2886
        /* NULL huffman stream.  Ensure it returns an error if
2887
           anything tries to use it. */
2888
213
        h->decode = cram_huffman_decode_null;
2889
213
        return h;
2890
213
    }
2891
2892
2.85k
    for (i = 0; i < ncodes; i++) {
2893
2.18k
        codes[i].len = vv->varint_get32(&cp, data_end, &err);
2894
2.18k
        if (err)
2895
15
            break;
2896
2.16k
        if (codes[i].len < 0) {
2897
12
            hts_log_error("Huffman code length (%d) is negative", codes[i].len);
2898
12
            goto malformed;
2899
12
        }
2900
2.15k
        if (max_len < codes[i].len)
2901
516
            max_len = codes[i].len;
2902
2.15k
    }
2903
687
    if (err || cp - data != size || max_len >= ncodes)
2904
24
        goto malformed;
2905
2906
    /* 31 is max. bits available in val */
2907
663
    if (max_len > max_code_bits) {
2908
3
        hts_log_error("Huffman code length (%d) is greater "
2909
3
                      "than maximum supported (%d)", max_len, max_code_bits);
2910
3
        goto malformed;
2911
3
    }
2912
2913
    /* Sort by bit length and then by symbol value */
2914
660
    qsort(codes, ncodes, sizeof(*codes), code_sort);
2915
2916
    /* Assign canonical codes */
2917
660
    val = -1, last_len = 0, max_val = 0;
2918
1.91k
    for (i = 0; i < ncodes; i++) {
2919
1.27k
        val++;
2920
1.27k
        if (val > max_val)
2921
21
            goto malformed;
2922
2923
1.25k
        if (codes[i].len > last_len) {
2924
405
            val <<= (codes[i].len - last_len);
2925
405
            last_len = codes[i].len;
2926
405
            max_val = (1U << codes[i].len) - 1;
2927
405
        }
2928
1.25k
        codes[i].code = val;
2929
1.25k
    }
2930
2931
    /*
2932
     * Compute the next starting point, offset by the i'th value.
2933
     * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
2934
     * codes[10..13].p = 30 - 10.
2935
     */
2936
639
    last_len = 0;
2937
1.87k
    for (i = j = 0; i < ncodes; i++) {
2938
1.23k
        if (codes[i].len > last_len) {
2939
405
            j = codes[i].code - i;
2940
405
            last_len = codes[i].len;
2941
405
        }
2942
1.23k
        codes[i].p = j;
2943
1.23k
    }
2944
2945
    // puts("==HUFF LEN==");
2946
    // for (i = 0; i <= last_len+1; i++) {
2947
    //     printf("len %d=%d prefix %d\n", i, h->u.huffman.lengths[i], h->u.huffman.prefix[i]);
2948
    // }
2949
    // puts("===HUFFMAN CODES===");
2950
    // for (i = 0; i < ncodes; i++) {
2951
    //     int j;
2952
    //     printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
2953
    //     j = codes[i].len;
2954
    //     while (j) {
2955
    //         putchar(codes[i].code & (1 << --j) ? '1' : '0');
2956
    //     }
2957
    //     printf(" %d\n", codes[i].code);
2958
    // }
2959
2960
639
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
2961
297
        if (h->u.huffman.codes[0].len == 0)
2962
144
            h->decode = cram_huffman_decode_char0;
2963
153
        else
2964
153
            h->decode = cram_huffman_decode_char;
2965
342
    } else if (option == E_LONG || option == E_SLONG) {
2966
0
        if (h->u.huffman.codes[0].len == 0)
2967
0
            h->decode = cram_huffman_decode_long0;
2968
0
        else
2969
0
            h->decode = cram_huffman_decode_long;
2970
342
    } else if (option == E_INT || option == E_SINT || option == E_BYTE) {
2971
342
        if (h->u.huffman.codes[0].len == 0)
2972
120
            h->decode = cram_huffman_decode_int0;
2973
222
        else
2974
222
            h->decode = cram_huffman_decode_int;
2975
342
    } else {
2976
0
        return NULL;
2977
0
    }
2978
639
    h->describe = cram_huffman_describe;
2979
2980
639
    return (cram_codec *)h;
2981
2982
162
 malformed:
2983
162
    hts_log_error("Malformed huffman header stream");
2984
162
    free(codes);
2985
162
    free(h);
2986
162
    return NULL;
2987
639
}
2988
2989
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
2990
9.41k
                              char *in, int in_size) {
2991
9.41k
    return 0;
2992
9.41k
}
2993
2994
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
2995
0
                             char *in, int in_size) {
2996
0
    int i, code, len, r = 0;
2997
0
    unsigned char *syms = (unsigned char *)in;
2998
2999
0
    while (in_size--) {
3000
0
        int sym = *syms++;
3001
0
        if (sym >= -1 && sym < MAX_HUFF) {
3002
0
            i = c->u.e_huffman.val2code[sym+1];
3003
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3004
0
            code = c->u.e_huffman.codes[i].code;
3005
0
            len  = c->u.e_huffman.codes[i].len;
3006
0
        } else {
3007
            /* Slow - use a lookup table for when sym < MAX_HUFF? */
3008
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3009
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3010
0
                    break;
3011
0
            }
3012
0
            if (i == c->u.e_huffman.nvals)
3013
0
                return -1;
3014
3015
0
            code = c->u.e_huffman.codes[i].code;
3016
0
            len  = c->u.e_huffman.codes[i].len;
3017
0
        }
3018
3019
0
        r |= store_bits_MSB(c->out, code, len);
3020
0
    }
3021
3022
0
    return r;
3023
0
}
3024
3025
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
3026
43.9M
                             char *in, int in_size) {
3027
43.9M
    return 0;
3028
43.9M
}
3029
3030
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
3031
0
                            char *in, int in_size) {
3032
0
    int i, code, len, r = 0;
3033
0
    int *syms = (int *)in;
3034
3035
0
    while (in_size--) {
3036
0
        int sym = *syms++;
3037
3038
0
        if (sym >= -1 && sym < MAX_HUFF) {
3039
0
            i = c->u.e_huffman.val2code[sym+1];
3040
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3041
0
            code = c->u.e_huffman.codes[i].code;
3042
0
            len  = c->u.e_huffman.codes[i].len;
3043
0
        } else {
3044
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3045
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3046
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3047
0
                    break;
3048
0
            }
3049
0
            if (i == c->u.e_huffman.nvals)
3050
0
                return -1;
3051
3052
0
            code = c->u.e_huffman.codes[i].code;
3053
0
            len  = c->u.e_huffman.codes[i].len;
3054
0
        }
3055
3056
0
        r |= store_bits_MSB(c->out, code, len);
3057
0
    }
3058
3059
0
    return r;
3060
0
}
3061
3062
int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c,
3063
0
                              char *in, int in_size) {
3064
0
    return 0;
3065
0
}
3066
3067
int cram_huffman_encode_long(cram_slice *slice, cram_codec *c,
3068
0
                             char *in, int in_size) {
3069
0
    int i, code, len, r = 0;
3070
0
    int64_t *syms = (int64_t *)in;
3071
3072
0
    while (in_size--) {
3073
0
        int sym = *syms++;
3074
3075
0
        if (sym >= -1 && sym < MAX_HUFF) {
3076
0
            i = c->u.e_huffman.val2code[sym+1];
3077
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3078
0
            code = c->u.e_huffman.codes[i].code;
3079
0
            len  = c->u.e_huffman.codes[i].len;
3080
0
        } else {
3081
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3082
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3083
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3084
0
                    break;
3085
0
            }
3086
0
            if (i == c->u.e_huffman.nvals)
3087
0
                return -1;
3088
3089
0
            code = c->u.e_huffman.codes[i].code;
3090
0
            len  = c->u.e_huffman.codes[i].len;
3091
0
        }
3092
3093
0
        r |= store_bits_MSB(c->out, code, len);
3094
0
    }
3095
3096
0
    return r;
3097
0
}
3098
3099
33.8k
void cram_huffman_encode_free(cram_codec *c) {
3100
33.8k
    if (!c)
3101
0
        return;
3102
3103
33.8k
    if (c->u.e_huffman.codes)
3104
33.8k
        free(c->u.e_huffman.codes);
3105
33.8k
    free(c);
3106
33.8k
}
3107
3108
/*
3109
 * Encodes a huffman tree.
3110
 * Returns number of bytes written.
3111
 */
3112
int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
3113
33.2k
                              int version) {
3114
33.2k
    int i, len = 0, r = 0, n;
3115
33.2k
    cram_huffman_code *codes = c->u.e_huffman.codes;
3116
    /*
3117
     * Up to code length 127 means 2.5e+26 bytes of data required (worst
3118
     * case huffman tree needs symbols with freqs matching the Fibonacci
3119
     * series). So guaranteed 1 byte per code.
3120
     *
3121
     * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
3122
     *
3123
     * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
3124
     */
3125
33.2k
    char *tmp = malloc(6*c->u.e_huffman.nvals+16);
3126
33.2k
    char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16;
3127
3128
33.2k
    if (!tmp)
3129
0
        return -1;
3130
3131
33.2k
    if (prefix) {
3132
32.4k
        size_t l = strlen(prefix);
3133
32.4k
        BLOCK_APPEND(b, prefix, l);
3134
32.4k
        len += l;
3135
32.4k
    }
3136
3137
33.2k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3138
33.2k
    if (c->u.e_huffman.option == E_LONG) {
3139
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3140
0
            tp += c->vv->varint_put64(tp, tpend, codes[i].symbol);
3141
0
        }
3142
33.2k
    } else if (c->u.e_huffman.option == E_SLONG) {
3143
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3144
0
            tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol);
3145
0
        }
3146
33.2k
    } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) {
3147
66.4k
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3148
33.2k
            tp += c->vv->varint_put32(tp, tpend, codes[i].symbol);
3149
33.2k
        }
3150
33.2k
    } else if (c->u.e_huffman.option == E_SINT) {
3151
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3152
0
            tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol);
3153
0
        }
3154
0
    } else {
3155
0
        return -1;
3156
0
    }
3157
3158
33.2k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3159
66.4k
    for (i = 0; i < c->u.e_huffman.nvals; i++)
3160
33.2k
        tp += c->vv->varint_put32(tp, tpend, codes[i].len);
3161
3162
33.2k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
3163
33.2k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
3164
33.2k
    BLOCK_APPEND(b, tmp, tp-tmp);
3165
33.2k
    len += tp-tmp;
3166
3167
33.2k
    free(tmp);
3168
3169
33.2k
    if (r > 0)
3170
33.2k
        return len;
3171
3172
0
 block_err:
3173
0
    return -1;
3174
33.2k
}
3175
3176
cram_codec *cram_huffman_encode_init(cram_stats *st,
3177
                                     enum cram_encoding codec,
3178
                                     enum cram_external_type option,
3179
                                     void *dat,
3180
33.8k
                                     int version, varint_vec *vv) {
3181
33.8k
    int *vals = NULL, *freqs = NULL, *lens = NULL, code, len;
3182
33.8k
    int *new_vals, *new_freqs;
3183
33.8k
    int i, max_val = 0, min_val = INT_MAX, k;
3184
33.8k
    size_t nvals, vals_alloc = 0;
3185
33.8k
    cram_codec *c;
3186
33.8k
    cram_huffman_code *codes;
3187
3188
33.8k
    c = malloc(sizeof(*c));
3189
33.8k
    if (!c)
3190
0
        return NULL;
3191
33.8k
    c->codec = E_HUFFMAN;
3192
3193
    /* Count number of unique symbols */
3194
34.6M
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
3195
34.6M
        if (!st->freqs[i])
3196
34.5M
            continue;
3197
24.8k
        if (nvals >= vals_alloc) {
3198
24.8k
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3199
24.8k
            new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3200
24.8k
            if (!new_vals) goto nomem;
3201
24.8k
            vals = new_vals;
3202
24.8k
            new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3203
24.8k
            if (!new_freqs) goto nomem;
3204
24.8k
            freqs = new_freqs;
3205
24.8k
        }
3206
24.8k
        vals[nvals] = i;
3207
24.8k
        freqs[nvals] = st->freqs[i];
3208
24.8k
        assert(st->freqs[i] > 0);
3209
24.8k
        if (max_val < i) max_val = i;
3210
24.8k
        if (min_val > i) min_val = i;
3211
24.8k
        nvals++;
3212
24.8k
    }
3213
33.8k
    if (st->h) {
3214
8.93k
        khint_t k;
3215
3216
44.6k
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
3217
35.7k
            if (!kh_exist(st->h, k))
3218
26.8k
                continue;
3219
8.93k
            if (nvals >= vals_alloc) {
3220
8.93k
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3221
8.93k
                new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3222
8.93k
                if (!new_vals) goto nomem;
3223
8.93k
                vals = new_vals;
3224
8.93k
                new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3225
8.93k
                if (!new_freqs) goto nomem;
3226
8.93k
                freqs = new_freqs;
3227
8.93k
            }
3228
8.93k
            vals[nvals]= kh_key(st->h, k);
3229
8.93k
            freqs[nvals] = kh_val(st->h, k);
3230
8.93k
            assert(freqs[nvals] > 0);
3231
8.93k
            if (max_val < i) max_val = i;
3232
8.93k
            if (min_val > i) min_val = i;
3233
8.93k
            nvals++;
3234
8.93k
        }
3235
8.93k
    }
3236
3237
33.8k
    assert(nvals > 0);
3238
3239
33.8k
    new_freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
3240
33.8k
    if (!new_freqs) goto nomem;
3241
33.8k
    freqs = new_freqs;
3242
33.8k
    lens = calloc(2*nvals, sizeof(*lens));
3243
33.8k
    if (!lens) goto nomem;
3244
3245
    /* Inefficient, use pointers to form chain so we can insert and maintain
3246
     * a sorted list? This is currently O(nvals^2) complexity.
3247
     */
3248
33.8k
    for (;;) {
3249
33.8k
        int low1 = INT_MAX, low2 = INT_MAX;
3250
33.8k
        int ind1 = 0, ind2 = 0;
3251
67.6k
        for (i = 0; i < nvals; i++) {
3252
33.8k
            if (freqs[i] < 0)
3253
0
                continue;
3254
33.8k
            if (low1 > freqs[i])
3255
33.8k
                low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
3256
0
            else if (low2 > freqs[i])
3257
0
                low2 = freqs[i], ind2 = i;
3258
33.8k
        }
3259
33.8k
        if (low2 == INT_MAX)
3260
33.8k
            break;
3261
3262
0
        freqs[nvals] = low1 + low2;
3263
0
        lens[ind1] = nvals;
3264
0
        lens[ind2] = nvals;
3265
0
        freqs[ind1] *= -1;
3266
0
        freqs[ind2] *= -1;
3267
0
        nvals++;
3268
0
    }
3269
33.8k
    nvals = nvals/2+1;
3270
3271
    /* Assign lengths */
3272
67.6k
    for (i = 0; i < nvals; i++) {
3273
33.8k
        int code_len = 0;
3274
33.8k
        for (k = lens[i]; k; k = lens[k])
3275
0
            code_len++;
3276
33.8k
        lens[i] = code_len;
3277
33.8k
        freqs[i] *= -1;
3278
        //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
3279
33.8k
    }
3280
3281
3282
    /* Sort, need in a struct */
3283
33.8k
    if (!(codes = malloc(nvals * sizeof(*codes))))
3284
0
        goto nomem;
3285
67.6k
    for (i = 0; i < nvals; i++) {
3286
33.8k
        codes[i].symbol = vals[i];
3287
33.8k
        codes[i].len = lens[i];
3288
33.8k
    }
3289
33.8k
    qsort(codes, nvals, sizeof(*codes), code_sort);
3290
3291
    /*
3292
     * Generate canonical codes from lengths.
3293
     * Sort by length.
3294
     * Start with 0.
3295
     * Every new code of same length is +1.
3296
     * Every new code of new length is +1 then <<1 per extra length.
3297
     *
3298
     * /\
3299
     * a/\
3300
     * /\/\
3301
     * bcd/\
3302
     *    ef
3303
     *
3304
     * a 1  0
3305
     * b 3  4 (0+1)<<2
3306
     * c 3  5
3307
     * d 3  6
3308
     * e 4  14  (6+1)<<1
3309
     * f 5  15
3310
     */
3311
33.8k
    code = 0; len = codes[0].len;
3312
67.6k
    for (i = 0; i < nvals; i++) {
3313
33.8k
        while (len != codes[i].len) {
3314
0
            code<<=1;
3315
0
            len++;
3316
0
        }
3317
33.8k
        codes[i].code = code++;
3318
3319
33.8k
        if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
3320
33.2k
            c->u.e_huffman.val2code[codes[i].symbol+1] = i;
3321
3322
        //fprintf(stderr, "sym %d, code %d, len %d\n",
3323
        //      codes[i].symbol, codes[i].code, codes[i].len);
3324
33.8k
    }
3325
3326
33.8k
    free(lens);
3327
33.8k
    free(vals);
3328
33.8k
    free(freqs);
3329
3330
33.8k
    c->u.e_huffman.codes = codes;
3331
33.8k
    c->u.e_huffman.nvals = nvals;
3332
33.8k
    c->u.e_huffman.option = option;
3333
3334
33.8k
    c->free = cram_huffman_encode_free;
3335
33.8k
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
3336
277
        if (c->u.e_huffman.codes[0].len == 0)
3337
277
            c->encode = cram_huffman_encode_char0;
3338
0
        else
3339
0
            c->encode = cram_huffman_encode_char;
3340
33.5k
    } else if (option == E_INT || option == E_SINT) {
3341
33.5k
        if (c->u.e_huffman.codes[0].len == 0)
3342
33.5k
            c->encode = cram_huffman_encode_int0;
3343
0
        else
3344
0
            c->encode = cram_huffman_encode_int;
3345
33.5k
    } else if (option == E_LONG || option == E_SLONG) {
3346
0
        if (c->u.e_huffman.codes[0].len == 0)
3347
0
            c->encode = cram_huffman_encode_long0;
3348
0
        else
3349
0
            c->encode = cram_huffman_encode_long;
3350
0
    } else {
3351
0
        return NULL;
3352
0
    }
3353
33.8k
    c->store = cram_huffman_encode_store;
3354
33.8k
    c->flush = NULL;
3355
3356
33.8k
    return c;
3357
3358
0
 nomem:
3359
0
    hts_log_error("Out of memory");
3360
0
    free(vals);
3361
0
    free(freqs);
3362
0
    free(lens);
3363
0
    free(c);
3364
0
    return NULL;
3365
33.8k
}
3366
3367
/*
3368
 * ---------------------------------------------------------------------------
3369
 * BYTE_ARRAY_LEN
3370
 */
3371
int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
3372
                               cram_block *in, char *out,
3373
0
                               int *out_size) {
3374
    /* Fetch length */
3375
0
    int32_t len = 0, one = 1;
3376
0
    int r;
3377
3378
0
    cram_codec *len_codec = c->u.byte_array_len.len_codec;
3379
0
    cram_codec *val_codec = c->u.byte_array_len.val_codec;
3380
3381
0
    r = len_codec->decode(slice, len_codec, in, (char *)&len, &one);
3382
0
    if (len < 0 || (len > *out_size &&
3383
0
                    !(val_codec->codec == E_EXTERNAL &&
3384
0
                      val_codec->u.external.type == E_BYTE_ARRAY_BLOCK))) {
3385
0
        fprintf(stderr, "Error: overflow in cram_byte_array_len_decode.\n");
3386
0
        return -1;
3387
0
    }
3388
3389
0
    if (!r && val_codec) {
3390
0
        r = val_codec->decode(slice, val_codec, in, out, &len);
3391
0
    } else {
3392
0
        return -1;
3393
0
    }
3394
3395
0
    *out_size = len;
3396
3397
0
    return r;
3398
0
}
3399
3400
1.29k
void cram_byte_array_len_decode_free(cram_codec *c) {
3401
1.29k
    if (!c) return;
3402
3403
1.29k
    if (c->u.byte_array_len.len_codec)
3404
1.24k
        c->u.byte_array_len.len_codec->free(c->u.byte_array_len.len_codec);
3405
3406
1.29k
    if (c->u.byte_array_len.val_codec)
3407
1.23k
        c->u.byte_array_len.val_codec->free(c->u.byte_array_len.val_codec);
3408
3409
1.29k
    free(c);
3410
1.29k
}
3411
3412
0
int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) {
3413
0
    int r = 0;
3414
0
    r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0;
3415
0
    cram_byte_array_len_decoder *l = &c->u.byte_array_len;
3416
0
    r |=  l->len_codec->describe
3417
0
        ? l->len_codec->describe(l->len_codec, ks)
3418
0
        : (ksprintf(ks, "?")<0);
3419
0
    r |= ksprintf(ks, "},val_codec={") < 0;
3420
0
    r |=  l->val_codec->describe
3421
0
        ? l->val_codec->describe(l->val_codec, ks)
3422
0
        : (ksprintf(ks, "?")<0);
3423
0
    r |= ksprintf(ks, "}") < 0;
3424
3425
0
    return r;
3426
0
}
3427
3428
cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
3429
                                            char *data, int size,
3430
                                            enum cram_encoding codec,
3431
                                            enum cram_external_type option,
3432
1.29k
                                            int version, varint_vec *vv) {
3433
1.29k
    cram_codec *c;
3434
1.29k
    char *cp   = data;
3435
1.29k
    char *endp = data + size;
3436
3437
1.29k
    if (!(c = malloc(sizeof(*c))))
3438
0
        return NULL;
3439
3440
1.29k
    c->codec  = E_BYTE_ARRAY_LEN;
3441
1.29k
    c->decode = cram_byte_array_len_decode;
3442
1.29k
    c->free   = cram_byte_array_len_decode_free;
3443
1.29k
    c->describe = cram_byte_array_len_describe;
3444
1.29k
    c->u.byte_array_len.len_codec = NULL;
3445
1.29k
    c->u.byte_array_len.val_codec = NULL;
3446
3447
1.29k
    int encoding = vv->varint_get32(&cp, endp, NULL);
3448
1.29k
    int sub_size = vv->varint_get32(&cp, endp, NULL);
3449
1.29k
    if (sub_size < 0 || endp - cp < sub_size)
3450
15
        goto malformed;
3451
1.28k
    c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3452
1.28k
                                                      E_INT, version, vv);
3453
1.28k
    if (c->u.byte_array_len.len_codec == NULL)
3454
36
        goto no_codec;
3455
1.24k
    cp += sub_size;
3456
3457
1.24k
    encoding = vv->varint_get32(&cp, endp, NULL);
3458
1.24k
    sub_size = vv->varint_get32(&cp, endp, NULL);
3459
1.24k
    if (sub_size < 0 || endp - cp < sub_size)
3460
9
        goto malformed;
3461
1.23k
    c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3462
1.23k
                                                      option, version, vv);
3463
1.23k
    if (c->u.byte_array_len.val_codec == NULL)
3464
9
        goto no_codec;
3465
1.23k
    cp += sub_size;
3466
3467
1.23k
    if (cp - data != size)
3468
9
        goto malformed;
3469
3470
1.22k
    return c;
3471
3472
33
 malformed:
3473
33
    hts_log_error("Malformed byte_array_len header stream");
3474
78
 no_codec:
3475
78
    cram_byte_array_len_decode_free(c);
3476
78
    return NULL;
3477
33
}
3478
3479
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
3480
7.02k
                               char *in, int in_size) {
3481
7.02k
    int32_t i32 = in_size;
3482
7.02k
    int r = 0;
3483
3484
7.02k
    r |= c->u.e_byte_array_len.len_codec->encode(slice,
3485
7.02k
                                                 c->u.e_byte_array_len.len_codec,
3486
7.02k
                                                 (char *)&i32, 1);
3487
7.02k
    r |= c->u.e_byte_array_len.val_codec->encode(slice,
3488
7.02k
                                                 c->u.e_byte_array_len.val_codec,
3489
7.02k
                                                 in, in_size);
3490
7.02k
    return r;
3491
7.02k
}
3492
3493
4.59k
void cram_byte_array_len_encode_free(cram_codec *c) {
3494
4.59k
    if (!c)
3495
0
        return;
3496
3497
4.59k
    if (c->u.e_byte_array_len.len_codec)
3498
4.59k
        c->u.e_byte_array_len.len_codec->free(c->u.e_byte_array_len.len_codec);
3499
3500
4.59k
    if (c->u.e_byte_array_len.val_codec)
3501
4.59k
        c->u.e_byte_array_len.val_codec->free(c->u.e_byte_array_len.val_codec);
3502
3503
4.59k
    free(c);
3504
4.59k
}
3505
3506
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
3507
4.08k
                                     char *prefix, int version) {
3508
4.08k
    int len = 0, len2, len3, r = 0, n;
3509
4.08k
    cram_codec *tc;
3510
4.08k
    cram_block *b_len = NULL, *b_val = NULL;
3511
3512
4.08k
    if (prefix) {
3513
2.89k
        size_t l = strlen(prefix);
3514
2.89k
        BLOCK_APPEND(b, prefix, l);
3515
2.89k
        len += l;
3516
2.89k
    }
3517
3518
4.08k
    tc = c->u.e_byte_array_len.len_codec;
3519
4.08k
    b_len = cram_new_block(0, 0);
3520
4.08k
    if (!b_len) goto block_err;
3521
4.08k
    len2 = tc->store(tc, b_len, NULL, version);
3522
4.08k
    if (len2 < 0) goto block_err;
3523
3524
4.08k
    tc = c->u.e_byte_array_len.val_codec;
3525
4.08k
    b_val = cram_new_block(0, 0);
3526
4.08k
    if (!b_val) goto block_err;
3527
4.08k
    len3 = tc->store(tc, b_val, NULL, version);
3528
4.08k
    if (len3 < 0) goto block_err;
3529
3530
4.08k
    len += (n = c->vv->varint_put32_blk(b, c->codec));  r |= n;
3531
4.08k
    len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n;
3532
4.08k
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
3533
4.08k
    BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
3534
3535
4.08k
    cram_free_block(b_len);
3536
4.08k
    cram_free_block(b_val);
3537
3538
4.08k
    if (r > 0)
3539
4.08k
        return len + len2 + len3;
3540
3541
0
 block_err:
3542
0
    if (b_len) cram_free_block(b_len);
3543
0
    if (b_val) cram_free_block(b_val);
3544
0
    return -1;
3545
4.08k
}
3546
3547
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
3548
                                            enum cram_encoding codec,
3549
                                            enum cram_external_type option,
3550
                                            void *dat,
3551
4.59k
                                            int version, varint_vec *vv) {
3552
4.59k
    cram_codec *c;
3553
4.59k
    cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
3554
3555
4.59k
    c = malloc(sizeof(*c));
3556
4.59k
    if (!c)
3557
0
        return NULL;
3558
4.59k
    c->codec = E_BYTE_ARRAY_LEN;
3559
4.59k
    c->free = cram_byte_array_len_encode_free;
3560
4.59k
    c->encode = cram_byte_array_len_encode;
3561
4.59k
    c->store = cram_byte_array_len_encode_store;
3562
4.59k
    c->flush = NULL;
3563
3564
4.59k
    c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
3565
4.59k
                                                        st, E_INT,
3566
4.59k
                                                        e->len_dat,
3567
4.59k
                                                        version, vv);
3568
4.59k
    c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
3569
4.59k
                                                        NULL, E_BYTE_ARRAY,
3570
4.59k
                                                        e->val_dat,
3571
4.59k
                                                        version, vv);
3572
3573
4.59k
    if (!c->u.e_byte_array_len.len_codec ||
3574
4.59k
        !c->u.e_byte_array_len.val_codec) {
3575
0
        cram_byte_array_len_encode_free(c);
3576
0
        return NULL;
3577
0
    }
3578
3579
4.59k
    return c;
3580
4.59k
}
3581
3582
/*
3583
 * ---------------------------------------------------------------------------
3584
 * BYTE_ARRAY_STOP
3585
 */
3586
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
3587
                                            cram_block *in, char *out,
3588
0
                                            int *out_size) {
3589
0
    uint8_t *cp;
3590
0
    cram_block *b = NULL;
3591
3592
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3593
0
    if (!b)
3594
0
        return *out_size?-1:0;
3595
3596
0
    if (b->idx >= b->uncomp_size)
3597
0
        return -1;
3598
3599
0
    ssize_t term = b->uncomp_size - b->idx;
3600
0
    cp = b->data + b->idx;
3601
0
    if (out) {
3602
       // memccpy equivalent but without copying the terminating byte
3603
0
        if (term > *out_size)
3604
0
            term = *out_size;
3605
0
        while (--term >= 0 && *cp != c->u.byte_array_stop.stop) {
3606
0
            *out++ = *cp++;
3607
0
        }
3608
3609
0
    } else {
3610
        // Consume input, but produce no output
3611
0
        while (--term >= 0 && *cp != c->u.byte_array_stop.stop) {
3612
0
            cp++;
3613
0
        }
3614
0
    }
3615
3616
    // Attempted overrun on input or output
3617
0
    if (cp >= b->data + b->uncomp_size || *cp != c->u.byte_array_stop.stop)
3618
0
        return -1;
3619
3620
0
    *out_size = cp - (b->data + b->idx);
3621
0
    b->idx = cp - b->data + 1;
3622
3623
0
    return 0;
3624
0
}
3625
3626
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
3627
                                      cram_block *in, char *out_,
3628
0
                                      int *out_size) {
3629
0
    cram_block *b;
3630
0
    cram_block *out = (cram_block *)out_;
3631
0
    unsigned char *cp, *cp_end;
3632
0
    unsigned char stop;
3633
3634
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3635
0
    if (!b)
3636
0
        return *out_size?-1:0;
3637
3638
0
    if (b->idx >= b->uncomp_size)
3639
0
        return -1;
3640
0
    cp = b->data + b->idx;
3641
0
    cp_end = b->data + b->uncomp_size;
3642
3643
    // STOP byte is hard-coded as zero by our name tokeniser decoder
3644
    // implementation, so we may ignore what was requested.
3645
0
    stop = b->orig_method == TOK3 ? 0 : c->u.byte_array_stop.stop;
3646
3647
0
    if (cp_end - cp < out->alloc - out->byte) {
3648
0
        unsigned char *out_cp = BLOCK_END(out);
3649
0
        while (cp != cp_end && *cp != stop)
3650
0
            *out_cp++ = *cp++;
3651
0
        BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out);
3652
0
    } else {
3653
0
        unsigned char *cp_start;
3654
0
        for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
3655
0
            ;
3656
0
        BLOCK_APPEND(out, cp_start, cp - cp_start);
3657
0
        BLOCK_GROW(out, cp - cp_start);
3658
0
    }
3659
3660
0
    *out_size = cp - (b->data + b->idx);
3661
0
    b->idx = cp - b->data + 1;
3662
3663
0
    return 0;
3664
3665
0
 block_err:
3666
0
    return -1;
3667
0
}
3668
3669
474
void cram_byte_array_stop_decode_free(cram_codec *c) {
3670
474
    if (!c) return;
3671
3672
474
    free(c);
3673
474
}
3674
3675
0
int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) {
3676
0
    return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)",
3677
0
                    c->u.byte_array_stop.stop,
3678
0
                    c->u.byte_array_stop.content_id)
3679
0
        < 0 ? -1 : 0;
3680
0
}
3681
3682
cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
3683
                                             char *data, int size,
3684
                                             enum cram_encoding codec,
3685
                                             enum cram_external_type option,
3686
486
                                             int version, varint_vec *vv) {
3687
486
    cram_codec *c = NULL;
3688
486
    unsigned char *cp = (unsigned char *)data;
3689
486
    int err = 0;
3690
3691
486
    if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
3692
3
        goto malformed;
3693
3694
483
    if (!(c = malloc(sizeof(*c))))
3695
0
        return NULL;
3696
3697
483
    c->codec  = E_BYTE_ARRAY_STOP;
3698
483
    switch (option) {
3699
435
    case E_BYTE_ARRAY_BLOCK:
3700
435
        c->decode = cram_byte_array_stop_decode_block;
3701
435
        break;
3702
45
    case E_BYTE_ARRAY:
3703
45
        c->decode = cram_byte_array_stop_decode_char;
3704
45
        break;
3705
3
    default:
3706
3
        hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
3707
3
        free(c);
3708
3
        return NULL;
3709
483
    }
3710
480
    c->free   = cram_byte_array_stop_decode_free;
3711
480
    c->describe = cram_byte_array_stop_describe;
3712
3713
480
    c->u.byte_array_stop.stop = *cp++;
3714
480
    if (CRAM_MAJOR_VERS(version) == 1) {
3715
468
        c->u.byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
3716
468
            + ((unsigned int) cp[3]<<24);
3717
468
        cp += 4;
3718
468
    } else {
3719
12
        c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err);
3720
12
    }
3721
3722
480
    if ((char *)cp - data != size || err)
3723
6
        goto malformed;
3724
3725
474
    return c;
3726
3727
9
 malformed:
3728
9
    hts_log_error("Malformed byte_array_stop header stream");
3729
9
    free(c);
3730
9
    return NULL;
3731
480
}
3732
3733
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
3734
2.46k
                                char *in, int in_size) {
3735
2.46k
    BLOCK_APPEND(c->out, in, in_size);
3736
2.46k
    BLOCK_APPEND_CHAR(c->out, c->u.e_byte_array_stop.stop);
3737
2.46k
    return 0;
3738
3739
0
 block_err:
3740
0
    return -1;
3741
2.46k
}
3742
3743
8.92k
void cram_byte_array_stop_encode_free(cram_codec *c) {
3744
8.92k
    if (!c)
3745
0
        return;
3746
8.92k
    free(c);
3747
8.92k
}
3748
3749
int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
3750
8.83k
                                      char *prefix, int version) {
3751
8.83k
    int len = 0;
3752
8.83k
    char buf[20], *cp = buf;
3753
3754
8.83k
    if (prefix) {
3755
8.68k
        size_t l = strlen(prefix);
3756
8.68k
        BLOCK_APPEND(b, prefix, l);
3757
8.68k
        len += l;
3758
8.68k
    }
3759
3760
8.83k
    cp += c->vv->varint_put32(cp, buf+20, c->codec);
3761
3762
8.83k
    if (CRAM_MAJOR_VERS(version) == 1) {
3763
0
        cp += c->vv->varint_put32(cp, buf+20, 5);
3764
0
        *cp++ = c->u.e_byte_array_stop.stop;
3765
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  0) & 0xff;
3766
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  8) & 0xff;
3767
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff;
3768
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff;
3769
8.83k
    } else {
3770
8.83k
        cp += c->vv->varint_put32(cp, buf+20, 1 +
3771
8.83k
                                  c->vv->varint_size(c->u.e_byte_array_stop.content_id));
3772
8.83k
        *cp++ = c->u.e_byte_array_stop.stop;
3773
8.83k
        cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id);
3774
8.83k
    }
3775
3776
8.83k
    BLOCK_APPEND(b, buf, cp-buf);
3777
8.83k
    len += cp-buf;
3778
3779
8.83k
    return len;
3780
3781
0
 block_err:
3782
0
    return -1;
3783
8.83k
}
3784
3785
cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
3786
                                             enum cram_encoding codec,
3787
                                             enum cram_external_type option,
3788
                                             void *dat,
3789
8.92k
                                             int version, varint_vec *vv) {
3790
8.92k
    cram_codec *c;
3791
3792
8.92k
    c = malloc(sizeof(*c));
3793
8.92k
    if (!c)
3794
0
        return NULL;
3795
8.92k
    c->codec = E_BYTE_ARRAY_STOP;
3796
8.92k
    c->free = cram_byte_array_stop_encode_free;
3797
8.92k
    c->encode = cram_byte_array_stop_encode;
3798
8.92k
    c->store = cram_byte_array_stop_encode_store;
3799
8.92k
    c->flush = NULL;
3800
3801
8.92k
    c->u.e_byte_array_stop.stop = ((int *)dat)[0];
3802
8.92k
    c->u.e_byte_array_stop.content_id = ((int *)dat)[1];
3803
3804
8.92k
    return c;
3805
8.92k
}
3806
3807
/*
3808
 * ---------------------------------------------------------------------------
3809
 */
3810
3811
210
const char *cram_encoding2str(enum cram_encoding t) {
3812
210
    switch (t) {
3813
105
    case E_NULL:            return "NULL";
3814
0
    case E_EXTERNAL:        return "EXTERNAL";
3815
3
    case E_GOLOMB:          return "GOLOMB";
3816
0
    case E_HUFFMAN:         return "HUFFMAN";
3817
0
    case E_BYTE_ARRAY_LEN:  return "BYTE_ARRAY_LEN";
3818
0
    case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
3819
6
    case E_BETA:            return "BETA";
3820
0
    case E_SUBEXP:          return "SUBEXP";
3821
3
    case E_GOLOMB_RICE:     return "GOLOMB_RICE";
3822
0
    case E_GAMMA:           return "GAMMA";
3823
3824
0
    case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED";
3825
0
    case E_VARINT_SIGNED:   return "VARINT_SIGNED";
3826
0
    case E_CONST_BYTE:      return "CONST_BYTE";
3827
0
    case E_CONST_INT:       return "CONST_INT";
3828
3829
9
    case E_NUM_CODECS:
3830
93
    default:                return "?";
3831
210
    }
3832
210
}
3833
3834
static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr,
3835
                                    char *data,
3836
                                    int size,
3837
                                    enum cram_encoding codec,
3838
                                    enum cram_external_type option,
3839
                                    int version, varint_vec *vv) = {
3840
    // CRAM 3.0 valid codecs
3841
    NULL, // null codec
3842
    cram_external_decode_init,
3843
    NULL, // golomb
3844
    cram_huffman_decode_init,
3845
    cram_byte_array_len_decode_init,
3846
    cram_byte_array_stop_decode_init,
3847
    cram_beta_decode_init,
3848
    cram_subexp_decode_init,
3849
    NULL, // golomb rice
3850
    cram_gamma_decode_init,
3851
3852
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3853
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3854
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3855
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3856
3857
    NULL,                      // was xbyte
3858
    cram_varint_decode_init,   // varint unsigned
3859
    cram_varint_decode_init,   // varint signed
3860
    cram_const_decode_init,    // const byte
3861
    cram_const_decode_init,    // const int
3862
3863
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3864
    NULL, NULL, NULL, NULL, NULL,
3865
3866
    NULL, // xhuffman
3867
    cram_xpack_decode_init,
3868
    cram_xrle_decode_init,
3869
    cram_xdelta_decode_init,
3870
};
3871
3872
cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr,
3873
                              enum cram_encoding codec,
3874
                              char *data, int size,
3875
                              enum cram_external_type option,
3876
9.81k
                              int version, varint_vec *vv) {
3877
9.81k
    if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
3878
9.60k
        cram_codec *r = decode_init[codec](hdr, data, size, codec,
3879
9.60k
                                           option, version, vv);
3880
9.60k
        if (r) {
3881
8.83k
            r->vv = vv;
3882
8.83k
            r->codec_id = hdr->ncodecs++;
3883
8.83k
        }
3884
9.60k
        return r;
3885
9.60k
    } else {
3886
204
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3887
204
        return NULL;
3888
204
    }
3889
9.81k
}
3890
3891
static cram_codec *(*encode_init[])(cram_stats *stx,
3892
                                    enum cram_encoding codec,
3893
                                    enum cram_external_type option,
3894
                                    void *opt,
3895
                                    int version, varint_vec *vv) = {
3896
    // CRAM 3.0 valid codecs
3897
    NULL, // null codec
3898
    cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4
3899
    NULL, // golomb
3900
    cram_huffman_encode_init,
3901
    cram_byte_array_len_encode_init,
3902
    cram_byte_array_stop_encode_init,
3903
    cram_beta_encode_init,
3904
    NULL, // subexponential (we support decode only)
3905
    NULL, // golomb rice
3906
    NULL, // gamma (we support decode only)
3907
3908
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3909
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3910
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3911
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3912
3913
    NULL, // was xbyte
3914
    cram_varint_encode_init, // varint unsigned
3915
    cram_varint_encode_init, // varint signed
3916
    cram_const_encode_init,  // const byte
3917
    cram_const_encode_init,  // const int
3918
3919
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3920
    NULL, NULL, NULL, NULL, NULL,
3921
3922
    NULL, // xhuffman
3923
    cram_xpack_encode_init,
3924
    cram_xrle_encode_init,
3925
    cram_xdelta_encode_init,
3926
};
3927
3928
cram_codec *cram_encoder_init(enum cram_encoding codec,
3929
                              cram_stats *st,
3930
                              enum cram_external_type option,
3931
                              void *dat,
3932
89.5k
                              int version, varint_vec *vv) {
3933
89.5k
    if (st && !st->nvals)
3934
26.8k
        return NULL;
3935
3936
    // cram_stats_encoding assumes integer data, but if option
3937
    // is E_BYTE then tweak the requested encoding.  This ought
3938
    // to be fixed in cram_stats_encoding instead.
3939
62.6k
    if (option == E_BYTE || option == E_BYTE_ARRAY ||
3940
39.9k
       option == E_BYTE_ARRAY_BLOCK) {
3941
22.7k
       if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED)
3942
0
           codec = E_EXTERNAL;
3943
22.7k
       else if (codec == E_CONST_INT)
3944
0
           codec = E_CONST_BYTE;
3945
22.7k
    }
3946
3947
62.6k
    if (encode_init[codec]) {
3948
62.6k
        cram_codec *r;
3949
62.6k
        if ((r = encode_init[codec](st, codec, option, dat, version, vv)))
3950
62.6k
            r->out = NULL;
3951
62.6k
        if (!r) {
3952
6
            hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec));
3953
6
            return NULL;
3954
6
        }
3955
62.6k
        r->vv = vv;
3956
62.6k
        return r;
3957
62.6k
    } else {
3958
0
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3959
0
        abort();
3960
0
    }
3961
62.6k
}
3962
3963
/*
3964
 * Returns the content_id used by this codec, also in id2 if byte_array_len.
3965
 * Returns -1 for the CORE block and -2 for unneeded.
3966
 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
3967
 */
3968
0
int cram_codec_to_id(cram_codec *c, int *id2) {
3969
0
    int bnum1, bnum2 = -2;
3970
3971
0
    switch (c->codec) {
3972
0
    case E_CONST_INT:
3973
0
    case E_CONST_BYTE:
3974
0
        bnum1 = -2; // no blocks used
3975
0
        break;
3976
3977
0
    case E_HUFFMAN:
3978
0
        bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1;
3979
0
        break;
3980
3981
0
    case E_GOLOMB:
3982
0
    case E_BETA:
3983
0
    case E_SUBEXP:
3984
0
    case E_GOLOMB_RICE:
3985
0
    case E_GAMMA:
3986
        // CORE block
3987
0
        bnum1 = -1;
3988
0
        break;
3989
3990
0
    case E_EXTERNAL:
3991
0
    case E_VARINT_UNSIGNED:
3992
0
    case E_VARINT_SIGNED:
3993
0
        bnum1 = c->u.external.content_id;
3994
0
        break;
3995
3996
0
    case E_BYTE_ARRAY_LEN:
3997
0
        bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL);
3998
0
        bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL);
3999
0
        break;
4000
4001
0
    case E_BYTE_ARRAY_STOP:
4002
0
        bnum1 = c->u.byte_array_stop.content_id;
4003
0
        break;
4004
4005
0
    case E_NULL:
4006
0
        bnum1 = -2;
4007
0
        break;
4008
4009
0
    default:
4010
0
        hts_log_error("Unknown codec type %d", c->codec);
4011
0
        bnum1 = -1;
4012
0
    }
4013
4014
0
    if (id2)
4015
0
        *id2 = bnum2;
4016
0
    return bnum1;
4017
0
}
4018
4019
4020
/*
4021
 * cram_codec structures are specialised for decoding or encoding.
4022
 * Unfortunately this makes turning a decoder into an encoder (such as
4023
 * when transcoding files) problematic.
4024
 *
4025
 * This function converts a cram decoder codec into an encoder version
4026
 * in-place (ie it modifiers the codec itself).
4027
 *
4028
 * Returns 0 on success;
4029
 *        -1 on failure.
4030
 */
4031
0
int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
4032
0
    int j;
4033
4034
0
    switch (c->codec) {
4035
0
    case E_CONST_INT:
4036
0
    case E_CONST_BYTE:
4037
        // shares struct with decode
4038
0
        c->store = cram_const_encode_store;
4039
0
        break;
4040
4041
0
    case E_EXTERNAL:
4042
        // shares struct with decode
4043
0
        c->free = cram_external_encode_free;
4044
0
        c->store = cram_external_encode_store;
4045
0
        if (c->decode == cram_external_decode_int)
4046
0
            c->encode = cram_external_encode_int;
4047
0
        else if (c->decode == cram_external_decode_long)
4048
0
            c->encode = cram_external_encode_long;
4049
0
        else if (c->decode == cram_external_decode_char)
4050
0
            c->encode = cram_external_encode_char;
4051
0
        else if (c->decode == cram_external_decode_block)
4052
0
            c->encode = cram_external_encode_char;
4053
0
        else
4054
0
            return -1;
4055
0
        break;
4056
4057
0
    case E_VARINT_SIGNED:
4058
0
    case E_VARINT_UNSIGNED:
4059
        // shares struct with decode
4060
0
        c->free = cram_varint_encode_free;
4061
0
        c->store = cram_varint_encode_store;
4062
0
        if (c->decode == cram_varint_decode_int)
4063
0
            c->encode = cram_varint_encode_int;
4064
0
        else if (c->decode == cram_varint_decode_sint)
4065
0
            c->encode = cram_varint_encode_sint;
4066
0
        else if (c->decode == cram_varint_decode_long)
4067
0
            c->encode = cram_varint_encode_long;
4068
0
        else if (c->decode == cram_varint_decode_slong)
4069
0
            c->encode = cram_varint_encode_slong;
4070
0
        else
4071
0
            return -1;
4072
0
        break;
4073
4074
0
    case E_HUFFMAN: {
4075
        // New structure, so switch.
4076
        // FIXME: we huffman and e_huffman structs amended, we could
4077
        // unify this.
4078
0
        cram_codec *t = malloc(sizeof(*t));
4079
0
        if (!t) return -1;
4080
0
        t->vv     = c->vv;
4081
0
        t->codec = E_HUFFMAN;
4082
0
        t->free = cram_huffman_encode_free;
4083
0
        t->store = cram_huffman_encode_store;
4084
0
        t->u.e_huffman.codes = c->u.huffman.codes;
4085
0
        t->u.e_huffman.nvals = c->u.huffman.ncodes;
4086
0
        t->u.e_huffman.option = c->u.huffman.option;
4087
0
        for (j = 0; j < t->u.e_huffman.nvals; j++) {
4088
0
            int32_t sym = t->u.e_huffman.codes[j].symbol;
4089
0
            if (sym >= -1 && sym < MAX_HUFF)
4090
0
                t->u.e_huffman.val2code[sym+1] = j;
4091
0
        }
4092
4093
0
        if (c->decode == cram_huffman_decode_char0)
4094
0
            t->encode = cram_huffman_encode_char0;
4095
0
        else if (c->decode == cram_huffman_decode_char)
4096
0
            t->encode = cram_huffman_encode_char;
4097
0
        else if (c->decode == cram_huffman_decode_int0)
4098
0
            t->encode = cram_huffman_encode_int0;
4099
0
        else if (c->decode == cram_huffman_decode_int)
4100
0
            t->encode = cram_huffman_encode_int;
4101
0
        else if (c->decode == cram_huffman_decode_long0)
4102
0
            t->encode = cram_huffman_encode_long0;
4103
0
        else if (c->decode == cram_huffman_decode_long)
4104
0
            t->encode = cram_huffman_encode_long;
4105
0
        else {
4106
0
            free(t);
4107
0
            return -1;
4108
0
        }
4109
0
        *c = *t;
4110
0
        free(t);
4111
0
        break;
4112
0
    }
4113
4114
0
    case E_BETA:
4115
        // shares struct with decode
4116
0
        c->free = cram_beta_encode_free;
4117
0
        c->store = cram_beta_encode_store;
4118
0
        if (c->decode == cram_beta_decode_int)
4119
0
            c->encode = cram_beta_encode_int;
4120
0
        else if (c->decode == cram_beta_decode_long)
4121
0
            c->encode = cram_beta_encode_long;
4122
0
        else if (c->decode == cram_beta_decode_char)
4123
0
            c->encode = cram_beta_encode_char;
4124
0
        else
4125
0
            return -1;
4126
0
        break;
4127
4128
0
    case E_XPACK: {
4129
        // shares struct with decode
4130
0
        cram_codec t = *c;
4131
0
        t.free = cram_xpack_encode_free;
4132
0
        t.store = cram_xpack_encode_store;
4133
0
        if (t.decode == cram_xpack_decode_long)
4134
0
            t.encode = cram_xpack_encode_long;
4135
0
        else if (t.decode == cram_xpack_decode_int)
4136
0
            t.encode = cram_xpack_encode_int;
4137
0
        else if (t.decode == cram_xpack_decode_char)
4138
0
            t.encode = cram_xpack_encode_char;
4139
0
        else
4140
0
            return -1;
4141
0
        t.u.e_xpack.sub_codec = t.u.xpack.sub_codec;
4142
0
        if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1)
4143
0
            return -1;
4144
0
        *c = t;
4145
0
        break;
4146
0
    }
4147
4148
0
    case E_BYTE_ARRAY_LEN: {
4149
0
        cram_codec *t = malloc(sizeof(*t));
4150
0
        if (!t) return -1;
4151
0
        t->vv     = c->vv;
4152
0
        t->codec  = E_BYTE_ARRAY_LEN;
4153
0
        t->free   = cram_byte_array_len_encode_free;
4154
0
        t->store  = cram_byte_array_len_encode_store;
4155
0
        t->encode = cram_byte_array_len_encode;
4156
0
        t->u.e_byte_array_len.len_codec = c->u.byte_array_len.len_codec;
4157
0
        t->u.e_byte_array_len.val_codec = c->u.byte_array_len.val_codec;
4158
0
        if (cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.len_codec) == -1 ||
4159
0
            cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.val_codec) == -1) {
4160
0
            t->free(t);
4161
0
            return -1;
4162
0
        }
4163
4164
        // {len,val}_{encoding,dat} are undefined, but unused.
4165
        // Leaving them unset here means we can test that assertion.
4166
0
        *c = *t;
4167
0
        free(t);
4168
0
        break;
4169
0
    }
4170
4171
0
    case E_BYTE_ARRAY_STOP:
4172
        // shares struct with decode
4173
0
        c->free   = cram_byte_array_stop_encode_free;
4174
0
        c->store  = cram_byte_array_stop_encode_store;
4175
0
        c->encode = cram_byte_array_stop_encode;
4176
0
        break;
4177
4178
0
    default:
4179
0
        return -1;
4180
0
    }
4181
4182
0
    return 0;
4183
0
}
4184
4185
0
int cram_codec_describe(cram_codec *c, kstring_t *ks) {
4186
0
    if (c && c->describe)
4187
0
        return c->describe(c, ks);
4188
0
    else
4189
0
        return ksprintf(ks, "?");
4190
0
}