Coverage Report

Created: 2025-11-24 06:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_codecs.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2021,2023 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * FIXME: add checking of cram_external_type to return NULL on unsupported
33
 * {codec,type} tuples.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdlib.h>
40
#include <string.h>
41
#include <assert.h>
42
#include <limits.h>
43
#include <stdint.h>
44
#include <errno.h>
45
#include <stddef.h>
46
47
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48
#include "../fuzz_settings.h"
49
#endif
50
51
#include "../htslib/hts_endian.h"
52
53
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
54
#include <htscodecs/varint.h>
55
#include <htscodecs/pack.h>
56
#include <htscodecs/rle.h>
57
#else
58
#include "../htscodecs/htscodecs/varint.h"
59
#include "../htscodecs/htscodecs/pack.h"
60
#include "../htscodecs/htscodecs/rle.h"
61
#endif
62
63
#include "cram.h"
64
65
/*
66
 * ---------------------------------------------------------------------------
67
 * Block bit-level I/O functions.
68
 * All defined static here to promote easy inlining by the compiler.
69
 */
70
71
#if 0
72
/* Get a single bit, MSB first */
73
static signed int get_bit_MSB(cram_block *block) {
74
    unsigned int val;
75
76
    if (block->byte > block->alloc)
77
        return -1;
78
79
    val = block->data[block->byte] >> block->bit;
80
    if (--block->bit == -1) {
81
        block->bit = 7;
82
        block->byte++;
83
        //printf("(%02X)", block->data[block->byte]);
84
    }
85
86
    //printf("-B%d-", val&1);
87
88
    return val & 1;
89
}
90
#endif
91
92
/*
93
 * Count number of successive 0 and 1 bits
94
 */
95
0
static int get_one_bits_MSB(cram_block *block) {
96
0
    int n = 0, b;
97
0
    if (block->byte >= block->uncomp_size)
98
0
        return -1;
99
0
    do {
100
0
        b = block->data[block->byte] >> block->bit;
101
0
        if (--block->bit == -1) {
102
0
            block->bit = 7;
103
0
            block->byte++;
104
0
            if (block->byte == block->uncomp_size && (b&1))
105
0
                return -1;
106
0
        }
107
0
        n++;
108
0
    } while (b&1);
109
110
0
    return n-1;
111
0
}
112
113
0
static int get_zero_bits_MSB(cram_block *block) {
114
0
    int n = 0, b;
115
0
    if (block->byte >= block->uncomp_size)
116
0
        return -1;
117
0
    do {
118
0
        b = block->data[block->byte] >> block->bit;
119
0
        if (--block->bit == -1) {
120
0
            block->bit = 7;
121
0
            block->byte++;
122
0
            if (block->byte == block->uncomp_size && !(b&1))
123
0
                return -1;
124
0
        }
125
0
        n++;
126
0
    } while (!(b&1));
127
128
0
    return n-1;
129
0
}
130
131
#if 0
132
/* Stores a single bit */
133
static void store_bit_MSB(cram_block *block, unsigned int bit) {
134
    if (block->byte >= block->alloc) {
135
        block->alloc = block->alloc ? block->alloc*2 : 1024;
136
        block->data = realloc(block->data, block->alloc);
137
    }
138
139
    if (bit)
140
        block->data[block->byte] |= (1 << block->bit);
141
142
    if (--block->bit == -1) {
143
        block->bit = 7;
144
        block->byte++;
145
        block->data[block->byte] = 0;
146
    }
147
}
148
#endif
149
150
#if 0
151
/* Rounds to the next whole byte boundary first */
152
static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
153
    if (block->bit != 7) {
154
        block->bit = 7;
155
        block->byte++;
156
    }
157
158
    while (block->byte + len >= block->alloc) {
159
        block->alloc = block->alloc ? block->alloc*2 : 1024;
160
        block->data = realloc(block->data, block->alloc);
161
    }
162
163
    memcpy(&block->data[block->byte], bytes, len);
164
    block->byte += len;
165
}
166
#endif
167
168
/* Local optimised copy for inlining */
169
0
static inline int64_t get_bits_MSB(cram_block *block, int nbits) {
170
0
    uint64_t val = 0;
171
0
    int i;
172
173
#if 0
174
    // Fits within the current byte */
175
    if (nbits <= block->bit+1) {
176
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
177
        if ((block->bit -= nbits) == -1) {
178
            block->bit = 7;
179
            block->byte++;
180
        }
181
        return val;
182
    }
183
184
    // partial first byte
185
    val = block->data[block->byte] & ((1<<(block->bit+1))-1);
186
    nbits -= block->bit+1;
187
    block->bit = 7;
188
    block->byte++;
189
190
    // whole middle bytes
191
    while (nbits >= 8) {
192
        val = (val << 8) | block->data[block->byte++];
193
        nbits -= 8;
194
    }
195
196
    val <<= nbits;
197
    val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
198
    block->bit -= nbits;
199
    return val;
200
#endif
201
202
#if 0
203
    /* Inefficient implementation! */
204
    //printf("{");
205
    for (i = 0; i < nbits; i++)
206
        //val = (val << 1) | get_bit_MSB(block);
207
        GET_BIT_MSB(block, val);
208
#endif
209
210
0
#if 1
211
    /* Combination of 1st two methods */
212
0
    if (nbits <= block->bit+1) {
213
0
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
214
0
        if ((block->bit -= nbits) == -1) {
215
0
            block->bit = 7;
216
0
            block->byte++;
217
0
        }
218
0
        return val;
219
0
    }
220
221
0
    switch(nbits) {
222
//  case 15: GET_BIT_MSB(block, val); // fall through
223
//  case 14: GET_BIT_MSB(block, val); // fall through
224
//  case 13: GET_BIT_MSB(block, val); // fall through
225
//  case 12: GET_BIT_MSB(block, val); // fall through
226
//  case 11: GET_BIT_MSB(block, val); // fall through
227
//  case 10: GET_BIT_MSB(block, val); // fall through
228
//  case  9: GET_BIT_MSB(block, val); // fall through
229
0
    case  8: GET_BIT_MSB(block, val); // fall through
230
0
    case  7: GET_BIT_MSB(block, val); // fall through
231
0
    case  6: GET_BIT_MSB(block, val); // fall through
232
0
    case  5: GET_BIT_MSB(block, val); // fall through
233
0
    case  4: GET_BIT_MSB(block, val); // fall through
234
0
    case  3: GET_BIT_MSB(block, val); // fall through
235
0
    case  2: GET_BIT_MSB(block, val); // fall through
236
0
    case  1: GET_BIT_MSB(block, val);
237
0
        break;
238
239
0
    default:
240
0
        for (i = 0; i < nbits; i++)
241
            //val = (val << 1) | get_bit_MSB(block);
242
0
            GET_BIT_MSB(block, val);
243
0
    }
244
0
#endif
245
246
    //printf("=0x%x}", val);
247
248
0
    return val;
249
0
}
250
251
/*
252
 * Can store up to 24-bits worth of data encoded in an integer value
253
 * Possibly we'd want to have a less optimal store_bits function when dealing
254
 * with nbits > 24, but for now we assume the codes generated are never
255
 * that big. (Given this is only possible with 121392 or more
256
 * characters with exactly the correct frequency distribution we check
257
 * for it elsewhere.)
258
 */
259
12.8k
static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) {
260
    //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
261
262
    /*
263
     * Use slow mode until we tweak the huffman generator to never generate
264
     * codes longer than 24-bits.
265
     */
266
12.8k
    unsigned int mask;
267
268
12.8k
    if (block->byte+8 >= block->alloc) {
269
685
        if (block->byte) {
270
1
            block->alloc *= 2;
271
1
            block->data = realloc(block->data, block->alloc + 8);
272
1
            if (!block->data)
273
0
                return -1;
274
684
        } else {
275
684
            block->alloc = 1024;
276
684
            block->data = realloc(block->data, block->alloc + 8);
277
684
            if (!block->data)
278
0
                return -1;
279
684
            block->data[0] = 0; // initialise first byte of buffer
280
684
        }
281
685
    }
282
283
    /* fits in current bit-field */
284
12.8k
    if (nbits <= block->bit+1) {
285
6.09k
        block->data[block->byte] |= (val << (block->bit+1-nbits));
286
6.09k
        if ((block->bit-=nbits) == -1) {
287
1.96k
            block->bit = 7;
288
1.96k
            block->byte++;
289
1.96k
            block->data[block->byte] = 0;
290
1.96k
        }
291
6.09k
        return 0;
292
6.09k
    }
293
294
6.74k
    block->data[block->byte] |= (val >> (nbits -= block->bit+1));
295
6.74k
    block->bit = 7;
296
6.74k
    block->byte++;
297
6.74k
    block->data[block->byte] = 0;
298
299
6.74k
    mask = 1<<(nbits-1);
300
42.5k
    do {
301
42.5k
        if (val & mask)
302
7.01k
            block->data[block->byte] |= (1 << block->bit);
303
42.5k
        if (--block->bit == -1) {
304
2.56k
            block->bit = 7;
305
2.56k
            block->byte++;
306
2.56k
            block->data[block->byte] = 0;
307
2.56k
        }
308
42.5k
        mask >>= 1;
309
42.5k
    } while(--nbits);
310
311
6.74k
    return 0;
312
12.8k
}
313
314
/*
315
 * Returns the next 'size' bytes from a block, or NULL if insufficient
316
 * data left.This is just a pointer into the block data and not an
317
 * allocated object, so do not free the result.
318
 */
319
0
static char *cram_extract_block(cram_block *b, int size) {
320
0
    char *cp = (char *)b->data + b->idx;
321
0
    b->idx += size;
322
0
    if (b->idx > b->uncomp_size)
323
0
        return NULL;
324
325
0
    return cp;
326
0
}
327
328
/*
329
 * ---------------------------------------------------------------------------
330
 * EXTERNAL
331
 *
332
 * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the
333
 * size of the object being returned.  This type is hard coded in the
334
 * spec document (changing from uint32 to uint64 requires a spec change)
335
 * and there is no data format introspection so implementations have
336
 * to determine which size to use based on version numbers.   It also
337
 * doesn't support signed data.
338
 *
339
 * With CRAM 4.0 onwards the size and sign of the data is no longer stated
340
 * explicitly in the specification.  Instead EXTERNAL is replaced by three
341
 * new encodings, for bytes and signed / unsigned integers which used a
342
 * variable sized encoding.
343
 *
344
 * For simplicity we use the same encode and decode functions for
345
 * bytes (CRAM4) and external (CRAM3). Given we already had code to
346
 * replace codec + type into a function pointer it makes little
347
 * difference how we ended up at that function.  However we disallow
348
 * this codec to operate on integer data for CRAM4 onwards.
349
 */
350
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
351
0
                             cram_block *in, char *out, int *out_size) {
352
0
    char *cp;
353
0
    cram_block *b;
354
355
    /* Find the external block */
356
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
357
0
    if (!b)
358
0
        return *out_size?-1:0;
359
360
0
    cp = (char *)b->data + b->idx;
361
    // E_INT and E_LONG are guaranteed single item queries
362
0
    int err = 0;
363
0
    *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err);
364
0
    b->idx = cp - (char *)b->data;
365
0
    *out_size = 1;
366
367
0
    return err ? -1 : 0;
368
0
}
369
370
int cram_external_decode_long(cram_slice *slice, cram_codec *c,
371
0
                              cram_block *in, char *out, int *out_size) {
372
0
    char *cp;
373
0
    cram_block *b;
374
375
    /* Find the external block */
376
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
377
0
    if (!b)
378
0
        return *out_size?-1:0;
379
380
0
    cp = (char *)b->data + b->idx;
381
    // E_INT and E_LONG are guaranteed single item queries
382
0
    int err = 0;
383
0
    *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err);
384
0
    b->idx = cp - (char *)b->data;
385
0
    *out_size = 1;
386
387
0
    return err ? -1 : 0;
388
0
}
389
390
int cram_external_decode_char(cram_slice *slice, cram_codec *c,
391
                              cram_block *in, char *out,
392
0
                              int *out_size) {
393
0
    char *cp;
394
0
    cram_block *b;
395
396
    /* Find the external block */
397
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
398
0
    if (!b)
399
0
        return *out_size?-1:0;
400
401
0
    cp = cram_extract_block(b, *out_size);
402
0
    if (!cp)
403
0
        return -1;
404
405
0
    if (out)
406
0
        memcpy(out, cp, *out_size);
407
0
    return 0;
408
0
}
409
410
static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
411
                                      cram_block *in, char *out_,
412
0
                                      int *out_size) {
413
0
    char *cp;
414
0
    cram_block *out = (cram_block *)out_;
415
0
    cram_block *b = NULL;
416
417
    /* Find the external block */
418
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
419
0
    if (!b)
420
0
        return *out_size?-1:0;
421
422
0
    cp = cram_extract_block(b, *out_size);
423
0
    if (!cp)
424
0
        return -1;
425
426
0
    BLOCK_APPEND(out, cp, *out_size);
427
0
    return 0;
428
429
0
 block_err:
430
0
    return -1;
431
0
}
432
433
1.22k
void cram_external_decode_free(cram_codec *c) {
434
1.22k
    if (c)
435
1.22k
        free(c);
436
1.22k
}
437
438
439
0
int cram_external_decode_size(cram_slice *slice, cram_codec *c) {
440
0
    cram_block *b;
441
442
    /* Find the external block */
443
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
444
0
    if (!b)
445
0
        return -1;
446
447
0
    return b->uncomp_size;
448
0
}
449
450
0
cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) {
451
0
    return cram_get_block_by_id(slice, c->u.external.content_id);
452
0
}
453
454
0
int cram_external_describe(cram_codec *c, kstring_t *ks) {
455
0
    return ksprintf(ks, "EXTERNAL(id=%d)",
456
0
                    c->u.external.content_id) < 0 ? -1 : 0;
457
0
}
458
459
cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
460
                                      char *data, int size,
461
                                      enum cram_encoding codec,
462
                                      enum cram_external_type option,
463
1.23k
                                      int version, varint_vec *vv) {
464
1.23k
    cram_codec *c = NULL;
465
1.23k
    char *cp = data;
466
467
1.23k
    if (size < 1)
468
0
        goto malformed;
469
470
1.23k
    if (!(c = malloc(sizeof(*c))))
471
0
        return NULL;
472
473
1.23k
    c->codec  = E_EXTERNAL;
474
1.23k
    if (CRAM_MAJOR_VERS(version) >= 4) {
475
        // Version 4 does not permit integer data to be encoded as a
476
        // series of bytes.  This is used purely for bytes, either
477
        // singular or declared as arrays
478
0
        switch (codec) {
479
0
        case E_EXTERNAL:
480
0
            if (option == E_BYTE_ARRAY_BLOCK)
481
0
                c->decode = cram_external_decode_block;
482
0
            else if (option == E_BYTE || option == E_BYTE_ARRAY)
483
0
                c->decode = cram_external_decode_char;
484
0
            else
485
0
                goto malformed;
486
0
            break;
487
0
        default:
488
0
            goto malformed;
489
0
        }
490
1.23k
    } else {
491
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
492
        // use the option field to indicate the input data format so
493
        // we know which serialisation format to use.
494
1.23k
        if (option == E_INT)
495
896
            c->decode = cram_external_decode_int;
496
342
        else if (option == E_LONG)
497
0
            c->decode = cram_external_decode_long;
498
342
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
499
15
            c->decode = cram_external_decode_char;
500
327
        else
501
327
            c->decode = cram_external_decode_block;
502
1.23k
    }
503
1.23k
    c->free   = cram_external_decode_free;
504
1.23k
    c->size   = cram_external_decode_size;
505
1.23k
    c->get_block = cram_external_get_block;
506
1.23k
    c->describe = cram_external_describe;
507
508
1.23k
    c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL);
509
510
1.23k
    if (cp - data != size)
511
11
        goto malformed;
512
513
1.22k
    c->u.external.type = option;
514
515
1.22k
    return c;
516
517
11
 malformed:
518
11
    hts_log_error("Malformed external header stream");
519
11
    free(c);
520
11
    return NULL;
521
1.23k
}
522
523
int cram_external_encode_int(cram_slice *slice, cram_codec *c,
524
9.61M
                             char *in, int in_size) {
525
9.61M
    uint32_t *i32 = (uint32_t *)in;
526
9.61M
    return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1;
527
9.61M
}
528
529
int cram_external_encode_sint(cram_slice *slice, cram_codec *c,
530
0
                             char *in, int in_size) {
531
0
    int32_t *i32 = (int32_t *)in;
532
0
    return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1;
533
0
}
534
535
int cram_external_encode_long(cram_slice *slice, cram_codec *c,
536
0
                             char *in, int in_size) {
537
0
    uint64_t *i64 = (uint64_t *)in;
538
0
    return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1;
539
0
}
540
541
int cram_external_encode_slong(cram_slice *slice, cram_codec *c,
542
0
                               char *in, int in_size) {
543
0
    int64_t *i64 = (int64_t *)in;
544
0
    return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1;
545
0
}
546
547
int cram_external_encode_char(cram_slice *slice, cram_codec *c,
548
236k
                              char *in, int in_size) {
549
236k
    BLOCK_APPEND(c->out, in, in_size);
550
236k
    return 0;
551
552
0
 block_err:
553
0
    return -1;
554
236k
}
555
556
173k
void cram_external_encode_free(cram_codec *c) {
557
173k
    if (!c)
558
0
        return;
559
173k
    free(c);
560
173k
}
561
562
int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
563
173k
                               int version) {
564
173k
    char tmp[99], *tp = tmp, *tpend = tmp+99;
565
173k
    int len = 0, r = 0, n;
566
567
173k
    if (prefix) {
568
57.7k
        size_t l = strlen(prefix);
569
57.7k
        BLOCK_APPEND(b, prefix, l);
570
57.7k
        len += l;
571
57.7k
    }
572
573
173k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id);
574
173k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
575
173k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
576
173k
    BLOCK_APPEND(b, tmp, tp-tmp);
577
173k
    len += tp-tmp;
578
579
173k
    if (r > 0)
580
173k
        return len;
581
582
0
 block_err:
583
0
    return -1;
584
173k
}
585
586
cram_codec *cram_external_encode_init(cram_stats *st,
587
                                      enum cram_encoding codec,
588
                                      enum cram_external_type option,
589
                                      void *dat,
590
173k
                                      int version, varint_vec *vv) {
591
173k
    cram_codec *c;
592
593
173k
    c = malloc(sizeof(*c));
594
173k
    if (!c)
595
0
        return NULL;
596
173k
    c->codec = E_EXTERNAL;
597
173k
    c->free = cram_external_encode_free;
598
173k
    if (CRAM_MAJOR_VERS(version) >= 4) {
599
        // Version 4 does not permit integer data to be encoded as a
600
        // series of bytes.  This is used purely for bytes, either
601
        // singular or declared as arrays
602
0
        switch (codec) {
603
0
        case E_EXTERNAL:
604
0
            if (option != E_BYTE && option != E_BYTE_ARRAY)
605
0
                return NULL;
606
0
            c->encode = cram_external_encode_char;
607
0
            break;
608
0
        default:
609
0
            return NULL;
610
0
        }
611
173k
    } else {
612
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
613
        // use the option field to indicate the input data format so
614
        // we know which serialisation format to use.
615
173k
        if (option == E_INT)
616
54.4k
            c->encode = cram_external_encode_int;
617
119k
        else if (option == E_LONG)
618
0
            c->encode = cram_external_encode_long;
619
119k
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
620
119k
            c->encode = cram_external_encode_char;
621
0
        else
622
0
            abort();
623
173k
    }
624
173k
    c->store = cram_external_encode_store;
625
173k
    c->flush = NULL;
626
627
173k
    c->u.e_external.content_id = (size_t)dat;
628
629
173k
    return c;
630
173k
}
631
632
/*
633
 * ---------------------------------------------------------------------------
634
 * VARINT
635
 *
636
 * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8
637
 * format as well as bytes.  In CRAM 4 EXTERNAL is only for bytes and
638
 * byte arrays, with two dedicated encodings for integers:
639
 * VARINT_SIGNED and VARINT_UNSIGNED.  These also differ a little to
640
 * EXTERNAL with the addition of an offset field, meaning we can store
641
 * values in, say, the range -2 to 1 million without needing to use
642
 * a signed zig-zag transformation.
643
 */
644
int cram_varint_decode_int(cram_slice *slice, cram_codec *c,
645
0
                           cram_block *in, char *out, int *out_size) {
646
0
    char *cp;
647
0
    cram_block *b;
648
649
    /* Find the data block */
650
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
651
0
    if (!b)
652
0
        return *out_size?-1:0;
653
654
0
    cp = (char *)b->data + b->idx;
655
    // E_INT and E_LONG are guaranteed single item queries
656
0
    int err = 0;
657
0
    *(int32_t *)out = c->vv->varint_get32(&cp,
658
0
                                          (char *)b->data + b->uncomp_size,
659
0
                                          &err) + c->u.varint.offset;
660
0
    b->idx = cp - (char *)b->data;
661
0
    *out_size = 1;
662
663
0
    return err ? -1 : 0;
664
0
}
665
666
int cram_varint_decode_sint(cram_slice *slice, cram_codec *c,
667
0
                            cram_block *in, char *out, int *out_size) {
668
0
    char *cp;
669
0
    cram_block *b;
670
671
    /* Find the data block */
672
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
673
0
    if (!b)
674
0
        return *out_size?-1:0;
675
676
0
    cp = (char *)b->data + b->idx;
677
    // E_INT and E_LONG are guaranteed single item queries
678
0
    int err = 0;
679
0
    *(int32_t *)out = c->vv->varint_get32s(&cp,
680
0
                                           (char *)b->data + b->uncomp_size,
681
0
                                           &err) + c->u.varint.offset;
682
0
    b->idx = cp - (char *)b->data;
683
0
    *out_size = 1;
684
685
0
    return err ? -1 : 0;
686
0
}
687
688
int cram_varint_decode_long(cram_slice *slice, cram_codec *c,
689
0
                            cram_block *in, char *out, int *out_size) {
690
0
    char *cp;
691
0
    cram_block *b;
692
693
    /* Find the data block */
694
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
695
0
    if (!b)
696
0
        return *out_size?-1:0;
697
698
0
    cp = (char *)b->data + b->idx;
699
    // E_INT and E_LONG are guaranteed single item queries
700
0
    int err = 0;
701
0
    *(int64_t *)out = c->vv->varint_get64(&cp,
702
0
                                          (char *)b->data + b->uncomp_size,
703
0
                                          &err) + c->u.varint.offset;
704
0
    b->idx = cp - (char *)b->data;
705
0
    *out_size = 1;
706
707
0
    return err ? -1 : 0;
708
0
}
709
710
int cram_varint_decode_slong(cram_slice *slice, cram_codec *c,
711
0
                             cram_block *in, char *out, int *out_size) {
712
0
    char *cp;
713
0
    cram_block *b;
714
715
    /* Find the data block */
716
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
717
0
    if (!b)
718
0
        return *out_size?-1:0;
719
720
0
    cp = (char *)b->data + b->idx;
721
    // E_INT and E_LONG are guaranteed single item queries
722
0
    int err = 0;
723
0
    *(int64_t *)out = c->vv->varint_get64s(&cp,
724
0
                                           (char *)b->data + b->uncomp_size,
725
0
                                           &err) + c->u.varint.offset;
726
0
    b->idx = cp - (char *)b->data;
727
0
    *out_size = 1;
728
729
0
    return err ? -1 : 0;
730
0
}
731
732
542
void cram_varint_decode_free(cram_codec *c) {
733
542
    if (c)
734
542
        free(c);
735
542
}
736
737
0
int cram_varint_decode_size(cram_slice *slice, cram_codec *c) {
738
0
    cram_block *b;
739
740
    /* Find the data block */
741
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
742
0
    if (!b)
743
0
        return -1;
744
745
0
    return b->uncomp_size;
746
0
}
747
748
0
cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) {
749
0
    return cram_get_block_by_id(slice, c->u.varint.content_id);
750
0
}
751
752
0
int cram_varint_describe(cram_codec *c, kstring_t *ks) {
753
0
    return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)",
754
0
                    c->u.varint.content_id,
755
0
                    c->u.varint.offset,
756
0
                    c->u.varint.type)
757
0
        < 0 ? -1 : 0;
758
0
}
759
760
cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
761
                                    char *data, int size,
762
                                    enum cram_encoding codec,
763
                                    enum cram_external_type option,
764
551
                                    int version, varint_vec *vv) {
765
551
    cram_codec *c;
766
551
    char *cp = data, *cp_end = data+size;
767
768
551
    if (!(c = malloc(sizeof(*c))))
769
0
        return NULL;
770
771
551
    c->codec  = codec;
772
773
    // Function pointer choice is theoretically by codec type.
774
    // Given we have some vars as int32 and some as int64 we
775
    // use option too for sizing, although on disk format
776
    // does not change.
777
551
    switch(codec) {
778
393
    case E_VARINT_UNSIGNED:
779
393
        c->decode = (option == E_INT)
780
393
            ? cram_varint_decode_int
781
393
            : cram_varint_decode_long;
782
393
        break;
783
158
    case E_VARINT_SIGNED:
784
158
        c->decode = (option == E_INT)
785
158
            ? cram_varint_decode_sint
786
158
            : cram_varint_decode_slong;
787
158
        break;
788
0
    default:
789
0
        return NULL;
790
551
    }
791
792
551
    c->free   = cram_varint_decode_free;
793
551
    c->size   = cram_varint_decode_size;
794
551
    c->get_block = cram_varint_get_block;
795
551
    c->describe = cram_varint_describe;
796
797
551
    c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL);
798
551
    c->u.varint.offset     = vv->varint_get64s(&cp, cp_end, NULL);
799
800
551
    if (cp - data != size) {
801
9
        fprintf(stderr, "Malformed varint header stream\n");
802
9
        free(c);
803
9
        return NULL;
804
9
    }
805
806
542
    c->u.varint.type = option;
807
808
542
    return c;
809
551
}
810
811
int cram_varint_encode_int(cram_slice *slice, cram_codec *c,
812
0
                           char *in, int in_size) {
813
0
    uint32_t *i32 = (uint32_t *)in;
814
0
    return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0
815
0
        ? 0 : -1;
816
0
}
817
818
int cram_varint_encode_sint(cram_slice *slice, cram_codec *c,
819
0
                            char *in, int in_size) {
820
0
    int32_t *i32 = (int32_t *)in;
821
0
    return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0
822
0
        ? 0 : -1;
823
0
}
824
825
int cram_varint_encode_long(cram_slice *slice, cram_codec *c,
826
0
                            char *in, int in_size) {
827
0
    uint64_t *i64 = (uint64_t *)in;
828
0
    return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0
829
0
        ? 0 : -1;
830
0
}
831
832
int cram_varint_encode_slong(cram_slice *slice, cram_codec *c,
833
0
                             char *in, int in_size) {
834
0
    int64_t *i64 = (int64_t *)in;
835
0
    return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0
836
0
        ? 0 : -1;
837
0
}
838
839
0
void cram_varint_encode_free(cram_codec *c) {
840
0
    if (!c)
841
0
        return;
842
0
    free(c);
843
0
}
844
845
int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix,
846
0
                             int version) {
847
0
    char tmp[99], *tp = tmp;
848
0
    int len = 0;
849
850
0
    if (prefix) {
851
0
        size_t l = strlen(prefix);
852
0
        BLOCK_APPEND(b, prefix, l);
853
0
        len += l;
854
0
    }
855
856
0
    tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id);
857
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset);
858
0
    len += c->vv->varint_put32_blk(b, c->codec);
859
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
860
0
    BLOCK_APPEND(b, tmp, tp-tmp);
861
0
    len += tp-tmp;
862
863
0
    return len;
864
865
0
 block_err:
866
0
    return -1;
867
0
}
868
869
cram_codec *cram_varint_encode_init(cram_stats *st,
870
                                    enum cram_encoding codec,
871
                                    enum cram_external_type option,
872
                                    void *dat,
873
0
                                    int version, varint_vec *vv) {
874
0
    cram_codec *c;
875
876
0
    if (!(c = malloc(sizeof(*c))))
877
0
        return NULL;
878
879
0
    c->u.e_varint.offset = 0;
880
0
    if (st) {
881
        // Marginal difference so far! Not worth the hassle?
882
0
        if (st->min_val < 0 && st->min_val >= -127
883
0
            && st->max_val / -st->min_val > 100) {
884
0
            c->u.e_varint.offset = -st->min_val;
885
0
            codec = E_VARINT_UNSIGNED;
886
0
        } else if (st->min_val > 0) {
887
0
            c->u.e_varint.offset = -st->min_val;
888
0
        }
889
0
    }
890
891
0
    c->codec = codec;
892
0
    c->free = cram_varint_encode_free;
893
894
    // Function pointer choice is theoretically by codec type.
895
    // Given we have some vars as int32 and some as int64 we
896
    // use option too for sizing, although on disk format
897
    // does not change.
898
0
    switch (codec) {
899
0
    case E_VARINT_UNSIGNED:
900
0
        c->encode = (option == E_INT)
901
0
            ? cram_varint_encode_int
902
0
            : cram_varint_encode_long;
903
0
        break;
904
0
    case E_VARINT_SIGNED:
905
0
        c->encode = (option == E_INT)
906
0
            ? cram_varint_encode_sint
907
0
            : cram_varint_encode_slong;
908
0
        break;
909
0
    default:
910
0
        return NULL;
911
0
    }
912
0
    c->store = cram_varint_encode_store;
913
0
    c->flush = NULL;
914
915
0
    c->u.e_varint.content_id = (size_t)dat;
916
917
0
    return c;
918
0
}
919
/*
920
 * ---------------------------------------------------------------------------
921
 * CONST_BYTE and CONST_INT
922
 */
923
int cram_const_decode_byte(cram_slice *slice, cram_codec *c,
924
0
                           cram_block *in, char *out, int *out_size) {
925
0
    int i, n;
926
927
0
    for (i = 0, n = *out_size; i < n; i++)
928
0
        out[i] = c->u.xconst.val;
929
930
0
    return 0;
931
0
}
932
933
int cram_const_decode_int(cram_slice *slice, cram_codec *c,
934
0
                          cram_block *in, char *out, int *out_size) {
935
0
    int32_t *out_i = (int32_t *)out;
936
0
    int i, n;
937
938
0
    for (i = 0, n = *out_size; i < n; i++)
939
0
        out_i[i] = c->u.xconst.val;
940
941
0
    return 0;
942
0
}
943
944
int cram_const_decode_long(cram_slice *slice, cram_codec *c,
945
0
                           cram_block *in, char *out, int *out_size) {
946
0
    int64_t *out_i = (int64_t *)out;
947
0
    int i, n;
948
949
0
    for (i = 0, n = *out_size; i < n; i++)
950
0
        out_i[i] = c->u.xconst.val;
951
952
0
    return 0;
953
0
}
954
955
243
void cram_const_decode_free(cram_codec *c) {
956
243
    if (c)
957
243
        free(c);
958
243
}
959
960
0
int cram_const_decode_size(cram_slice *slice, cram_codec *c) {
961
0
    return 0;
962
0
}
963
964
0
int cram_const_describe(cram_codec *c, kstring_t *ks) {
965
0
    return ksprintf(ks, "CONST(val=%"PRId64")",
966
0
                    c->u.xconst.val) < 0 ? -1 : 0;
967
0
}
968
969
cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
970
                                   char *data, int size,
971
                                   enum cram_encoding codec,
972
                                   enum cram_external_type option,
973
245
                                   int version, varint_vec *vv) {
974
245
    cram_codec *c;
975
245
    char *cp = data;
976
977
245
    if (!(c = malloc(sizeof(*c))))
978
0
        return NULL;
979
980
245
    c->codec  = codec;
981
245
    if (codec == E_CONST_BYTE)
982
2
        c->decode = cram_const_decode_byte;
983
243
    else if (option == E_INT)
984
141
        c->decode = cram_const_decode_int;
985
102
    else
986
102
        c->decode = cram_const_decode_long;
987
245
    c->free   = cram_const_decode_free;
988
245
    c->size   = cram_const_decode_size;
989
245
    c->get_block = NULL;
990
245
    c->describe = cram_const_describe;
991
992
245
    c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL);
993
994
245
    if (cp - data != size) {
995
2
        fprintf(stderr, "Malformed const header stream\n");
996
2
        free(c);
997
2
        return NULL;
998
2
    }
999
1000
243
    return c;
1001
245
}
1002
1003
int cram_const_encode(cram_slice *slice, cram_codec *c,
1004
0
                      char *in, int in_size) {
1005
0
    return 0;
1006
0
}
1007
1008
int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix,
1009
0
                            int version) {
1010
0
    char tmp[99], *tp = tmp;
1011
0
    int len = 0;
1012
1013
0
    if (prefix) {
1014
0
        size_t l = strlen(prefix);
1015
0
        BLOCK_APPEND(b, prefix, l);
1016
0
        len += l;
1017
0
    }
1018
1019
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val);
1020
0
    len += c->vv->varint_put32_blk(b, c->codec);
1021
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
1022
0
    BLOCK_APPEND(b, tmp, tp-tmp);
1023
0
    len += tp-tmp;
1024
1025
0
    return len;
1026
1027
0
 block_err:
1028
0
    return -1;
1029
0
}
1030
1031
cram_codec *cram_const_encode_init(cram_stats *st,
1032
                                   enum cram_encoding codec,
1033
                                   enum cram_external_type option,
1034
                                   void *dat,
1035
0
                                   int version, varint_vec *vv) {
1036
0
    cram_codec *c;
1037
1038
0
    if (!(c = malloc(sizeof(*c))))
1039
0
        return NULL;
1040
1041
0
    c->codec = codec;
1042
0
    c->free = cram_const_decode_free; // as as decode
1043
0
    c->encode = cram_const_encode; // a nop
1044
0
    c->store = cram_const_encode_store;
1045
0
    c->flush = NULL;
1046
0
    c->u.e_xconst.val = st->min_val;
1047
1048
0
    return c;
1049
0
}
1050
1051
/*
1052
 * ---------------------------------------------------------------------------
1053
 * BETA
1054
 */
1055
0
int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1056
0
    int64_t *out_i = (int64_t *)out;
1057
0
    int i, n = *out_size;
1058
1059
0
    if (c->u.beta.nbits) {
1060
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1061
0
            return -1;
1062
1063
0
        for (i = 0; i < n; i++)
1064
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1065
0
    } else {
1066
0
        for (i = 0; i < n; i++)
1067
0
            out_i[i] = -c->u.beta.offset;
1068
0
    }
1069
1070
0
    return 0;
1071
0
}
1072
1073
0
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1074
0
    int32_t *out_i = (int32_t *)out;
1075
0
    int i, n = *out_size;
1076
1077
0
    if (c->u.beta.nbits) {
1078
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1079
0
            return -1;
1080
1081
0
        for (i = 0; i < n; i++)
1082
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1083
0
    } else {
1084
0
        for (i = 0; i < n; i++)
1085
0
            out_i[i] = -c->u.beta.offset;
1086
0
    }
1087
1088
0
    return 0;
1089
0
}
1090
1091
0
int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1092
0
    int i, n = *out_size;
1093
1094
1095
0
    if (c->u.beta.nbits) {
1096
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1097
0
            return -1;
1098
1099
0
        if (out)
1100
0
            for (i = 0; i < n; i++)
1101
0
                out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1102
0
        else
1103
0
            for (i = 0; i < n; i++)
1104
0
                get_bits_MSB(in, c->u.beta.nbits);
1105
0
    } else {
1106
0
        if (out)
1107
0
            for (i = 0; i < n; i++)
1108
0
                out[i] = -c->u.beta.offset;
1109
0
    }
1110
1111
0
    return 0;
1112
0
}
1113
1114
84
void cram_beta_decode_free(cram_codec *c) {
1115
84
    if (c)
1116
84
        free(c);
1117
84
}
1118
1119
0
int cram_beta_describe(cram_codec *c, kstring_t *ks) {
1120
0
    return ksprintf(ks, "BETA(offset=%d, nbits=%d)",
1121
0
                    c->u.beta.offset, c->u.beta.nbits)
1122
0
        < 0 ? -1 : 0;
1123
0
}
1124
1125
cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
1126
                                  char *data, int size,
1127
                                  enum cram_encoding codec,
1128
                                  enum cram_external_type option,
1129
84
                                  int version, varint_vec *vv) {
1130
84
    cram_codec *c;
1131
84
    char *cp = data;
1132
1133
84
    if (!(c = malloc(sizeof(*c))))
1134
0
        return NULL;
1135
1136
84
    c->codec  = E_BETA;
1137
84
    if (option == E_INT || option == E_SINT)
1138
33
        c->decode = cram_beta_decode_int;
1139
51
    else if (option == E_LONG || option == E_SLONG)
1140
0
        c->decode = cram_beta_decode_long;
1141
51
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1142
51
        c->decode = cram_beta_decode_char;
1143
0
    else {
1144
0
        hts_log_error("BYTE_ARRAYs not supported by this codec");
1145
0
        free(c);
1146
0
        return NULL;
1147
0
    }
1148
84
    c->free   = cram_beta_decode_free;
1149
84
    c->describe = cram_beta_describe;
1150
1151
84
    c->u.beta.nbits = -1;
1152
84
    c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL);
1153
84
    if (cp < data + size) // Ensure test below works
1154
84
        c->u.beta.nbits  = vv->varint_get32(&cp, data + size, NULL);
1155
1156
84
    if (cp - data != size
1157
84
        || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) {
1158
0
        hts_log_error("Malformed beta header stream");
1159
0
        free(c);
1160
0
        return NULL;
1161
0
    }
1162
1163
84
    return c;
1164
84
}
1165
1166
int cram_beta_encode_store(cram_codec *c, cram_block *b,
1167
684
                           char *prefix, int version) {
1168
684
    int len = 0, r = 0, n;
1169
1170
684
    if (prefix) {
1171
684
        size_t l = strlen(prefix);
1172
684
        BLOCK_APPEND(b, prefix, l);
1173
684
        len += l;
1174
684
    }
1175
1176
684
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1177
    // codec length
1178
684
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset)
1179
684
                                         + c->vv->varint_size(c->u.e_beta.nbits)));
1180
684
    r |= n;
1181
684
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n;
1182
684
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits));  r |= n;
1183
1184
684
    if (r > 0) return len;
1185
1186
0
 block_err:
1187
0
    return -1;
1188
684
}
1189
1190
int cram_beta_encode_long(cram_slice *slice, cram_codec *c,
1191
0
                          char *in, int in_size) {
1192
0
    int64_t *syms = (int64_t *)in;
1193
0
    int i, r = 0;
1194
1195
0
    for (i = 0; i < in_size; i++)
1196
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1197
0
                            c->u.e_beta.nbits);
1198
1199
0
    return r;
1200
0
}
1201
1202
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
1203
12.8k
                         char *in, int in_size) {
1204
12.8k
    int *syms = (int *)in;
1205
12.8k
    int i, r = 0;
1206
1207
25.6k
    for (i = 0; i < in_size; i++)
1208
12.8k
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1209
12.8k
                            c->u.e_beta.nbits);
1210
1211
12.8k
    return r;
1212
12.8k
}
1213
1214
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
1215
0
                          char *in, int in_size) {
1216
0
    unsigned char *syms = (unsigned char *)in;
1217
0
    int i, r = 0;
1218
1219
0
    for (i = 0; i < in_size; i++)
1220
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1221
0
                            c->u.e_beta.nbits);
1222
1223
0
    return r;
1224
0
}
1225
1226
684
void cram_beta_encode_free(cram_codec *c) {
1227
684
    if (c) free(c);
1228
684
}
1229
1230
cram_codec *cram_beta_encode_init(cram_stats *st,
1231
                                  enum cram_encoding codec,
1232
                                  enum cram_external_type option,
1233
                                  void *dat,
1234
699
                                  int version, varint_vec *vv) {
1235
699
    cram_codec *c;
1236
699
    hts_pos_t min_val, max_val;
1237
699
    int len = 0;
1238
699
    int64_t range;
1239
1240
699
    c = malloc(sizeof(*c));
1241
699
    if (!c)
1242
0
        return NULL;
1243
699
    c->codec  = E_BETA;
1244
699
    c->free   = cram_beta_encode_free;
1245
699
    if (option == E_INT || option == E_SINT)
1246
699
        c->encode = cram_beta_encode_int;
1247
0
    else if (option == E_LONG || option == E_SLONG)
1248
0
        c->encode = cram_beta_encode_long;
1249
0
    else
1250
0
        c->encode = cram_beta_encode_char;
1251
699
    c->store  = cram_beta_encode_store;
1252
699
    c->flush = NULL;
1253
1254
699
    if (dat) {
1255
699
        min_val = ((hts_pos_t *)dat)[0];
1256
699
        max_val = ((hts_pos_t *)dat)[1];
1257
699
    } else {
1258
0
        min_val = INT_MAX;
1259
0
        max_val = INT_MIN;
1260
0
        int i;
1261
0
        for (i = 0; i < MAX_STAT_VAL; i++) {
1262
0
            if (!st->freqs[i])
1263
0
                continue;
1264
0
            if (min_val > i)
1265
0
                min_val = i;
1266
0
            max_val = i;
1267
0
        }
1268
0
        if (st->h) {
1269
0
            khint_t k;
1270
1271
0
            for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1272
0
                if (!kh_exist(st->h, k))
1273
0
                    continue;
1274
1275
0
                i = kh_key(st->h, k);
1276
0
                if (min_val > i)
1277
0
                    min_val = i;
1278
0
                if (max_val < i)
1279
0
                    max_val = i;
1280
0
            }
1281
0
        }
1282
0
    }
1283
1284
699
    if (max_val < min_val)
1285
0
        goto err;
1286
1287
699
    range = (int64_t) max_val - min_val;
1288
699
    switch (option) {
1289
0
    case E_SINT:
1290
0
        if (min_val < INT_MIN || range > INT_MAX)
1291
0
            goto err;
1292
0
        break;
1293
1294
699
    case E_INT:
1295
699
        if (max_val > UINT_MAX || range > UINT_MAX)
1296
15
            goto err;
1297
684
        break;
1298
1299
684
    default:
1300
0
        break;
1301
699
    }
1302
1303
684
    c->u.e_beta.offset = -min_val;
1304
3.02k
    while (range) {
1305
2.33k
        len++;
1306
2.33k
        range >>= 1;
1307
2.33k
    }
1308
684
    c->u.e_beta.nbits = len;
1309
1310
684
    return c;
1311
1312
15
 err:
1313
15
    free(c);
1314
15
    return NULL;
1315
699
}
1316
1317
/*
1318
 * ---------------------------------------------------------------------------
1319
 * XPACK: Packing multiple values into a single byte.  A fast transform that
1320
 * reduces time taken by entropy encoder and may also improve compression.
1321
 *
1322
 * This also has the additional requirement that the data series is not
1323
 * interleaved with another, permitting efficient encoding and decoding
1324
 * of all elements enmasse instead of needing to only extract the bits
1325
 * necessary per item.
1326
 */
1327
0
int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1328
0
    int64_t *out_i = (int64_t *)out;
1329
0
    int i, n = *out_size;
1330
1331
0
    if (c->u.xpack.nbits) {
1332
0
        for (i = 0; i < n; i++)
1333
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1334
0
    } else {
1335
0
        for (i = 0; i < n; i++)
1336
0
            out_i[i] = c->u.xpack.rmap[0];
1337
0
    }
1338
1339
0
    return 0;
1340
0
}
1341
1342
0
int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1343
0
    int32_t *out_i = (int32_t *)out;
1344
0
    int i, n = *out_size;
1345
1346
0
    if (c->u.xpack.nbits) {
1347
0
        if (cram_not_enough_bits(in, c->u.xpack.nbits * n))
1348
0
            return -1;
1349
1350
0
        for (i = 0; i < n; i++)
1351
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1352
0
    } else {
1353
0
        for (i = 0; i < n; i++)
1354
0
            out_i[i] = c->u.xpack.rmap[0];
1355
0
    }
1356
1357
0
    return 0;
1358
0
}
1359
1360
0
static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) {
1361
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
1362
0
    if (b)
1363
0
        return 0;
1364
1365
    // get sub-codec data.
1366
0
    cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec);
1367
0
    if (!sub_b)
1368
0
        return -1;
1369
1370
    // Allocate local block to expand into
1371
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
1372
0
    if (!b)
1373
0
        return -1;
1374
0
    int n = sub_b->uncomp_size * 8/c->u.xpack.nbits;
1375
0
    BLOCK_GROW(b, n);
1376
0
    b->uncomp_size = n;
1377
1378
0
    uint8_t p[256];
1379
0
    int z;
1380
0
    for (z = 0; z < 256; z++)
1381
0
        p[z] = c->u.xpack.rmap[z];
1382
0
    hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size,
1383
0
               8 / c->u.xpack.nbits, p);
1384
1385
0
    return 0;
1386
1387
0
 block_err:
1388
0
    return -1;
1389
0
}
1390
1391
0
int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1392
    // FIXME: we need to ban data-series interleaving in the spec for this to work.
1393
1394
    // Remember this may be called when threaded and multi-slice per container.
1395
    // Hence one cram_codec instance, multiple slices, multiple blocks.
1396
    // We therefore have to cache appropriate block info in slice and not codec.
1397
    //    b = cram_get_block_by_id(slice, c->external.content_id);
1398
0
    if (c->u.xpack.nval > 1) {
1399
0
        cram_xpack_decode_expand_char(slice, c);
1400
0
        cram_block *b = slice->block_by_id[512 + c->codec_id];
1401
0
        if (!b)
1402
0
            return -1;
1403
1404
0
        if (out)
1405
0
            memcpy(out, b->data + b->byte, *out_size);
1406
0
        b->byte += *out_size;
1407
0
    } else {
1408
0
        memset(out, c->u.xpack.rmap[0], *out_size);
1409
0
    }
1410
1411
0
    return 0;
1412
0
}
1413
1414
492
void cram_xpack_decode_free(cram_codec *c) {
1415
492
    if (!c) return;
1416
1417
492
    if (c->u.xpack.sub_codec)
1418
480
        c->u.xpack.sub_codec->free(c->u.xpack.sub_codec);
1419
1420
    //free(slice->block_by_id[512 + c->codec_id]);
1421
    //slice->block_by_id[512 + c->codec_id] = 0;
1422
1423
492
    free(c);
1424
492
}
1425
1426
0
int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) {
1427
0
    cram_xpack_decode_expand_char(slice, c);
1428
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1429
0
}
1430
1431
0
cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) {
1432
0
    cram_xpack_decode_expand_char(slice, c);
1433
0
    return slice->block_by_id[512 + c->codec_id];
1434
0
}
1435
1436
cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr,
1437
                                   char *data, int size,
1438
                                   enum cram_encoding codec,
1439
                                   enum cram_external_type option,
1440
492
                                   int version, varint_vec *vv) {
1441
492
    cram_codec *c;
1442
492
    char *cp = data;
1443
492
    char *endp = data+size;
1444
1445
492
    if (!(c = calloc(1, sizeof(*c))))
1446
0
        return NULL;
1447
1448
492
    c->codec  = E_XPACK;
1449
492
    if (option == E_LONG)
1450
0
        c->decode = cram_xpack_decode_long;
1451
492
    else if (option == E_INT)
1452
279
        c->decode = cram_xpack_decode_int;
1453
213
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1454
213
        c->decode = cram_xpack_decode_char;
1455
0
    else {
1456
0
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
1457
0
        goto malformed;
1458
0
    }
1459
492
    c->free = cram_xpack_decode_free;
1460
492
    c->size = cram_xpack_decode_size;
1461
492
    c->get_block = cram_xpack_get_block;
1462
492
    c->describe = NULL;
1463
1464
492
    c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL);
1465
492
    c->u.xpack.nval  = vv->varint_get32(&cp, endp, NULL);
1466
492
    if (c->u.xpack.nbits >= 8  || c->u.xpack.nbits < 0 ||
1467
492
        c->u.xpack.nval  > 256 || c->u.xpack.nval < 0)
1468
0
        goto malformed;
1469
492
    int i;
1470
2.15k
    for (i = 0; i < c->u.xpack.nval; i++) {
1471
1.66k
        uint32_t v = vv->varint_get32(&cp, endp, NULL);
1472
1.66k
        if (v >= 256)
1473
0
            goto malformed;
1474
1.66k
        c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K
1475
1.66k
    }
1476
1477
492
    int encoding = vv->varint_get32(&cp, endp, NULL);
1478
492
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1479
492
    if (sub_size < 0 || endp - cp < sub_size)
1480
3
        goto malformed;
1481
489
    c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1482
489
                                             option, version, vv);
1483
489
    if (c->u.xpack.sub_codec == NULL)
1484
9
        goto malformed;
1485
480
    cp += sub_size;
1486
1487
480
    if (cp - data != size
1488
477
        || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) {
1489
15
    malformed:
1490
15
        fprintf(stderr, "Malformed xpack header stream\n");
1491
15
        cram_xpack_decode_free(c);
1492
15
        return NULL;
1493
3
    }
1494
1495
477
    return c;
1496
480
}
1497
1498
0
int cram_xpack_encode_flush(cram_codec *c) {
1499
    // Pack the buffered up data
1500
0
    int meta_len;
1501
0
    uint64_t out_len;
1502
0
    uint8_t out_meta[1024];
1503
0
    uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out),
1504
0
                            out_meta, &meta_len, &out_len);
1505
1506
    // We now need to pass this through the next layer of transform
1507
0
    if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming
1508
0
                                     c->u.e_xpack.sub_codec,
1509
0
                                     (char *)out, out_len))
1510
0
        return -1;
1511
1512
0
    int r = 0;
1513
0
    if (c->u.e_xpack.sub_codec->flush)
1514
0
        r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec);
1515
1516
0
    free(out);
1517
0
    return r;
1518
0
}
1519
1520
int cram_xpack_encode_store(cram_codec *c, cram_block *b,
1521
0
                            char *prefix, int version) {
1522
0
    int len = 0, r = 0, n;
1523
1524
0
    if (prefix) {
1525
0
        size_t l = strlen(prefix);
1526
0
        BLOCK_APPEND(b, prefix, l);
1527
0
        len += l;
1528
0
    }
1529
1530
    // Store sub-codec
1531
0
    cram_codec *tc = c->u.e_xpack.sub_codec;
1532
0
    cram_block *tb = cram_new_block(0, 0);
1533
0
    if (!tb)
1534
0
        return -1;
1535
0
    int len2 = tc->store(tc, tb, NULL, version);
1536
1537
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1538
1539
    // codec length
1540
0
    int len1 = 0, i;
1541
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1542
0
        len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n;
1543
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits)
1544
0
                                        +  c->vv->varint_size(c->u.e_xpack.nval)
1545
0
                                        + len1 + len2)); r |= n;
1546
1547
    // The map and sub-codec
1548
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n;
1549
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval));  r |= n;
1550
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1551
0
        len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n;
1552
1553
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1554
1555
0
    cram_free_block(tb);
1556
1557
0
    return r > 0 ? len + len2 : -1;
1558
1559
0
 block_err:
1560
0
    return -1;
1561
0
}
1562
1563
// Same as cram_beta_encode_long
1564
int cram_xpack_encode_long(cram_slice *slice, cram_codec *c,
1565
0
                           char *in, int in_size) {
1566
0
    int64_t *syms = (int64_t *)in;
1567
0
    int i, r = 0;
1568
1569
0
    for (i = 0; i < in_size; i++)
1570
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1571
1572
0
    return r;
1573
0
}
1574
1575
int cram_xpack_encode_int(cram_slice *slice, cram_codec *c,
1576
0
                          char *in, int in_size) {
1577
0
    int *syms = (int *)in;
1578
0
    int i, r = 0;
1579
1580
0
    for (i = 0; i < in_size; i++)
1581
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1582
1583
0
    return r;
1584
0
}
1585
1586
int cram_xpack_encode_char(cram_slice *slice, cram_codec *c,
1587
0
                           char *in, int in_size) {
1588
0
    BLOCK_APPEND(c->out, in, in_size);
1589
0
    return 0;
1590
1591
0
 block_err:
1592
0
    return -1;
1593
0
}
1594
1595
0
void cram_xpack_encode_free(cram_codec *c) {
1596
0
    if (!c) return;
1597
1598
0
    if (c->u.e_xpack.sub_codec)
1599
0
        c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec);
1600
1601
0
    cram_free_block(c->out);
1602
1603
0
    free(c);
1604
0
}
1605
1606
cram_codec *cram_xpack_encode_init(cram_stats *st,
1607
                                   enum cram_encoding codec,
1608
                                   enum cram_external_type option,
1609
                                   void *dat,
1610
0
                                   int version, varint_vec *vv) {
1611
0
    cram_codec *c;
1612
1613
0
    if (!(c = malloc(sizeof(*c))))
1614
0
        return NULL;
1615
1616
0
    c->codec  = E_XPACK;
1617
0
    c->free   = cram_xpack_encode_free;
1618
0
    if (option == E_LONG)
1619
0
        c->encode = cram_xpack_encode_long;
1620
0
    else if (option == E_INT)
1621
0
        c->encode = cram_xpack_encode_int;
1622
0
    else
1623
0
        c->encode = cram_xpack_encode_char;
1624
0
    c->store  = cram_xpack_encode_store;
1625
0
    c->flush  = cram_xpack_encode_flush;
1626
1627
0
    cram_xpack_encoder *e = (cram_xpack_encoder *)dat;
1628
0
    c->u.e_xpack.nbits = e->nbits;
1629
0
    c->u.e_xpack.nval = e->nval;
1630
0
    c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
1631
0
                                               E_BYTE_ARRAY, e->sub_codec_dat,
1632
0
                                               version, vv);
1633
1634
    // Initialise fwd and rev maps
1635
0
    memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3
1636
0
    int i, n;
1637
0
    for (i = n = 0; i < 256; i++)
1638
0
        if (e->map[i] != -1)
1639
0
            c->u.e_xpack.rmap[n++] = i;               // 0,1,2,3 to P,A,C,K
1640
0
    if (n != e->nval) {
1641
0
        fprintf(stderr, "Incorrectly specified number of map items in PACK\n");
1642
0
        return NULL;
1643
0
    }
1644
1645
0
    return c;
1646
0
}
1647
1648
/*
1649
 * ---------------------------------------------------------------------------
1650
 * XDELTA: subtract successive values, zig-zag to turn +/- to + only,
1651
 * and then var-int encode the result.
1652
 *
1653
 * This also has the additional requirement that the data series is not
1654
 * interleaved with another, permitting efficient encoding and decoding
1655
 * of all elements enmasse instead of needing to only extract the bits
1656
 * necessary per item.
1657
 */
1658
1659
0
static uint8_t  zigzag8 (int8_t  x) { return (x << 1) ^ (x >>  7); }
1660
0
static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); }
1661
0
static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); }
1662
1663
//static int8_t  unzigzag8 (uint8_t  x) { return (x >> 1) ^ -(x & 1); }
1664
0
static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); }
1665
0
static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); }
1666
1667
0
int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1668
0
    return -1;
1669
0
}
1670
1671
0
int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1672
    // Slow value-by-value method for now
1673
0
    uint32_t *out32 = (uint32_t *)out;
1674
0
    int i;
1675
0
    for (i = 0; i < *out_size; i++) {
1676
0
        uint32_t v;
1677
0
        int one = 1;
1678
0
        if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in,
1679
0
                                          (char *)&v, &one) < 0)
1680
0
            return -1;
1681
0
        uint32_t d = unzigzag32(v);
1682
0
        c->u.xdelta.last = out32[i] = d + c->u.xdelta.last;
1683
0
    }
1684
1685
0
    return 0;
1686
0
}
1687
1688
0
static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) {
1689
0
    return -1;
1690
0
}
1691
1692
0
int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1693
0
    return -1;
1694
0
}
1695
1696
0
static inline int16_t le_int2(int16_t i) {
1697
0
    int16_t s;
1698
0
    i16_to_le(i, (uint8_t *)&s);
1699
0
    return s;
1700
0
}
1701
1702
int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in,
1703
0
                             char *out_, int *out_size) {
1704
0
    cram_block *out = (cram_block *)out_;
1705
0
    cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec);
1706
0
    int i = 0;
1707
1708
0
    const int w = c->u.xdelta.word_size;
1709
0
    uint32_t npad = (w - *out_size%w)%w;
1710
0
    uint32_t out_sz = *out_size + npad;
1711
0
    c->u.xdelta.last = 0;  // reset for each new array
1712
1713
0
    for (i = 0; i < out_sz; i += w) {
1714
0
        uint16_t v;
1715
        // Need better interface
1716
0
        char *cp = (char *)b->data + b->byte;
1717
0
        char *cp_end = (char *)b->data + b->uncomp_size;
1718
0
        int err = 0;
1719
0
        v = c->vv->varint_get32(&cp, cp_end, &err);
1720
0
        if (err)
1721
0
            return -1;
1722
0
        b->byte = cp - (char *)b->data;
1723
1724
0
        switch(w) {
1725
0
        case 2: {
1726
0
            int16_t d = unzigzag16(v), z;
1727
0
            c->u.xdelta.last = d + c->u.xdelta.last;
1728
0
            z = le_int2(c->u.xdelta.last);
1729
0
            BLOCK_APPEND(out, &z, 2-npad);
1730
0
            npad = 0;
1731
0
            break;
1732
0
        }
1733
0
        default:
1734
0
            fprintf(stderr, "Unsupported word size by XDELTA\n");
1735
0
            return -1;
1736
0
        }
1737
0
    }
1738
1739
0
    return 0;
1740
1741
0
 block_err:
1742
0
    return -1;
1743
0
}
1744
1745
84
void cram_xdelta_decode_free(cram_codec *c) {
1746
84
    if (!c) return;
1747
1748
84
    if (c->u.xdelta.sub_codec)
1749
78
        c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec);
1750
1751
84
    free(c);
1752
84
}
1753
1754
0
int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) {
1755
0
    cram_xdelta_decode_expand_char(slice, c);
1756
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1757
0
}
1758
1759
0
cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) {
1760
0
    cram_xdelta_decode_expand_char(slice, c);
1761
0
    return slice->block_by_id[512 + c->codec_id];
1762
0
}
1763
1764
cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr,
1765
                                    char *data, int size,
1766
                                    enum cram_encoding codec,
1767
                                    enum cram_external_type option,
1768
84
                                    int version, varint_vec *vv) {
1769
84
    cram_codec *c;
1770
84
    char *cp = data;
1771
84
    char *endp = data+size;
1772
1773
84
    if (!(c = calloc(1, sizeof(*c))))
1774
0
        return NULL;
1775
1776
84
    c->codec  = E_XDELTA;
1777
84
    if (option == E_LONG)
1778
0
        c->decode = cram_xdelta_decode_long;
1779
84
    else if (option == E_INT)
1780
30
        c->decode = cram_xdelta_decode_int;
1781
54
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1782
48
        c->decode = cram_xdelta_decode_char;
1783
6
    else if (option == E_BYTE_ARRAY_BLOCK) {
1784
6
        option = E_BYTE_ARRAY;
1785
6
        c->decode = cram_xdelta_decode_block;
1786
6
    } else {
1787
0
        free(c);
1788
0
        return NULL;
1789
0
    }
1790
84
    c->free = cram_xdelta_decode_free;
1791
84
    c->size = cram_xdelta_decode_size;
1792
84
    c->get_block = cram_xdelta_get_block;
1793
84
    c->describe = NULL;
1794
1795
84
    c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL);
1796
84
    c->u.xdelta.last = 0;
1797
1798
84
    int encoding = vv->varint_get32(&cp, endp, NULL);
1799
84
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1800
84
    if (sub_size < 0 || endp - cp < sub_size)
1801
0
        goto malformed;
1802
84
    c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1803
84
                                              option, version, vv);
1804
84
    if (c->u.xdelta.sub_codec == NULL)
1805
6
        goto malformed;
1806
78
    cp += sub_size;
1807
1808
78
    if (cp - data != size) {
1809
12
    malformed:
1810
12
        fprintf(stderr, "Malformed xdelta header stream\n");
1811
12
        cram_xdelta_decode_free(c);
1812
12
        return NULL;
1813
6
    }
1814
1815
72
    return c;
1816
78
}
1817
1818
0
int cram_xdelta_encode_flush(cram_codec *c) {
1819
0
    int r = -1;
1820
0
    cram_block *b = cram_new_block(0, 0);
1821
0
    if (!b)
1822
0
        return -1;
1823
1824
0
    switch (c->u.e_xdelta.word_size) {
1825
0
    case 2: {
1826
        // Delta + zigzag transform.
1827
        // Subtracting two 8-bit values has a 9-bit result (-255 to 255).
1828
        // However think of it as turning a wheel clockwise or anti-clockwise.
1829
        // If it has 256 gradations then a -ve rotation followed by a +ve
1830
        // rotation of the same amount reverses it regardless.
1831
        //
1832
        // Similarly the zig-zag transformation doesn't invent any extra bits,
1833
        // so the entire thing can be done in-situ.  This may permit faster
1834
        // SIMD loops if we break apart the steps.
1835
1836
        // uint16_t last = 0, d;
1837
        // for (i = 0; i < n; i++) {
1838
        //     d = io[i] - last;
1839
        //     last = io[i];
1840
        //     io[i] = zigzag16(vd);
1841
        // }
1842
1843
        // --- vs ---
1844
1845
        // for (i = n-1; i >= 1; i--)
1846
        //     io[i] -= io[i-1];
1847
        // for (i = 0; i < n; i++)
1848
        //     io[i] = zigzag16(io[i]);
1849
1850
        // varint: need array variant for speed here.
1851
        // With zig-zag
1852
0
        int i, n = BLOCK_SIZE(c->out)/2;;
1853
0
        uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0;
1854
1855
0
        if (n*2 < BLOCK_SIZE(c->out)) {
1856
            // half word
1857
0
            last = *(uint8_t *)dat;
1858
0
            c->vv->varint_put32_blk(b, zigzag16(last));
1859
0
            dat = (uint16_t *)(((uint8_t *)dat)+1);
1860
0
        }
1861
1862
0
        for (i = 0; i < n; i++) {
1863
0
            uint16_t d = dat[i] - last; // possibly unaligned
1864
0
            last = dat[i];
1865
0
            c->vv->varint_put32_blk(b, zigzag16(d));
1866
0
        }
1867
1868
0
        break;
1869
0
    }
1870
1871
0
    case 4: {
1872
0
        int i, n = BLOCK_SIZE(c->out)/4;;
1873
0
        uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0;
1874
1875
0
        for (i = 0; i < n; i++) {
1876
0
            uint32_t d = dat[i] - last;
1877
0
            last = dat[i];
1878
0
            c->vv->varint_put32_blk(b, zigzag32(d));
1879
0
        }
1880
1881
0
        break;
1882
0
    }
1883
1884
0
    case 1: {
1885
0
        int i, n = BLOCK_SIZE(c->out);;
1886
0
        uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0;
1887
1888
0
        for (i = 0; i < n; i++) {
1889
0
            uint32_t d = dat[i] - last;
1890
0
            last = dat[i];
1891
0
            c->vv->varint_put32_blk(b, zigzag8(d));
1892
0
        }
1893
1894
0
        break;
1895
0
    }
1896
1897
0
    default:
1898
0
        goto err;
1899
0
    }
1900
1901
0
    if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec,
1902
0
                                      (char *)b->data, b->byte))
1903
0
        goto err;
1904
1905
0
    r = 0;
1906
1907
0
 err:
1908
0
    cram_free_block(b);
1909
0
    return r;
1910
1911
0
}
1912
1913
int cram_xdelta_encode_store(cram_codec *c, cram_block *b,
1914
0
                            char *prefix, int version) {
1915
0
    int len = 0, r = 0, n;
1916
1917
0
    if (prefix) {
1918
0
        size_t l = strlen(prefix);
1919
0
        BLOCK_APPEND(b, prefix, l);
1920
0
        len += l;
1921
0
    }
1922
1923
    // Store sub-codec
1924
0
    cram_codec *tc = c->u.e_xdelta.sub_codec;
1925
0
    cram_block *tb = cram_new_block(0, 0);
1926
0
    if (!tb)
1927
0
        return -1;
1928
0
    int len2 = tc->store(tc, tb, NULL, version);
1929
1930
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1931
1932
    // codec length
1933
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size)
1934
0
                                        + len2)); r |= n;
1935
1936
    // This and sub-codec
1937
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n;
1938
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1939
1940
0
    cram_free_block(tb);
1941
1942
0
    return r > 0 ? len + len2 : -1;
1943
1944
0
 block_err:
1945
0
    return -1;
1946
0
}
1947
1948
// Same as cram_beta_encode_long
1949
int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c,
1950
0
                           char *in, int in_size) {
1951
0
    return -1;
1952
0
}
1953
1954
int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c,
1955
0
                          char *in, int in_size) {
1956
0
    return -1;
1957
0
}
1958
1959
int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c,
1960
0
                            char *in, int in_size) {
1961
0
    char *dat = malloc(in_size*5);
1962
0
    if (!dat)
1963
0
        return -1;
1964
0
    char *cp = dat, *cp_end = dat + in_size*5;
1965
1966
0
    c->u.e_xdelta.last = 0; // reset for each new array
1967
0
    if (c->u.e_xdelta.word_size == 2) {
1968
0
        int i, part;
1969
1970
0
        part = in_size%2;
1971
0
        if (part) {
1972
0
            uint16_t z = in[0];
1973
0
            c->u.e_xdelta.last = le_int2(z);
1974
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last));
1975
0
        }
1976
1977
0
        uint16_t *in16 = (uint16_t *)(in+part);
1978
0
        for (i = 0; i < in_size/2; i++) {
1979
0
            uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last;
1980
0
            c->u.e_xdelta.last = le_int2(in16[i]);
1981
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(d));
1982
0
        }
1983
0
    }
1984
0
    if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec,
1985
0
                                      (char *)dat, cp-dat)) {
1986
0
        free(dat);
1987
0
        return -1;
1988
0
    }
1989
1990
0
    free(dat);
1991
0
    return 0;
1992
0
}
1993
1994
0
void cram_xdelta_encode_free(cram_codec *c) {
1995
0
    if (!c) return;
1996
1997
0
    if (c->u.e_xdelta.sub_codec)
1998
0
        c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec);
1999
2000
0
    cram_free_block(c->out);
2001
2002
0
    free(c);
2003
0
}
2004
2005
cram_codec *cram_xdelta_encode_init(cram_stats *st,
2006
                                    enum cram_encoding codec,
2007
                                    enum cram_external_type option,
2008
                                    void *dat,
2009
0
                                    int version, varint_vec *vv) {
2010
0
    cram_codec *c;
2011
2012
0
    if (!(c = malloc(sizeof(*c))))
2013
0
        return NULL;
2014
2015
0
    c->codec  = E_XDELTA;
2016
0
    c->free   = cram_xdelta_encode_free;
2017
0
    if (option == E_LONG)
2018
0
        c->encode = cram_xdelta_encode_long;
2019
0
    else if (option == E_INT)
2020
0
        c->encode = cram_xdelta_encode_int;
2021
0
    else
2022
0
        c->encode = cram_xdelta_encode_char;
2023
0
    c->store  = cram_xdelta_encode_store;
2024
0
    c->flush  = cram_xdelta_encode_flush;
2025
2026
0
    cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat;
2027
0
    c->u.e_xdelta.word_size = e->word_size;
2028
0
    c->u.e_xdelta.last = 0;
2029
0
    c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
2030
0
                                                E_BYTE_ARRAY,
2031
0
                                                e->sub_codec_dat,
2032
0
                                                version, vv);
2033
2034
0
    return c;
2035
0
}
2036
2037
/*
2038
 * ---------------------------------------------------------------------------
2039
 * XRLE
2040
 *
2041
 * This also has the additional requirement that the data series is not
2042
 * interleaved with another, permitting efficient encoding and decoding
2043
 * of all elements enmasse instead of needing to only extract the bits
2044
 * necessary per item.
2045
 */
2046
0
int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2047
    // TODO if and when needed
2048
0
    return -1;
2049
0
}
2050
2051
0
int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2052
    // TODO if and when needed
2053
0
    return -1;
2054
0
}
2055
2056
// Expands an XRLE transform and caches result in slice->block_by_id[]
2057
0
static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) {
2058
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2059
0
    if (b)
2060
0
        return 0;
2061
2062
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
2063
0
    if (!b)
2064
0
        return -1;
2065
0
    cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec);
2066
0
    if (!lit_b)
2067
0
        return -1;
2068
0
    unsigned char *lit_dat = lit_b->data;
2069
0
    unsigned int lit_sz = lit_b->uncomp_size;
2070
0
    unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec);
2071
2072
0
    cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec);
2073
0
    if (!len_b)
2074
0
        return -1;
2075
0
    unsigned char *len_dat = len_b->data;
2076
2077
0
    uint8_t rle_syms[256];
2078
0
    int rle_nsyms = 0;
2079
0
    int i;
2080
0
    for (i = 0; i < 256; i++) {
2081
0
        if (c->u.xrle.rep_score[i] > 0)
2082
0
            rle_syms[rle_nsyms++] = i;
2083
0
    }
2084
2085
0
    uint64_t out_sz;
2086
0
    int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz);
2087
0
    if (!(b->data = malloc(out_sz)))
2088
0
        return -1;
2089
0
    hts_rle_decode(lit_dat, lit_sz,
2090
0
                   len_dat+nb, len_sz-nb,
2091
0
                   rle_syms, rle_nsyms,
2092
0
                   b->data, &out_sz);
2093
0
    b->uncomp_size = out_sz;
2094
2095
0
    return 0;
2096
0
}
2097
2098
0
int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) {
2099
0
    cram_xrle_decode_expand_char(slice, c);
2100
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
2101
0
}
2102
2103
0
cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) {
2104
0
    cram_xrle_decode_expand_char(slice, c);
2105
0
    return slice->block_by_id[512 + c->codec_id];
2106
0
}
2107
2108
0
int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2109
0
    int n = *out_size;
2110
2111
0
    cram_xrle_decode_expand_char(slice, c);
2112
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2113
2114
0
    memcpy(out, b->data + b->idx, n);
2115
0
    b->idx += n;
2116
0
    return 0;
2117
2118
    // Old code when not cached
2119
0
    while (n > 0) {
2120
0
        if (c->u.xrle.cur_len == 0) {
2121
0
            unsigned char lit;
2122
0
            int one = 1;
2123
0
            if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in,
2124
0
                                          (char *)&lit, &one) < 0)
2125
0
                return -1;
2126
0
            c->u.xrle.cur_lit = lit;
2127
2128
0
            if (c->u.xrle.rep_score[lit] > 0) {
2129
0
                if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in,
2130
0
                                              (char *)&c->u.xrle.cur_len, &one) < 0)
2131
0
                    return -1;
2132
0
            } // else cur_len still zero
2133
            //else fprintf(stderr, "%d\n", lit);
2134
2135
0
            c->u.xrle.cur_len++;
2136
0
        }
2137
2138
0
        if (n >= c->u.xrle.cur_len) {
2139
0
            memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len);
2140
0
            out += c->u.xrle.cur_len;
2141
0
            n -= c->u.xrle.cur_len;
2142
0
            c->u.xrle.cur_len = 0;
2143
0
        } else {
2144
0
            memset(out, c->u.xrle.cur_lit, n);
2145
0
            out += n;
2146
0
            c->u.xrle.cur_len -= n;
2147
0
            n = 0;
2148
0
        }
2149
0
    }
2150
2151
0
    return 0;
2152
0
}
2153
2154
3
void cram_xrle_decode_free(cram_codec *c) {
2155
3
    if (!c) return;
2156
2157
3
    if (c->u.xrle.len_codec)
2158
0
        c->u.xrle.len_codec->free(c->u.xrle.len_codec);
2159
2160
3
    if (c->u.xrle.lit_codec)
2161
0
        c->u.xrle.lit_codec->free(c->u.xrle.lit_codec);
2162
2163
3
    free(c);
2164
3
}
2165
2166
cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr,
2167
                                  char *data, int size,
2168
                                  enum cram_encoding codec,
2169
                                  enum cram_external_type option,
2170
9
                                  int version, varint_vec *vv) {
2171
9
    cram_codec *c;
2172
9
    char *cp = data;
2173
9
    char *endp = data+size;
2174
9
    int err = 0;
2175
2176
9
    if (!(c = calloc(1, sizeof(*c))))
2177
0
        return NULL;
2178
2179
9
    c->codec  = E_XRLE;
2180
9
    if (option == E_LONG)
2181
0
        c->decode = cram_xrle_decode_long;
2182
9
    else if (option == E_INT)
2183
0
        c->decode = cram_xrle_decode_int;
2184
9
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
2185
3
        c->decode = cram_xrle_decode_char;
2186
6
    else {
2187
6
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
2188
6
        free(c);
2189
6
        return NULL;
2190
6
    }
2191
3
    c->free   = cram_xrle_decode_free;
2192
3
    c->size   = cram_xrle_decode_size;
2193
3
    c->get_block = cram_xrle_get_block;
2194
3
    c->describe = NULL;
2195
3
    c->u.xrle.cur_len = 0;
2196
3
    c->u.xrle.cur_lit = -1;
2197
2198
    // RLE map
2199
3
    int i, j, nrle = vv->varint_get32(&cp, endp, &err);
2200
3
    memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score));
2201
18
    for (i = 0; i < nrle && i < 256; i++) {
2202
15
        j = vv->varint_get32(&cp, endp, &err);
2203
15
        if (j >= 0 && j < 256)
2204
15
            c->u.xrle.rep_score[j] = 1;
2205
15
    }
2206
2207
    // Length and literal sub encodings
2208
3
    c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err);
2209
3
    int sub_size = vv->varint_get32(&cp, endp, &err);
2210
3
    if (sub_size < 0 || endp - cp < sub_size)
2211
3
        goto malformed;
2212
0
    c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding,
2213
0
                                            cp, sub_size, E_INT, version, vv);
2214
0
    if (c->u.xrle.len_codec == NULL)
2215
0
        goto malformed;
2216
0
    cp += sub_size;
2217
2218
0
    c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err);
2219
0
    sub_size = vv->varint_get32(&cp, endp, &err);
2220
0
    if (sub_size < 0 || endp - cp < sub_size)
2221
0
        goto malformed;
2222
0
    c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding,
2223
0
                                            cp, sub_size, option, version, vv);
2224
0
    if (c->u.xrle.lit_codec == NULL)
2225
0
        goto malformed;
2226
0
    cp += sub_size;
2227
2228
0
    if (err)
2229
0
        goto malformed;
2230
2231
0
    return c;
2232
2233
3
 malformed:
2234
3
    fprintf(stderr, "Malformed xrle header stream\n");
2235
3
    cram_xrle_decode_free(c);
2236
3
    return NULL;
2237
0
}
2238
2239
0
int cram_xrle_encode_flush(cram_codec *c) {
2240
0
    uint8_t *out_lit, *out_len;
2241
0
    uint64_t out_lit_size, out_len_size;
2242
0
    uint8_t rle_syms[256];
2243
0
    int rle_nsyms = 0, i;
2244
2245
0
    for (i = 0; i < 256; i++)
2246
0
        if (c->u.e_xrle.rep_score[i] > 0)
2247
0
            rle_syms[rle_nsyms++] = i;
2248
2249
0
    if (!c->u.e_xrle.to_flush) {
2250
0
        c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out);
2251
0
        c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out);
2252
0
    }
2253
2254
0
    out_len = malloc(c->u.e_xrle.to_flush_size+8);
2255
0
    if (!out_len)
2256
0
        return -1;
2257
2258
0
    int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size);
2259
2260
0
    out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
2261
0
                             out_len+nb, &out_len_size,
2262
0
                             rle_syms, &rle_nsyms,
2263
0
                             NULL, &out_lit_size);
2264
0
    out_len_size += nb;
2265
2266
2267
    // TODO: can maybe "gift" the sub codec the data block, to remove
2268
    // one level of memcpy.
2269
0
    if (c->u.e_xrle.len_codec->encode(NULL,
2270
0
                                      c->u.e_xrle.len_codec,
2271
0
                                      (char *)out_len, out_len_size))
2272
0
        return -1;
2273
2274
0
    if (c->u.e_xrle.lit_codec->encode(NULL,
2275
0
                                      c->u.e_xrle.lit_codec,
2276
0
                                      (char *)out_lit, out_lit_size))
2277
0
        return -1;
2278
2279
0
    free(out_len);
2280
0
    free(out_lit);
2281
2282
0
    return 0;
2283
0
}
2284
2285
int cram_xrle_encode_store(cram_codec *c, cram_block *b,
2286
0
                            char *prefix, int version) {
2287
0
    int len = 0, r = 0, n;
2288
0
    cram_codec *tc;
2289
0
    cram_block *b_rle, *b_len, *b_lit;
2290
2291
0
    if (prefix) {
2292
0
        size_t l = strlen(prefix);
2293
0
        BLOCK_APPEND(b, prefix, l);
2294
0
        len += l;
2295
0
    }
2296
2297
    // List of symbols to RLE
2298
0
    b_rle = cram_new_block(0, 0);
2299
0
    if (!b_rle)
2300
0
        return -1;
2301
0
    int i, nrle = 0, len1 = 0;
2302
0
    for (i = 0; i < 256; i++) {
2303
0
        if (c->u.e_xrle.rep_score[i] > 0) {
2304
0
            nrle++;
2305
0
            len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n;
2306
0
        }
2307
0
    }
2308
2309
    // Store length and literal sub-codecs to get encoded length
2310
0
    tc = c->u.e_xrle.len_codec;
2311
0
    b_len = cram_new_block(0, 0);
2312
0
    if (!b_len)
2313
0
        return -1;
2314
0
    int len2 = tc->store(tc, b_len, NULL, version);
2315
2316
0
    tc = c->u.e_xrle.lit_codec;
2317
0
    b_lit = cram_new_block(0, 0);
2318
0
    if (!b_lit)
2319
0
        return -1;
2320
0
    int len3 = tc->store(tc, b_lit, NULL, version);
2321
2322
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
2323
0
    len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3
2324
0
                                        + c->vv->varint_size(nrle))); r |= n;
2325
0
    len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n;
2326
0
    BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle));
2327
0
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
2328
0
    BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit));
2329
2330
0
    cram_free_block(b_rle);
2331
0
    cram_free_block(b_len);
2332
0
    cram_free_block(b_lit);
2333
2334
0
    if (r > 0)
2335
0
        return len + len1 + len2 + len3;
2336
2337
0
 block_err:
2338
0
    return -1;
2339
0
}
2340
2341
int cram_xrle_encode_long(cram_slice *slice, cram_codec *c,
2342
0
                           char *in, int in_size) {
2343
    // TODO if and when needed
2344
0
    return -1;
2345
0
}
2346
2347
int cram_xrle_encode_int(cram_slice *slice, cram_codec *c,
2348
0
                          char *in, int in_size) {
2349
    // TODO if and when needed
2350
0
    return -1;
2351
0
}
2352
2353
int cram_xrle_encode_char(cram_slice *slice, cram_codec *c,
2354
0
                          char *in, int in_size) {
2355
0
    if (c->u.e_xrle.to_flush) {
2356
0
        if (!c->out && !(c->out = cram_new_block(0, 0)))
2357
0
            return -1;
2358
0
        BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size);
2359
0
        c->u.e_xrle.to_flush = NULL;
2360
0
        c->u.e_xrle.to_flush_size = 0;
2361
0
    }
2362
2363
0
    if (c->out && BLOCK_SIZE(c->out) > 0) {
2364
        // Gathering data
2365
0
        BLOCK_APPEND(c->out, in, in_size);
2366
0
        return 0;
2367
0
    }
2368
2369
    // else cache copy of the data we're about to send to flush instead.
2370
0
    c->u.e_xrle.to_flush = in;
2371
0
    c->u.e_xrle.to_flush_size = in_size;
2372
0
    return 0;
2373
2374
0
 block_err:
2375
0
    return -1;
2376
0
}
2377
2378
0
void cram_xrle_encode_free(cram_codec *c) {
2379
0
    if (!c) return;
2380
2381
0
    if (c->u.e_xrle.len_codec)
2382
0
        c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec);
2383
0
    if (c->u.e_xrle.lit_codec)
2384
0
        c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec);
2385
2386
0
    cram_free_block(c->out);
2387
2388
0
    free(c);
2389
0
}
2390
2391
cram_codec *cram_xrle_encode_init(cram_stats *st,
2392
                                  enum cram_encoding codec,
2393
                                  enum cram_external_type option,
2394
                                  void *dat,
2395
0
                                  int version, varint_vec *vv) {
2396
0
    cram_codec *c;
2397
2398
0
    if (!(c = malloc(sizeof(*c))))
2399
0
        return NULL;
2400
2401
0
    c->codec  = E_XRLE;
2402
0
    c->free   = cram_xrle_encode_free;
2403
0
    if (option == E_LONG)
2404
0
        c->encode = cram_xrle_encode_long;
2405
0
    else if (option == E_INT)
2406
0
        c->encode = cram_xrle_encode_int;
2407
0
    else
2408
0
        c->encode = cram_xrle_encode_char;
2409
0
    c->store  = cram_xrle_encode_store;
2410
0
    c->flush  = cram_xrle_encode_flush;
2411
2412
0
    cram_xrle_encoder *e = (cram_xrle_encoder *)dat;
2413
2414
0
    c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL,
2415
0
                                              E_BYTE, e->len_dat,
2416
0
                                              version, vv);
2417
0
    c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL,
2418
0
                                              E_BYTE, e->lit_dat,
2419
0
                                              version, vv);
2420
0
    c->u.e_xrle.cur_lit = -1;
2421
0
    c->u.e_xrle.cur_len = -1;
2422
0
    c->u.e_xrle.to_flush = NULL;
2423
0
    c->u.e_xrle.to_flush_size = 0;
2424
2425
0
    memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score));
2426
2427
0
    return c;
2428
0
}
2429
2430
/*
2431
 * ---------------------------------------------------------------------------
2432
 * SUBEXP
2433
 */
2434
0
int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2435
0
    int32_t *out_i = (int32_t *)out;
2436
0
    int n, count;
2437
0
    int k = c->u.subexp.k;
2438
2439
0
    for (count = 0, n = *out_size; count < n; count++) {
2440
0
        int i = 0, tail;
2441
0
        int val;
2442
2443
        /* Get number of 1s */
2444
        //while (get_bit_MSB(in) == 1) i++;
2445
0
        i = get_one_bits_MSB(in);
2446
0
        if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
2447
0
            return -1;
2448
        /*
2449
         * Val is
2450
         * i > 0:  2^(k+i-1) + k+i-1 bits
2451
         * i = 0:  k bits
2452
         */
2453
0
        if (i) {
2454
0
            tail = i + k-1;
2455
0
            val = 0;
2456
0
            while (tail) {
2457
                //val = val<<1; val |= get_bit_MSB(in);
2458
0
                GET_BIT_MSB(in, val);
2459
0
                tail--;
2460
0
            }
2461
0
            val += 1 << (i + k-1);
2462
0
        } else {
2463
0
            tail = k;
2464
0
            val = 0;
2465
0
            while (tail) {
2466
                //val = val<<1; val |= get_bit_MSB(in);
2467
0
                GET_BIT_MSB(in, val);
2468
0
                tail--;
2469
0
            }
2470
0
        }
2471
2472
0
        out_i[count] = val - c->u.subexp.offset;
2473
0
    }
2474
2475
0
    return 0;
2476
0
}
2477
2478
1.15k
void cram_subexp_decode_free(cram_codec *c) {
2479
1.15k
    if (c)
2480
1.15k
        free(c);
2481
1.15k
}
2482
2483
0
int cram_subexp_describe(cram_codec *c, kstring_t *ks) {
2484
0
    return ksprintf(ks, "SUBEXP(offset=%d,k=%d)",
2485
0
                    c->u.subexp.offset,
2486
0
                    c->u.subexp.k)
2487
0
        < 0 ? -1 : 0;
2488
0
}
2489
2490
cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
2491
                                    char *data, int size,
2492
                                    enum cram_encoding codec,
2493
                                    enum cram_external_type option,
2494
1.15k
                                    int version, varint_vec *vv) {
2495
1.15k
    cram_codec *c;
2496
1.15k
    char *cp = data;
2497
2498
1.15k
    if (option != E_INT) {
2499
0
        hts_log_error("This codec only supports INT encodings");
2500
0
        return NULL;
2501
0
    }
2502
2503
1.15k
    if (!(c = malloc(sizeof(*c))))
2504
0
        return NULL;
2505
2506
1.15k
    c->codec  = E_SUBEXP;
2507
1.15k
    c->decode = cram_subexp_decode;
2508
1.15k
    c->free   = cram_subexp_decode_free;
2509
1.15k
    c->describe = cram_subexp_describe;
2510
1.15k
    c->u.subexp.k = -1;
2511
2512
1.15k
    c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL);
2513
1.15k
    c->u.subexp.k      = vv->varint_get32(&cp, data + size, NULL);
2514
2515
1.15k
    if (cp - data != size || c->u.subexp.k < 0) {
2516
0
        hts_log_error("Malformed subexp header stream");
2517
0
        free(c);
2518
0
        return NULL;
2519
0
    }
2520
2521
1.15k
    return c;
2522
1.15k
}
2523
2524
/*
2525
 * ---------------------------------------------------------------------------
2526
 * GAMMA
2527
 */
2528
0
int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2529
0
    int32_t *out_i = (int32_t *)out;
2530
0
    int i, n;
2531
2532
0
    for (i = 0, n = *out_size; i < n; i++) {
2533
0
        int nz = 0;
2534
0
        int val;
2535
        //while (get_bit_MSB(in) == 0) nz++;
2536
0
        nz = get_zero_bits_MSB(in);
2537
0
        if (cram_not_enough_bits(in, nz))
2538
0
            return -1;
2539
0
        val = 1;
2540
0
        while (nz > 0) {
2541
            //val <<= 1; val |= get_bit_MSB(in);
2542
0
            GET_BIT_MSB(in, val);
2543
0
            nz--;
2544
0
        }
2545
2546
0
        out_i[i] = val - c->u.gamma.offset;
2547
0
    }
2548
2549
0
    return 0;
2550
0
}
2551
2552
1.28k
void cram_gamma_decode_free(cram_codec *c) {
2553
1.28k
    if (c)
2554
1.28k
        free(c);
2555
1.28k
}
2556
2557
0
int cram_gamma_describe(cram_codec *c, kstring_t *ks) {
2558
0
    return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset)
2559
0
        < 0 ? -1 : 0;
2560
0
}
2561
2562
cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
2563
                                   char *data, int size,
2564
                                   enum cram_encoding codec,
2565
                                   enum cram_external_type option,
2566
1.28k
                                   int version, varint_vec *vv) {
2567
1.28k
    cram_codec *c = NULL;
2568
1.28k
    char *cp = data;
2569
2570
1.28k
    if (option != E_INT) {
2571
0
        hts_log_error("This codec only supports INT encodings");
2572
0
        return NULL;
2573
0
    }
2574
2575
1.28k
    if (size < 1)
2576
3
        goto malformed;
2577
2578
1.28k
    if (!(c = malloc(sizeof(*c))))
2579
0
        return NULL;
2580
2581
1.28k
    c->codec  = E_GAMMA;
2582
1.28k
    c->decode = cram_gamma_decode;
2583
1.28k
    c->free   = cram_gamma_decode_free;
2584
1.28k
    c->describe = cram_gamma_describe;
2585
2586
1.28k
    c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL);
2587
2588
1.28k
    if (cp - data != size)
2589
2
        goto malformed;
2590
2591
1.28k
    return c;
2592
2593
5
 malformed:
2594
5
    hts_log_error("Malformed gamma header stream");
2595
5
    free(c);
2596
5
    return NULL;
2597
1.28k
}
2598
2599
/*
2600
 * ---------------------------------------------------------------------------
2601
 * HUFFMAN
2602
 */
2603
2604
267
static int code_sort(const void *vp1, const void *vp2) {
2605
267
    const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
2606
267
    const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
2607
2608
267
    if (c1->len != c2->len)
2609
3
        return c1->len - c2->len;
2610
264
    else
2611
264
        return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
2612
267
}
2613
2614
615
void cram_huffman_decode_free(cram_codec *c) {
2615
615
    if (!c)
2616
0
        return;
2617
2618
615
    if (c->u.huffman.codes)
2619
450
        free(c->u.huffman.codes);
2620
615
    free(c);
2621
615
}
2622
2623
int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
2624
0
                             cram_block *in, char *out, int *out_size) {
2625
0
    return -1;
2626
0
}
2627
2628
int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
2629
0
                              cram_block *in, char *out, int *out_size) {
2630
0
    int i, n;
2631
2632
0
    if (!out)
2633
0
        return 0;
2634
2635
    /* Special case of 0 length codes */
2636
0
    for (i = 0, n = *out_size; i < n; i++) {
2637
0
        out[i] = c->u.huffman.codes[0].symbol;
2638
0
    }
2639
0
    return 0;
2640
0
}
2641
2642
int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
2643
0
                             cram_block *in, char *out, int *out_size) {
2644
0
    int i, n, ncodes = c->u.huffman.ncodes;
2645
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2646
2647
0
    for (i = 0, n = *out_size; i < n; i++) {
2648
0
        int idx = 0;
2649
0
        int val = 0, len = 0, last_len = 0;
2650
2651
0
        for (;;) {
2652
0
            int dlen = codes[idx].len - last_len;
2653
0
            if (cram_not_enough_bits(in, dlen))
2654
0
                return -1;
2655
2656
            //val <<= dlen;
2657
            //val  |= get_bits_MSB(in, dlen);
2658
            //last_len = (len += dlen);
2659
2660
0
            last_len = (len += dlen);
2661
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2662
2663
0
            idx = val - codes[idx].p;
2664
0
            if (idx >= ncodes || idx < 0)
2665
0
                return -1;
2666
2667
0
            if (codes[idx].code == val && codes[idx].len == len) {
2668
0
                if (out) out[i] = codes[idx].symbol;
2669
0
                break;
2670
0
            }
2671
0
        }
2672
0
    }
2673
2674
0
    return 0;
2675
0
}
2676
2677
int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
2678
0
                             cram_block *in, char *out, int *out_size) {
2679
0
    int32_t *out_i = (int32_t *)out;
2680
0
    int i, n;
2681
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2682
2683
    /* Special case of 0 length codes */
2684
0
    for (i = 0, n = *out_size; i < n; i++) {
2685
0
        out_i[i] = codes[0].symbol;
2686
0
    }
2687
0
    return 0;
2688
0
}
2689
2690
int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
2691
0
                            cram_block *in, char *out, int *out_size) {
2692
0
    int32_t *out_i = (int32_t *)out;
2693
0
    int i, n, ncodes = c->u.huffman.ncodes;
2694
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2695
2696
0
    for (i = 0, n = *out_size; i < n; i++) {
2697
0
        int idx = 0;
2698
0
        int val = 0, len = 0, last_len = 0;
2699
2700
        // Now one bit at a time for remaining checks
2701
0
        for (;;) {
2702
0
            int dlen = codes[idx].len - last_len;
2703
0
            if (cram_not_enough_bits(in, dlen))
2704
0
                return -1;
2705
2706
            //val <<= dlen;
2707
            //val  |= get_bits_MSB(in, dlen);
2708
            //last_len = (len += dlen);
2709
2710
0
            last_len = (len += dlen);
2711
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2712
2713
0
            idx = val - codes[idx].p;
2714
0
            if (idx >= ncodes || idx < 0)
2715
0
                return -1;
2716
2717
0
            if (codes[idx].code == val && codes[idx].len == len) {
2718
0
                out_i[i] = codes[idx].symbol;
2719
0
                break;
2720
0
            }
2721
0
        }
2722
0
    }
2723
2724
0
    return 0;
2725
0
}
2726
2727
int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c,
2728
0
                              cram_block *in, char *out, int *out_size) {
2729
0
    int64_t *out_i = (int64_t *)out;
2730
0
    int i, n;
2731
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2732
2733
    /* Special case of 0 length codes */
2734
0
    for (i = 0, n = *out_size; i < n; i++) {
2735
0
        out_i[i] = codes[0].symbol;
2736
0
    }
2737
0
    return 0;
2738
0
}
2739
2740
int cram_huffman_decode_long(cram_slice *slice, cram_codec *c,
2741
0
                             cram_block *in, char *out, int *out_size) {
2742
0
    int64_t *out_i = (int64_t *)out;
2743
0
    int i, n, ncodes = c->u.huffman.ncodes;
2744
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2745
2746
0
    for (i = 0, n = *out_size; i < n; i++) {
2747
0
        int idx = 0;
2748
0
        int val = 0, len = 0, last_len = 0;
2749
2750
        // Now one bit at a time for remaining checks
2751
0
        for (;;) {
2752
0
            int dlen = codes[idx].len - last_len;
2753
0
            if (cram_not_enough_bits(in, dlen))
2754
0
                return -1;
2755
2756
            //val <<= dlen;
2757
            //val  |= get_bits_MSB(in, dlen);
2758
            //last_len = (len += dlen);
2759
2760
0
            last_len = (len += dlen);
2761
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2762
2763
0
            idx = val - codes[idx].p;
2764
0
            if (idx >= ncodes || idx < 0)
2765
0
                return -1;
2766
2767
0
            if (codes[idx].code == val && codes[idx].len == len) {
2768
0
                out_i[i] = codes[idx].symbol;
2769
0
                break;
2770
0
            }
2771
0
        }
2772
0
    }
2773
2774
0
    return 0;
2775
0
}
2776
2777
0
int cram_huffman_describe(cram_codec *c, kstring_t *ks) {
2778
0
    int r = 0, n;
2779
0
    r |= ksprintf(ks, "HUFFMAN(codes={") < 0;
2780
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2781
0
        r |= ksprintf(ks, "%s%"PRId64, n?",":"",
2782
0
                      c->u.huffman.codes[n].symbol);
2783
0
    }
2784
0
    r |= ksprintf(ks, "},lengths={") < 0;
2785
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2786
0
        r |= ksprintf(ks, "%s%d", n?",":"",
2787
0
                      c->u.huffman.codes[n].len);
2788
0
    }
2789
0
    r |= ksprintf(ks, "})") < 0;
2790
0
    return r;
2791
0
}
2792
2793
/*
2794
 * Initialises a huffman decoder from an encoding data stream.
2795
 */
2796
cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
2797
                                     char *data, int size,
2798
                                     enum cram_encoding codec,
2799
                                     enum cram_external_type option,
2800
644
                                     int version, varint_vec *vv) {
2801
644
    int32_t ncodes = 0, i, j;
2802
644
    char *cp = data, *data_end = &data[size];
2803
644
    cram_codec *h;
2804
644
    cram_huffman_code *codes = NULL;
2805
644
    int32_t val, last_len, max_len = 0;
2806
644
    uint32_t max_val; // needs one more bit than val
2807
644
    const int max_code_bits = sizeof(val) * 8 - 1;
2808
644
    int err = 0;
2809
2810
644
    if (option == E_BYTE_ARRAY_BLOCK) {
2811
0
        hts_log_error("BYTE_ARRAYs not supported by this codec");
2812
0
        return NULL;
2813
0
    }
2814
2815
644
    ncodes = vv->varint_get32(&cp, data_end, &err);
2816
644
    if (ncodes < 0) {
2817
0
        hts_log_error("Invalid number of symbols in huffman stream");
2818
0
        return NULL;
2819
0
    }
2820
644
    if (ncodes >= SIZE_MAX / sizeof(*codes)) {
2821
0
        errno = ENOMEM;
2822
0
        return NULL;
2823
0
    }
2824
644
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2825
644
    if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) {
2826
6
        errno = ENOMEM;
2827
6
        return NULL;
2828
6
    }
2829
638
#endif
2830
638
    h = calloc(1, sizeof(*h));
2831
638
    if (!h)
2832
0
        return NULL;
2833
2834
638
    h->codec  = E_HUFFMAN;
2835
638
    h->free   = cram_huffman_decode_free;
2836
2837
638
    h->u.huffman.ncodes = ncodes;
2838
638
    h->u.huffman.option = option;
2839
638
    if (ncodes) {
2840
470
        codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes));
2841
470
        if (!codes) {
2842
0
            free(h);
2843
0
            return NULL;
2844
0
        }
2845
470
    } else {
2846
168
        codes = h->u.huffman.codes = NULL;
2847
168
    }
2848
2849
    /* Read symbols and bit-lengths */
2850
638
    if (option == E_LONG) {
2851
0
        for (i = 0; i < ncodes; i++)
2852
0
            codes[i].symbol = vv->varint_get64(&cp, data_end, &err);
2853
638
    } else if (option == E_INT || option == E_BYTE) {
2854
36.5k
        for (i = 0; i < ncodes; i++)
2855
35.9k
            codes[i].symbol = vv->varint_get32(&cp, data_end, &err);
2856
638
    } else {
2857
0
        goto malformed;
2858
0
    }
2859
2860
638
    if (err)
2861
11
        goto malformed;
2862
2863
627
    i = vv->varint_get32(&cp, data_end, &err);
2864
627
    if (i != ncodes)
2865
6
        goto malformed;
2866
2867
621
    if (ncodes == 0) {
2868
        /* NULL huffman stream.  Ensure it returns an error if
2869
           anything tries to use it. */
2870
165
        h->decode = cram_huffman_decode_null;
2871
165
        return h;
2872
165
    }
2873
2874
1.18k
    for (i = 0; i < ncodes; i++) {
2875
726
        codes[i].len = vv->varint_get32(&cp, data_end, &err);
2876
726
        if (err)
2877
0
            break;
2878
726
        if (codes[i].len < 0) {
2879
0
            hts_log_error("Huffman code length (%d) is negative", codes[i].len);
2880
0
            goto malformed;
2881
0
        }
2882
726
        if (max_len < codes[i].len)
2883
270
            max_len = codes[i].len;
2884
726
    }
2885
456
    if (err || cp - data != size || max_len >= ncodes)
2886
3
        goto malformed;
2887
2888
    /* 31 is max. bits available in val */
2889
453
    if (max_len > max_code_bits) {
2890
0
        hts_log_error("Huffman code length (%d) is greater "
2891
0
                      "than maximum supported (%d)", max_len, max_code_bits);
2892
0
        goto malformed;
2893
0
    }
2894
2895
    /* Sort by bit length and then by symbol value */
2896
453
    qsort(codes, ncodes, sizeof(*codes), code_sort);
2897
2898
    /* Assign canonical codes */
2899
453
    val = -1, last_len = 0, max_val = 0;
2900
1.17k
    for (i = 0; i < ncodes; i++) {
2901
720
        val++;
2902
720
        if (val > max_val)
2903
3
            goto malformed;
2904
2905
717
        if (codes[i].len > last_len) {
2906
264
            val <<= (codes[i].len - last_len);
2907
264
            last_len = codes[i].len;
2908
264
            max_val = (1U << codes[i].len) - 1;
2909
264
        }
2910
717
        codes[i].code = val;
2911
717
    }
2912
2913
    /*
2914
     * Compute the next starting point, offset by the i'th value.
2915
     * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
2916
     * codes[10..13].p = 30 - 10.
2917
     */
2918
450
    last_len = 0;
2919
1.16k
    for (i = j = 0; i < ncodes; i++) {
2920
714
        if (codes[i].len > last_len) {
2921
264
            j = codes[i].code - i;
2922
264
            last_len = codes[i].len;
2923
264
        }
2924
714
        codes[i].p = j;
2925
714
    }
2926
2927
    // puts("==HUFF LEN==");
2928
    // for (i = 0; i <= last_len+1; i++) {
2929
    //     printf("len %d=%d prefix %d\n", i, h->u.huffman.lengths[i], h->u.huffman.prefix[i]);
2930
    // }
2931
    // puts("===HUFFMAN CODES===");
2932
    // for (i = 0; i < ncodes; i++) {
2933
    //     int j;
2934
    //     printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
2935
    //     j = codes[i].len;
2936
    //     while (j) {
2937
    //         putchar(codes[i].code & (1 << --j) ? '1' : '0');
2938
    //     }
2939
    //     printf(" %d\n", codes[i].code);
2940
    // }
2941
2942
450
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
2943
228
        if (h->u.huffman.codes[0].len == 0)
2944
102
            h->decode = cram_huffman_decode_char0;
2945
126
        else
2946
126
            h->decode = cram_huffman_decode_char;
2947
228
    } else if (option == E_LONG || option == E_SLONG) {
2948
0
        if (h->u.huffman.codes[0].len == 0)
2949
0
            h->decode = cram_huffman_decode_long0;
2950
0
        else
2951
0
            h->decode = cram_huffman_decode_long;
2952
222
    } else if (option == E_INT || option == E_SINT || option == E_BYTE) {
2953
222
        if (h->u.huffman.codes[0].len == 0)
2954
84
            h->decode = cram_huffman_decode_int0;
2955
138
        else
2956
138
            h->decode = cram_huffman_decode_int;
2957
222
    } else {
2958
0
        return NULL;
2959
0
    }
2960
450
    h->describe = cram_huffman_describe;
2961
2962
450
    return (cram_codec *)h;
2963
2964
23
 malformed:
2965
23
    hts_log_error("Malformed huffman header stream");
2966
23
    free(codes);
2967
23
    free(h);
2968
23
    return NULL;
2969
450
}
2970
2971
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
2972
31.5k
                              char *in, int in_size) {
2973
31.5k
    return 0;
2974
31.5k
}
2975
2976
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
2977
0
                             char *in, int in_size) {
2978
0
    int i, code, len, r = 0;
2979
0
    unsigned char *syms = (unsigned char *)in;
2980
2981
0
    while (in_size--) {
2982
0
        int sym = *syms++;
2983
0
        if (sym >= -1 && sym < MAX_HUFF) {
2984
0
            i = c->u.e_huffman.val2code[sym+1];
2985
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
2986
0
            code = c->u.e_huffman.codes[i].code;
2987
0
            len  = c->u.e_huffman.codes[i].len;
2988
0
        } else {
2989
            /* Slow - use a lookup table for when sym < MAX_HUFF? */
2990
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
2991
0
                if (c->u.e_huffman.codes[i].symbol == sym)
2992
0
                    break;
2993
0
            }
2994
0
            if (i == c->u.e_huffman.nvals)
2995
0
                return -1;
2996
2997
0
            code = c->u.e_huffman.codes[i].code;
2998
0
            len  = c->u.e_huffman.codes[i].len;
2999
0
        }
3000
3001
0
        r |= store_bits_MSB(c->out, code, len);
3002
0
    }
3003
3004
0
    return r;
3005
0
}
3006
3007
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
3008
36.9M
                             char *in, int in_size) {
3009
36.9M
    return 0;
3010
36.9M
}
3011
3012
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
3013
0
                            char *in, int in_size) {
3014
0
    int i, code, len, r = 0;
3015
0
    int *syms = (int *)in;
3016
3017
0
    while (in_size--) {
3018
0
        int sym = *syms++;
3019
3020
0
        if (sym >= -1 && sym < MAX_HUFF) {
3021
0
            i = c->u.e_huffman.val2code[sym+1];
3022
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3023
0
            code = c->u.e_huffman.codes[i].code;
3024
0
            len  = c->u.e_huffman.codes[i].len;
3025
0
        } else {
3026
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3027
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3028
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3029
0
                    break;
3030
0
            }
3031
0
            if (i == c->u.e_huffman.nvals)
3032
0
                return -1;
3033
3034
0
            code = c->u.e_huffman.codes[i].code;
3035
0
            len  = c->u.e_huffman.codes[i].len;
3036
0
        }
3037
3038
0
        r |= store_bits_MSB(c->out, code, len);
3039
0
    }
3040
3041
0
    return r;
3042
0
}
3043
3044
int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c,
3045
0
                              char *in, int in_size) {
3046
0
    return 0;
3047
0
}
3048
3049
int cram_huffman_encode_long(cram_slice *slice, cram_codec *c,
3050
0
                             char *in, int in_size) {
3051
0
    int i, code, len, r = 0;
3052
0
    int64_t *syms = (int64_t *)in;
3053
3054
0
    while (in_size--) {
3055
0
        int sym = *syms++;
3056
3057
0
        if (sym >= -1 && sym < MAX_HUFF) {
3058
0
            i = c->u.e_huffman.val2code[sym+1];
3059
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3060
0
            code = c->u.e_huffman.codes[i].code;
3061
0
            len  = c->u.e_huffman.codes[i].len;
3062
0
        } else {
3063
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3064
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3065
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3066
0
                    break;
3067
0
            }
3068
0
            if (i == c->u.e_huffman.nvals)
3069
0
                return -1;
3070
3071
0
            code = c->u.e_huffman.codes[i].code;
3072
0
            len  = c->u.e_huffman.codes[i].len;
3073
0
        }
3074
3075
0
        r |= store_bits_MSB(c->out, code, len);
3076
0
    }
3077
3078
0
    return r;
3079
0
}
3080
3081
410k
void cram_huffman_encode_free(cram_codec *c) {
3082
410k
    if (!c)
3083
0
        return;
3084
3085
410k
    if (c->u.e_huffman.codes)
3086
410k
        free(c->u.e_huffman.codes);
3087
410k
    free(c);
3088
410k
}
3089
3090
/*
3091
 * Encodes a huffman tree.
3092
 * Returns number of bytes written.
3093
 */
3094
int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
3095
409k
                              int version) {
3096
409k
    int i, len = 0, r = 0, n;
3097
409k
    cram_huffman_code *codes = c->u.e_huffman.codes;
3098
    /*
3099
     * Up to code length 127 means 2.5e+26 bytes of data required (worst
3100
     * case huffman tree needs symbols with freqs matching the Fibonacci
3101
     * series). So guaranteed 1 byte per code.
3102
     *
3103
     * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
3104
     *
3105
     * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
3106
     */
3107
409k
    char *tmp = malloc(6*c->u.e_huffman.nvals+16);
3108
409k
    char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16;
3109
3110
409k
    if (!tmp)
3111
0
        return -1;
3112
3113
409k
    if (prefix) {
3114
350k
        size_t l = strlen(prefix);
3115
350k
        BLOCK_APPEND(b, prefix, l);
3116
350k
        len += l;
3117
350k
    }
3118
3119
409k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3120
409k
    if (c->u.e_huffman.option == E_LONG) {
3121
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3122
0
            tp += c->vv->varint_put64(tp, tpend, codes[i].symbol);
3123
0
        }
3124
409k
    } else if (c->u.e_huffman.option == E_SLONG) {
3125
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3126
0
            tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol);
3127
0
        }
3128
409k
    } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) {
3129
819k
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3130
409k
            tp += c->vv->varint_put32(tp, tpend, codes[i].symbol);
3131
409k
        }
3132
409k
    } else if (c->u.e_huffman.option == E_SINT) {
3133
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3134
0
            tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol);
3135
0
        }
3136
0
    } else {
3137
0
        return -1;
3138
0
    }
3139
3140
409k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3141
819k
    for (i = 0; i < c->u.e_huffman.nvals; i++)
3142
409k
        tp += c->vv->varint_put32(tp, tpend, codes[i].len);
3143
3144
409k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
3145
409k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
3146
409k
    BLOCK_APPEND(b, tmp, tp-tmp);
3147
409k
    len += tp-tmp;
3148
3149
409k
    free(tmp);
3150
3151
409k
    if (r > 0)
3152
409k
        return len;
3153
3154
0
 block_err:
3155
0
    return -1;
3156
409k
}
3157
3158
cram_codec *cram_huffman_encode_init(cram_stats *st,
3159
                                     enum cram_encoding codec,
3160
                                     enum cram_external_type option,
3161
                                     void *dat,
3162
410k
                                     int version, varint_vec *vv) {
3163
410k
    int *vals = NULL, *freqs = NULL, *lens = NULL, code, len;
3164
410k
    int *new_vals, *new_freqs;
3165
410k
    int i, max_val = 0, min_val = INT_MAX, k;
3166
410k
    size_t nvals, vals_alloc = 0;
3167
410k
    cram_codec *c;
3168
410k
    cram_huffman_code *codes;
3169
3170
410k
    c = malloc(sizeof(*c));
3171
410k
    if (!c)
3172
0
        return NULL;
3173
410k
    c->codec = E_HUFFMAN;
3174
3175
    /* Count number of unique symbols */
3176
420M
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
3177
420M
        if (!st->freqs[i])
3178
419M
            continue;
3179
338k
        if (nvals >= vals_alloc) {
3180
338k
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3181
338k
            new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3182
338k
            if (!new_vals) goto nomem;
3183
338k
            vals = new_vals;
3184
338k
            new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3185
338k
            if (!new_freqs) goto nomem;
3186
338k
            freqs = new_freqs;
3187
338k
        }
3188
338k
        vals[nvals] = i;
3189
338k
        freqs[nvals] = st->freqs[i];
3190
338k
        assert(st->freqs[i] > 0);
3191
338k
        if (max_val < i) max_val = i;
3192
338k
        if (min_val > i) min_val = i;
3193
338k
        nvals++;
3194
338k
    }
3195
410k
    if (st->h) {
3196
71.4k
        khint_t k;
3197
3198
357k
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
3199
285k
            if (!kh_exist(st->h, k))
3200
214k
                continue;
3201
71.4k
            if (nvals >= vals_alloc) {
3202
71.4k
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3203
71.4k
                new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3204
71.4k
                if (!new_vals) goto nomem;
3205
71.4k
                vals = new_vals;
3206
71.4k
                new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3207
71.4k
                if (!new_freqs) goto nomem;
3208
71.4k
                freqs = new_freqs;
3209
71.4k
            }
3210
71.4k
            vals[nvals]= kh_key(st->h, k);
3211
71.4k
            freqs[nvals] = kh_val(st->h, k);
3212
71.4k
            assert(freqs[nvals] > 0);
3213
71.4k
            if (max_val < i) max_val = i;
3214
71.4k
            if (min_val > i) min_val = i;
3215
71.4k
            nvals++;
3216
71.4k
        }
3217
71.4k
    }
3218
3219
410k
    assert(nvals > 0);
3220
3221
410k
    new_freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
3222
410k
    if (!new_freqs) goto nomem;
3223
410k
    freqs = new_freqs;
3224
410k
    lens = calloc(2*nvals, sizeof(*lens));
3225
410k
    if (!lens) goto nomem;
3226
3227
    /* Inefficient, use pointers to form chain so we can insert and maintain
3228
     * a sorted list? This is currently O(nvals^2) complexity.
3229
     */
3230
410k
    for (;;) {
3231
410k
        int low1 = INT_MAX, low2 = INT_MAX;
3232
410k
        int ind1 = 0, ind2 = 0;
3233
820k
        for (i = 0; i < nvals; i++) {
3234
410k
            if (freqs[i] < 0)
3235
0
                continue;
3236
410k
            if (low1 > freqs[i])
3237
410k
                low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
3238
0
            else if (low2 > freqs[i])
3239
0
                low2 = freqs[i], ind2 = i;
3240
410k
        }
3241
410k
        if (low2 == INT_MAX)
3242
410k
            break;
3243
3244
0
        freqs[nvals] = low1 + low2;
3245
0
        lens[ind1] = nvals;
3246
0
        lens[ind2] = nvals;
3247
0
        freqs[ind1] *= -1;
3248
0
        freqs[ind2] *= -1;
3249
0
        nvals++;
3250
0
    }
3251
410k
    nvals = nvals/2+1;
3252
3253
    /* Assign lengths */
3254
820k
    for (i = 0; i < nvals; i++) {
3255
410k
        int code_len = 0;
3256
410k
        for (k = lens[i]; k; k = lens[k])
3257
0
            code_len++;
3258
410k
        lens[i] = code_len;
3259
410k
        freqs[i] *= -1;
3260
        //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
3261
410k
    }
3262
3263
3264
    /* Sort, need in a struct */
3265
410k
    if (!(codes = malloc(nvals * sizeof(*codes))))
3266
0
        goto nomem;
3267
820k
    for (i = 0; i < nvals; i++) {
3268
410k
        codes[i].symbol = vals[i];
3269
410k
        codes[i].len = lens[i];
3270
410k
    }
3271
410k
    qsort(codes, nvals, sizeof(*codes), code_sort);
3272
3273
    /*
3274
     * Generate canonical codes from lengths.
3275
     * Sort by length.
3276
     * Start with 0.
3277
     * Every new code of same length is +1.
3278
     * Every new code of new length is +1 then <<1 per extra length.
3279
     *
3280
     * /\
3281
     * a/\
3282
     * /\/\
3283
     * bcd/\
3284
     *    ef
3285
     *
3286
     * a 1  0
3287
     * b 3  4 (0+1)<<2
3288
     * c 3  5
3289
     * d 3  6
3290
     * e 4  14  (6+1)<<1
3291
     * f 5  15
3292
     */
3293
410k
    code = 0; len = codes[0].len;
3294
820k
    for (i = 0; i < nvals; i++) {
3295
410k
        while (len != codes[i].len) {
3296
0
            code<<=1;
3297
0
            len++;
3298
0
        }
3299
410k
        codes[i].code = code++;
3300
3301
410k
        if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
3302
406k
            c->u.e_huffman.val2code[codes[i].symbol+1] = i;
3303
3304
        //fprintf(stderr, "sym %d, code %d, len %d\n",
3305
        //      codes[i].symbol, codes[i].code, codes[i].len);
3306
410k
    }
3307
3308
410k
    free(lens);
3309
410k
    free(vals);
3310
410k
    free(freqs);
3311
3312
410k
    c->u.e_huffman.codes = codes;
3313
410k
    c->u.e_huffman.nvals = nvals;
3314
410k
    c->u.e_huffman.option = option;
3315
3316
410k
    c->free = cram_huffman_encode_free;
3317
410k
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
3318
13.2k
        if (c->u.e_huffman.codes[0].len == 0)
3319
13.2k
            c->encode = cram_huffman_encode_char0;
3320
0
        else
3321
0
            c->encode = cram_huffman_encode_char;
3322
396k
    } else if (option == E_INT || option == E_SINT) {
3323
396k
        if (c->u.e_huffman.codes[0].len == 0)
3324
396k
            c->encode = cram_huffman_encode_int0;
3325
0
        else
3326
0
            c->encode = cram_huffman_encode_int;
3327
396k
    } else if (option == E_LONG || option == E_SLONG) {
3328
0
        if (c->u.e_huffman.codes[0].len == 0)
3329
0
            c->encode = cram_huffman_encode_long0;
3330
0
        else
3331
0
            c->encode = cram_huffman_encode_long;
3332
0
    } else {
3333
0
        return NULL;
3334
0
    }
3335
410k
    c->store = cram_huffman_encode_store;
3336
410k
    c->flush = NULL;
3337
3338
410k
    return c;
3339
3340
0
 nomem:
3341
0
    hts_log_error("Out of memory");
3342
0
    free(vals);
3343
0
    free(freqs);
3344
0
    free(lens);
3345
0
    free(c);
3346
0
    return NULL;
3347
410k
}
3348
3349
/*
3350
 * ---------------------------------------------------------------------------
3351
 * BYTE_ARRAY_LEN
3352
 */
3353
int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
3354
                               cram_block *in, char *out,
3355
0
                               int *out_size) {
3356
    /* Fetch length */
3357
0
    int32_t len = 0, one = 1;
3358
0
    int r;
3359
3360
0
    r = c->u.byte_array_len.len_codec->decode(slice, c->u.byte_array_len.len_codec,
3361
0
                                              in, (char *)&len, &one);
3362
    //printf("ByteArray Len=%d\n", len);
3363
3364
0
    if (!r && c->u.byte_array_len.val_codec && len >= 0) {
3365
0
        r = c->u.byte_array_len.val_codec->decode(slice,
3366
0
                                                  c->u.byte_array_len.val_codec,
3367
0
                                                  in, out, &len);
3368
0
    } else {
3369
0
        return -1;
3370
0
    }
3371
3372
0
    *out_size = len;
3373
3374
0
    return r;
3375
0
}
3376
3377
969
void cram_byte_array_len_decode_free(cram_codec *c) {
3378
969
    if (!c) return;
3379
3380
969
    if (c->u.byte_array_len.len_codec)
3381
957
        c->u.byte_array_len.len_codec->free(c->u.byte_array_len.len_codec);
3382
3383
969
    if (c->u.byte_array_len.val_codec)
3384
954
        c->u.byte_array_len.val_codec->free(c->u.byte_array_len.val_codec);
3385
3386
969
    free(c);
3387
969
}
3388
3389
0
int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) {
3390
0
    int r = 0;
3391
0
    r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0;
3392
0
    cram_byte_array_len_decoder *l = &c->u.byte_array_len;
3393
0
    r |=  l->len_codec->describe
3394
0
        ? l->len_codec->describe(l->len_codec, ks)
3395
0
        : (ksprintf(ks, "?")<0);
3396
0
    r |= ksprintf(ks, "},val_codec={") < 0;
3397
0
    r |=  l->val_codec->describe
3398
0
        ? l->val_codec->describe(l->val_codec, ks)
3399
0
        : (ksprintf(ks, "?")<0);
3400
0
    r |= ksprintf(ks, "}") < 0;
3401
3402
0
    return r;
3403
0
}
3404
3405
cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
3406
                                            char *data, int size,
3407
                                            enum cram_encoding codec,
3408
                                            enum cram_external_type option,
3409
969
                                            int version, varint_vec *vv) {
3410
969
    cram_codec *c;
3411
969
    char *cp   = data;
3412
969
    char *endp = data + size;
3413
3414
969
    if (!(c = malloc(sizeof(*c))))
3415
0
        return NULL;
3416
3417
969
    c->codec  = E_BYTE_ARRAY_LEN;
3418
969
    c->decode = cram_byte_array_len_decode;
3419
969
    c->free   = cram_byte_array_len_decode_free;
3420
969
    c->describe = cram_byte_array_len_describe;
3421
969
    c->u.byte_array_len.len_codec = NULL;
3422
969
    c->u.byte_array_len.val_codec = NULL;
3423
3424
969
    int encoding = vv->varint_get32(&cp, endp, NULL);
3425
969
    int sub_size = vv->varint_get32(&cp, endp, NULL);
3426
969
    if (sub_size < 0 || endp - cp < sub_size)
3427
3
        goto malformed;
3428
966
    c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3429
966
                                                      E_INT, version, vv);
3430
966
    if (c->u.byte_array_len.len_codec == NULL)
3431
9
        goto no_codec;
3432
957
    cp += sub_size;
3433
3434
957
    encoding = vv->varint_get32(&cp, endp, NULL);
3435
957
    sub_size = vv->varint_get32(&cp, endp, NULL);
3436
957
    if (sub_size < 0 || endp - cp < sub_size)
3437
0
        goto malformed;
3438
957
    c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3439
957
                                                      option, version, vv);
3440
957
    if (c->u.byte_array_len.val_codec == NULL)
3441
3
        goto no_codec;
3442
954
    cp += sub_size;
3443
3444
954
    if (cp - data != size)
3445
3
        goto malformed;
3446
3447
951
    return c;
3448
3449
6
 malformed:
3450
6
    hts_log_error("Malformed byte_array_len header stream");
3451
18
 no_codec:
3452
18
    cram_byte_array_len_decode_free(c);
3453
18
    return NULL;
3454
6
}
3455
3456
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
3457
20.0k
                               char *in, int in_size) {
3458
20.0k
    int32_t i32 = in_size;
3459
20.0k
    int r = 0;
3460
3461
20.0k
    r |= c->u.e_byte_array_len.len_codec->encode(slice,
3462
20.0k
                                                 c->u.e_byte_array_len.len_codec,
3463
20.0k
                                                 (char *)&i32, 1);
3464
20.0k
    r |= c->u.e_byte_array_len.val_codec->encode(slice,
3465
20.0k
                                                 c->u.e_byte_array_len.val_codec,
3466
20.0k
                                                 in, in_size);
3467
20.0k
    return r;
3468
20.0k
}
3469
3470
87.8k
void cram_byte_array_len_encode_free(cram_codec *c) {
3471
87.8k
    if (!c)
3472
0
        return;
3473
3474
87.8k
    if (c->u.e_byte_array_len.len_codec)
3475
87.8k
        c->u.e_byte_array_len.len_codec->free(c->u.e_byte_array_len.len_codec);
3476
3477
87.8k
    if (c->u.e_byte_array_len.val_codec)
3478
87.8k
        c->u.e_byte_array_len.val_codec->free(c->u.e_byte_array_len.val_codec);
3479
3480
87.8k
    free(c);
3481
87.8k
}
3482
3483
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
3484
87.3k
                                     char *prefix, int version) {
3485
87.3k
    int len = 0, len2, len3, r = 0, n;
3486
87.3k
    cram_codec *tc;
3487
87.3k
    cram_block *b_len = NULL, *b_val = NULL;
3488
3489
87.3k
    if (prefix) {
3490
27.6k
        size_t l = strlen(prefix);
3491
27.6k
        BLOCK_APPEND(b, prefix, l);
3492
27.6k
        len += l;
3493
27.6k
    }
3494
3495
87.3k
    tc = c->u.e_byte_array_len.len_codec;
3496
87.3k
    b_len = cram_new_block(0, 0);
3497
87.3k
    if (!b_len) goto block_err;
3498
87.3k
    len2 = tc->store(tc, b_len, NULL, version);
3499
87.3k
    if (len2 < 0) goto block_err;
3500
3501
87.3k
    tc = c->u.e_byte_array_len.val_codec;
3502
87.3k
    b_val = cram_new_block(0, 0);
3503
87.3k
    if (!b_val) goto block_err;
3504
87.3k
    len3 = tc->store(tc, b_val, NULL, version);
3505
87.3k
    if (len3 < 0) goto block_err;
3506
3507
87.3k
    len += (n = c->vv->varint_put32_blk(b, c->codec));  r |= n;
3508
87.3k
    len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n;
3509
87.3k
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
3510
87.3k
    BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
3511
3512
87.3k
    cram_free_block(b_len);
3513
87.3k
    cram_free_block(b_val);
3514
3515
87.3k
    if (r > 0)
3516
87.3k
        return len + len2 + len3;
3517
3518
0
 block_err:
3519
0
    if (b_len) cram_free_block(b_len);
3520
0
    if (b_val) cram_free_block(b_val);
3521
0
    return -1;
3522
87.3k
}
3523
3524
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
3525
                                            enum cram_encoding codec,
3526
                                            enum cram_external_type option,
3527
                                            void *dat,
3528
87.8k
                                            int version, varint_vec *vv) {
3529
87.8k
    cram_codec *c;
3530
87.8k
    cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
3531
3532
87.8k
    c = malloc(sizeof(*c));
3533
87.8k
    if (!c)
3534
0
        return NULL;
3535
87.8k
    c->codec = E_BYTE_ARRAY_LEN;
3536
87.8k
    c->free = cram_byte_array_len_encode_free;
3537
87.8k
    c->encode = cram_byte_array_len_encode;
3538
87.8k
    c->store = cram_byte_array_len_encode_store;
3539
87.8k
    c->flush = NULL;
3540
3541
87.8k
    c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
3542
87.8k
                                                        st, E_INT,
3543
87.8k
                                                        e->len_dat,
3544
87.8k
                                                        version, vv);
3545
87.8k
    c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
3546
87.8k
                                                        NULL, E_BYTE_ARRAY,
3547
87.8k
                                                        e->val_dat,
3548
87.8k
                                                        version, vv);
3549
3550
87.8k
    if (!c->u.e_byte_array_len.len_codec ||
3551
87.8k
        !c->u.e_byte_array_len.val_codec) {
3552
0
        cram_byte_array_len_encode_free(c);
3553
0
        return NULL;
3554
0
    }
3555
3556
87.8k
    return c;
3557
87.8k
}
3558
3559
/*
3560
 * ---------------------------------------------------------------------------
3561
 * BYTE_ARRAY_STOP
3562
 */
3563
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
3564
                                            cram_block *in, char *out,
3565
0
                                            int *out_size) {
3566
0
    char *cp, ch;
3567
0
    cram_block *b = NULL;
3568
3569
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3570
0
    if (!b)
3571
0
        return *out_size?-1:0;
3572
3573
0
    if (b->idx >= b->uncomp_size)
3574
0
        return -1;
3575
3576
0
    cp = (char *)b->data + b->idx;
3577
0
    if (out) {
3578
       // memccpy equivalent but without copying the terminating byte
3579
0
        ssize_t term = MIN(*out_size, b->uncomp_size - b->idx);
3580
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3581
0
            if (term-- < 0)
3582
0
                break;
3583
0
            *out++ = ch;
3584
0
            cp++;
3585
0
        }
3586
3587
        // Attempted overrun on input or output
3588
0
        if (ch != (char)c->u.byte_array_stop.stop)
3589
0
            return -1;
3590
0
    } else {
3591
        // Consume input, but produce no output
3592
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3593
0
            if (cp - (char *)b->data >= b->uncomp_size)
3594
0
                return -1;
3595
0
            cp++;
3596
0
        }
3597
0
    }
3598
3599
0
    *out_size = cp - (char *)(b->data + b->idx);
3600
0
    b->idx = cp - (char *)b->data + 1;
3601
3602
0
    return 0;
3603
0
}
3604
3605
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
3606
                                      cram_block *in, char *out_,
3607
0
                                      int *out_size) {
3608
0
    cram_block *b;
3609
0
    cram_block *out = (cram_block *)out_;
3610
0
    unsigned char *cp, *cp_end;
3611
0
    unsigned char stop;
3612
3613
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3614
0
    if (!b)
3615
0
        return *out_size?-1:0;
3616
3617
0
    if (b->idx >= b->uncomp_size)
3618
0
        return -1;
3619
0
    cp = b->data + b->idx;
3620
0
    cp_end = b->data + b->uncomp_size;
3621
3622
    // STOP byte is hard-coded as zero by our name tokeniser decoder
3623
    // implementation, so we may ignore what was requested.
3624
0
    stop = b->orig_method == TOK3 ? 0 : c->u.byte_array_stop.stop;
3625
3626
0
    if (cp_end - cp < out->alloc - out->byte) {
3627
0
        unsigned char *out_cp = BLOCK_END(out);
3628
0
        while (cp != cp_end && *cp != stop)
3629
0
            *out_cp++ = *cp++;
3630
0
        BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out);
3631
0
    } else {
3632
0
        unsigned char *cp_start;
3633
0
        for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
3634
0
            ;
3635
0
        BLOCK_APPEND(out, cp_start, cp - cp_start);
3636
0
        BLOCK_GROW(out, cp - cp_start);
3637
0
    }
3638
3639
0
    *out_size = cp - (b->data + b->idx);
3640
0
    b->idx = cp - b->data + 1;
3641
3642
0
    return 0;
3643
3644
0
 block_err:
3645
0
    return -1;
3646
0
}
3647
3648
588
void cram_byte_array_stop_decode_free(cram_codec *c) {
3649
588
    if (!c) return;
3650
3651
588
    free(c);
3652
588
}
3653
3654
0
int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) {
3655
0
    return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)",
3656
0
                    c->u.byte_array_stop.stop,
3657
0
                    c->u.byte_array_stop.content_id)
3658
0
        < 0 ? -1 : 0;
3659
0
}
3660
3661
cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
3662
                                             char *data, int size,
3663
                                             enum cram_encoding codec,
3664
                                             enum cram_external_type option,
3665
588
                                             int version, varint_vec *vv) {
3666
588
    cram_codec *c = NULL;
3667
588
    unsigned char *cp = (unsigned char *)data;
3668
588
    int err = 0;
3669
3670
588
    if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
3671
0
        goto malformed;
3672
3673
588
    if (!(c = malloc(sizeof(*c))))
3674
0
        return NULL;
3675
3676
588
    c->codec  = E_BYTE_ARRAY_STOP;
3677
588
    switch (option) {
3678
582
    case E_BYTE_ARRAY_BLOCK:
3679
582
        c->decode = cram_byte_array_stop_decode_block;
3680
582
        break;
3681
6
    case E_BYTE_ARRAY:
3682
6
        c->decode = cram_byte_array_stop_decode_char;
3683
6
        break;
3684
0
    default:
3685
0
        hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
3686
0
        free(c);
3687
0
        return NULL;
3688
588
    }
3689
588
    c->free   = cram_byte_array_stop_decode_free;
3690
588
    c->describe = cram_byte_array_stop_describe;
3691
3692
588
    c->u.byte_array_stop.stop = *cp++;
3693
588
    if (CRAM_MAJOR_VERS(version) == 1) {
3694
588
        c->u.byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
3695
588
            + ((unsigned int) cp[3]<<24);
3696
588
        cp += 4;
3697
588
    } else {
3698
0
        c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err);
3699
0
    }
3700
3701
588
    if ((char *)cp - data != size || err)
3702
0
        goto malformed;
3703
3704
588
    return c;
3705
3706
0
 malformed:
3707
0
    hts_log_error("Malformed byte_array_stop header stream");
3708
0
    free(c);
3709
0
    return NULL;
3710
588
}
3711
3712
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
3713
146k
                                char *in, int in_size) {
3714
146k
    BLOCK_APPEND(c->out, in, in_size);
3715
146k
    BLOCK_APPEND_CHAR(c->out, c->u.e_byte_array_stop.stop);
3716
146k
    return 0;
3717
3718
0
 block_err:
3719
0
    return -1;
3720
146k
}
3721
3722
139k
void cram_byte_array_stop_encode_free(cram_codec *c) {
3723
139k
    if (!c)
3724
0
        return;
3725
139k
    free(c);
3726
139k
}
3727
3728
int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
3729
139k
                                      char *prefix, int version) {
3730
139k
    int len = 0;
3731
139k
    char buf[20], *cp = buf;
3732
3733
139k
    if (prefix) {
3734
82.9k
        size_t l = strlen(prefix);
3735
82.9k
        BLOCK_APPEND(b, prefix, l);
3736
82.9k
        len += l;
3737
82.9k
    }
3738
3739
139k
    cp += c->vv->varint_put32(cp, buf+20, c->codec);
3740
3741
139k
    if (CRAM_MAJOR_VERS(version) == 1) {
3742
0
        cp += c->vv->varint_put32(cp, buf+20, 5);
3743
0
        *cp++ = c->u.e_byte_array_stop.stop;
3744
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  0) & 0xff;
3745
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  8) & 0xff;
3746
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff;
3747
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff;
3748
139k
    } else {
3749
139k
        cp += c->vv->varint_put32(cp, buf+20, 1 +
3750
139k
                                  c->vv->varint_size(c->u.e_byte_array_stop.content_id));
3751
139k
        *cp++ = c->u.e_byte_array_stop.stop;
3752
139k
        cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id);
3753
139k
    }
3754
3755
139k
    BLOCK_APPEND(b, buf, cp-buf);
3756
139k
    len += cp-buf;
3757
3758
139k
    return len;
3759
3760
0
 block_err:
3761
0
    return -1;
3762
139k
}
3763
3764
cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
3765
                                             enum cram_encoding codec,
3766
                                             enum cram_external_type option,
3767
                                             void *dat,
3768
139k
                                             int version, varint_vec *vv) {
3769
139k
    cram_codec *c;
3770
3771
139k
    c = malloc(sizeof(*c));
3772
139k
    if (!c)
3773
0
        return NULL;
3774
139k
    c->codec = E_BYTE_ARRAY_STOP;
3775
139k
    c->free = cram_byte_array_stop_encode_free;
3776
139k
    c->encode = cram_byte_array_stop_encode;
3777
139k
    c->store = cram_byte_array_stop_encode_store;
3778
139k
    c->flush = NULL;
3779
3780
139k
    c->u.e_byte_array_stop.stop = ((int *)dat)[0];
3781
139k
    c->u.e_byte_array_stop.content_id = ((int *)dat)[1];
3782
3783
139k
    return c;
3784
139k
}
3785
3786
/*
3787
 * ---------------------------------------------------------------------------
3788
 */
3789
3790
111
const char *cram_encoding2str(enum cram_encoding t) {
3791
111
    switch (t) {
3792
5
    case E_NULL:            return "NULL";
3793
0
    case E_EXTERNAL:        return "EXTERNAL";
3794
12
    case E_GOLOMB:          return "GOLOMB";
3795
0
    case E_HUFFMAN:         return "HUFFMAN";
3796
0
    case E_BYTE_ARRAY_LEN:  return "BYTE_ARRAY_LEN";
3797
0
    case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
3798
15
    case E_BETA:            return "BETA";
3799
0
    case E_SUBEXP:          return "SUBEXP";
3800
2
    case E_GOLOMB_RICE:     return "GOLOMB_RICE";
3801
0
    case E_GAMMA:           return "GAMMA";
3802
3803
0
    case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED";
3804
0
    case E_VARINT_SIGNED:   return "VARINT_SIGNED";
3805
0
    case E_CONST_BYTE:      return "CONST_BYTE";
3806
0
    case E_CONST_INT:       return "CONST_INT";
3807
3808
0
    case E_NUM_CODECS:
3809
77
    default:                return "?";
3810
111
    }
3811
111
}
3812
3813
static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr,
3814
                                    char *data,
3815
                                    int size,
3816
                                    enum cram_encoding codec,
3817
                                    enum cram_external_type option,
3818
                                    int version, varint_vec *vv) = {
3819
    // CRAM 3.0 valid codecs
3820
    NULL, // null codec
3821
    cram_external_decode_init,
3822
    NULL, // golomb
3823
    cram_huffman_decode_init,
3824
    cram_byte_array_len_decode_init,
3825
    cram_byte_array_stop_decode_init,
3826
    cram_beta_decode_init,
3827
    cram_subexp_decode_init,
3828
    NULL, // golomb rice
3829
    cram_gamma_decode_init,
3830
3831
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3832
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3833
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3834
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3835
3836
    NULL,                      // was xbyte
3837
    cram_varint_decode_init,   // varint unsigned
3838
    cram_varint_decode_init,   // varint signed
3839
    cram_const_decode_init,    // const byte
3840
    cram_const_decode_init,    // const int
3841
3842
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3843
    NULL, NULL, NULL, NULL, NULL,
3844
3845
    NULL, // xhuffman
3846
    cram_xpack_decode_init,
3847
    cram_xrle_decode_init,
3848
    cram_xdelta_decode_init,
3849
};
3850
3851
cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr,
3852
                              enum cram_encoding codec,
3853
                              char *data, int size,
3854
                              enum cram_external_type option,
3855
7.44k
                              int version, varint_vec *vv) {
3856
7.44k
    if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
3857
7.35k
        cram_codec *r = decode_init[codec](hdr, data, size, codec,
3858
7.35k
                                           option, version, vv);
3859
7.35k
        if (r) {
3860
7.24k
            r->vv = vv;
3861
7.24k
            r->codec_id = hdr->ncodecs++;
3862
7.24k
        }
3863
7.35k
        return r;
3864
7.35k
    } else {
3865
96
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3866
96
        return NULL;
3867
96
    }
3868
7.44k
}
3869
3870
static cram_codec *(*encode_init[])(cram_stats *stx,
3871
                                    enum cram_encoding codec,
3872
                                    enum cram_external_type option,
3873
                                    void *opt,
3874
                                    int version, varint_vec *vv) = {
3875
    // CRAM 3.0 valid codecs
3876
    NULL, // null codec
3877
    cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4
3878
    NULL, // golomb
3879
    cram_huffman_encode_init,
3880
    cram_byte_array_len_encode_init,
3881
    cram_byte_array_stop_encode_init,
3882
    cram_beta_encode_init,
3883
    NULL, // subexponential (we support decode only)
3884
    NULL, // golomb rice
3885
    NULL, // gamma (we support decode only)
3886
3887
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3888
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3889
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3890
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3891
3892
    NULL, // was xbyte
3893
    cram_varint_encode_init, // varint unsigned
3894
    cram_varint_encode_init, // varint signed
3895
    cram_const_encode_init,  // const byte
3896
    cram_const_encode_init,  // const int
3897
3898
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3899
    NULL, NULL, NULL, NULL, NULL,
3900
3901
    NULL, // xhuffman
3902
    cram_xpack_encode_init,
3903
    cram_xrle_encode_init,
3904
    cram_xdelta_encode_init,
3905
};
3906
3907
cram_codec *cram_encoder_init(enum cram_encoding codec,
3908
                              cram_stats *st,
3909
                              enum cram_external_type option,
3910
                              void *dat,
3911
1.03M
                              int version, varint_vec *vv) {
3912
1.03M
    if (st && !st->nvals)
3913
227k
        return NULL;
3914
3915
    // cram_stats_encoding assumes integer data, but if option
3916
    // is E_BYTE then tweak the requested encoding.  This ought
3917
    // to be fixed in cram_stats_encoding instead.
3918
811k
    if (option == E_BYTE || option == E_BYTE_ARRAY ||
3919
452k
       option == E_BYTE_ARRAY_BLOCK) {
3920
359k
       if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED)
3921
0
           codec = E_EXTERNAL;
3922
359k
       else if (codec == E_CONST_INT)
3923
0
           codec = E_CONST_BYTE;
3924
359k
    }
3925
3926
811k
    if (encode_init[codec]) {
3927
811k
        cram_codec *r;
3928
811k
        if ((r = encode_init[codec](st, codec, option, dat, version, vv)))
3929
811k
            r->out = NULL;
3930
811k
        if (!r) {
3931
15
            hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec));
3932
15
            return NULL;
3933
15
        }
3934
811k
        r->vv = vv;
3935
811k
        return r;
3936
811k
    } else {
3937
0
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3938
0
        abort();
3939
0
    }
3940
811k
}
3941
3942
/*
3943
 * Returns the content_id used by this codec, also in id2 if byte_array_len.
3944
 * Returns -1 for the CORE block and -2 for unneeded.
3945
 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
3946
 */
3947
0
int cram_codec_to_id(cram_codec *c, int *id2) {
3948
0
    int bnum1, bnum2 = -2;
3949
3950
0
    switch (c->codec) {
3951
0
    case E_CONST_INT:
3952
0
    case E_CONST_BYTE:
3953
0
        bnum1 = -2; // no blocks used
3954
0
        break;
3955
3956
0
    case E_HUFFMAN:
3957
0
        bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1;
3958
0
        break;
3959
3960
0
    case E_GOLOMB:
3961
0
    case E_BETA:
3962
0
    case E_SUBEXP:
3963
0
    case E_GOLOMB_RICE:
3964
0
    case E_GAMMA:
3965
        // CORE block
3966
0
        bnum1 = -1;
3967
0
        break;
3968
3969
0
    case E_EXTERNAL:
3970
0
    case E_VARINT_UNSIGNED:
3971
0
    case E_VARINT_SIGNED:
3972
0
        bnum1 = c->u.external.content_id;
3973
0
        break;
3974
3975
0
    case E_BYTE_ARRAY_LEN:
3976
0
        bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL);
3977
0
        bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL);
3978
0
        break;
3979
3980
0
    case E_BYTE_ARRAY_STOP:
3981
0
        bnum1 = c->u.byte_array_stop.content_id;
3982
0
        break;
3983
3984
0
    case E_NULL:
3985
0
        bnum1 = -2;
3986
0
        break;
3987
3988
0
    default:
3989
0
        hts_log_error("Unknown codec type %d", c->codec);
3990
0
        bnum1 = -1;
3991
0
    }
3992
3993
0
    if (id2)
3994
0
        *id2 = bnum2;
3995
0
    return bnum1;
3996
0
}
3997
3998
3999
/*
4000
 * cram_codec structures are specialised for decoding or encoding.
4001
 * Unfortunately this makes turning a decoder into an encoder (such as
4002
 * when transcoding files) problematic.
4003
 *
4004
 * This function converts a cram decoder codec into an encoder version
4005
 * in-place (ie it modifiers the codec itself).
4006
 *
4007
 * Returns 0 on success;
4008
 *        -1 on failure.
4009
 */
4010
0
int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
4011
0
    int j;
4012
4013
0
    switch (c->codec) {
4014
0
    case E_CONST_INT:
4015
0
    case E_CONST_BYTE:
4016
        // shares struct with decode
4017
0
        c->store = cram_const_encode_store;
4018
0
        break;
4019
4020
0
    case E_EXTERNAL:
4021
        // shares struct with decode
4022
0
        c->free = cram_external_encode_free;
4023
0
        c->store = cram_external_encode_store;
4024
0
        if (c->decode == cram_external_decode_int)
4025
0
            c->encode = cram_external_encode_int;
4026
0
        else if (c->decode == cram_external_decode_long)
4027
0
            c->encode = cram_external_encode_long;
4028
0
        else if (c->decode == cram_external_decode_char)
4029
0
            c->encode = cram_external_encode_char;
4030
0
        else if (c->decode == cram_external_decode_block)
4031
0
            c->encode = cram_external_encode_char;
4032
0
        else
4033
0
            return -1;
4034
0
        break;
4035
4036
0
    case E_VARINT_SIGNED:
4037
0
    case E_VARINT_UNSIGNED:
4038
        // shares struct with decode
4039
0
        c->free = cram_varint_encode_free;
4040
0
        c->store = cram_varint_encode_store;
4041
0
        if (c->decode == cram_varint_decode_int)
4042
0
            c->encode = cram_varint_encode_int;
4043
0
        else if (c->decode == cram_varint_decode_sint)
4044
0
            c->encode = cram_varint_encode_sint;
4045
0
        else if (c->decode == cram_varint_decode_long)
4046
0
            c->encode = cram_varint_encode_long;
4047
0
        else if (c->decode == cram_varint_decode_slong)
4048
0
            c->encode = cram_varint_encode_slong;
4049
0
        else
4050
0
            return -1;
4051
0
        break;
4052
4053
0
    case E_HUFFMAN: {
4054
        // New structure, so switch.
4055
        // FIXME: we huffman and e_huffman structs amended, we could
4056
        // unify this.
4057
0
        cram_codec *t = malloc(sizeof(*t));
4058
0
        if (!t) return -1;
4059
0
        t->vv     = c->vv;
4060
0
        t->codec = E_HUFFMAN;
4061
0
        t->free = cram_huffman_encode_free;
4062
0
        t->store = cram_huffman_encode_store;
4063
0
        t->u.e_huffman.codes = c->u.huffman.codes;
4064
0
        t->u.e_huffman.nvals = c->u.huffman.ncodes;
4065
0
        t->u.e_huffman.option = c->u.huffman.option;
4066
0
        for (j = 0; j < t->u.e_huffman.nvals; j++) {
4067
0
            int32_t sym = t->u.e_huffman.codes[j].symbol;
4068
0
            if (sym >= -1 && sym < MAX_HUFF)
4069
0
                t->u.e_huffman.val2code[sym+1] = j;
4070
0
        }
4071
4072
0
        if (c->decode == cram_huffman_decode_char0)
4073
0
            t->encode = cram_huffman_encode_char0;
4074
0
        else if (c->decode == cram_huffman_decode_char)
4075
0
            t->encode = cram_huffman_encode_char;
4076
0
        else if (c->decode == cram_huffman_decode_int0)
4077
0
            t->encode = cram_huffman_encode_int0;
4078
0
        else if (c->decode == cram_huffman_decode_int)
4079
0
            t->encode = cram_huffman_encode_int;
4080
0
        else if (c->decode == cram_huffman_decode_long0)
4081
0
            t->encode = cram_huffman_encode_long0;
4082
0
        else if (c->decode == cram_huffman_decode_long)
4083
0
            t->encode = cram_huffman_encode_long;
4084
0
        else {
4085
0
            free(t);
4086
0
            return -1;
4087
0
        }
4088
0
        *c = *t;
4089
0
        free(t);
4090
0
        break;
4091
0
    }
4092
4093
0
    case E_BETA:
4094
        // shares struct with decode
4095
0
        c->free = cram_beta_encode_free;
4096
0
        c->store = cram_beta_encode_store;
4097
0
        if (c->decode == cram_beta_decode_int)
4098
0
            c->encode = cram_beta_encode_int;
4099
0
        else if (c->decode == cram_beta_decode_long)
4100
0
            c->encode = cram_beta_encode_long;
4101
0
        else if (c->decode == cram_beta_decode_char)
4102
0
            c->encode = cram_beta_encode_char;
4103
0
        else
4104
0
            return -1;
4105
0
        break;
4106
4107
0
    case E_XPACK: {
4108
        // shares struct with decode
4109
0
        cram_codec t = *c;
4110
0
        t.free = cram_xpack_encode_free;
4111
0
        t.store = cram_xpack_encode_store;
4112
0
        if (t.decode == cram_xpack_decode_long)
4113
0
            t.encode = cram_xpack_encode_long;
4114
0
        else if (t.decode == cram_xpack_decode_int)
4115
0
            t.encode = cram_xpack_encode_int;
4116
0
        else if (t.decode == cram_xpack_decode_char)
4117
0
            t.encode = cram_xpack_encode_char;
4118
0
        else
4119
0
            return -1;
4120
0
        t.u.e_xpack.sub_codec = t.u.xpack.sub_codec;
4121
0
        if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1)
4122
0
            return -1;
4123
0
        *c = t;
4124
0
        break;
4125
0
    }
4126
4127
0
    case E_BYTE_ARRAY_LEN: {
4128
0
        cram_codec *t = malloc(sizeof(*t));
4129
0
        if (!t) return -1;
4130
0
        t->vv     = c->vv;
4131
0
        t->codec  = E_BYTE_ARRAY_LEN;
4132
0
        t->free   = cram_byte_array_len_encode_free;
4133
0
        t->store  = cram_byte_array_len_encode_store;
4134
0
        t->encode = cram_byte_array_len_encode;
4135
0
        t->u.e_byte_array_len.len_codec = c->u.byte_array_len.len_codec;
4136
0
        t->u.e_byte_array_len.val_codec = c->u.byte_array_len.val_codec;
4137
0
        if (cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.len_codec) == -1 ||
4138
0
            cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.val_codec) == -1) {
4139
0
            t->free(t);
4140
0
            return -1;
4141
0
        }
4142
4143
        // {len,val}_{encoding,dat} are undefined, but unused.
4144
        // Leaving them unset here means we can test that assertion.
4145
0
        *c = *t;
4146
0
        free(t);
4147
0
        break;
4148
0
    }
4149
4150
0
    case E_BYTE_ARRAY_STOP:
4151
        // shares struct with decode
4152
0
        c->free   = cram_byte_array_stop_encode_free;
4153
0
        c->store  = cram_byte_array_stop_encode_store;
4154
0
        c->encode = cram_byte_array_stop_encode;
4155
0
        break;
4156
4157
0
    default:
4158
0
        return -1;
4159
0
    }
4160
4161
0
    return 0;
4162
0
}
4163
4164
0
int cram_codec_describe(cram_codec *c, kstring_t *ks) {
4165
0
    if (c && c->describe)
4166
0
        return c->describe(c, ks);
4167
0
    else
4168
0
        return ksprintf(ks, "?");
4169
0
}