Coverage Report

Created: 2025-11-16 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_codecs.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2021,2023 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * FIXME: add checking of cram_external_type to return NULL on unsupported
33
 * {codec,type} tuples.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdlib.h>
40
#include <string.h>
41
#include <assert.h>
42
#include <limits.h>
43
#include <stdint.h>
44
#include <errno.h>
45
#include <stddef.h>
46
47
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48
#include "../fuzz_settings.h"
49
#endif
50
51
#include "../htslib/hts_endian.h"
52
53
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
54
#include <htscodecs/varint.h>
55
#include <htscodecs/pack.h>
56
#include <htscodecs/rle.h>
57
#else
58
#include "../htscodecs/htscodecs/varint.h"
59
#include "../htscodecs/htscodecs/pack.h"
60
#include "../htscodecs/htscodecs/rle.h"
61
#endif
62
63
#include "cram.h"
64
65
/*
66
 * ---------------------------------------------------------------------------
67
 * Block bit-level I/O functions.
68
 * All defined static here to promote easy inlining by the compiler.
69
 */
70
71
#if 0
72
/* Get a single bit, MSB first */
73
static signed int get_bit_MSB(cram_block *block) {
74
    unsigned int val;
75
76
    if (block->byte > block->alloc)
77
        return -1;
78
79
    val = block->data[block->byte] >> block->bit;
80
    if (--block->bit == -1) {
81
        block->bit = 7;
82
        block->byte++;
83
        //printf("(%02X)", block->data[block->byte]);
84
    }
85
86
    //printf("-B%d-", val&1);
87
88
    return val & 1;
89
}
90
#endif
91
92
/*
93
 * Count number of successive 0 and 1 bits
94
 */
95
0
static int get_one_bits_MSB(cram_block *block) {
96
0
    int n = 0, b;
97
0
    if (block->byte >= block->uncomp_size)
98
0
        return -1;
99
0
    do {
100
0
        b = block->data[block->byte] >> block->bit;
101
0
        if (--block->bit == -1) {
102
0
            block->bit = 7;
103
0
            block->byte++;
104
0
            if (block->byte == block->uncomp_size && (b&1))
105
0
                return -1;
106
0
        }
107
0
        n++;
108
0
    } while (b&1);
109
110
0
    return n-1;
111
0
}
112
113
0
static int get_zero_bits_MSB(cram_block *block) {
114
0
    int n = 0, b;
115
0
    if (block->byte >= block->uncomp_size)
116
0
        return -1;
117
0
    do {
118
0
        b = block->data[block->byte] >> block->bit;
119
0
        if (--block->bit == -1) {
120
0
            block->bit = 7;
121
0
            block->byte++;
122
0
            if (block->byte == block->uncomp_size && !(b&1))
123
0
                return -1;
124
0
        }
125
0
        n++;
126
0
    } while (!(b&1));
127
128
0
    return n-1;
129
0
}
130
131
#if 0
132
/* Stores a single bit */
133
static void store_bit_MSB(cram_block *block, unsigned int bit) {
134
    if (block->byte >= block->alloc) {
135
        block->alloc = block->alloc ? block->alloc*2 : 1024;
136
        block->data = realloc(block->data, block->alloc);
137
    }
138
139
    if (bit)
140
        block->data[block->byte] |= (1 << block->bit);
141
142
    if (--block->bit == -1) {
143
        block->bit = 7;
144
        block->byte++;
145
        block->data[block->byte] = 0;
146
    }
147
}
148
#endif
149
150
#if 0
151
/* Rounds to the next whole byte boundary first */
152
static void store_bytes_MSB(cram_block *block, char *bytes, int len) {
153
    if (block->bit != 7) {
154
        block->bit = 7;
155
        block->byte++;
156
    }
157
158
    while (block->byte + len >= block->alloc) {
159
        block->alloc = block->alloc ? block->alloc*2 : 1024;
160
        block->data = realloc(block->data, block->alloc);
161
    }
162
163
    memcpy(&block->data[block->byte], bytes, len);
164
    block->byte += len;
165
}
166
#endif
167
168
/* Local optimised copy for inlining */
169
0
static inline int64_t get_bits_MSB(cram_block *block, int nbits) {
170
0
    uint64_t val = 0;
171
0
    int i;
172
173
#if 0
174
    // Fits within the current byte */
175
    if (nbits <= block->bit+1) {
176
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
177
        if ((block->bit -= nbits) == -1) {
178
            block->bit = 7;
179
            block->byte++;
180
        }
181
        return val;
182
    }
183
184
    // partial first byte
185
    val = block->data[block->byte] & ((1<<(block->bit+1))-1);
186
    nbits -= block->bit+1;
187
    block->bit = 7;
188
    block->byte++;
189
190
    // whole middle bytes
191
    while (nbits >= 8) {
192
        val = (val << 8) | block->data[block->byte++];
193
        nbits -= 8;
194
    }
195
196
    val <<= nbits;
197
    val |= (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
198
    block->bit -= nbits;
199
    return val;
200
#endif
201
202
#if 0
203
    /* Inefficient implementation! */
204
    //printf("{");
205
    for (i = 0; i < nbits; i++)
206
        //val = (val << 1) | get_bit_MSB(block);
207
        GET_BIT_MSB(block, val);
208
#endif
209
210
0
#if 1
211
    /* Combination of 1st two methods */
212
0
    if (nbits <= block->bit+1) {
213
0
        val = (block->data[block->byte]>>(block->bit-(nbits-1))) & ((1<<nbits)-1);
214
0
        if ((block->bit -= nbits) == -1) {
215
0
            block->bit = 7;
216
0
            block->byte++;
217
0
        }
218
0
        return val;
219
0
    }
220
221
0
    switch(nbits) {
222
//  case 15: GET_BIT_MSB(block, val); // fall through
223
//  case 14: GET_BIT_MSB(block, val); // fall through
224
//  case 13: GET_BIT_MSB(block, val); // fall through
225
//  case 12: GET_BIT_MSB(block, val); // fall through
226
//  case 11: GET_BIT_MSB(block, val); // fall through
227
//  case 10: GET_BIT_MSB(block, val); // fall through
228
//  case  9: GET_BIT_MSB(block, val); // fall through
229
0
    case  8: GET_BIT_MSB(block, val); // fall through
230
0
    case  7: GET_BIT_MSB(block, val); // fall through
231
0
    case  6: GET_BIT_MSB(block, val); // fall through
232
0
    case  5: GET_BIT_MSB(block, val); // fall through
233
0
    case  4: GET_BIT_MSB(block, val); // fall through
234
0
    case  3: GET_BIT_MSB(block, val); // fall through
235
0
    case  2: GET_BIT_MSB(block, val); // fall through
236
0
    case  1: GET_BIT_MSB(block, val);
237
0
        break;
238
239
0
    default:
240
0
        for (i = 0; i < nbits; i++)
241
            //val = (val << 1) | get_bit_MSB(block);
242
0
            GET_BIT_MSB(block, val);
243
0
    }
244
0
#endif
245
246
    //printf("=0x%x}", val);
247
248
0
    return val;
249
0
}
250
251
/*
252
 * Can store up to 24-bits worth of data encoded in an integer value
253
 * Possibly we'd want to have a less optimal store_bits function when dealing
254
 * with nbits > 24, but for now we assume the codes generated are never
255
 * that big. (Given this is only possible with 121392 or more
256
 * characters with exactly the correct frequency distribution we check
257
 * for it elsewhere.)
258
 */
259
3.59k
static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) {
260
    //fprintf(stderr, " store_bits: %02x %d\n", val, nbits);
261
262
    /*
263
     * Use slow mode until we tweak the huffman generator to never generate
264
     * codes longer than 24-bits.
265
     */
266
3.59k
    unsigned int mask;
267
268
3.59k
    if (block->byte+8 >= block->alloc) {
269
214
        if (block->byte) {
270
0
            block->alloc *= 2;
271
0
            block->data = realloc(block->data, block->alloc + 8);
272
0
            if (!block->data)
273
0
                return -1;
274
214
        } else {
275
214
            block->alloc = 1024;
276
214
            block->data = realloc(block->data, block->alloc + 8);
277
214
            if (!block->data)
278
0
                return -1;
279
214
            block->data[0] = 0; // initialise first byte of buffer
280
214
        }
281
214
    }
282
283
    /* fits in current bit-field */
284
3.59k
    if (nbits <= block->bit+1) {
285
1.83k
        block->data[block->byte] |= (val << (block->bit+1-nbits));
286
1.83k
        if ((block->bit-=nbits) == -1) {
287
495
            block->bit = 7;
288
495
            block->byte++;
289
495
            block->data[block->byte] = 0;
290
495
        }
291
1.83k
        return 0;
292
1.83k
    }
293
294
1.75k
    block->data[block->byte] |= (val >> (nbits -= block->bit+1));
295
1.75k
    block->bit = 7;
296
1.75k
    block->byte++;
297
1.75k
    block->data[block->byte] = 0;
298
299
1.75k
    mask = 1<<(nbits-1);
300
7.05k
    do {
301
7.05k
        if (val & mask)
302
1.95k
            block->data[block->byte] |= (1 << block->bit);
303
7.05k
        if (--block->bit == -1) {
304
141
            block->bit = 7;
305
141
            block->byte++;
306
141
            block->data[block->byte] = 0;
307
141
        }
308
7.05k
        mask >>= 1;
309
7.05k
    } while(--nbits);
310
311
1.75k
    return 0;
312
3.59k
}
313
314
/*
315
 * Returns the next 'size' bytes from a block, or NULL if insufficient
316
 * data left.This is just a pointer into the block data and not an
317
 * allocated object, so do not free the result.
318
 */
319
0
static char *cram_extract_block(cram_block *b, int size) {
320
0
    char *cp = (char *)b->data + b->idx;
321
0
    b->idx += size;
322
0
    if (b->idx > b->uncomp_size)
323
0
        return NULL;
324
325
0
    return cp;
326
0
}
327
328
/*
329
 * ---------------------------------------------------------------------------
330
 * EXTERNAL
331
 *
332
 * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the
333
 * size of the object being returned.  This type is hard coded in the
334
 * spec document (changing from uint32 to uint64 requires a spec change)
335
 * and there is no data format introspection so implementations have
336
 * to determine which size to use based on version numbers.   It also
337
 * doesn't support signed data.
338
 *
339
 * With CRAM 4.0 onwards the size and sign of the data is no longer stated
340
 * explicitly in the specification.  Instead EXTERNAL is replaced by three
341
 * new encodings, for bytes and signed / unsigned integers which used a
342
 * variable sized encoding.
343
 *
344
 * For simplicity we use the same encode and decode functions for
345
 * bytes (CRAM4) and external (CRAM3). Given we already had code to
346
 * replace codec + type into a function pointer it makes little
347
 * difference how we ended up at that function.  However we disallow
348
 * this codec to operate on integer data for CRAM4 onwards.
349
 */
350
int cram_external_decode_int(cram_slice *slice, cram_codec *c,
351
0
                             cram_block *in, char *out, int *out_size) {
352
0
    char *cp;
353
0
    cram_block *b;
354
355
    /* Find the external block */
356
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
357
0
    if (!b)
358
0
        return *out_size?-1:0;
359
360
0
    cp = (char *)b->data + b->idx;
361
    // E_INT and E_LONG are guaranteed single item queries
362
0
    int err = 0;
363
0
    *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err);
364
0
    b->idx = cp - (char *)b->data;
365
0
    *out_size = 1;
366
367
0
    return err ? -1 : 0;
368
0
}
369
370
int cram_external_decode_long(cram_slice *slice, cram_codec *c,
371
0
                              cram_block *in, char *out, int *out_size) {
372
0
    char *cp;
373
0
    cram_block *b;
374
375
    /* Find the external block */
376
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
377
0
    if (!b)
378
0
        return *out_size?-1:0;
379
380
0
    cp = (char *)b->data + b->idx;
381
    // E_INT and E_LONG are guaranteed single item queries
382
0
    int err = 0;
383
0
    *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err);
384
0
    b->idx = cp - (char *)b->data;
385
0
    *out_size = 1;
386
387
0
    return err ? -1 : 0;
388
0
}
389
390
int cram_external_decode_char(cram_slice *slice, cram_codec *c,
391
                              cram_block *in, char *out,
392
0
                              int *out_size) {
393
0
    char *cp;
394
0
    cram_block *b;
395
396
    /* Find the external block */
397
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
398
0
    if (!b)
399
0
        return *out_size?-1:0;
400
401
0
    cp = cram_extract_block(b, *out_size);
402
0
    if (!cp)
403
0
        return -1;
404
405
0
    if (out)
406
0
        memcpy(out, cp, *out_size);
407
0
    return 0;
408
0
}
409
410
static int cram_external_decode_block(cram_slice *slice, cram_codec *c,
411
                                      cram_block *in, char *out_,
412
0
                                      int *out_size) {
413
0
    char *cp;
414
0
    cram_block *out = (cram_block *)out_;
415
0
    cram_block *b = NULL;
416
417
    /* Find the external block */
418
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
419
0
    if (!b)
420
0
        return *out_size?-1:0;
421
422
0
    cp = cram_extract_block(b, *out_size);
423
0
    if (!cp)
424
0
        return -1;
425
426
0
    BLOCK_APPEND(out, cp, *out_size);
427
0
    return 0;
428
429
0
 block_err:
430
0
    return -1;
431
0
}
432
433
1.03k
void cram_external_decode_free(cram_codec *c) {
434
1.03k
    if (c)
435
1.03k
        free(c);
436
1.03k
}
437
438
439
0
int cram_external_decode_size(cram_slice *slice, cram_codec *c) {
440
0
    cram_block *b;
441
442
    /* Find the external block */
443
0
    b = cram_get_block_by_id(slice, c->u.external.content_id);
444
0
    if (!b)
445
0
        return -1;
446
447
0
    return b->uncomp_size;
448
0
}
449
450
0
cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) {
451
0
    return cram_get_block_by_id(slice, c->u.external.content_id);
452
0
}
453
454
0
int cram_external_describe(cram_codec *c, kstring_t *ks) {
455
0
    return ksprintf(ks, "EXTERNAL(id=%d)",
456
0
                    c->u.external.content_id) < 0 ? -1 : 0;
457
0
}
458
459
cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr,
460
                                      char *data, int size,
461
                                      enum cram_encoding codec,
462
                                      enum cram_external_type option,
463
1.04k
                                      int version, varint_vec *vv) {
464
1.04k
    cram_codec *c = NULL;
465
1.04k
    char *cp = data;
466
467
1.04k
    if (size < 1)
468
0
        goto malformed;
469
470
1.04k
    if (!(c = malloc(sizeof(*c))))
471
0
        return NULL;
472
473
1.04k
    c->codec  = E_EXTERNAL;
474
1.04k
    if (CRAM_MAJOR_VERS(version) >= 4) {
475
        // Version 4 does not permit integer data to be encoded as a
476
        // series of bytes.  This is used purely for bytes, either
477
        // singular or declared as arrays
478
0
        switch (codec) {
479
0
        case E_EXTERNAL:
480
0
            if (option == E_BYTE_ARRAY_BLOCK)
481
0
                c->decode = cram_external_decode_block;
482
0
            else if (option == E_BYTE || option == E_BYTE_ARRAY)
483
0
                c->decode = cram_external_decode_char;
484
0
            else
485
0
                goto malformed;
486
0
            break;
487
0
        default:
488
0
            goto malformed;
489
0
        }
490
1.04k
    } else {
491
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
492
        // use the option field to indicate the input data format so
493
        // we know which serialisation format to use.
494
1.04k
        if (option == E_INT)
495
858
            c->decode = cram_external_decode_int;
496
183
        else if (option == E_LONG)
497
0
            c->decode = cram_external_decode_long;
498
183
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
499
9
            c->decode = cram_external_decode_char;
500
174
        else
501
174
            c->decode = cram_external_decode_block;
502
1.04k
    }
503
1.04k
    c->free   = cram_external_decode_free;
504
1.04k
    c->size   = cram_external_decode_size;
505
1.04k
    c->get_block = cram_external_get_block;
506
1.04k
    c->describe = cram_external_describe;
507
508
1.04k
    c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL);
509
510
1.04k
    if (cp - data != size)
511
6
        goto malformed;
512
513
1.03k
    c->u.external.type = option;
514
515
1.03k
    return c;
516
517
6
 malformed:
518
6
    hts_log_error("Malformed external header stream");
519
6
    free(c);
520
6
    return NULL;
521
1.04k
}
522
523
int cram_external_encode_int(cram_slice *slice, cram_codec *c,
524
7.77M
                             char *in, int in_size) {
525
7.77M
    uint32_t *i32 = (uint32_t *)in;
526
7.77M
    return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1;
527
7.77M
}
528
529
int cram_external_encode_sint(cram_slice *slice, cram_codec *c,
530
0
                             char *in, int in_size) {
531
0
    int32_t *i32 = (int32_t *)in;
532
0
    return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1;
533
0
}
534
535
int cram_external_encode_long(cram_slice *slice, cram_codec *c,
536
0
                             char *in, int in_size) {
537
0
    uint64_t *i64 = (uint64_t *)in;
538
0
    return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1;
539
0
}
540
541
int cram_external_encode_slong(cram_slice *slice, cram_codec *c,
542
0
                               char *in, int in_size) {
543
0
    int64_t *i64 = (int64_t *)in;
544
0
    return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1;
545
0
}
546
547
int cram_external_encode_char(cram_slice *slice, cram_codec *c,
548
138k
                              char *in, int in_size) {
549
138k
    BLOCK_APPEND(c->out, in, in_size);
550
138k
    return 0;
551
552
0
 block_err:
553
0
    return -1;
554
138k
}
555
556
90.1k
void cram_external_encode_free(cram_codec *c) {
557
90.1k
    if (!c)
558
0
        return;
559
90.1k
    free(c);
560
90.1k
}
561
562
int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix,
563
89.9k
                               int version) {
564
89.9k
    char tmp[99], *tp = tmp, *tpend = tmp+99;
565
89.9k
    int len = 0, r = 0, n;
566
567
89.9k
    if (prefix) {
568
25.1k
        size_t l = strlen(prefix);
569
25.1k
        BLOCK_APPEND(b, prefix, l);
570
25.1k
        len += l;
571
25.1k
    }
572
573
89.9k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id);
574
89.9k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
575
89.9k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
576
89.9k
    BLOCK_APPEND(b, tmp, tp-tmp);
577
89.9k
    len += tp-tmp;
578
579
89.9k
    if (r > 0)
580
89.9k
        return len;
581
582
0
 block_err:
583
0
    return -1;
584
89.9k
}
585
586
cram_codec *cram_external_encode_init(cram_stats *st,
587
                                      enum cram_encoding codec,
588
                                      enum cram_external_type option,
589
                                      void *dat,
590
90.1k
                                      int version, varint_vec *vv) {
591
90.1k
    cram_codec *c;
592
593
90.1k
    c = malloc(sizeof(*c));
594
90.1k
    if (!c)
595
0
        return NULL;
596
90.1k
    c->codec = E_EXTERNAL;
597
90.1k
    c->free = cram_external_encode_free;
598
90.1k
    if (CRAM_MAJOR_VERS(version) >= 4) {
599
        // Version 4 does not permit integer data to be encoded as a
600
        // series of bytes.  This is used purely for bytes, either
601
        // singular or declared as arrays
602
0
        switch (codec) {
603
0
        case E_EXTERNAL:
604
0
            if (option != E_BYTE && option != E_BYTE_ARRAY)
605
0
                return NULL;
606
0
            c->encode = cram_external_encode_char;
607
0
            break;
608
0
        default:
609
0
            return NULL;
610
0
        }
611
90.1k
    } else {
612
        // CRAM 3 and earlier encodes integers as EXTERNAL.  We need
613
        // use the option field to indicate the input data format so
614
        // we know which serialisation format to use.
615
90.1k
        if (option == E_INT)
616
23.6k
            c->encode = cram_external_encode_int;
617
66.4k
        else if (option == E_LONG)
618
0
            c->encode = cram_external_encode_long;
619
66.4k
        else if (option == E_BYTE_ARRAY || option == E_BYTE)
620
66.4k
            c->encode = cram_external_encode_char;
621
0
        else
622
0
            abort();
623
90.1k
    }
624
90.1k
    c->store = cram_external_encode_store;
625
90.1k
    c->flush = NULL;
626
627
90.1k
    c->u.e_external.content_id = (size_t)dat;
628
629
90.1k
    return c;
630
90.1k
}
631
632
/*
633
 * ---------------------------------------------------------------------------
634
 * VARINT
635
 *
636
 * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8
637
 * format as well as bytes.  In CRAM 4 EXTERNAL is only for bytes and
638
 * byte arrays, with two dedicated encodings for integers:
639
 * VARINT_SIGNED and VARINT_UNSIGNED.  These also differ a little to
640
 * EXTERNAL with the addition of an offset field, meaning we can store
641
 * values in, say, the range -2 to 1 million without needing to use
642
 * a signed zig-zag transformation.
643
 */
644
int cram_varint_decode_int(cram_slice *slice, cram_codec *c,
645
0
                           cram_block *in, char *out, int *out_size) {
646
0
    char *cp;
647
0
    cram_block *b;
648
649
    /* Find the data block */
650
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
651
0
    if (!b)
652
0
        return *out_size?-1:0;
653
654
0
    cp = (char *)b->data + b->idx;
655
    // E_INT and E_LONG are guaranteed single item queries
656
0
    int err = 0;
657
0
    *(int32_t *)out = c->vv->varint_get32(&cp,
658
0
                                          (char *)b->data + b->uncomp_size,
659
0
                                          &err) + c->u.varint.offset;
660
0
    b->idx = cp - (char *)b->data;
661
0
    *out_size = 1;
662
663
0
    return err ? -1 : 0;
664
0
}
665
666
int cram_varint_decode_sint(cram_slice *slice, cram_codec *c,
667
0
                            cram_block *in, char *out, int *out_size) {
668
0
    char *cp;
669
0
    cram_block *b;
670
671
    /* Find the data block */
672
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
673
0
    if (!b)
674
0
        return *out_size?-1:0;
675
676
0
    cp = (char *)b->data + b->idx;
677
    // E_INT and E_LONG are guaranteed single item queries
678
0
    int err = 0;
679
0
    *(int32_t *)out = c->vv->varint_get32s(&cp,
680
0
                                           (char *)b->data + b->uncomp_size,
681
0
                                           &err) + c->u.varint.offset;
682
0
    b->idx = cp - (char *)b->data;
683
0
    *out_size = 1;
684
685
0
    return err ? -1 : 0;
686
0
}
687
688
int cram_varint_decode_long(cram_slice *slice, cram_codec *c,
689
0
                            cram_block *in, char *out, int *out_size) {
690
0
    char *cp;
691
0
    cram_block *b;
692
693
    /* Find the data block */
694
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
695
0
    if (!b)
696
0
        return *out_size?-1:0;
697
698
0
    cp = (char *)b->data + b->idx;
699
    // E_INT and E_LONG are guaranteed single item queries
700
0
    int err = 0;
701
0
    *(int64_t *)out = c->vv->varint_get64(&cp,
702
0
                                          (char *)b->data + b->uncomp_size,
703
0
                                          &err) + c->u.varint.offset;
704
0
    b->idx = cp - (char *)b->data;
705
0
    *out_size = 1;
706
707
0
    return err ? -1 : 0;
708
0
}
709
710
int cram_varint_decode_slong(cram_slice *slice, cram_codec *c,
711
0
                             cram_block *in, char *out, int *out_size) {
712
0
    char *cp;
713
0
    cram_block *b;
714
715
    /* Find the data block */
716
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
717
0
    if (!b)
718
0
        return *out_size?-1:0;
719
720
0
    cp = (char *)b->data + b->idx;
721
    // E_INT and E_LONG are guaranteed single item queries
722
0
    int err = 0;
723
0
    *(int64_t *)out = c->vv->varint_get64s(&cp,
724
0
                                           (char *)b->data + b->uncomp_size,
725
0
                                           &err) + c->u.varint.offset;
726
0
    b->idx = cp - (char *)b->data;
727
0
    *out_size = 1;
728
729
0
    return err ? -1 : 0;
730
0
}
731
732
269
void cram_varint_decode_free(cram_codec *c) {
733
269
    if (c)
734
269
        free(c);
735
269
}
736
737
0
int cram_varint_decode_size(cram_slice *slice, cram_codec *c) {
738
0
    cram_block *b;
739
740
    /* Find the data block */
741
0
    b = cram_get_block_by_id(slice, c->u.varint.content_id);
742
0
    if (!b)
743
0
        return -1;
744
745
0
    return b->uncomp_size;
746
0
}
747
748
0
cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) {
749
0
    return cram_get_block_by_id(slice, c->u.varint.content_id);
750
0
}
751
752
0
int cram_varint_describe(cram_codec *c, kstring_t *ks) {
753
0
    return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)",
754
0
                    c->u.varint.content_id,
755
0
                    c->u.varint.offset,
756
0
                    c->u.varint.type)
757
0
        < 0 ? -1 : 0;
758
0
}
759
760
cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr,
761
                                    char *data, int size,
762
                                    enum cram_encoding codec,
763
                                    enum cram_external_type option,
764
269
                                    int version, varint_vec *vv) {
765
269
    cram_codec *c;
766
269
    char *cp = data, *cp_end = data+size;
767
768
269
    if (!(c = malloc(sizeof(*c))))
769
0
        return NULL;
770
771
269
    c->codec  = codec;
772
773
    // Function pointer choice is theoretically by codec type.
774
    // Given we have some vars as int32 and some as int64 we
775
    // use option too for sizing, although on disk format
776
    // does not change.
777
269
    switch(codec) {
778
213
    case E_VARINT_UNSIGNED:
779
213
        c->decode = (option == E_INT)
780
213
            ? cram_varint_decode_int
781
213
            : cram_varint_decode_long;
782
213
        break;
783
56
    case E_VARINT_SIGNED:
784
56
        c->decode = (option == E_INT)
785
56
            ? cram_varint_decode_sint
786
56
            : cram_varint_decode_slong;
787
56
        break;
788
0
    default:
789
0
        return NULL;
790
269
    }
791
792
269
    c->free   = cram_varint_decode_free;
793
269
    c->size   = cram_varint_decode_size;
794
269
    c->get_block = cram_varint_get_block;
795
269
    c->describe = cram_varint_describe;
796
797
269
    c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL);
798
269
    c->u.varint.offset     = vv->varint_get64s(&cp, cp_end, NULL);
799
800
269
    if (cp - data != size) {
801
0
        fprintf(stderr, "Malformed varint header stream\n");
802
0
        free(c);
803
0
        return NULL;
804
0
    }
805
806
269
    c->u.varint.type = option;
807
808
269
    return c;
809
269
}
810
811
int cram_varint_encode_int(cram_slice *slice, cram_codec *c,
812
0
                           char *in, int in_size) {
813
0
    uint32_t *i32 = (uint32_t *)in;
814
0
    return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0
815
0
        ? 0 : -1;
816
0
}
817
818
int cram_varint_encode_sint(cram_slice *slice, cram_codec *c,
819
0
                            char *in, int in_size) {
820
0
    int32_t *i32 = (int32_t *)in;
821
0
    return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0
822
0
        ? 0 : -1;
823
0
}
824
825
int cram_varint_encode_long(cram_slice *slice, cram_codec *c,
826
0
                            char *in, int in_size) {
827
0
    uint64_t *i64 = (uint64_t *)in;
828
0
    return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0
829
0
        ? 0 : -1;
830
0
}
831
832
int cram_varint_encode_slong(cram_slice *slice, cram_codec *c,
833
0
                             char *in, int in_size) {
834
0
    int64_t *i64 = (int64_t *)in;
835
0
    return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0
836
0
        ? 0 : -1;
837
0
}
838
839
0
void cram_varint_encode_free(cram_codec *c) {
840
0
    if (!c)
841
0
        return;
842
0
    free(c);
843
0
}
844
845
int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix,
846
0
                             int version) {
847
0
    char tmp[99], *tp = tmp;
848
0
    int len = 0;
849
850
0
    if (prefix) {
851
0
        size_t l = strlen(prefix);
852
0
        BLOCK_APPEND(b, prefix, l);
853
0
        len += l;
854
0
    }
855
856
0
    tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id);
857
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset);
858
0
    len += c->vv->varint_put32_blk(b, c->codec);
859
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
860
0
    BLOCK_APPEND(b, tmp, tp-tmp);
861
0
    len += tp-tmp;
862
863
0
    return len;
864
865
0
 block_err:
866
0
    return -1;
867
0
}
868
869
cram_codec *cram_varint_encode_init(cram_stats *st,
870
                                    enum cram_encoding codec,
871
                                    enum cram_external_type option,
872
                                    void *dat,
873
0
                                    int version, varint_vec *vv) {
874
0
    cram_codec *c;
875
876
0
    if (!(c = malloc(sizeof(*c))))
877
0
        return NULL;
878
879
0
    c->u.e_varint.offset = 0;
880
0
    if (st) {
881
        // Marginal difference so far! Not worth the hassle?
882
0
        if (st->min_val < 0 && st->min_val >= -127
883
0
            && st->max_val / -st->min_val > 100) {
884
0
            c->u.e_varint.offset = -st->min_val;
885
0
            codec = E_VARINT_UNSIGNED;
886
0
        } else if (st->min_val > 0) {
887
0
            c->u.e_varint.offset = -st->min_val;
888
0
        }
889
0
    }
890
891
0
    c->codec = codec;
892
0
    c->free = cram_varint_encode_free;
893
894
    // Function pointer choice is theoretically by codec type.
895
    // Given we have some vars as int32 and some as int64 we
896
    // use option too for sizing, although on disk format
897
    // does not change.
898
0
    switch (codec) {
899
0
    case E_VARINT_UNSIGNED:
900
0
        c->encode = (option == E_INT)
901
0
            ? cram_varint_encode_int
902
0
            : cram_varint_encode_long;
903
0
        break;
904
0
    case E_VARINT_SIGNED:
905
0
        c->encode = (option == E_INT)
906
0
            ? cram_varint_encode_sint
907
0
            : cram_varint_encode_slong;
908
0
        break;
909
0
    default:
910
0
        return NULL;
911
0
    }
912
0
    c->store = cram_varint_encode_store;
913
0
    c->flush = NULL;
914
915
0
    c->u.e_varint.content_id = (size_t)dat;
916
917
0
    return c;
918
0
}
919
/*
920
 * ---------------------------------------------------------------------------
921
 * CONST_BYTE and CONST_INT
922
 */
923
int cram_const_decode_byte(cram_slice *slice, cram_codec *c,
924
0
                           cram_block *in, char *out, int *out_size) {
925
0
    int i, n;
926
927
0
    for (i = 0, n = *out_size; i < n; i++)
928
0
        out[i] = c->u.xconst.val;
929
930
0
    return 0;
931
0
}
932
933
int cram_const_decode_int(cram_slice *slice, cram_codec *c,
934
0
                          cram_block *in, char *out, int *out_size) {
935
0
    int32_t *out_i = (int32_t *)out;
936
0
    int i, n;
937
938
0
    for (i = 0, n = *out_size; i < n; i++)
939
0
        out_i[i] = c->u.xconst.val;
940
941
0
    return 0;
942
0
}
943
944
int cram_const_decode_long(cram_slice *slice, cram_codec *c,
945
0
                           cram_block *in, char *out, int *out_size) {
946
0
    int64_t *out_i = (int64_t *)out;
947
0
    int i, n;
948
949
0
    for (i = 0, n = *out_size; i < n; i++)
950
0
        out_i[i] = c->u.xconst.val;
951
952
0
    return 0;
953
0
}
954
955
132
void cram_const_decode_free(cram_codec *c) {
956
132
    if (c)
957
132
        free(c);
958
132
}
959
960
0
int cram_const_decode_size(cram_slice *slice, cram_codec *c) {
961
0
    return 0;
962
0
}
963
964
0
int cram_const_describe(cram_codec *c, kstring_t *ks) {
965
0
    return ksprintf(ks, "CONST(val=%"PRId64")",
966
0
                    c->u.xconst.val) < 0 ? -1 : 0;
967
0
}
968
969
cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr,
970
                                   char *data, int size,
971
                                   enum cram_encoding codec,
972
                                   enum cram_external_type option,
973
134
                                   int version, varint_vec *vv) {
974
134
    cram_codec *c;
975
134
    char *cp = data;
976
977
134
    if (!(c = malloc(sizeof(*c))))
978
0
        return NULL;
979
980
134
    c->codec  = codec;
981
134
    if (codec == E_CONST_BYTE)
982
2
        c->decode = cram_const_decode_byte;
983
132
    else if (option == E_INT)
984
72
        c->decode = cram_const_decode_int;
985
60
    else
986
60
        c->decode = cram_const_decode_long;
987
134
    c->free   = cram_const_decode_free;
988
134
    c->size   = cram_const_decode_size;
989
134
    c->get_block = NULL;
990
134
    c->describe = cram_const_describe;
991
992
134
    c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL);
993
994
134
    if (cp - data != size) {
995
2
        fprintf(stderr, "Malformed const header stream\n");
996
2
        free(c);
997
2
        return NULL;
998
2
    }
999
1000
132
    return c;
1001
134
}
1002
1003
int cram_const_encode(cram_slice *slice, cram_codec *c,
1004
0
                      char *in, int in_size) {
1005
0
    return 0;
1006
0
}
1007
1008
int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix,
1009
0
                            int version) {
1010
0
    char tmp[99], *tp = tmp;
1011
0
    int len = 0;
1012
1013
0
    if (prefix) {
1014
0
        size_t l = strlen(prefix);
1015
0
        BLOCK_APPEND(b, prefix, l);
1016
0
        len += l;
1017
0
    }
1018
1019
0
    tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val);
1020
0
    len += c->vv->varint_put32_blk(b, c->codec);
1021
0
    len += c->vv->varint_put32_blk(b, tp-tmp);
1022
0
    BLOCK_APPEND(b, tmp, tp-tmp);
1023
0
    len += tp-tmp;
1024
1025
0
    return len;
1026
1027
0
 block_err:
1028
0
    return -1;
1029
0
}
1030
1031
cram_codec *cram_const_encode_init(cram_stats *st,
1032
                                   enum cram_encoding codec,
1033
                                   enum cram_external_type option,
1034
                                   void *dat,
1035
0
                                   int version, varint_vec *vv) {
1036
0
    cram_codec *c;
1037
1038
0
    if (!(c = malloc(sizeof(*c))))
1039
0
        return NULL;
1040
1041
0
    c->codec = codec;
1042
0
    c->free = cram_const_decode_free; // as as decode
1043
0
    c->encode = cram_const_encode; // a nop
1044
0
    c->store = cram_const_encode_store;
1045
0
    c->flush = NULL;
1046
0
    c->u.e_xconst.val = st->min_val;
1047
1048
0
    return c;
1049
0
}
1050
1051
/*
1052
 * ---------------------------------------------------------------------------
1053
 * BETA
1054
 */
1055
0
int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1056
0
    int64_t *out_i = (int64_t *)out;
1057
0
    int i, n = *out_size;
1058
1059
0
    if (c->u.beta.nbits) {
1060
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1061
0
            return -1;
1062
1063
0
        for (i = 0; i < n; i++)
1064
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1065
0
    } else {
1066
0
        for (i = 0; i < n; i++)
1067
0
            out_i[i] = -c->u.beta.offset;
1068
0
    }
1069
1070
0
    return 0;
1071
0
}
1072
1073
0
int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1074
0
    int32_t *out_i = (int32_t *)out;
1075
0
    int i, n = *out_size;
1076
1077
0
    if (c->u.beta.nbits) {
1078
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1079
0
            return -1;
1080
1081
0
        for (i = 0; i < n; i++)
1082
0
            out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1083
0
    } else {
1084
0
        for (i = 0; i < n; i++)
1085
0
            out_i[i] = -c->u.beta.offset;
1086
0
    }
1087
1088
0
    return 0;
1089
0
}
1090
1091
0
int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1092
0
    int i, n = *out_size;
1093
1094
1095
0
    if (c->u.beta.nbits) {
1096
0
        if (cram_not_enough_bits(in, c->u.beta.nbits * n))
1097
0
            return -1;
1098
1099
0
        if (out)
1100
0
            for (i = 0; i < n; i++)
1101
0
                out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset;
1102
0
        else
1103
0
            for (i = 0; i < n; i++)
1104
0
                get_bits_MSB(in, c->u.beta.nbits);
1105
0
    } else {
1106
0
        if (out)
1107
0
            for (i = 0; i < n; i++)
1108
0
                out[i] = -c->u.beta.offset;
1109
0
    }
1110
1111
0
    return 0;
1112
0
}
1113
1114
15
void cram_beta_decode_free(cram_codec *c) {
1115
15
    if (c)
1116
15
        free(c);
1117
15
}
1118
1119
0
int cram_beta_describe(cram_codec *c, kstring_t *ks) {
1120
0
    return ksprintf(ks, "BETA(offset=%d, nbits=%d)",
1121
0
                    c->u.beta.offset, c->u.beta.nbits)
1122
0
        < 0 ? -1 : 0;
1123
0
}
1124
1125
cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr,
1126
                                  char *data, int size,
1127
                                  enum cram_encoding codec,
1128
                                  enum cram_external_type option,
1129
15
                                  int version, varint_vec *vv) {
1130
15
    cram_codec *c;
1131
15
    char *cp = data;
1132
1133
15
    if (!(c = malloc(sizeof(*c))))
1134
0
        return NULL;
1135
1136
15
    c->codec  = E_BETA;
1137
15
    if (option == E_INT || option == E_SINT)
1138
9
        c->decode = cram_beta_decode_int;
1139
6
    else if (option == E_LONG || option == E_SLONG)
1140
0
        c->decode = cram_beta_decode_long;
1141
6
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1142
6
        c->decode = cram_beta_decode_char;
1143
0
    else {
1144
0
        hts_log_error("BYTE_ARRAYs not supported by this codec");
1145
0
        free(c);
1146
0
        return NULL;
1147
0
    }
1148
15
    c->free   = cram_beta_decode_free;
1149
15
    c->describe = cram_beta_describe;
1150
1151
15
    c->u.beta.nbits = -1;
1152
15
    c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL);
1153
15
    if (cp < data + size) // Ensure test below works
1154
15
        c->u.beta.nbits  = vv->varint_get32(&cp, data + size, NULL);
1155
1156
15
    if (cp - data != size
1157
15
        || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) {
1158
0
        hts_log_error("Malformed beta header stream");
1159
0
        free(c);
1160
0
        return NULL;
1161
0
    }
1162
1163
15
    return c;
1164
15
}
1165
1166
int cram_beta_encode_store(cram_codec *c, cram_block *b,
1167
214
                           char *prefix, int version) {
1168
214
    int len = 0, r = 0, n;
1169
1170
214
    if (prefix) {
1171
214
        size_t l = strlen(prefix);
1172
214
        BLOCK_APPEND(b, prefix, l);
1173
214
        len += l;
1174
214
    }
1175
1176
214
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1177
    // codec length
1178
214
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset)
1179
214
                                         + c->vv->varint_size(c->u.e_beta.nbits)));
1180
214
    r |= n;
1181
214
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n;
1182
214
    len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits));  r |= n;
1183
1184
214
    if (r > 0) return len;
1185
1186
0
 block_err:
1187
0
    return -1;
1188
214
}
1189
1190
int cram_beta_encode_long(cram_slice *slice, cram_codec *c,
1191
0
                          char *in, int in_size) {
1192
0
    int64_t *syms = (int64_t *)in;
1193
0
    int i, r = 0;
1194
1195
0
    for (i = 0; i < in_size; i++)
1196
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1197
0
                            c->u.e_beta.nbits);
1198
1199
0
    return r;
1200
0
}
1201
1202
int cram_beta_encode_int(cram_slice *slice, cram_codec *c,
1203
3.59k
                         char *in, int in_size) {
1204
3.59k
    int *syms = (int *)in;
1205
3.59k
    int i, r = 0;
1206
1207
7.18k
    for (i = 0; i < in_size; i++)
1208
3.59k
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1209
3.59k
                            c->u.e_beta.nbits);
1210
1211
3.59k
    return r;
1212
3.59k
}
1213
1214
int cram_beta_encode_char(cram_slice *slice, cram_codec *c,
1215
0
                          char *in, int in_size) {
1216
0
    unsigned char *syms = (unsigned char *)in;
1217
0
    int i, r = 0;
1218
1219
0
    for (i = 0; i < in_size; i++)
1220
0
        r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset,
1221
0
                            c->u.e_beta.nbits);
1222
1223
0
    return r;
1224
0
}
1225
1226
214
void cram_beta_encode_free(cram_codec *c) {
1227
214
    if (c) free(c);
1228
214
}
1229
1230
cram_codec *cram_beta_encode_init(cram_stats *st,
1231
                                  enum cram_encoding codec,
1232
                                  enum cram_external_type option,
1233
                                  void *dat,
1234
223
                                  int version, varint_vec *vv) {
1235
223
    cram_codec *c;
1236
223
    hts_pos_t min_val, max_val;
1237
223
    int len = 0;
1238
223
    int64_t range;
1239
1240
223
    c = malloc(sizeof(*c));
1241
223
    if (!c)
1242
0
        return NULL;
1243
223
    c->codec  = E_BETA;
1244
223
    c->free   = cram_beta_encode_free;
1245
223
    if (option == E_INT || option == E_SINT)
1246
223
        c->encode = cram_beta_encode_int;
1247
0
    else if (option == E_LONG || option == E_SLONG)
1248
0
        c->encode = cram_beta_encode_long;
1249
0
    else
1250
0
        c->encode = cram_beta_encode_char;
1251
223
    c->store  = cram_beta_encode_store;
1252
223
    c->flush = NULL;
1253
1254
223
    if (dat) {
1255
223
        min_val = ((hts_pos_t *)dat)[0];
1256
223
        max_val = ((hts_pos_t *)dat)[1];
1257
223
    } else {
1258
0
        min_val = INT_MAX;
1259
0
        max_val = INT_MIN;
1260
0
        int i;
1261
0
        for (i = 0; i < MAX_STAT_VAL; i++) {
1262
0
            if (!st->freqs[i])
1263
0
                continue;
1264
0
            if (min_val > i)
1265
0
                min_val = i;
1266
0
            max_val = i;
1267
0
        }
1268
0
        if (st->h) {
1269
0
            khint_t k;
1270
1271
0
            for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
1272
0
                if (!kh_exist(st->h, k))
1273
0
                    continue;
1274
1275
0
                i = kh_key(st->h, k);
1276
0
                if (min_val > i)
1277
0
                    min_val = i;
1278
0
                if (max_val < i)
1279
0
                    max_val = i;
1280
0
            }
1281
0
        }
1282
0
    }
1283
1284
223
    if (max_val < min_val)
1285
0
        goto err;
1286
1287
223
    range = (int64_t) max_val - min_val;
1288
223
    switch (option) {
1289
0
    case E_SINT:
1290
0
        if (min_val < INT_MIN || range > INT_MAX)
1291
0
            goto err;
1292
0
        break;
1293
1294
223
    case E_INT:
1295
223
        if (max_val > UINT_MAX || range > UINT_MAX)
1296
9
            goto err;
1297
214
        break;
1298
1299
214
    default:
1300
0
        break;
1301
223
    }
1302
1303
214
    c->u.e_beta.offset = -min_val;
1304
1.04k
    while (range) {
1305
828
        len++;
1306
828
        range >>= 1;
1307
828
    }
1308
214
    c->u.e_beta.nbits = len;
1309
1310
214
    return c;
1311
1312
9
 err:
1313
9
    free(c);
1314
9
    return NULL;
1315
223
}
1316
1317
/*
1318
 * ---------------------------------------------------------------------------
1319
 * XPACK: Packing multiple values into a single byte.  A fast transform that
1320
 * reduces time taken by entropy encoder and may also improve compression.
1321
 *
1322
 * This also has the additional requirement that the data series is not
1323
 * interleaved with another, permitting efficient encoding and decoding
1324
 * of all elements enmasse instead of needing to only extract the bits
1325
 * necessary per item.
1326
 */
1327
0
int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1328
0
    int64_t *out_i = (int64_t *)out;
1329
0
    int i, n = *out_size;
1330
1331
0
    if (c->u.xpack.nbits) {
1332
0
        for (i = 0; i < n; i++)
1333
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1334
0
    } else {
1335
0
        for (i = 0; i < n; i++)
1336
0
            out_i[i] = c->u.xpack.rmap[0];
1337
0
    }
1338
1339
0
    return 0;
1340
0
}
1341
1342
0
int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1343
0
    int32_t *out_i = (int32_t *)out;
1344
0
    int i, n = *out_size;
1345
1346
0
    if (c->u.xpack.nbits) {
1347
0
        if (cram_not_enough_bits(in, c->u.xpack.nbits * n))
1348
0
            return -1;
1349
1350
0
        for (i = 0; i < n; i++)
1351
0
            out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)];
1352
0
    } else {
1353
0
        for (i = 0; i < n; i++)
1354
0
            out_i[i] = c->u.xpack.rmap[0];
1355
0
    }
1356
1357
0
    return 0;
1358
0
}
1359
1360
0
static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) {
1361
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
1362
0
    if (b)
1363
0
        return 0;
1364
1365
    // get sub-codec data.
1366
0
    cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec);
1367
0
    if (!sub_b)
1368
0
        return -1;
1369
1370
    // Allocate local block to expand into
1371
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
1372
0
    if (!b)
1373
0
        return -1;
1374
0
    int n = sub_b->uncomp_size * 8/c->u.xpack.nbits;
1375
0
    BLOCK_GROW(b, n);
1376
0
    b->uncomp_size = n;
1377
1378
0
    uint8_t p[256];
1379
0
    int z;
1380
0
    for (z = 0; z < 256; z++)
1381
0
        p[z] = c->u.xpack.rmap[z];
1382
0
    hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size,
1383
0
               8 / c->u.xpack.nbits, p);
1384
1385
0
    return 0;
1386
1387
0
 block_err:
1388
0
    return -1;
1389
0
}
1390
1391
0
int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1392
    // FIXME: we need to ban data-series interleaving in the spec for this to work.
1393
1394
    // Remember this may be called when threaded and multi-slice per container.
1395
    // Hence one cram_codec instance, multiple slices, multiple blocks.
1396
    // We therefore have to cache appropriate block info in slice and not codec.
1397
    //    b = cram_get_block_by_id(slice, c->external.content_id);
1398
0
    if (c->u.xpack.nval > 1) {
1399
0
        cram_xpack_decode_expand_char(slice, c);
1400
0
        cram_block *b = slice->block_by_id[512 + c->codec_id];
1401
0
        if (!b)
1402
0
            return -1;
1403
1404
0
        if (out)
1405
0
            memcpy(out, b->data + b->byte, *out_size);
1406
0
        b->byte += *out_size;
1407
0
    } else {
1408
0
        memset(out, c->u.xpack.rmap[0], *out_size);
1409
0
    }
1410
1411
0
    return 0;
1412
0
}
1413
1414
261
void cram_xpack_decode_free(cram_codec *c) {
1415
261
    if (!c) return;
1416
1417
261
    if (c->u.xpack.sub_codec)
1418
249
        c->u.xpack.sub_codec->free(c->u.xpack.sub_codec);
1419
1420
    //free(slice->block_by_id[512 + c->codec_id]);
1421
    //slice->block_by_id[512 + c->codec_id] = 0;
1422
1423
261
    free(c);
1424
261
}
1425
1426
0
int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) {
1427
0
    cram_xpack_decode_expand_char(slice, c);
1428
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1429
0
}
1430
1431
0
cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) {
1432
0
    cram_xpack_decode_expand_char(slice, c);
1433
0
    return slice->block_by_id[512 + c->codec_id];
1434
0
}
1435
1436
cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr,
1437
                                   char *data, int size,
1438
                                   enum cram_encoding codec,
1439
                                   enum cram_external_type option,
1440
261
                                   int version, varint_vec *vv) {
1441
261
    cram_codec *c;
1442
261
    char *cp = data;
1443
261
    char *endp = data+size;
1444
1445
261
    if (!(c = calloc(1, sizeof(*c))))
1446
0
        return NULL;
1447
1448
261
    c->codec  = E_XPACK;
1449
261
    if (option == E_LONG)
1450
0
        c->decode = cram_xpack_decode_long;
1451
261
    else if (option == E_INT)
1452
147
        c->decode = cram_xpack_decode_int;
1453
114
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1454
114
        c->decode = cram_xpack_decode_char;
1455
0
    else {
1456
0
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
1457
0
        goto malformed;
1458
0
    }
1459
261
    c->free = cram_xpack_decode_free;
1460
261
    c->size = cram_xpack_decode_size;
1461
261
    c->get_block = cram_xpack_get_block;
1462
261
    c->describe = NULL;
1463
1464
261
    c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL);
1465
261
    c->u.xpack.nval  = vv->varint_get32(&cp, endp, NULL);
1466
261
    if (c->u.xpack.nbits >= 8  || c->u.xpack.nbits < 0 ||
1467
261
        c->u.xpack.nval  > 256 || c->u.xpack.nval < 0)
1468
0
        goto malformed;
1469
261
    int i;
1470
957
    for (i = 0; i < c->u.xpack.nval; i++) {
1471
702
        uint32_t v = vv->varint_get32(&cp, endp, NULL);
1472
702
        if (v >= 256)
1473
6
            goto malformed;
1474
696
        c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K
1475
696
    }
1476
1477
255
    int encoding = vv->varint_get32(&cp, endp, NULL);
1478
255
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1479
255
    if (sub_size < 0 || endp - cp < sub_size)
1480
3
        goto malformed;
1481
252
    c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1482
252
                                             option, version, vv);
1483
252
    if (c->u.xpack.sub_codec == NULL)
1484
3
        goto malformed;
1485
249
    cp += sub_size;
1486
1487
249
    if (cp - data != size
1488
249
        || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) {
1489
12
    malformed:
1490
12
        fprintf(stderr, "Malformed xpack header stream\n");
1491
12
        cram_xpack_decode_free(c);
1492
12
        return NULL;
1493
0
    }
1494
1495
249
    return c;
1496
249
}
1497
1498
0
int cram_xpack_encode_flush(cram_codec *c) {
1499
    // Pack the buffered up data
1500
0
    int meta_len;
1501
0
    uint64_t out_len;
1502
0
    uint8_t out_meta[1024];
1503
0
    uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out),
1504
0
                            out_meta, &meta_len, &out_len);
1505
1506
    // We now need to pass this through the next layer of transform
1507
0
    if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming
1508
0
                                     c->u.e_xpack.sub_codec,
1509
0
                                     (char *)out, out_len))
1510
0
        return -1;
1511
1512
0
    int r = 0;
1513
0
    if (c->u.e_xpack.sub_codec->flush)
1514
0
        r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec);
1515
1516
0
    free(out);
1517
0
    return r;
1518
0
}
1519
1520
int cram_xpack_encode_store(cram_codec *c, cram_block *b,
1521
0
                            char *prefix, int version) {
1522
0
    int len = 0, r = 0, n;
1523
1524
0
    if (prefix) {
1525
0
        size_t l = strlen(prefix);
1526
0
        BLOCK_APPEND(b, prefix, l);
1527
0
        len += l;
1528
0
    }
1529
1530
    // Store sub-codec
1531
0
    cram_codec *tc = c->u.e_xpack.sub_codec;
1532
0
    cram_block *tb = cram_new_block(0, 0);
1533
0
    if (!tb)
1534
0
        return -1;
1535
0
    int len2 = tc->store(tc, tb, NULL, version);
1536
1537
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1538
1539
    // codec length
1540
0
    int len1 = 0, i;
1541
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1542
0
        len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n;
1543
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits)
1544
0
                                        +  c->vv->varint_size(c->u.e_xpack.nval)
1545
0
                                        + len1 + len2)); r |= n;
1546
1547
    // The map and sub-codec
1548
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n;
1549
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval));  r |= n;
1550
0
    for (i = 0; i < c->u.e_xpack.nval; i++)
1551
0
        len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n;
1552
1553
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1554
1555
0
    cram_free_block(tb);
1556
1557
0
    return r > 0 ? len + len2 : -1;
1558
1559
0
 block_err:
1560
0
    return -1;
1561
0
}
1562
1563
// Same as cram_beta_encode_long
1564
int cram_xpack_encode_long(cram_slice *slice, cram_codec *c,
1565
0
                           char *in, int in_size) {
1566
0
    int64_t *syms = (int64_t *)in;
1567
0
    int i, r = 0;
1568
1569
0
    for (i = 0; i < in_size; i++)
1570
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1571
1572
0
    return r;
1573
0
}
1574
1575
int cram_xpack_encode_int(cram_slice *slice, cram_codec *c,
1576
0
                          char *in, int in_size) {
1577
0
    int *syms = (int *)in;
1578
0
    int i, r = 0;
1579
1580
0
    for (i = 0; i < in_size; i++)
1581
0
        r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits);
1582
1583
0
    return r;
1584
0
}
1585
1586
int cram_xpack_encode_char(cram_slice *slice, cram_codec *c,
1587
0
                           char *in, int in_size) {
1588
0
    BLOCK_APPEND(c->out, in, in_size);
1589
0
    return 0;
1590
1591
0
 block_err:
1592
0
    return -1;
1593
0
}
1594
1595
0
void cram_xpack_encode_free(cram_codec *c) {
1596
0
    if (!c) return;
1597
1598
0
    if (c->u.e_xpack.sub_codec)
1599
0
        c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec);
1600
1601
0
    cram_free_block(c->out);
1602
1603
0
    free(c);
1604
0
}
1605
1606
cram_codec *cram_xpack_encode_init(cram_stats *st,
1607
                                   enum cram_encoding codec,
1608
                                   enum cram_external_type option,
1609
                                   void *dat,
1610
0
                                   int version, varint_vec *vv) {
1611
0
    cram_codec *c;
1612
1613
0
    if (!(c = malloc(sizeof(*c))))
1614
0
        return NULL;
1615
1616
0
    c->codec  = E_XPACK;
1617
0
    c->free   = cram_xpack_encode_free;
1618
0
    if (option == E_LONG)
1619
0
        c->encode = cram_xpack_encode_long;
1620
0
    else if (option == E_INT)
1621
0
        c->encode = cram_xpack_encode_int;
1622
0
    else
1623
0
        c->encode = cram_xpack_encode_char;
1624
0
    c->store  = cram_xpack_encode_store;
1625
0
    c->flush  = cram_xpack_encode_flush;
1626
1627
0
    cram_xpack_encoder *e = (cram_xpack_encoder *)dat;
1628
0
    c->u.e_xpack.nbits = e->nbits;
1629
0
    c->u.e_xpack.nval = e->nval;
1630
0
    c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
1631
0
                                               E_BYTE_ARRAY, e->sub_codec_dat,
1632
0
                                               version, vv);
1633
1634
    // Initialise fwd and rev maps
1635
0
    memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3
1636
0
    int i, n;
1637
0
    for (i = n = 0; i < 256; i++)
1638
0
        if (e->map[i] != -1)
1639
0
            c->u.e_xpack.rmap[n++] = i;               // 0,1,2,3 to P,A,C,K
1640
0
    if (n != e->nval) {
1641
0
        fprintf(stderr, "Incorrectly specified number of map items in PACK\n");
1642
0
        return NULL;
1643
0
    }
1644
1645
0
    return c;
1646
0
}
1647
1648
/*
1649
 * ---------------------------------------------------------------------------
1650
 * XDELTA: subtract successive values, zig-zag to turn +/- to + only,
1651
 * and then var-int encode the result.
1652
 *
1653
 * This also has the additional requirement that the data series is not
1654
 * interleaved with another, permitting efficient encoding and decoding
1655
 * of all elements enmasse instead of needing to only extract the bits
1656
 * necessary per item.
1657
 */
1658
1659
0
static uint8_t  zigzag8 (int8_t  x) { return (x << 1) ^ (x >>  7); }
1660
0
static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); }
1661
0
static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); }
1662
1663
//static int8_t  unzigzag8 (uint8_t  x) { return (x >> 1) ^ -(x & 1); }
1664
0
static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); }
1665
0
static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); }
1666
1667
0
int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1668
0
    return -1;
1669
0
}
1670
1671
0
int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1672
    // Slow value-by-value method for now
1673
0
    uint32_t *out32 = (uint32_t *)out;
1674
0
    int i;
1675
0
    for (i = 0; i < *out_size; i++) {
1676
0
        uint32_t v;
1677
0
        int one = 1;
1678
0
        if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in,
1679
0
                                          (char *)&v, &one) < 0)
1680
0
            return -1;
1681
0
        uint32_t d = unzigzag32(v);
1682
0
        c->u.xdelta.last = out32[i] = d + c->u.xdelta.last;
1683
0
    }
1684
1685
0
    return 0;
1686
0
}
1687
1688
0
static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) {
1689
0
    return -1;
1690
0
}
1691
1692
0
int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
1693
0
    return -1;
1694
0
}
1695
1696
0
static inline int16_t le_int2(int16_t i) {
1697
0
    int16_t s;
1698
0
    i16_to_le(i, (uint8_t *)&s);
1699
0
    return s;
1700
0
}
1701
1702
int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in,
1703
0
                             char *out_, int *out_size) {
1704
0
    cram_block *out = (cram_block *)out_;
1705
0
    cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec);
1706
0
    int i = 0;
1707
1708
0
    const int w = c->u.xdelta.word_size;
1709
0
    uint32_t npad = (w - *out_size%w)%w;
1710
0
    uint32_t out_sz = *out_size + npad;
1711
0
    c->u.xdelta.last = 0;  // reset for each new array
1712
1713
0
    for (i = 0; i < out_sz; i += w) {
1714
0
        uint16_t v;
1715
        // Need better interface
1716
0
        char *cp = (char *)b->data + b->byte;
1717
0
        char *cp_end = (char *)b->data + b->uncomp_size;
1718
0
        int err = 0;
1719
0
        v = c->vv->varint_get32(&cp, cp_end, &err);
1720
0
        if (err)
1721
0
            return -1;
1722
0
        b->byte = cp - (char *)b->data;
1723
1724
0
        switch(w) {
1725
0
        case 2: {
1726
0
            int16_t d = unzigzag16(v), z;
1727
0
            c->u.xdelta.last = d + c->u.xdelta.last;
1728
0
            z = le_int2(c->u.xdelta.last);
1729
0
            BLOCK_APPEND(out, &z, 2-npad);
1730
0
            npad = 0;
1731
0
            break;
1732
0
        }
1733
0
        default:
1734
0
            fprintf(stderr, "Unsupported word size by XDELTA\n");
1735
0
            return -1;
1736
0
        }
1737
0
    }
1738
1739
0
    return 0;
1740
1741
0
 block_err:
1742
0
    return -1;
1743
0
}
1744
1745
21
void cram_xdelta_decode_free(cram_codec *c) {
1746
21
    if (!c) return;
1747
1748
21
    if (c->u.xdelta.sub_codec)
1749
21
        c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec);
1750
1751
21
    free(c);
1752
21
}
1753
1754
0
int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) {
1755
0
    cram_xdelta_decode_expand_char(slice, c);
1756
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
1757
0
}
1758
1759
0
cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) {
1760
0
    cram_xdelta_decode_expand_char(slice, c);
1761
0
    return slice->block_by_id[512 + c->codec_id];
1762
0
}
1763
1764
cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr,
1765
                                    char *data, int size,
1766
                                    enum cram_encoding codec,
1767
                                    enum cram_external_type option,
1768
21
                                    int version, varint_vec *vv) {
1769
21
    cram_codec *c;
1770
21
    char *cp = data;
1771
21
    char *endp = data+size;
1772
1773
21
    if (!(c = calloc(1, sizeof(*c))))
1774
0
        return NULL;
1775
1776
21
    c->codec  = E_XDELTA;
1777
21
    if (option == E_LONG)
1778
0
        c->decode = cram_xdelta_decode_long;
1779
21
    else if (option == E_INT)
1780
9
        c->decode = cram_xdelta_decode_int;
1781
12
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
1782
6
        c->decode = cram_xdelta_decode_char;
1783
6
    else if (option == E_BYTE_ARRAY_BLOCK) {
1784
6
        option = E_BYTE_ARRAY;
1785
6
        c->decode = cram_xdelta_decode_block;
1786
6
    } else {
1787
0
        free(c);
1788
0
        return NULL;
1789
0
    }
1790
21
    c->free = cram_xdelta_decode_free;
1791
21
    c->size = cram_xdelta_decode_size;
1792
21
    c->get_block = cram_xdelta_get_block;
1793
21
    c->describe = NULL;
1794
1795
21
    c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL);
1796
21
    c->u.xdelta.last = 0;
1797
1798
21
    int encoding = vv->varint_get32(&cp, endp, NULL);
1799
21
    int sub_size = vv->varint_get32(&cp, endp, NULL);
1800
21
    if (sub_size < 0 || endp - cp < sub_size)
1801
0
        goto malformed;
1802
21
    c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
1803
21
                                              option, version, vv);
1804
21
    if (c->u.xdelta.sub_codec == NULL)
1805
0
        goto malformed;
1806
21
    cp += sub_size;
1807
1808
21
    if (cp - data != size) {
1809
6
    malformed:
1810
6
        fprintf(stderr, "Malformed xdelta header stream\n");
1811
6
        cram_xdelta_decode_free(c);
1812
6
        return NULL;
1813
6
    }
1814
1815
15
    return c;
1816
21
}
1817
1818
0
int cram_xdelta_encode_flush(cram_codec *c) {
1819
0
    int r = -1;
1820
0
    cram_block *b = cram_new_block(0, 0);
1821
0
    if (!b)
1822
0
        return -1;
1823
1824
0
    switch (c->u.e_xdelta.word_size) {
1825
0
    case 2: {
1826
        // Delta + zigzag transform.
1827
        // Subtracting two 8-bit values has a 9-bit result (-255 to 255).
1828
        // However think of it as turning a wheel clockwise or anti-clockwise.
1829
        // If it has 256 gradations then a -ve rotation followed by a +ve
1830
        // rotation of the same amount reverses it regardless.
1831
        //
1832
        // Similarly the zig-zag transformation doesn't invent any extra bits,
1833
        // so the entire thing can be done in-situ.  This may permit faster
1834
        // SIMD loops if we break apart the steps.
1835
1836
        // uint16_t last = 0, d;
1837
        // for (i = 0; i < n; i++) {
1838
        //     d = io[i] - last;
1839
        //     last = io[i];
1840
        //     io[i] = zigzag16(vd);
1841
        // }
1842
1843
        // --- vs ---
1844
1845
        // for (i = n-1; i >= 1; i--)
1846
        //     io[i] -= io[i-1];
1847
        // for (i = 0; i < n; i++)
1848
        //     io[i] = zigzag16(io[i]);
1849
1850
        // varint: need array variant for speed here.
1851
        // With zig-zag
1852
0
        int i, n = BLOCK_SIZE(c->out)/2;;
1853
0
        uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0;
1854
1855
0
        if (n*2 < BLOCK_SIZE(c->out)) {
1856
            // half word
1857
0
            last = *(uint8_t *)dat;
1858
0
            c->vv->varint_put32_blk(b, zigzag16(last));
1859
0
            dat = (uint16_t *)(((uint8_t *)dat)+1);
1860
0
        }
1861
1862
0
        for (i = 0; i < n; i++) {
1863
0
            uint16_t d = dat[i] - last; // possibly unaligned
1864
0
            last = dat[i];
1865
0
            c->vv->varint_put32_blk(b, zigzag16(d));
1866
0
        }
1867
1868
0
        break;
1869
0
    }
1870
1871
0
    case 4: {
1872
0
        int i, n = BLOCK_SIZE(c->out)/4;;
1873
0
        uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0;
1874
1875
0
        for (i = 0; i < n; i++) {
1876
0
            uint32_t d = dat[i] - last;
1877
0
            last = dat[i];
1878
0
            c->vv->varint_put32_blk(b, zigzag32(d));
1879
0
        }
1880
1881
0
        break;
1882
0
    }
1883
1884
0
    case 1: {
1885
0
        int i, n = BLOCK_SIZE(c->out);;
1886
0
        uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0;
1887
1888
0
        for (i = 0; i < n; i++) {
1889
0
            uint32_t d = dat[i] - last;
1890
0
            last = dat[i];
1891
0
            c->vv->varint_put32_blk(b, zigzag8(d));
1892
0
        }
1893
1894
0
        break;
1895
0
    }
1896
1897
0
    default:
1898
0
        goto err;
1899
0
    }
1900
1901
0
    if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec,
1902
0
                                      (char *)b->data, b->byte))
1903
0
        goto err;
1904
1905
0
    r = 0;
1906
1907
0
 err:
1908
0
    cram_free_block(b);
1909
0
    return r;
1910
1911
0
}
1912
1913
int cram_xdelta_encode_store(cram_codec *c, cram_block *b,
1914
0
                            char *prefix, int version) {
1915
0
    int len = 0, r = 0, n;
1916
1917
0
    if (prefix) {
1918
0
        size_t l = strlen(prefix);
1919
0
        BLOCK_APPEND(b, prefix, l);
1920
0
        len += l;
1921
0
    }
1922
1923
    // Store sub-codec
1924
0
    cram_codec *tc = c->u.e_xdelta.sub_codec;
1925
0
    cram_block *tb = cram_new_block(0, 0);
1926
0
    if (!tb)
1927
0
        return -1;
1928
0
    int len2 = tc->store(tc, tb, NULL, version);
1929
1930
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
1931
1932
    // codec length
1933
0
    len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size)
1934
0
                                        + len2)); r |= n;
1935
1936
    // This and sub-codec
1937
0
    len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n;
1938
0
    BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb));
1939
1940
0
    cram_free_block(tb);
1941
1942
0
    return r > 0 ? len + len2 : -1;
1943
1944
0
 block_err:
1945
0
    return -1;
1946
0
}
1947
1948
// Same as cram_beta_encode_long
1949
int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c,
1950
0
                           char *in, int in_size) {
1951
0
    return -1;
1952
0
}
1953
1954
int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c,
1955
0
                          char *in, int in_size) {
1956
0
    return -1;
1957
0
}
1958
1959
int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c,
1960
0
                            char *in, int in_size) {
1961
0
    char *dat = malloc(in_size*5);
1962
0
    if (!dat)
1963
0
        return -1;
1964
0
    char *cp = dat, *cp_end = dat + in_size*5;
1965
1966
0
    c->u.e_xdelta.last = 0; // reset for each new array
1967
0
    if (c->u.e_xdelta.word_size == 2) {
1968
0
        int i, part;
1969
1970
0
        part = in_size%2;
1971
0
        if (part) {
1972
0
            uint16_t z = in[0];
1973
0
            c->u.e_xdelta.last = le_int2(z);
1974
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last));
1975
0
        }
1976
1977
0
        uint16_t *in16 = (uint16_t *)(in+part);
1978
0
        for (i = 0; i < in_size/2; i++) {
1979
0
            uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last;
1980
0
            c->u.e_xdelta.last = le_int2(in16[i]);
1981
0
            cp += c->vv->varint_put32(cp, cp_end, zigzag16(d));
1982
0
        }
1983
0
    }
1984
0
    if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec,
1985
0
                                      (char *)dat, cp-dat)) {
1986
0
        free(dat);
1987
0
        return -1;
1988
0
    }
1989
1990
0
    free(dat);
1991
0
    return 0;
1992
0
}
1993
1994
0
void cram_xdelta_encode_free(cram_codec *c) {
1995
0
    if (!c) return;
1996
1997
0
    if (c->u.e_xdelta.sub_codec)
1998
0
        c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec);
1999
2000
0
    cram_free_block(c->out);
2001
2002
0
    free(c);
2003
0
}
2004
2005
cram_codec *cram_xdelta_encode_init(cram_stats *st,
2006
                                    enum cram_encoding codec,
2007
                                    enum cram_external_type option,
2008
                                    void *dat,
2009
0
                                    int version, varint_vec *vv) {
2010
0
    cram_codec *c;
2011
2012
0
    if (!(c = malloc(sizeof(*c))))
2013
0
        return NULL;
2014
2015
0
    c->codec  = E_XDELTA;
2016
0
    c->free   = cram_xdelta_encode_free;
2017
0
    if (option == E_LONG)
2018
0
        c->encode = cram_xdelta_encode_long;
2019
0
    else if (option == E_INT)
2020
0
        c->encode = cram_xdelta_encode_int;
2021
0
    else
2022
0
        c->encode = cram_xdelta_encode_char;
2023
0
    c->store  = cram_xdelta_encode_store;
2024
0
    c->flush  = cram_xdelta_encode_flush;
2025
2026
0
    cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat;
2027
0
    c->u.e_xdelta.word_size = e->word_size;
2028
0
    c->u.e_xdelta.last = 0;
2029
0
    c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL,
2030
0
                                                E_BYTE_ARRAY,
2031
0
                                                e->sub_codec_dat,
2032
0
                                                version, vv);
2033
2034
0
    return c;
2035
0
}
2036
2037
/*
2038
 * ---------------------------------------------------------------------------
2039
 * XRLE
2040
 *
2041
 * This also has the additional requirement that the data series is not
2042
 * interleaved with another, permitting efficient encoding and decoding
2043
 * of all elements enmasse instead of needing to only extract the bits
2044
 * necessary per item.
2045
 */
2046
0
int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2047
    // TODO if and when needed
2048
0
    return -1;
2049
0
}
2050
2051
0
int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2052
    // TODO if and when needed
2053
0
    return -1;
2054
0
}
2055
2056
// Expands an XRLE transform and caches result in slice->block_by_id[]
2057
0
static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) {
2058
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2059
0
    if (b)
2060
0
        return 0;
2061
2062
0
    b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0);
2063
0
    if (!b)
2064
0
        return -1;
2065
0
    cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec);
2066
0
    if (!lit_b)
2067
0
        return -1;
2068
0
    unsigned char *lit_dat = lit_b->data;
2069
0
    unsigned int lit_sz = lit_b->uncomp_size;
2070
0
    unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec);
2071
2072
0
    cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec);
2073
0
    if (!len_b)
2074
0
        return -1;
2075
0
    unsigned char *len_dat = len_b->data;
2076
2077
0
    uint8_t rle_syms[256];
2078
0
    int rle_nsyms = 0;
2079
0
    int i;
2080
0
    for (i = 0; i < 256; i++) {
2081
0
        if (c->u.xrle.rep_score[i] > 0)
2082
0
            rle_syms[rle_nsyms++] = i;
2083
0
    }
2084
2085
0
    uint64_t out_sz;
2086
0
    int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz);
2087
0
    if (!(b->data = malloc(out_sz)))
2088
0
        return -1;
2089
0
    hts_rle_decode(lit_dat, lit_sz,
2090
0
                   len_dat+nb, len_sz-nb,
2091
0
                   rle_syms, rle_nsyms,
2092
0
                   b->data, &out_sz);
2093
0
    b->uncomp_size = out_sz;
2094
2095
0
    return 0;
2096
0
}
2097
2098
0
int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) {
2099
0
    cram_xrle_decode_expand_char(slice, c);
2100
0
    return slice->block_by_id[512 + c->codec_id]->uncomp_size;
2101
0
}
2102
2103
0
cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) {
2104
0
    cram_xrle_decode_expand_char(slice, c);
2105
0
    return slice->block_by_id[512 + c->codec_id];
2106
0
}
2107
2108
0
int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2109
0
    int n = *out_size;
2110
2111
0
    cram_xrle_decode_expand_char(slice, c);
2112
0
    cram_block *b = slice->block_by_id[512 + c->codec_id];
2113
2114
0
    memcpy(out, b->data + b->idx, n);
2115
0
    b->idx += n;
2116
0
    return 0;
2117
2118
    // Old code when not cached
2119
0
    while (n > 0) {
2120
0
        if (c->u.xrle.cur_len == 0) {
2121
0
            unsigned char lit;
2122
0
            int one = 1;
2123
0
            if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in,
2124
0
                                          (char *)&lit, &one) < 0)
2125
0
                return -1;
2126
0
            c->u.xrle.cur_lit = lit;
2127
2128
0
            if (c->u.xrle.rep_score[lit] > 0) {
2129
0
                if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in,
2130
0
                                              (char *)&c->u.xrle.cur_len, &one) < 0)
2131
0
                    return -1;
2132
0
            } // else cur_len still zero
2133
            //else fprintf(stderr, "%d\n", lit);
2134
2135
0
            c->u.xrle.cur_len++;
2136
0
        }
2137
2138
0
        if (n >= c->u.xrle.cur_len) {
2139
0
            memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len);
2140
0
            out += c->u.xrle.cur_len;
2141
0
            n -= c->u.xrle.cur_len;
2142
0
            c->u.xrle.cur_len = 0;
2143
0
        } else {
2144
0
            memset(out, c->u.xrle.cur_lit, n);
2145
0
            out += n;
2146
0
            c->u.xrle.cur_len -= n;
2147
0
            n = 0;
2148
0
        }
2149
0
    }
2150
2151
0
    return 0;
2152
0
}
2153
2154
0
void cram_xrle_decode_free(cram_codec *c) {
2155
0
    if (!c) return;
2156
2157
0
    if (c->u.xrle.len_codec)
2158
0
        c->u.xrle.len_codec->free(c->u.xrle.len_codec);
2159
2160
0
    if (c->u.xrle.lit_codec)
2161
0
        c->u.xrle.lit_codec->free(c->u.xrle.lit_codec);
2162
2163
0
    free(c);
2164
0
}
2165
2166
cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr,
2167
                                  char *data, int size,
2168
                                  enum cram_encoding codec,
2169
                                  enum cram_external_type option,
2170
6
                                  int version, varint_vec *vv) {
2171
6
    cram_codec *c;
2172
6
    char *cp = data;
2173
6
    char *endp = data+size;
2174
6
    int err = 0;
2175
2176
6
    if (!(c = calloc(1, sizeof(*c))))
2177
0
        return NULL;
2178
2179
6
    c->codec  = E_XRLE;
2180
6
    if (option == E_LONG)
2181
0
        c->decode = cram_xrle_decode_long;
2182
6
    else if (option == E_INT)
2183
0
        c->decode = cram_xrle_decode_int;
2184
6
    else if (option == E_BYTE_ARRAY || option == E_BYTE)
2185
0
        c->decode = cram_xrle_decode_char;
2186
6
    else {
2187
6
        fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n");
2188
6
        free(c);
2189
6
        return NULL;
2190
6
    }
2191
0
    c->free   = cram_xrle_decode_free;
2192
0
    c->size   = cram_xrle_decode_size;
2193
0
    c->get_block = cram_xrle_get_block;
2194
0
    c->describe = NULL;
2195
0
    c->u.xrle.cur_len = 0;
2196
0
    c->u.xrle.cur_lit = -1;
2197
2198
    // RLE map
2199
0
    int i, j, nrle = vv->varint_get32(&cp, endp, &err);
2200
0
    memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score));
2201
0
    for (i = 0; i < nrle && i < 256; i++) {
2202
0
        j = vv->varint_get32(&cp, endp, &err);
2203
0
        if (j >= 0 && j < 256)
2204
0
            c->u.xrle.rep_score[j] = 1;
2205
0
    }
2206
2207
    // Length and literal sub encodings
2208
0
    c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err);
2209
0
    int sub_size = vv->varint_get32(&cp, endp, &err);
2210
0
    if (sub_size < 0 || endp - cp < sub_size)
2211
0
        goto malformed;
2212
0
    c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding,
2213
0
                                            cp, sub_size, E_INT, version, vv);
2214
0
    if (c->u.xrle.len_codec == NULL)
2215
0
        goto malformed;
2216
0
    cp += sub_size;
2217
2218
0
    c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err);
2219
0
    sub_size = vv->varint_get32(&cp, endp, &err);
2220
0
    if (sub_size < 0 || endp - cp < sub_size)
2221
0
        goto malformed;
2222
0
    c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding,
2223
0
                                            cp, sub_size, option, version, vv);
2224
0
    if (c->u.xrle.lit_codec == NULL)
2225
0
        goto malformed;
2226
0
    cp += sub_size;
2227
2228
0
    if (err)
2229
0
        goto malformed;
2230
2231
0
    return c;
2232
2233
0
 malformed:
2234
0
    fprintf(stderr, "Malformed xrle header stream\n");
2235
0
    cram_xrle_decode_free(c);
2236
0
    return NULL;
2237
0
}
2238
2239
0
int cram_xrle_encode_flush(cram_codec *c) {
2240
0
    uint8_t *out_lit, *out_len;
2241
0
    uint64_t out_lit_size, out_len_size;
2242
0
    uint8_t rle_syms[256];
2243
0
    int rle_nsyms = 0, i;
2244
2245
0
    for (i = 0; i < 256; i++)
2246
0
        if (c->u.e_xrle.rep_score[i] > 0)
2247
0
            rle_syms[rle_nsyms++] = i;
2248
2249
0
    if (!c->u.e_xrle.to_flush) {
2250
0
        c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out);
2251
0
        c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out);
2252
0
    }
2253
2254
0
    out_len = malloc(c->u.e_xrle.to_flush_size+8);
2255
0
    if (!out_len)
2256
0
        return -1;
2257
2258
0
    int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size);
2259
2260
0
    out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size,
2261
0
                             out_len+nb, &out_len_size,
2262
0
                             rle_syms, &rle_nsyms,
2263
0
                             NULL, &out_lit_size);
2264
0
    out_len_size += nb;
2265
2266
2267
    // TODO: can maybe "gift" the sub codec the data block, to remove
2268
    // one level of memcpy.
2269
0
    if (c->u.e_xrle.len_codec->encode(NULL,
2270
0
                                      c->u.e_xrle.len_codec,
2271
0
                                      (char *)out_len, out_len_size))
2272
0
        return -1;
2273
2274
0
    if (c->u.e_xrle.lit_codec->encode(NULL,
2275
0
                                      c->u.e_xrle.lit_codec,
2276
0
                                      (char *)out_lit, out_lit_size))
2277
0
        return -1;
2278
2279
0
    free(out_len);
2280
0
    free(out_lit);
2281
2282
0
    return 0;
2283
0
}
2284
2285
int cram_xrle_encode_store(cram_codec *c, cram_block *b,
2286
0
                            char *prefix, int version) {
2287
0
    int len = 0, r = 0, n;
2288
0
    cram_codec *tc;
2289
0
    cram_block *b_rle, *b_len, *b_lit;
2290
2291
0
    if (prefix) {
2292
0
        size_t l = strlen(prefix);
2293
0
        BLOCK_APPEND(b, prefix, l);
2294
0
        len += l;
2295
0
    }
2296
2297
    // List of symbols to RLE
2298
0
    b_rle = cram_new_block(0, 0);
2299
0
    if (!b_rle)
2300
0
        return -1;
2301
0
    int i, nrle = 0, len1 = 0;
2302
0
    for (i = 0; i < 256; i++) {
2303
0
        if (c->u.e_xrle.rep_score[i] > 0) {
2304
0
            nrle++;
2305
0
            len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n;
2306
0
        }
2307
0
    }
2308
2309
    // Store length and literal sub-codecs to get encoded length
2310
0
    tc = c->u.e_xrle.len_codec;
2311
0
    b_len = cram_new_block(0, 0);
2312
0
    if (!b_len)
2313
0
        return -1;
2314
0
    int len2 = tc->store(tc, b_len, NULL, version);
2315
2316
0
    tc = c->u.e_xrle.lit_codec;
2317
0
    b_lit = cram_new_block(0, 0);
2318
0
    if (!b_lit)
2319
0
        return -1;
2320
0
    int len3 = tc->store(tc, b_lit, NULL, version);
2321
2322
0
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
2323
0
    len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3
2324
0
                                        + c->vv->varint_size(nrle))); r |= n;
2325
0
    len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n;
2326
0
    BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle));
2327
0
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
2328
0
    BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit));
2329
2330
0
    cram_free_block(b_rle);
2331
0
    cram_free_block(b_len);
2332
0
    cram_free_block(b_lit);
2333
2334
0
    if (r > 0)
2335
0
        return len + len1 + len2 + len3;
2336
2337
0
 block_err:
2338
0
    return -1;
2339
0
}
2340
2341
int cram_xrle_encode_long(cram_slice *slice, cram_codec *c,
2342
0
                           char *in, int in_size) {
2343
    // TODO if and when needed
2344
0
    return -1;
2345
0
}
2346
2347
int cram_xrle_encode_int(cram_slice *slice, cram_codec *c,
2348
0
                          char *in, int in_size) {
2349
    // TODO if and when needed
2350
0
    return -1;
2351
0
}
2352
2353
int cram_xrle_encode_char(cram_slice *slice, cram_codec *c,
2354
0
                          char *in, int in_size) {
2355
0
    if (c->u.e_xrle.to_flush) {
2356
0
        if (!c->out && !(c->out = cram_new_block(0, 0)))
2357
0
            return -1;
2358
0
        BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size);
2359
0
        c->u.e_xrle.to_flush = NULL;
2360
0
        c->u.e_xrle.to_flush_size = 0;
2361
0
    }
2362
2363
0
    if (c->out && BLOCK_SIZE(c->out) > 0) {
2364
        // Gathering data
2365
0
        BLOCK_APPEND(c->out, in, in_size);
2366
0
        return 0;
2367
0
    }
2368
2369
    // else cache copy of the data we're about to send to flush instead.
2370
0
    c->u.e_xrle.to_flush = in;
2371
0
    c->u.e_xrle.to_flush_size = in_size;
2372
0
    return 0;
2373
2374
0
 block_err:
2375
0
    return -1;
2376
0
}
2377
2378
0
void cram_xrle_encode_free(cram_codec *c) {
2379
0
    if (!c) return;
2380
2381
0
    if (c->u.e_xrle.len_codec)
2382
0
        c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec);
2383
0
    if (c->u.e_xrle.lit_codec)
2384
0
        c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec);
2385
2386
0
    cram_free_block(c->out);
2387
2388
0
    free(c);
2389
0
}
2390
2391
cram_codec *cram_xrle_encode_init(cram_stats *st,
2392
                                  enum cram_encoding codec,
2393
                                  enum cram_external_type option,
2394
                                  void *dat,
2395
0
                                  int version, varint_vec *vv) {
2396
0
    cram_codec *c;
2397
2398
0
    if (!(c = malloc(sizeof(*c))))
2399
0
        return NULL;
2400
2401
0
    c->codec  = E_XRLE;
2402
0
    c->free   = cram_xrle_encode_free;
2403
0
    if (option == E_LONG)
2404
0
        c->encode = cram_xrle_encode_long;
2405
0
    else if (option == E_INT)
2406
0
        c->encode = cram_xrle_encode_int;
2407
0
    else
2408
0
        c->encode = cram_xrle_encode_char;
2409
0
    c->store  = cram_xrle_encode_store;
2410
0
    c->flush  = cram_xrle_encode_flush;
2411
2412
0
    cram_xrle_encoder *e = (cram_xrle_encoder *)dat;
2413
2414
0
    c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL,
2415
0
                                              E_BYTE, e->len_dat,
2416
0
                                              version, vv);
2417
0
    c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL,
2418
0
                                              E_BYTE, e->lit_dat,
2419
0
                                              version, vv);
2420
0
    c->u.e_xrle.cur_lit = -1;
2421
0
    c->u.e_xrle.cur_len = -1;
2422
0
    c->u.e_xrle.to_flush = NULL;
2423
0
    c->u.e_xrle.to_flush_size = 0;
2424
2425
0
    memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score));
2426
2427
0
    return c;
2428
0
}
2429
2430
/*
2431
 * ---------------------------------------------------------------------------
2432
 * SUBEXP
2433
 */
2434
0
int cram_subexp_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2435
0
    int32_t *out_i = (int32_t *)out;
2436
0
    int n, count;
2437
0
    int k = c->u.subexp.k;
2438
2439
0
    for (count = 0, n = *out_size; count < n; count++) {
2440
0
        int i = 0, tail;
2441
0
        int val;
2442
2443
        /* Get number of 1s */
2444
        //while (get_bit_MSB(in) == 1) i++;
2445
0
        i = get_one_bits_MSB(in);
2446
0
        if (i < 0 || cram_not_enough_bits(in, i > 0 ? i + k - 1 : k))
2447
0
            return -1;
2448
        /*
2449
         * Val is
2450
         * i > 0:  2^(k+i-1) + k+i-1 bits
2451
         * i = 0:  k bits
2452
         */
2453
0
        if (i) {
2454
0
            tail = i + k-1;
2455
0
            val = 0;
2456
0
            while (tail) {
2457
                //val = val<<1; val |= get_bit_MSB(in);
2458
0
                GET_BIT_MSB(in, val);
2459
0
                tail--;
2460
0
            }
2461
0
            val += 1 << (i + k-1);
2462
0
        } else {
2463
0
            tail = k;
2464
0
            val = 0;
2465
0
            while (tail) {
2466
                //val = val<<1; val |= get_bit_MSB(in);
2467
0
                GET_BIT_MSB(in, val);
2468
0
                tail--;
2469
0
            }
2470
0
        }
2471
2472
0
        out_i[count] = val - c->u.subexp.offset;
2473
0
    }
2474
2475
0
    return 0;
2476
0
}
2477
2478
651
void cram_subexp_decode_free(cram_codec *c) {
2479
651
    if (c)
2480
651
        free(c);
2481
651
}
2482
2483
0
int cram_subexp_describe(cram_codec *c, kstring_t *ks) {
2484
0
    return ksprintf(ks, "SUBEXP(offset=%d,k=%d)",
2485
0
                    c->u.subexp.offset,
2486
0
                    c->u.subexp.k)
2487
0
        < 0 ? -1 : 0;
2488
0
}
2489
2490
cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr,
2491
                                    char *data, int size,
2492
                                    enum cram_encoding codec,
2493
                                    enum cram_external_type option,
2494
651
                                    int version, varint_vec *vv) {
2495
651
    cram_codec *c;
2496
651
    char *cp = data;
2497
2498
651
    if (option != E_INT) {
2499
0
        hts_log_error("This codec only supports INT encodings");
2500
0
        return NULL;
2501
0
    }
2502
2503
651
    if (!(c = malloc(sizeof(*c))))
2504
0
        return NULL;
2505
2506
651
    c->codec  = E_SUBEXP;
2507
651
    c->decode = cram_subexp_decode;
2508
651
    c->free   = cram_subexp_decode_free;
2509
651
    c->describe = cram_subexp_describe;
2510
651
    c->u.subexp.k = -1;
2511
2512
651
    c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL);
2513
651
    c->u.subexp.k      = vv->varint_get32(&cp, data + size, NULL);
2514
2515
651
    if (cp - data != size || c->u.subexp.k < 0) {
2516
0
        hts_log_error("Malformed subexp header stream");
2517
0
        free(c);
2518
0
        return NULL;
2519
0
    }
2520
2521
651
    return c;
2522
651
}
2523
2524
/*
2525
 * ---------------------------------------------------------------------------
2526
 * GAMMA
2527
 */
2528
0
int cram_gamma_decode(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) {
2529
0
    int32_t *out_i = (int32_t *)out;
2530
0
    int i, n;
2531
2532
0
    for (i = 0, n = *out_size; i < n; i++) {
2533
0
        int nz = 0;
2534
0
        int val;
2535
        //while (get_bit_MSB(in) == 0) nz++;
2536
0
        nz = get_zero_bits_MSB(in);
2537
0
        if (cram_not_enough_bits(in, nz))
2538
0
            return -1;
2539
0
        val = 1;
2540
0
        while (nz > 0) {
2541
            //val <<= 1; val |= get_bit_MSB(in);
2542
0
            GET_BIT_MSB(in, val);
2543
0
            nz--;
2544
0
        }
2545
2546
0
        out_i[i] = val - c->u.gamma.offset;
2547
0
    }
2548
2549
0
    return 0;
2550
0
}
2551
2552
1.44k
void cram_gamma_decode_free(cram_codec *c) {
2553
1.44k
    if (c)
2554
1.44k
        free(c);
2555
1.44k
}
2556
2557
0
int cram_gamma_describe(cram_codec *c, kstring_t *ks) {
2558
0
    return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset)
2559
0
        < 0 ? -1 : 0;
2560
0
}
2561
2562
cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr,
2563
                                   char *data, int size,
2564
                                   enum cram_encoding codec,
2565
                                   enum cram_external_type option,
2566
1.45k
                                   int version, varint_vec *vv) {
2567
1.45k
    cram_codec *c = NULL;
2568
1.45k
    char *cp = data;
2569
2570
1.45k
    if (option != E_INT) {
2571
0
        hts_log_error("This codec only supports INT encodings");
2572
0
        return NULL;
2573
0
    }
2574
2575
1.45k
    if (size < 1)
2576
0
        goto malformed;
2577
2578
1.45k
    if (!(c = malloc(sizeof(*c))))
2579
0
        return NULL;
2580
2581
1.45k
    c->codec  = E_GAMMA;
2582
1.45k
    c->decode = cram_gamma_decode;
2583
1.45k
    c->free   = cram_gamma_decode_free;
2584
1.45k
    c->describe = cram_gamma_describe;
2585
2586
1.45k
    c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL);
2587
2588
1.45k
    if (cp - data != size)
2589
3
        goto malformed;
2590
2591
1.44k
    return c;
2592
2593
3
 malformed:
2594
3
    hts_log_error("Malformed gamma header stream");
2595
3
    free(c);
2596
3
    return NULL;
2597
1.45k
}
2598
2599
/*
2600
 * ---------------------------------------------------------------------------
2601
 * HUFFMAN
2602
 */
2603
2604
225
static int code_sort(const void *vp1, const void *vp2) {
2605
225
    const cram_huffman_code *c1 = (const cram_huffman_code *)vp1;
2606
225
    const cram_huffman_code *c2 = (const cram_huffman_code *)vp2;
2607
2608
225
    if (c1->len != c2->len)
2609
3
        return c1->len - c2->len;
2610
222
    else
2611
222
        return c1->symbol < c2->symbol ? -1 : (c1->symbol > c2->symbol ? 1 : 0);
2612
225
}
2613
2614
444
void cram_huffman_decode_free(cram_codec *c) {
2615
444
    if (!c)
2616
0
        return;
2617
2618
444
    if (c->u.huffman.codes)
2619
357
        free(c->u.huffman.codes);
2620
444
    free(c);
2621
444
}
2622
2623
int cram_huffman_decode_null(cram_slice *slice, cram_codec *c,
2624
0
                             cram_block *in, char *out, int *out_size) {
2625
0
    return -1;
2626
0
}
2627
2628
int cram_huffman_decode_char0(cram_slice *slice, cram_codec *c,
2629
0
                              cram_block *in, char *out, int *out_size) {
2630
0
    int i, n;
2631
2632
0
    if (!out)
2633
0
        return 0;
2634
2635
    /* Special case of 0 length codes */
2636
0
    for (i = 0, n = *out_size; i < n; i++) {
2637
0
        out[i] = c->u.huffman.codes[0].symbol;
2638
0
    }
2639
0
    return 0;
2640
0
}
2641
2642
int cram_huffman_decode_char(cram_slice *slice, cram_codec *c,
2643
0
                             cram_block *in, char *out, int *out_size) {
2644
0
    int i, n, ncodes = c->u.huffman.ncodes;
2645
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2646
2647
0
    for (i = 0, n = *out_size; i < n; i++) {
2648
0
        int idx = 0;
2649
0
        int val = 0, len = 0, last_len = 0;
2650
2651
0
        for (;;) {
2652
0
            int dlen = codes[idx].len - last_len;
2653
0
            if (cram_not_enough_bits(in, dlen))
2654
0
                return -1;
2655
2656
            //val <<= dlen;
2657
            //val  |= get_bits_MSB(in, dlen);
2658
            //last_len = (len += dlen);
2659
2660
0
            last_len = (len += dlen);
2661
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2662
2663
0
            idx = val - codes[idx].p;
2664
0
            if (idx >= ncodes || idx < 0)
2665
0
                return -1;
2666
2667
0
            if (codes[idx].code == val && codes[idx].len == len) {
2668
0
                if (out) out[i] = codes[idx].symbol;
2669
0
                break;
2670
0
            }
2671
0
        }
2672
0
    }
2673
2674
0
    return 0;
2675
0
}
2676
2677
int cram_huffman_decode_int0(cram_slice *slice, cram_codec *c,
2678
0
                             cram_block *in, char *out, int *out_size) {
2679
0
    int32_t *out_i = (int32_t *)out;
2680
0
    int i, n;
2681
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2682
2683
    /* Special case of 0 length codes */
2684
0
    for (i = 0, n = *out_size; i < n; i++) {
2685
0
        out_i[i] = codes[0].symbol;
2686
0
    }
2687
0
    return 0;
2688
0
}
2689
2690
int cram_huffman_decode_int(cram_slice *slice, cram_codec *c,
2691
0
                            cram_block *in, char *out, int *out_size) {
2692
0
    int32_t *out_i = (int32_t *)out;
2693
0
    int i, n, ncodes = c->u.huffman.ncodes;
2694
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2695
2696
0
    for (i = 0, n = *out_size; i < n; i++) {
2697
0
        int idx = 0;
2698
0
        int val = 0, len = 0, last_len = 0;
2699
2700
        // Now one bit at a time for remaining checks
2701
0
        for (;;) {
2702
0
            int dlen = codes[idx].len - last_len;
2703
0
            if (cram_not_enough_bits(in, dlen))
2704
0
                return -1;
2705
2706
            //val <<= dlen;
2707
            //val  |= get_bits_MSB(in, dlen);
2708
            //last_len = (len += dlen);
2709
2710
0
            last_len = (len += dlen);
2711
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2712
2713
0
            idx = val - codes[idx].p;
2714
0
            if (idx >= ncodes || idx < 0)
2715
0
                return -1;
2716
2717
0
            if (codes[idx].code == val && codes[idx].len == len) {
2718
0
                out_i[i] = codes[idx].symbol;
2719
0
                break;
2720
0
            }
2721
0
        }
2722
0
    }
2723
2724
0
    return 0;
2725
0
}
2726
2727
int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c,
2728
0
                              cram_block *in, char *out, int *out_size) {
2729
0
    int64_t *out_i = (int64_t *)out;
2730
0
    int i, n;
2731
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2732
2733
    /* Special case of 0 length codes */
2734
0
    for (i = 0, n = *out_size; i < n; i++) {
2735
0
        out_i[i] = codes[0].symbol;
2736
0
    }
2737
0
    return 0;
2738
0
}
2739
2740
int cram_huffman_decode_long(cram_slice *slice, cram_codec *c,
2741
0
                             cram_block *in, char *out, int *out_size) {
2742
0
    int64_t *out_i = (int64_t *)out;
2743
0
    int i, n, ncodes = c->u.huffman.ncodes;
2744
0
    const cram_huffman_code * const codes = c->u.huffman.codes;
2745
2746
0
    for (i = 0, n = *out_size; i < n; i++) {
2747
0
        int idx = 0;
2748
0
        int val = 0, len = 0, last_len = 0;
2749
2750
        // Now one bit at a time for remaining checks
2751
0
        for (;;) {
2752
0
            int dlen = codes[idx].len - last_len;
2753
0
            if (cram_not_enough_bits(in, dlen))
2754
0
                return -1;
2755
2756
            //val <<= dlen;
2757
            //val  |= get_bits_MSB(in, dlen);
2758
            //last_len = (len += dlen);
2759
2760
0
            last_len = (len += dlen);
2761
0
            for (; dlen; dlen--) GET_BIT_MSB(in, val);
2762
2763
0
            idx = val - codes[idx].p;
2764
0
            if (idx >= ncodes || idx < 0)
2765
0
                return -1;
2766
2767
0
            if (codes[idx].code == val && codes[idx].len == len) {
2768
0
                out_i[i] = codes[idx].symbol;
2769
0
                break;
2770
0
            }
2771
0
        }
2772
0
    }
2773
2774
0
    return 0;
2775
0
}
2776
2777
0
int cram_huffman_describe(cram_codec *c, kstring_t *ks) {
2778
0
    int r = 0, n;
2779
0
    r |= ksprintf(ks, "HUFFMAN(codes={") < 0;
2780
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2781
0
        r |= ksprintf(ks, "%s%"PRId64, n?",":"",
2782
0
                      c->u.huffman.codes[n].symbol);
2783
0
    }
2784
0
    r |= ksprintf(ks, "},lengths={") < 0;
2785
0
    for (n = 0; n < c->u.huffman.ncodes; n++) {
2786
0
        r |= ksprintf(ks, "%s%d", n?",":"",
2787
0
                      c->u.huffman.codes[n].len);
2788
0
    }
2789
0
    r |= ksprintf(ks, "})") < 0;
2790
0
    return r;
2791
0
}
2792
2793
/*
2794
 * Initialises a huffman decoder from an encoding data stream.
2795
 */
2796
cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr,
2797
                                     char *data, int size,
2798
                                     enum cram_encoding codec,
2799
                                     enum cram_external_type option,
2800
467
                                     int version, varint_vec *vv) {
2801
467
    int32_t ncodes = 0, i, j;
2802
467
    char *cp = data, *data_end = &data[size];
2803
467
    cram_codec *h;
2804
467
    cram_huffman_code *codes = NULL;
2805
467
    int32_t val, last_len, max_len = 0;
2806
467
    uint32_t max_val; // needs one more bit than val
2807
467
    const int max_code_bits = sizeof(val) * 8 - 1;
2808
467
    int err = 0;
2809
2810
467
    if (option == E_BYTE_ARRAY_BLOCK) {
2811
0
        hts_log_error("BYTE_ARRAYs not supported by this codec");
2812
0
        return NULL;
2813
0
    }
2814
2815
467
    ncodes = vv->varint_get32(&cp, data_end, &err);
2816
467
    if (ncodes < 0) {
2817
0
        hts_log_error("Invalid number of symbols in huffman stream");
2818
0
        return NULL;
2819
0
    }
2820
467
    if (ncodes >= SIZE_MAX / sizeof(*codes)) {
2821
0
        errno = ENOMEM;
2822
0
        return NULL;
2823
0
    }
2824
467
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2825
467
    if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) {
2826
3
        errno = ENOMEM;
2827
3
        return NULL;
2828
3
    }
2829
464
#endif
2830
464
    h = calloc(1, sizeof(*h));
2831
464
    if (!h)
2832
0
        return NULL;
2833
2834
464
    h->codec  = E_HUFFMAN;
2835
464
    h->free   = cram_huffman_decode_free;
2836
2837
464
    h->u.huffman.ncodes = ncodes;
2838
464
    h->u.huffman.option = option;
2839
464
    if (ncodes) {
2840
374
        codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes));
2841
374
        if (!codes) {
2842
0
            free(h);
2843
0
            return NULL;
2844
0
        }
2845
374
    } else {
2846
90
        codes = h->u.huffman.codes = NULL;
2847
90
    }
2848
2849
    /* Read symbols and bit-lengths */
2850
464
    if (option == E_LONG) {
2851
0
        for (i = 0; i < ncodes; i++)
2852
0
            codes[i].symbol = vv->varint_get64(&cp, data_end, &err);
2853
464
    } else if (option == E_INT || option == E_BYTE) {
2854
36.1k
        for (i = 0; i < ncodes; i++)
2855
35.6k
            codes[i].symbol = vv->varint_get32(&cp, data_end, &err);
2856
461
    } else {
2857
3
        goto malformed;
2858
3
    }
2859
2860
461
    if (err)
2861
8
        goto malformed;
2862
2863
453
    i = vv->varint_get32(&cp, data_end, &err);
2864
453
    if (i != ncodes)
2865
3
        goto malformed;
2866
2867
450
    if (ncodes == 0) {
2868
        /* NULL huffman stream.  Ensure it returns an error if
2869
           anything tries to use it. */
2870
87
        h->decode = cram_huffman_decode_null;
2871
87
        return h;
2872
87
    }
2873
2874
954
    for (i = 0; i < ncodes; i++) {
2875
591
        codes[i].len = vv->varint_get32(&cp, data_end, &err);
2876
591
        if (err)
2877
0
            break;
2878
591
        if (codes[i].len < 0) {
2879
0
            hts_log_error("Huffman code length (%d) is negative", codes[i].len);
2880
0
            goto malformed;
2881
0
        }
2882
591
        if (max_len < codes[i].len)
2883
228
            max_len = codes[i].len;
2884
591
    }
2885
363
    if (err || cp - data != size || max_len >= ncodes)
2886
3
        goto malformed;
2887
2888
    /* 31 is max. bits available in val */
2889
360
    if (max_len > max_code_bits) {
2890
0
        hts_log_error("Huffman code length (%d) is greater "
2891
0
                      "than maximum supported (%d)", max_len, max_code_bits);
2892
0
        goto malformed;
2893
0
    }
2894
2895
    /* Sort by bit length and then by symbol value */
2896
360
    qsort(codes, ncodes, sizeof(*codes), code_sort);
2897
2898
    /* Assign canonical codes */
2899
360
    val = -1, last_len = 0, max_val = 0;
2900
942
    for (i = 0; i < ncodes; i++) {
2901
585
        val++;
2902
585
        if (val > max_val)
2903
3
            goto malformed;
2904
2905
582
        if (codes[i].len > last_len) {
2906
222
            val <<= (codes[i].len - last_len);
2907
222
            last_len = codes[i].len;
2908
222
            max_val = (1U << codes[i].len) - 1;
2909
222
        }
2910
582
        codes[i].code = val;
2911
582
    }
2912
2913
    /*
2914
     * Compute the next starting point, offset by the i'th value.
2915
     * For example if codes 10, 11, 12, 13 are 30, 31, 32, 33 then
2916
     * codes[10..13].p = 30 - 10.
2917
     */
2918
357
    last_len = 0;
2919
936
    for (i = j = 0; i < ncodes; i++) {
2920
579
        if (codes[i].len > last_len) {
2921
222
            j = codes[i].code - i;
2922
222
            last_len = codes[i].len;
2923
222
        }
2924
579
        codes[i].p = j;
2925
579
    }
2926
2927
    // puts("==HUFF LEN==");
2928
    // for (i = 0; i <= last_len+1; i++) {
2929
    //     printf("len %d=%d prefix %d\n", i, h->u.huffman.lengths[i], h->u.huffman.prefix[i]);
2930
    // }
2931
    // puts("===HUFFMAN CODES===");
2932
    // for (i = 0; i < ncodes; i++) {
2933
    //     int j;
2934
    //     printf("%d: %d %d %d ", i, codes[i].symbol, codes[i].len, codes[i].code);
2935
    //     j = codes[i].len;
2936
    //     while (j) {
2937
    //         putchar(codes[i].code & (1 << --j) ? '1' : '0');
2938
    //     }
2939
    //     printf(" %d\n", codes[i].code);
2940
    // }
2941
2942
357
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
2943
153
        if (h->u.huffman.codes[0].len == 0)
2944
66
            h->decode = cram_huffman_decode_char0;
2945
87
        else
2946
87
            h->decode = cram_huffman_decode_char;
2947
204
    } else if (option == E_LONG || option == E_SLONG) {
2948
0
        if (h->u.huffman.codes[0].len == 0)
2949
0
            h->decode = cram_huffman_decode_long0;
2950
0
        else
2951
0
            h->decode = cram_huffman_decode_long;
2952
204
    } else if (option == E_INT || option == E_SINT || option == E_BYTE) {
2953
204
        if (h->u.huffman.codes[0].len == 0)
2954
69
            h->decode = cram_huffman_decode_int0;
2955
135
        else
2956
135
            h->decode = cram_huffman_decode_int;
2957
204
    } else {
2958
0
        return NULL;
2959
0
    }
2960
357
    h->describe = cram_huffman_describe;
2961
2962
357
    return (cram_codec *)h;
2963
2964
20
 malformed:
2965
20
    hts_log_error("Malformed huffman header stream");
2966
20
    free(codes);
2967
20
    free(h);
2968
20
    return NULL;
2969
357
}
2970
2971
int cram_huffman_encode_char0(cram_slice *slice, cram_codec *c,
2972
18.8k
                              char *in, int in_size) {
2973
18.8k
    return 0;
2974
18.8k
}
2975
2976
int cram_huffman_encode_char(cram_slice *slice, cram_codec *c,
2977
0
                             char *in, int in_size) {
2978
0
    int i, code, len, r = 0;
2979
0
    unsigned char *syms = (unsigned char *)in;
2980
2981
0
    while (in_size--) {
2982
0
        int sym = *syms++;
2983
0
        if (sym >= -1 && sym < MAX_HUFF) {
2984
0
            i = c->u.e_huffman.val2code[sym+1];
2985
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
2986
0
            code = c->u.e_huffman.codes[i].code;
2987
0
            len  = c->u.e_huffman.codes[i].len;
2988
0
        } else {
2989
            /* Slow - use a lookup table for when sym < MAX_HUFF? */
2990
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
2991
0
                if (c->u.e_huffman.codes[i].symbol == sym)
2992
0
                    break;
2993
0
            }
2994
0
            if (i == c->u.e_huffman.nvals)
2995
0
                return -1;
2996
2997
0
            code = c->u.e_huffman.codes[i].code;
2998
0
            len  = c->u.e_huffman.codes[i].len;
2999
0
        }
3000
3001
0
        r |= store_bits_MSB(c->out, code, len);
3002
0
    }
3003
3004
0
    return r;
3005
0
}
3006
3007
int cram_huffman_encode_int0(cram_slice *slice, cram_codec *c,
3008
30.3M
                             char *in, int in_size) {
3009
30.3M
    return 0;
3010
30.3M
}
3011
3012
int cram_huffman_encode_int(cram_slice *slice, cram_codec *c,
3013
0
                            char *in, int in_size) {
3014
0
    int i, code, len, r = 0;
3015
0
    int *syms = (int *)in;
3016
3017
0
    while (in_size--) {
3018
0
        int sym = *syms++;
3019
3020
0
        if (sym >= -1 && sym < MAX_HUFF) {
3021
0
            i = c->u.e_huffman.val2code[sym+1];
3022
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3023
0
            code = c->u.e_huffman.codes[i].code;
3024
0
            len  = c->u.e_huffman.codes[i].len;
3025
0
        } else {
3026
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3027
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3028
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3029
0
                    break;
3030
0
            }
3031
0
            if (i == c->u.e_huffman.nvals)
3032
0
                return -1;
3033
3034
0
            code = c->u.e_huffman.codes[i].code;
3035
0
            len  = c->u.e_huffman.codes[i].len;
3036
0
        }
3037
3038
0
        r |= store_bits_MSB(c->out, code, len);
3039
0
    }
3040
3041
0
    return r;
3042
0
}
3043
3044
int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c,
3045
0
                              char *in, int in_size) {
3046
0
    return 0;
3047
0
}
3048
3049
int cram_huffman_encode_long(cram_slice *slice, cram_codec *c,
3050
0
                             char *in, int in_size) {
3051
0
    int i, code, len, r = 0;
3052
0
    int64_t *syms = (int64_t *)in;
3053
3054
0
    while (in_size--) {
3055
0
        int sym = *syms++;
3056
3057
0
        if (sym >= -1 && sym < MAX_HUFF) {
3058
0
            i = c->u.e_huffman.val2code[sym+1];
3059
0
            assert(c->u.e_huffman.codes[i].symbol == sym);
3060
0
            code = c->u.e_huffman.codes[i].code;
3061
0
            len  = c->u.e_huffman.codes[i].len;
3062
0
        } else {
3063
            /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */
3064
0
            for (i = 0; i < c->u.e_huffman.nvals; i++) {
3065
0
                if (c->u.e_huffman.codes[i].symbol == sym)
3066
0
                    break;
3067
0
            }
3068
0
            if (i == c->u.e_huffman.nvals)
3069
0
                return -1;
3070
3071
0
            code = c->u.e_huffman.codes[i].code;
3072
0
            len  = c->u.e_huffman.codes[i].len;
3073
0
        }
3074
3075
0
        r |= store_bits_MSB(c->out, code, len);
3076
0
    }
3077
3078
0
    return r;
3079
0
}
3080
3081
207k
void cram_huffman_encode_free(cram_codec *c) {
3082
207k
    if (!c)
3083
0
        return;
3084
3085
207k
    if (c->u.e_huffman.codes)
3086
207k
        free(c->u.e_huffman.codes);
3087
207k
    free(c);
3088
207k
}
3089
3090
/*
3091
 * Encodes a huffman tree.
3092
 * Returns number of bytes written.
3093
 */
3094
int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix,
3095
206k
                              int version) {
3096
206k
    int i, len = 0, r = 0, n;
3097
206k
    cram_huffman_code *codes = c->u.e_huffman.codes;
3098
    /*
3099
     * Up to code length 127 means 2.5e+26 bytes of data required (worst
3100
     * case huffman tree needs symbols with freqs matching the Fibonacci
3101
     * series). So guaranteed 1 byte per code.
3102
     *
3103
     * Symbols themselves could be 5 bytes (eg -1 is 5 bytes in itf8).
3104
     *
3105
     * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory
3106
     */
3107
206k
    char *tmp = malloc(6*c->u.e_huffman.nvals+16);
3108
206k
    char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16;
3109
3110
206k
    if (!tmp)
3111
0
        return -1;
3112
3113
206k
    if (prefix) {
3114
168k
        size_t l = strlen(prefix);
3115
168k
        BLOCK_APPEND(b, prefix, l);
3116
168k
        len += l;
3117
168k
    }
3118
3119
206k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3120
206k
    if (c->u.e_huffman.option == E_LONG) {
3121
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3122
0
            tp += c->vv->varint_put64(tp, tpend, codes[i].symbol);
3123
0
        }
3124
206k
    } else if (c->u.e_huffman.option == E_SLONG) {
3125
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3126
0
            tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol);
3127
0
        }
3128
206k
    } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) {
3129
413k
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3130
206k
            tp += c->vv->varint_put32(tp, tpend, codes[i].symbol);
3131
206k
        }
3132
206k
    } else if (c->u.e_huffman.option == E_SINT) {
3133
0
        for (i = 0; i < c->u.e_huffman.nvals; i++) {
3134
0
            tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol);
3135
0
        }
3136
0
    } else {
3137
0
        return -1;
3138
0
    }
3139
3140
206k
    tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals);
3141
413k
    for (i = 0; i < c->u.e_huffman.nvals; i++)
3142
206k
        tp += c->vv->varint_put32(tp, tpend, codes[i].len);
3143
3144
206k
    len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n;
3145
206k
    len += (n = c->vv->varint_put32_blk(b, tp-tmp));   r |= n;
3146
206k
    BLOCK_APPEND(b, tmp, tp-tmp);
3147
206k
    len += tp-tmp;
3148
3149
206k
    free(tmp);
3150
3151
206k
    if (r > 0)
3152
206k
        return len;
3153
3154
0
 block_err:
3155
0
    return -1;
3156
206k
}
3157
3158
cram_codec *cram_huffman_encode_init(cram_stats *st,
3159
                                     enum cram_encoding codec,
3160
                                     enum cram_external_type option,
3161
                                     void *dat,
3162
207k
                                     int version, varint_vec *vv) {
3163
207k
    int *vals = NULL, *freqs = NULL, *lens = NULL, code, len;
3164
207k
    int *new_vals, *new_freqs;
3165
207k
    int i, max_val = 0, min_val = INT_MAX, k;
3166
207k
    size_t nvals, vals_alloc = 0;
3167
207k
    cram_codec *c;
3168
207k
    cram_huffman_code *codes;
3169
3170
207k
    c = malloc(sizeof(*c));
3171
207k
    if (!c)
3172
0
        return NULL;
3173
207k
    c->codec = E_HUFFMAN;
3174
3175
    /* Count number of unique symbols */
3176
212M
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
3177
212M
        if (!st->freqs[i])
3178
211M
            continue;
3179
171k
        if (nvals >= vals_alloc) {
3180
171k
            vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3181
171k
            new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3182
171k
            if (!new_vals) goto nomem;
3183
171k
            vals = new_vals;
3184
171k
            new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3185
171k
            if (!new_freqs) goto nomem;
3186
171k
            freqs = new_freqs;
3187
171k
        }
3188
171k
        vals[nvals] = i;
3189
171k
        freqs[nvals] = st->freqs[i];
3190
171k
        assert(st->freqs[i] > 0);
3191
171k
        if (max_val < i) max_val = i;
3192
171k
        if (min_val > i) min_val = i;
3193
171k
        nvals++;
3194
171k
    }
3195
207k
    if (st->h) {
3196
35.9k
        khint_t k;
3197
3198
179k
        for (k = kh_begin(st->h); k != kh_end(st->h); k++) {
3199
143k
            if (!kh_exist(st->h, k))
3200
107k
                continue;
3201
35.9k
            if (nvals >= vals_alloc) {
3202
35.9k
                vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
3203
35.9k
                new_vals  = realloc(vals,  vals_alloc * sizeof(int));
3204
35.9k
                if (!new_vals) goto nomem;
3205
35.9k
                vals = new_vals;
3206
35.9k
                new_freqs = realloc(freqs, vals_alloc * sizeof(int));
3207
35.9k
                if (!new_freqs) goto nomem;
3208
35.9k
                freqs = new_freqs;
3209
35.9k
            }
3210
35.9k
            vals[nvals]= kh_key(st->h, k);
3211
35.9k
            freqs[nvals] = kh_val(st->h, k);
3212
35.9k
            assert(freqs[nvals] > 0);
3213
35.9k
            if (max_val < i) max_val = i;
3214
35.9k
            if (min_val > i) min_val = i;
3215
35.9k
            nvals++;
3216
35.9k
        }
3217
35.9k
    }
3218
3219
207k
    assert(nvals > 0);
3220
3221
207k
    new_freqs = realloc(freqs, 2*nvals*sizeof(*freqs));
3222
207k
    if (!new_freqs) goto nomem;
3223
207k
    freqs = new_freqs;
3224
207k
    lens = calloc(2*nvals, sizeof(*lens));
3225
207k
    if (!lens) goto nomem;
3226
3227
    /* Inefficient, use pointers to form chain so we can insert and maintain
3228
     * a sorted list? This is currently O(nvals^2) complexity.
3229
     */
3230
207k
    for (;;) {
3231
207k
        int low1 = INT_MAX, low2 = INT_MAX;
3232
207k
        int ind1 = 0, ind2 = 0;
3233
414k
        for (i = 0; i < nvals; i++) {
3234
207k
            if (freqs[i] < 0)
3235
0
                continue;
3236
207k
            if (low1 > freqs[i])
3237
207k
                low2 = low1, ind2 = ind1, low1 = freqs[i], ind1 = i;
3238
0
            else if (low2 > freqs[i])
3239
0
                low2 = freqs[i], ind2 = i;
3240
207k
        }
3241
207k
        if (low2 == INT_MAX)
3242
207k
            break;
3243
3244
0
        freqs[nvals] = low1 + low2;
3245
0
        lens[ind1] = nvals;
3246
0
        lens[ind2] = nvals;
3247
0
        freqs[ind1] *= -1;
3248
0
        freqs[ind2] *= -1;
3249
0
        nvals++;
3250
0
    }
3251
207k
    nvals = nvals/2+1;
3252
3253
    /* Assign lengths */
3254
414k
    for (i = 0; i < nvals; i++) {
3255
207k
        int code_len = 0;
3256
207k
        for (k = lens[i]; k; k = lens[k])
3257
0
            code_len++;
3258
207k
        lens[i] = code_len;
3259
207k
        freqs[i] *= -1;
3260
        //fprintf(stderr, "%d / %d => %d\n", vals[i], freqs[i], lens[i]);
3261
207k
    }
3262
3263
3264
    /* Sort, need in a struct */
3265
207k
    if (!(codes = malloc(nvals * sizeof(*codes))))
3266
0
        goto nomem;
3267
414k
    for (i = 0; i < nvals; i++) {
3268
207k
        codes[i].symbol = vals[i];
3269
207k
        codes[i].len = lens[i];
3270
207k
    }
3271
207k
    qsort(codes, nvals, sizeof(*codes), code_sort);
3272
3273
    /*
3274
     * Generate canonical codes from lengths.
3275
     * Sort by length.
3276
     * Start with 0.
3277
     * Every new code of same length is +1.
3278
     * Every new code of new length is +1 then <<1 per extra length.
3279
     *
3280
     * /\
3281
     * a/\
3282
     * /\/\
3283
     * bcd/\
3284
     *    ef
3285
     *
3286
     * a 1  0
3287
     * b 3  4 (0+1)<<2
3288
     * c 3  5
3289
     * d 3  6
3290
     * e 4  14  (6+1)<<1
3291
     * f 5  15
3292
     */
3293
207k
    code = 0; len = codes[0].len;
3294
414k
    for (i = 0; i < nvals; i++) {
3295
207k
        while (len != codes[i].len) {
3296
0
            code<<=1;
3297
0
            len++;
3298
0
        }
3299
207k
        codes[i].code = code++;
3300
3301
207k
        if (codes[i].symbol >= -1 && codes[i].symbol < MAX_HUFF)
3302
204k
            c->u.e_huffman.val2code[codes[i].symbol+1] = i;
3303
3304
        //fprintf(stderr, "sym %d, code %d, len %d\n",
3305
        //      codes[i].symbol, codes[i].code, codes[i].len);
3306
207k
    }
3307
3308
207k
    free(lens);
3309
207k
    free(vals);
3310
207k
    free(freqs);
3311
3312
207k
    c->u.e_huffman.codes = codes;
3313
207k
    c->u.e_huffman.nvals = nvals;
3314
207k
    c->u.e_huffman.option = option;
3315
3316
207k
    c->free = cram_huffman_encode_free;
3317
207k
    if (option == E_BYTE || option == E_BYTE_ARRAY) {
3318
6.08k
        if (c->u.e_huffman.codes[0].len == 0)
3319
6.08k
            c->encode = cram_huffman_encode_char0;
3320
0
        else
3321
0
            c->encode = cram_huffman_encode_char;
3322
201k
    } else if (option == E_INT || option == E_SINT) {
3323
201k
        if (c->u.e_huffman.codes[0].len == 0)
3324
201k
            c->encode = cram_huffman_encode_int0;
3325
0
        else
3326
0
            c->encode = cram_huffman_encode_int;
3327
201k
    } else if (option == E_LONG || option == E_SLONG) {
3328
0
        if (c->u.e_huffman.codes[0].len == 0)
3329
0
            c->encode = cram_huffman_encode_long0;
3330
0
        else
3331
0
            c->encode = cram_huffman_encode_long;
3332
0
    } else {
3333
0
        return NULL;
3334
0
    }
3335
207k
    c->store = cram_huffman_encode_store;
3336
207k
    c->flush = NULL;
3337
3338
207k
    return c;
3339
3340
0
 nomem:
3341
0
    hts_log_error("Out of memory");
3342
0
    free(vals);
3343
0
    free(freqs);
3344
0
    free(lens);
3345
0
    free(c);
3346
0
    return NULL;
3347
207k
}
3348
3349
/*
3350
 * ---------------------------------------------------------------------------
3351
 * BYTE_ARRAY_LEN
3352
 */
3353
int cram_byte_array_len_decode(cram_slice *slice, cram_codec *c,
3354
                               cram_block *in, char *out,
3355
0
                               int *out_size) {
3356
    /* Fetch length */
3357
0
    int32_t len = 0, one = 1;
3358
0
    int r;
3359
3360
0
    r = c->u.byte_array_len.len_codec->decode(slice, c->u.byte_array_len.len_codec,
3361
0
                                              in, (char *)&len, &one);
3362
    //printf("ByteArray Len=%d\n", len);
3363
3364
0
    if (!r && c->u.byte_array_len.val_codec && len >= 0) {
3365
0
        r = c->u.byte_array_len.val_codec->decode(slice,
3366
0
                                                  c->u.byte_array_len.val_codec,
3367
0
                                                  in, out, &len);
3368
0
    } else {
3369
0
        return -1;
3370
0
    }
3371
3372
0
    *out_size = len;
3373
3374
0
    return r;
3375
0
}
3376
3377
1.03k
void cram_byte_array_len_decode_free(cram_codec *c) {
3378
1.03k
    if (!c) return;
3379
3380
1.03k
    if (c->u.byte_array_len.len_codec)
3381
1.01k
        c->u.byte_array_len.len_codec->free(c->u.byte_array_len.len_codec);
3382
3383
1.03k
    if (c->u.byte_array_len.val_codec)
3384
1.01k
        c->u.byte_array_len.val_codec->free(c->u.byte_array_len.val_codec);
3385
3386
1.03k
    free(c);
3387
1.03k
}
3388
3389
0
int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) {
3390
0
    int r = 0;
3391
0
    r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0;
3392
0
    cram_byte_array_len_decoder *l = &c->u.byte_array_len;
3393
0
    r |=  l->len_codec->describe
3394
0
        ? l->len_codec->describe(l->len_codec, ks)
3395
0
        : (ksprintf(ks, "?")<0);
3396
0
    r |= ksprintf(ks, "},val_codec={") < 0;
3397
0
    r |=  l->val_codec->describe
3398
0
        ? l->val_codec->describe(l->val_codec, ks)
3399
0
        : (ksprintf(ks, "?")<0);
3400
0
    r |= ksprintf(ks, "}") < 0;
3401
3402
0
    return r;
3403
0
}
3404
3405
cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr,
3406
                                            char *data, int size,
3407
                                            enum cram_encoding codec,
3408
                                            enum cram_external_type option,
3409
1.03k
                                            int version, varint_vec *vv) {
3410
1.03k
    cram_codec *c;
3411
1.03k
    char *cp   = data;
3412
1.03k
    char *endp = data + size;
3413
3414
1.03k
    if (!(c = malloc(sizeof(*c))))
3415
0
        return NULL;
3416
3417
1.03k
    c->codec  = E_BYTE_ARRAY_LEN;
3418
1.03k
    c->decode = cram_byte_array_len_decode;
3419
1.03k
    c->free   = cram_byte_array_len_decode_free;
3420
1.03k
    c->describe = cram_byte_array_len_describe;
3421
1.03k
    c->u.byte_array_len.len_codec = NULL;
3422
1.03k
    c->u.byte_array_len.val_codec = NULL;
3423
3424
1.03k
    int encoding = vv->varint_get32(&cp, endp, NULL);
3425
1.03k
    int sub_size = vv->varint_get32(&cp, endp, NULL);
3426
1.03k
    if (sub_size < 0 || endp - cp < sub_size)
3427
6
        goto malformed;
3428
1.02k
    c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3429
1.02k
                                                      E_INT, version, vv);
3430
1.02k
    if (c->u.byte_array_len.len_codec == NULL)
3431
15
        goto no_codec;
3432
1.01k
    cp += sub_size;
3433
3434
1.01k
    encoding = vv->varint_get32(&cp, endp, NULL);
3435
1.01k
    sub_size = vv->varint_get32(&cp, endp, NULL);
3436
1.01k
    if (sub_size < 0 || endp - cp < sub_size)
3437
0
        goto malformed;
3438
1.01k
    c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size,
3439
1.01k
                                                      option, version, vv);
3440
1.01k
    if (c->u.byte_array_len.val_codec == NULL)
3441
0
        goto no_codec;
3442
1.01k
    cp += sub_size;
3443
3444
1.01k
    if (cp - data != size)
3445
3
        goto malformed;
3446
3447
1.00k
    return c;
3448
3449
9
 malformed:
3450
9
    hts_log_error("Malformed byte_array_len header stream");
3451
24
 no_codec:
3452
24
    cram_byte_array_len_decode_free(c);
3453
24
    return NULL;
3454
9
}
3455
3456
int cram_byte_array_len_encode(cram_slice *slice, cram_codec *c,
3457
10.8k
                               char *in, int in_size) {
3458
10.8k
    int32_t i32 = in_size;
3459
10.8k
    int r = 0;
3460
3461
10.8k
    r |= c->u.e_byte_array_len.len_codec->encode(slice,
3462
10.8k
                                                 c->u.e_byte_array_len.len_codec,
3463
10.8k
                                                 (char *)&i32, 1);
3464
10.8k
    r |= c->u.e_byte_array_len.val_codec->encode(slice,
3465
10.8k
                                                 c->u.e_byte_array_len.val_codec,
3466
10.8k
                                                 in, in_size);
3467
10.8k
    return r;
3468
10.8k
}
3469
3470
51.6k
void cram_byte_array_len_encode_free(cram_codec *c) {
3471
51.6k
    if (!c)
3472
0
        return;
3473
3474
51.6k
    if (c->u.e_byte_array_len.len_codec)
3475
51.6k
        c->u.e_byte_array_len.len_codec->free(c->u.e_byte_array_len.len_codec);
3476
3477
51.6k
    if (c->u.e_byte_array_len.val_codec)
3478
51.6k
        c->u.e_byte_array_len.val_codec->free(c->u.e_byte_array_len.val_codec);
3479
3480
51.6k
    free(c);
3481
51.6k
}
3482
3483
int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b,
3484
51.5k
                                     char *prefix, int version) {
3485
51.5k
    int len = 0, len2, len3, r = 0, n;
3486
51.5k
    cram_codec *tc;
3487
51.5k
    cram_block *b_len = NULL, *b_val = NULL;
3488
3489
51.5k
    if (prefix) {
3490
13.1k
        size_t l = strlen(prefix);
3491
13.1k
        BLOCK_APPEND(b, prefix, l);
3492
13.1k
        len += l;
3493
13.1k
    }
3494
3495
51.5k
    tc = c->u.e_byte_array_len.len_codec;
3496
51.5k
    b_len = cram_new_block(0, 0);
3497
51.5k
    if (!b_len) goto block_err;
3498
51.5k
    len2 = tc->store(tc, b_len, NULL, version);
3499
51.5k
    if (len2 < 0) goto block_err;
3500
3501
51.5k
    tc = c->u.e_byte_array_len.val_codec;
3502
51.5k
    b_val = cram_new_block(0, 0);
3503
51.5k
    if (!b_val) goto block_err;
3504
51.5k
    len3 = tc->store(tc, b_val, NULL, version);
3505
51.5k
    if (len3 < 0) goto block_err;
3506
3507
51.5k
    len += (n = c->vv->varint_put32_blk(b, c->codec));  r |= n;
3508
51.5k
    len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n;
3509
51.5k
    BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len));
3510
51.5k
    BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val));
3511
3512
51.5k
    cram_free_block(b_len);
3513
51.5k
    cram_free_block(b_val);
3514
3515
51.5k
    if (r > 0)
3516
51.5k
        return len + len2 + len3;
3517
3518
0
 block_err:
3519
0
    if (b_len) cram_free_block(b_len);
3520
0
    if (b_val) cram_free_block(b_val);
3521
0
    return -1;
3522
51.5k
}
3523
3524
cram_codec *cram_byte_array_len_encode_init(cram_stats *st,
3525
                                            enum cram_encoding codec,
3526
                                            enum cram_external_type option,
3527
                                            void *dat,
3528
51.6k
                                            int version, varint_vec *vv) {
3529
51.6k
    cram_codec *c;
3530
51.6k
    cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat;
3531
3532
51.6k
    c = malloc(sizeof(*c));
3533
51.6k
    if (!c)
3534
0
        return NULL;
3535
51.6k
    c->codec = E_BYTE_ARRAY_LEN;
3536
51.6k
    c->free = cram_byte_array_len_encode_free;
3537
51.6k
    c->encode = cram_byte_array_len_encode;
3538
51.6k
    c->store = cram_byte_array_len_encode_store;
3539
51.6k
    c->flush = NULL;
3540
3541
51.6k
    c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding,
3542
51.6k
                                                        st, E_INT,
3543
51.6k
                                                        e->len_dat,
3544
51.6k
                                                        version, vv);
3545
51.6k
    c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding,
3546
51.6k
                                                        NULL, E_BYTE_ARRAY,
3547
51.6k
                                                        e->val_dat,
3548
51.6k
                                                        version, vv);
3549
3550
51.6k
    if (!c->u.e_byte_array_len.len_codec ||
3551
51.6k
        !c->u.e_byte_array_len.val_codec) {
3552
0
        cram_byte_array_len_encode_free(c);
3553
0
        return NULL;
3554
0
    }
3555
3556
51.6k
    return c;
3557
51.6k
}
3558
3559
/*
3560
 * ---------------------------------------------------------------------------
3561
 * BYTE_ARRAY_STOP
3562
 */
3563
static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c,
3564
                                            cram_block *in, char *out,
3565
0
                                            int *out_size) {
3566
0
    char *cp, ch;
3567
0
    cram_block *b = NULL;
3568
3569
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3570
0
    if (!b)
3571
0
        return *out_size?-1:0;
3572
3573
0
    if (b->idx >= b->uncomp_size)
3574
0
        return -1;
3575
3576
0
    cp = (char *)b->data + b->idx;
3577
0
    if (out) {
3578
       // memccpy equivalent but without copying the terminating byte
3579
0
        ssize_t term = MIN(*out_size, b->uncomp_size - b->idx);
3580
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3581
0
            if (term-- < 0)
3582
0
                break;
3583
0
            *out++ = ch;
3584
0
            cp++;
3585
0
        }
3586
3587
        // Attempted overrun on input or output
3588
0
        if (ch != (char)c->u.byte_array_stop.stop)
3589
0
            return -1;
3590
0
    } else {
3591
        // Consume input, but produce no output
3592
0
        while ((ch = *cp) != (char)c->u.byte_array_stop.stop) {
3593
0
            if (cp - (char *)b->data >= b->uncomp_size)
3594
0
                return -1;
3595
0
            cp++;
3596
0
        }
3597
0
    }
3598
3599
0
    *out_size = cp - (char *)(b->data + b->idx);
3600
0
    b->idx = cp - (char *)b->data + 1;
3601
3602
0
    return 0;
3603
0
}
3604
3605
int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c,
3606
                                      cram_block *in, char *out_,
3607
0
                                      int *out_size) {
3608
0
    cram_block *b;
3609
0
    cram_block *out = (cram_block *)out_;
3610
0
    unsigned char *cp, *cp_end;
3611
0
    unsigned char stop;
3612
3613
0
    b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id);
3614
0
    if (!b)
3615
0
        return *out_size?-1:0;
3616
3617
0
    if (b->idx >= b->uncomp_size)
3618
0
        return -1;
3619
0
    cp = b->data + b->idx;
3620
0
    cp_end = b->data + b->uncomp_size;
3621
3622
    // STOP byte is hard-coded as zero by our name tokeniser decoder
3623
    // implementation, so we may ignore what was requested.
3624
0
    stop = b->orig_method == TOK3 ? 0 : c->u.byte_array_stop.stop;
3625
3626
0
    if (cp_end - cp < out->alloc - out->byte) {
3627
0
        unsigned char *out_cp = BLOCK_END(out);
3628
0
        while (cp != cp_end && *cp != stop)
3629
0
            *out_cp++ = *cp++;
3630
0
        BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out);
3631
0
    } else {
3632
0
        unsigned char *cp_start;
3633
0
        for (cp_start = cp; cp != cp_end && *cp != stop; cp++)
3634
0
            ;
3635
0
        BLOCK_APPEND(out, cp_start, cp - cp_start);
3636
0
        BLOCK_GROW(out, cp - cp_start);
3637
0
    }
3638
3639
0
    *out_size = cp - (b->data + b->idx);
3640
0
    b->idx = cp - b->data + 1;
3641
3642
0
    return 0;
3643
3644
0
 block_err:
3645
0
    return -1;
3646
0
}
3647
3648
348
void cram_byte_array_stop_decode_free(cram_codec *c) {
3649
348
    if (!c) return;
3650
3651
348
    free(c);
3652
348
}
3653
3654
0
int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) {
3655
0
    return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)",
3656
0
                    c->u.byte_array_stop.stop,
3657
0
                    c->u.byte_array_stop.content_id)
3658
0
        < 0 ? -1 : 0;
3659
0
}
3660
3661
cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr,
3662
                                             char *data, int size,
3663
                                             enum cram_encoding codec,
3664
                                             enum cram_external_type option,
3665
348
                                             int version, varint_vec *vv) {
3666
348
    cram_codec *c = NULL;
3667
348
    unsigned char *cp = (unsigned char *)data;
3668
348
    int err = 0;
3669
3670
348
    if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2))
3671
0
        goto malformed;
3672
3673
348
    if (!(c = malloc(sizeof(*c))))
3674
0
        return NULL;
3675
3676
348
    c->codec  = E_BYTE_ARRAY_STOP;
3677
348
    switch (option) {
3678
342
    case E_BYTE_ARRAY_BLOCK:
3679
342
        c->decode = cram_byte_array_stop_decode_block;
3680
342
        break;
3681
6
    case E_BYTE_ARRAY:
3682
6
        c->decode = cram_byte_array_stop_decode_char;
3683
6
        break;
3684
0
    default:
3685
0
        hts_log_error("The byte_array_stop codec only supports BYTE_ARRAYs");
3686
0
        free(c);
3687
0
        return NULL;
3688
348
    }
3689
348
    c->free   = cram_byte_array_stop_decode_free;
3690
348
    c->describe = cram_byte_array_stop_describe;
3691
3692
348
    c->u.byte_array_stop.stop = *cp++;
3693
348
    if (CRAM_MAJOR_VERS(version) == 1) {
3694
348
        c->u.byte_array_stop.content_id = cp[0] + (cp[1]<<8) + (cp[2]<<16)
3695
348
            + ((unsigned int) cp[3]<<24);
3696
348
        cp += 4;
3697
348
    } else {
3698
0
        c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err);
3699
0
    }
3700
3701
348
    if ((char *)cp - data != size || err)
3702
0
        goto malformed;
3703
3704
348
    return c;
3705
3706
0
 malformed:
3707
0
    hts_log_error("Malformed byte_array_stop header stream");
3708
0
    free(c);
3709
0
    return NULL;
3710
348
}
3711
3712
int cram_byte_array_stop_encode(cram_slice *slice, cram_codec *c,
3713
103k
                                char *in, int in_size) {
3714
103k
    BLOCK_APPEND(c->out, in, in_size);
3715
103k
    BLOCK_APPEND_CHAR(c->out, c->u.e_byte_array_stop.stop);
3716
103k
    return 0;
3717
3718
0
 block_err:
3719
0
    return -1;
3720
103k
}
3721
3722
79.5k
void cram_byte_array_stop_encode_free(cram_codec *c) {
3723
79.5k
    if (!c)
3724
0
        return;
3725
79.5k
    free(c);
3726
79.5k
}
3727
3728
int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b,
3729
79.4k
                                      char *prefix, int version) {
3730
79.4k
    int len = 0;
3731
79.4k
    char buf[20], *cp = buf;
3732
3733
79.4k
    if (prefix) {
3734
39.5k
        size_t l = strlen(prefix);
3735
39.5k
        BLOCK_APPEND(b, prefix, l);
3736
39.5k
        len += l;
3737
39.5k
    }
3738
3739
79.4k
    cp += c->vv->varint_put32(cp, buf+20, c->codec);
3740
3741
79.4k
    if (CRAM_MAJOR_VERS(version) == 1) {
3742
0
        cp += c->vv->varint_put32(cp, buf+20, 5);
3743
0
        *cp++ = c->u.e_byte_array_stop.stop;
3744
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  0) & 0xff;
3745
0
        *cp++ = (c->u.e_byte_array_stop.content_id >>  8) & 0xff;
3746
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff;
3747
0
        *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff;
3748
79.4k
    } else {
3749
79.4k
        cp += c->vv->varint_put32(cp, buf+20, 1 +
3750
79.4k
                                  c->vv->varint_size(c->u.e_byte_array_stop.content_id));
3751
79.4k
        *cp++ = c->u.e_byte_array_stop.stop;
3752
79.4k
        cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id);
3753
79.4k
    }
3754
3755
79.4k
    BLOCK_APPEND(b, buf, cp-buf);
3756
79.4k
    len += cp-buf;
3757
3758
79.4k
    return len;
3759
3760
0
 block_err:
3761
0
    return -1;
3762
79.4k
}
3763
3764
cram_codec *cram_byte_array_stop_encode_init(cram_stats *st,
3765
                                             enum cram_encoding codec,
3766
                                             enum cram_external_type option,
3767
                                             void *dat,
3768
79.5k
                                             int version, varint_vec *vv) {
3769
79.5k
    cram_codec *c;
3770
3771
79.5k
    c = malloc(sizeof(*c));
3772
79.5k
    if (!c)
3773
0
        return NULL;
3774
79.5k
    c->codec = E_BYTE_ARRAY_STOP;
3775
79.5k
    c->free = cram_byte_array_stop_encode_free;
3776
79.5k
    c->encode = cram_byte_array_stop_encode;
3777
79.5k
    c->store = cram_byte_array_stop_encode_store;
3778
79.5k
    c->flush = NULL;
3779
3780
79.5k
    c->u.e_byte_array_stop.stop = ((int *)dat)[0];
3781
79.5k
    c->u.e_byte_array_stop.content_id = ((int *)dat)[1];
3782
3783
79.5k
    return c;
3784
79.5k
}
3785
3786
/*
3787
 * ---------------------------------------------------------------------------
3788
 */
3789
3790
61
const char *cram_encoding2str(enum cram_encoding t) {
3791
61
    switch (t) {
3792
2
    case E_NULL:            return "NULL";
3793
0
    case E_EXTERNAL:        return "EXTERNAL";
3794
3
    case E_GOLOMB:          return "GOLOMB";
3795
0
    case E_HUFFMAN:         return "HUFFMAN";
3796
0
    case E_BYTE_ARRAY_LEN:  return "BYTE_ARRAY_LEN";
3797
0
    case E_BYTE_ARRAY_STOP: return "BYTE_ARRAY_STOP";
3798
9
    case E_BETA:            return "BETA";
3799
0
    case E_SUBEXP:          return "SUBEXP";
3800
0
    case E_GOLOMB_RICE:     return "GOLOMB_RICE";
3801
0
    case E_GAMMA:           return "GAMMA";
3802
3803
0
    case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED";
3804
0
    case E_VARINT_SIGNED:   return "VARINT_SIGNED";
3805
0
    case E_CONST_BYTE:      return "CONST_BYTE";
3806
0
    case E_CONST_INT:       return "CONST_INT";
3807
3808
0
    case E_NUM_CODECS:
3809
47
    default:                return "?";
3810
61
    }
3811
61
}
3812
3813
static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr,
3814
                                    char *data,
3815
                                    int size,
3816
                                    enum cram_encoding codec,
3817
                                    enum cram_external_type option,
3818
                                    int version, varint_vec *vv) = {
3819
    // CRAM 3.0 valid codecs
3820
    NULL, // null codec
3821
    cram_external_decode_init,
3822
    NULL, // golomb
3823
    cram_huffman_decode_init,
3824
    cram_byte_array_len_decode_init,
3825
    cram_byte_array_stop_decode_init,
3826
    cram_beta_decode_init,
3827
    cram_subexp_decode_init,
3828
    NULL, // golomb rice
3829
    cram_gamma_decode_init,
3830
3831
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3832
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3833
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3834
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3835
3836
    NULL,                      // was xbyte
3837
    cram_varint_decode_init,   // varint unsigned
3838
    cram_varint_decode_init,   // varint signed
3839
    cram_const_decode_init,    // const byte
3840
    cram_const_decode_init,    // const int
3841
3842
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3843
    NULL, NULL, NULL, NULL, NULL,
3844
3845
    NULL, // xhuffman
3846
    cram_xpack_decode_init,
3847
    cram_xrle_decode_init,
3848
    cram_xdelta_decode_init,
3849
};
3850
3851
cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr,
3852
                              enum cram_encoding codec,
3853
                              char *data, int size,
3854
                              enum cram_external_type option,
3855
5.74k
                              int version, varint_vec *vv) {
3856
5.74k
    if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) {
3857
5.69k
        cram_codec *r = decode_init[codec](hdr, data, size, codec,
3858
5.69k
                                           option, version, vv);
3859
5.69k
        if (r) {
3860
5.61k
            r->vv = vv;
3861
5.61k
            r->codec_id = hdr->ncodecs++;
3862
5.61k
        }
3863
5.69k
        return r;
3864
5.69k
    } else {
3865
52
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3866
52
        return NULL;
3867
52
    }
3868
5.74k
}
3869
3870
static cram_codec *(*encode_init[])(cram_stats *stx,
3871
                                    enum cram_encoding codec,
3872
                                    enum cram_external_type option,
3873
                                    void *opt,
3874
                                    int version, varint_vec *vv) = {
3875
    // CRAM 3.0 valid codecs
3876
    NULL, // null codec
3877
    cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4
3878
    NULL, // golomb
3879
    cram_huffman_encode_init,
3880
    cram_byte_array_len_encode_init,
3881
    cram_byte_array_stop_encode_init,
3882
    cram_beta_encode_init,
3883
    NULL, // subexponential (we support decode only)
3884
    NULL, // golomb rice
3885
    NULL, // gamma (we support decode only)
3886
3887
    // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive
3888
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3889
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3890
    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
3891
3892
    NULL, // was xbyte
3893
    cram_varint_encode_init, // varint unsigned
3894
    cram_varint_encode_init, // varint signed
3895
    cram_const_encode_init,  // const byte
3896
    cram_const_encode_init,  // const int
3897
3898
    // Gap to CRAM 4 transfomrations; 45 to 49 inclusive
3899
    NULL, NULL, NULL, NULL, NULL,
3900
3901
    NULL, // xhuffman
3902
    cram_xpack_encode_init,
3903
    cram_xrle_encode_init,
3904
    cram_xdelta_encode_init,
3905
};
3906
3907
cram_codec *cram_encoder_init(enum cram_encoding codec,
3908
                              cram_stats *st,
3909
                              enum cram_external_type option,
3910
                              void *dat,
3911
537k
                              int version, varint_vec *vv) {
3912
537k
    if (st && !st->nvals)
3913
109k
        return NULL;
3914
3915
    // cram_stats_encoding assumes integer data, but if option
3916
    // is E_BYTE then tweak the requested encoding.  This ought
3917
    // to be fixed in cram_stats_encoding instead.
3918
428k
    if (option == E_BYTE || option == E_BYTE_ARRAY ||
3919
225k
       option == E_BYTE_ARRAY_BLOCK) {
3920
203k
       if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED)
3921
0
           codec = E_EXTERNAL;
3922
203k
       else if (codec == E_CONST_INT)
3923
0
           codec = E_CONST_BYTE;
3924
203k
    }
3925
3926
428k
    if (encode_init[codec]) {
3927
428k
        cram_codec *r;
3928
428k
        if ((r = encode_init[codec](st, codec, option, dat, version, vv)))
3929
428k
            r->out = NULL;
3930
428k
        if (!r) {
3931
9
            hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec));
3932
9
            return NULL;
3933
9
        }
3934
428k
        r->vv = vv;
3935
428k
        return r;
3936
428k
    } else {
3937
0
        hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec));
3938
0
        abort();
3939
0
    }
3940
428k
}
3941
3942
/*
3943
 * Returns the content_id used by this codec, also in id2 if byte_array_len.
3944
 * Returns -1 for the CORE block and -2 for unneeded.
3945
 * id2 is only filled out for BYTE_ARRAY_LEN which uses 2 codecs.
3946
 */
3947
0
int cram_codec_to_id(cram_codec *c, int *id2) {
3948
0
    int bnum1, bnum2 = -2;
3949
3950
0
    switch (c->codec) {
3951
0
    case E_CONST_INT:
3952
0
    case E_CONST_BYTE:
3953
0
        bnum1 = -2; // no blocks used
3954
0
        break;
3955
3956
0
    case E_HUFFMAN:
3957
0
        bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1;
3958
0
        break;
3959
3960
0
    case E_GOLOMB:
3961
0
    case E_BETA:
3962
0
    case E_SUBEXP:
3963
0
    case E_GOLOMB_RICE:
3964
0
    case E_GAMMA:
3965
        // CORE block
3966
0
        bnum1 = -1;
3967
0
        break;
3968
3969
0
    case E_EXTERNAL:
3970
0
    case E_VARINT_UNSIGNED:
3971
0
    case E_VARINT_SIGNED:
3972
0
        bnum1 = c->u.external.content_id;
3973
0
        break;
3974
3975
0
    case E_BYTE_ARRAY_LEN:
3976
0
        bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL);
3977
0
        bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL);
3978
0
        break;
3979
3980
0
    case E_BYTE_ARRAY_STOP:
3981
0
        bnum1 = c->u.byte_array_stop.content_id;
3982
0
        break;
3983
3984
0
    case E_NULL:
3985
0
        bnum1 = -2;
3986
0
        break;
3987
3988
0
    default:
3989
0
        hts_log_error("Unknown codec type %d", c->codec);
3990
0
        bnum1 = -1;
3991
0
    }
3992
3993
0
    if (id2)
3994
0
        *id2 = bnum2;
3995
0
    return bnum1;
3996
0
}
3997
3998
3999
/*
4000
 * cram_codec structures are specialised for decoding or encoding.
4001
 * Unfortunately this makes turning a decoder into an encoder (such as
4002
 * when transcoding files) problematic.
4003
 *
4004
 * This function converts a cram decoder codec into an encoder version
4005
 * in-place (ie it modifiers the codec itself).
4006
 *
4007
 * Returns 0 on success;
4008
 *        -1 on failure.
4009
 */
4010
0
int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) {
4011
0
    int j;
4012
4013
0
    switch (c->codec) {
4014
0
    case E_CONST_INT:
4015
0
    case E_CONST_BYTE:
4016
        // shares struct with decode
4017
0
        c->store = cram_const_encode_store;
4018
0
        break;
4019
4020
0
    case E_EXTERNAL:
4021
        // shares struct with decode
4022
0
        c->free = cram_external_encode_free;
4023
0
        c->store = cram_external_encode_store;
4024
0
        if (c->decode == cram_external_decode_int)
4025
0
            c->encode = cram_external_encode_int;
4026
0
        else if (c->decode == cram_external_decode_long)
4027
0
            c->encode = cram_external_encode_long;
4028
0
        else if (c->decode == cram_external_decode_char)
4029
0
            c->encode = cram_external_encode_char;
4030
0
        else if (c->decode == cram_external_decode_block)
4031
0
            c->encode = cram_external_encode_char;
4032
0
        else
4033
0
            return -1;
4034
0
        break;
4035
4036
0
    case E_VARINT_SIGNED:
4037
0
    case E_VARINT_UNSIGNED:
4038
        // shares struct with decode
4039
0
        c->free = cram_varint_encode_free;
4040
0
        c->store = cram_varint_encode_store;
4041
0
        if (c->decode == cram_varint_decode_int)
4042
0
            c->encode = cram_varint_encode_int;
4043
0
        else if (c->decode == cram_varint_decode_sint)
4044
0
            c->encode = cram_varint_encode_sint;
4045
0
        else if (c->decode == cram_varint_decode_long)
4046
0
            c->encode = cram_varint_encode_long;
4047
0
        else if (c->decode == cram_varint_decode_slong)
4048
0
            c->encode = cram_varint_encode_slong;
4049
0
        else
4050
0
            return -1;
4051
0
        break;
4052
4053
0
    case E_HUFFMAN: {
4054
        // New structure, so switch.
4055
        // FIXME: we huffman and e_huffman structs amended, we could
4056
        // unify this.
4057
0
        cram_codec *t = malloc(sizeof(*t));
4058
0
        if (!t) return -1;
4059
0
        t->vv     = c->vv;
4060
0
        t->codec = E_HUFFMAN;
4061
0
        t->free = cram_huffman_encode_free;
4062
0
        t->store = cram_huffman_encode_store;
4063
0
        t->u.e_huffman.codes = c->u.huffman.codes;
4064
0
        t->u.e_huffman.nvals = c->u.huffman.ncodes;
4065
0
        t->u.e_huffman.option = c->u.huffman.option;
4066
0
        for (j = 0; j < t->u.e_huffman.nvals; j++) {
4067
0
            int32_t sym = t->u.e_huffman.codes[j].symbol;
4068
0
            if (sym >= -1 && sym < MAX_HUFF)
4069
0
                t->u.e_huffman.val2code[sym+1] = j;
4070
0
        }
4071
4072
0
        if (c->decode == cram_huffman_decode_char0)
4073
0
            t->encode = cram_huffman_encode_char0;
4074
0
        else if (c->decode == cram_huffman_decode_char)
4075
0
            t->encode = cram_huffman_encode_char;
4076
0
        else if (c->decode == cram_huffman_decode_int0)
4077
0
            t->encode = cram_huffman_encode_int0;
4078
0
        else if (c->decode == cram_huffman_decode_int)
4079
0
            t->encode = cram_huffman_encode_int;
4080
0
        else if (c->decode == cram_huffman_decode_long0)
4081
0
            t->encode = cram_huffman_encode_long0;
4082
0
        else if (c->decode == cram_huffman_decode_long)
4083
0
            t->encode = cram_huffman_encode_long;
4084
0
        else {
4085
0
            free(t);
4086
0
            return -1;
4087
0
        }
4088
0
        *c = *t;
4089
0
        free(t);
4090
0
        break;
4091
0
    }
4092
4093
0
    case E_BETA:
4094
        // shares struct with decode
4095
0
        c->free = cram_beta_encode_free;
4096
0
        c->store = cram_beta_encode_store;
4097
0
        if (c->decode == cram_beta_decode_int)
4098
0
            c->encode = cram_beta_encode_int;
4099
0
        else if (c->decode == cram_beta_decode_long)
4100
0
            c->encode = cram_beta_encode_long;
4101
0
        else if (c->decode == cram_beta_decode_char)
4102
0
            c->encode = cram_beta_encode_char;
4103
0
        else
4104
0
            return -1;
4105
0
        break;
4106
4107
0
    case E_XPACK: {
4108
        // shares struct with decode
4109
0
        cram_codec t = *c;
4110
0
        t.free = cram_xpack_encode_free;
4111
0
        t.store = cram_xpack_encode_store;
4112
0
        if (t.decode == cram_xpack_decode_long)
4113
0
            t.encode = cram_xpack_encode_long;
4114
0
        else if (t.decode == cram_xpack_decode_int)
4115
0
            t.encode = cram_xpack_encode_int;
4116
0
        else if (t.decode == cram_xpack_decode_char)
4117
0
            t.encode = cram_xpack_encode_char;
4118
0
        else
4119
0
            return -1;
4120
0
        t.u.e_xpack.sub_codec = t.u.xpack.sub_codec;
4121
0
        if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1)
4122
0
            return -1;
4123
0
        *c = t;
4124
0
        break;
4125
0
    }
4126
4127
0
    case E_BYTE_ARRAY_LEN: {
4128
0
        cram_codec *t = malloc(sizeof(*t));
4129
0
        if (!t) return -1;
4130
0
        t->vv     = c->vv;
4131
0
        t->codec  = E_BYTE_ARRAY_LEN;
4132
0
        t->free   = cram_byte_array_len_encode_free;
4133
0
        t->store  = cram_byte_array_len_encode_store;
4134
0
        t->encode = cram_byte_array_len_encode;
4135
0
        t->u.e_byte_array_len.len_codec = c->u.byte_array_len.len_codec;
4136
0
        t->u.e_byte_array_len.val_codec = c->u.byte_array_len.val_codec;
4137
0
        if (cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.len_codec) == -1 ||
4138
0
            cram_codec_decoder2encoder(fd, t->u.e_byte_array_len.val_codec) == -1) {
4139
0
            t->free(t);
4140
0
            return -1;
4141
0
        }
4142
4143
        // {len,val}_{encoding,dat} are undefined, but unused.
4144
        // Leaving them unset here means we can test that assertion.
4145
0
        *c = *t;
4146
0
        free(t);
4147
0
        break;
4148
0
    }
4149
4150
0
    case E_BYTE_ARRAY_STOP:
4151
        // shares struct with decode
4152
0
        c->free   = cram_byte_array_stop_encode_free;
4153
0
        c->store  = cram_byte_array_stop_encode_store;
4154
0
        c->encode = cram_byte_array_stop_encode;
4155
0
        break;
4156
4157
0
    default:
4158
0
        return -1;
4159
0
    }
4160
4161
0
    return 0;
4162
0
}
4163
4164
0
int cram_codec_describe(cram_codec *c, kstring_t *ks) {
4165
0
    if (c && c->describe)
4166
0
        return c->describe(c, ks);
4167
0
    else
4168
0
        return ksprintf(ks, "?");
4169
0
}