Coverage Report

Created: 2023-01-17 06:24

/src/htslib/htscodecs/htscodecs/tokenise_name3.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2016-2021 Genome Research Ltd.
3
 * Author(s): James Bonfield
4
 * 
5
 * Redistribution and use in source and binary forms, with or without 
6
 * modification, are permitted provided that the following conditions are met:
7
 * 
8
 *    1. Redistributions of source code must retain the above copyright notice,
9
 *       this list of conditions and the following disclaimer.
10
 * 
11
 *    2. Redistributions in binary form must reproduce the above
12
 *       copyright notice, this list of conditions and the following
13
 *       disclaimer in the documentation and/or other materials provided
14
 *       with the distribution.
15
 * 
16
 *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17
 *    Institute nor the names of its contributors may be used to endorse
18
 *    or promote products derived from this software without specific
19
 *    prior written permission.
20
 * 
21
 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22
 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25
 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
 */
33
34
// cc -O3 -g -DTEST_TOKENISER tokenise_name3.c arith_dynamic.c rANS_static4x16pr.c pooled_alloc.c -I.. -I. -lbz2 -pthread
35
36
// Name tokeniser.
37
// It generates a series of byte streams (per token) and compresses these
38
// either using static rANS or dynamic arithmetic coding.  Arith coding is
39
// typically 1-5% smaller, but around 50-100% slower.  We only envisage it
40
// being used at the higher compression levels.
41
42
// TODO
43
//
44
// - Is it better when encoding 1, 2, 3, 3, 4, 5, 5, 6, 7, 9, 9, 10 to encode
45
//   this as a mixture of MATCH and DELTA ops, or as entirely as DELTA ops
46
//   with some delta values being zero?  I suspect the latter, but it is
47
//   not implemented here.  See "last_token_delta" comments in code.
48
//
49
// - Consider variable size string implementations.
50
//   Pascal style strings (length + str),
51
//   C style strings (nul terminated),
52
//   Or split blocks: length block and string contents block.
53
//
54
// - Is this one token-block or many serialised token-blocks?
55
//   A) Lots of different models but feeding one bit-buffer emitted to
56
//      by the entropy encoder => one block (fqzcomp).
57
//   B) Lots of different models each feeding their own bit-buffers
58
//      => many blocks.
59
//
60
// - multiple integer types depending on size; 1, 2, 4 byte long.
61
//
62
// - Consider token choice for isalnum instead of isalpha.  Sometimes better.
63
//
64
// - Consider token synchronisation (eg on matching chr symbols?) incase of
65
//   variable number.  Eg consider foo:0999, foo:1000, foo:1001 (the leading
66
//   zero adds an extra token).
67
//
68
// - Optimisation of tokens.  Eg:
69
//     HS25_09827:2:2102:11274:80442#49
70
//     HS25_09827:2:2109:12941:31311#49
71
//
72
//   We'll have tokens for HS 25 _ 09827 : 2 : that are entirely <MATCH>
73
//   after the initial token.  These 7 tokens could be one ALPHA instead
74
//   of 7 distinct tokens, with 1 MATCH instead of 7.  This is both a speed
75
//   improvement for decoding as well as a space saving (fewer token-blocks
76
//   and associated overhead).
77
//
78
// - XOR.  Like ALPHA, but used when previous symbol is ALPHA or XOR
79
//   and string lengths match.  Useful when names are similar, eg:
80
//   the sequence in 07.names:
81
//
82
//   @VP2-06:112:H7LNDMCVY:1:1105:26919:1172 1:N:0:ATTCAGAA+AGGAGAAG
83
//   @VP2-06:112:H7LNDMCVY:1:1105:27100:1172 1:N:0:ATTCAGAA+AGGCGAAG
84
//   @VP2-06:112:H7LNDMCVY:1:1105:27172:1172 1:N:0:ATTCAGAA+AGGCTAAG
85
86
#include "config.h"
87
88
#include <stdio.h>
89
#include <stdlib.h>
90
#include <string.h>
91
#include <sys/types.h>
92
#include <sys/stat.h>
93
#include <unistd.h>
94
#include <limits.h>
95
#include <ctype.h>
96
#include <assert.h>
97
#include <inttypes.h>
98
#include <math.h>
99
#include <fcntl.h>
100
#include <errno.h>
101
#include <time.h>
102
103
#include "pooled_alloc.h"
104
#include "arith_dynamic.h"
105
#include "rANS_static4x16.h"
106
#include "tokenise_name3.h"
107
#include "varint.h"
108
#include "utils.h"
109
110
// 128 is insufficient for SAM names (max 256 bytes) as
111
// we may alternate a0a0a0a0a0 etc.  However if we fail,
112
// we just give up and switch to another codec, so this
113
// isn't a serious limit.  Maybe up to 256 to permit all
114
// SAM names?
115
21.5k
#define MAX_TOKENS 128
116
17.6k
#define MAX_TBLOCKS (MAX_TOKENS<<4)
117
118
// Number of names per block
119
#define MAX_NAMES 1000000
120
121
enum name_type {N_ERR = -1, N_TYPE = 0, N_ALPHA, N_CHAR, N_DIGITS0, N_DZLEN, N_DUP, N_DIFF, 
122
                N_DIGITS, N_DDELTA, N_DDELTA0, N_MATCH, N_NOP, N_END, N_ALL};
123
124
typedef struct trie {
125
    struct trie *next, *sibling;
126
    int count;
127
    uint32_t c:8;
128
    uint32_t n:24; // Nth line
129
} trie_t;
130
131
typedef struct {
132
    enum name_type token_type;
133
    int token_int;
134
    int token_str;
135
} last_context_tok;
136
137
typedef struct {
138
    char *last_name;
139
    int last_ntok;
140
    last_context_tok *last; // [last_ntok]
141
} last_context;
142
143
typedef struct {
144
    uint8_t *buf;
145
    size_t buf_a, buf_l; // alloc and used length.
146
    int tnum, ttype;
147
    int dup_from;
148
} descriptor;
149
150
typedef struct {
151
    last_context *lc;
152
153
    // For finding entire line dups
154
    int counter;
155
156
    // Trie used in encoder only
157
    trie_t *t_head;
158
    pool_alloc_t *pool;
159
160
    // token blocks
161
    descriptor desc[MAX_TBLOCKS];
162
163
    // summary stats per token
164
    int token_dcount[MAX_TOKENS];
165
    int token_icount[MAX_TOKENS];
166
    //int token_zcount[MAX_TOKENS];
167
168
    int max_tok; // tracks which desc/[id]count elements have been initialised
169
    int max_names;
170
} name_context;
171
172
126
static name_context *create_context(int max_names) {
173
126
    if (max_names <= 0)
174
0
        return NULL;
175
176
126
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
177
126
    if (max_names > 100000)
178
0
        return NULL;
179
126
#endif
180
181
    // An arbitrary limit to prevent malformed data from consuming excessive
182
    // amounts of memory.  Consider upping this if we have genuine use cases
183
    // for larger blocks.
184
126
    if (max_names > 1e7) {
185
0
        fprintf(stderr, "Name codec currently has a max of 10 million rec.\n");
186
0
        return NULL;
187
0
    }
188
189
126
    name_context *ctx = htscodecs_tls_alloc(sizeof(*ctx) +
190
126
                                            ++max_names*sizeof(*ctx->lc));
191
126
    if (!ctx) return NULL;
192
126
    ctx->max_names = max_names;
193
194
126
    ctx->counter = 0;
195
126
    ctx->t_head = NULL;
196
197
126
    ctx->lc = (last_context *)(((char *)ctx) + sizeof(*ctx));
198
126
    ctx->pool = NULL;
199
200
126
     memset(&ctx->desc[0], 0, 2*16 * sizeof(ctx->desc[0]));
201
126
     memset(&ctx->token_dcount[0], 0, sizeof(int));
202
126
     memset(&ctx->token_icount[0], 0, sizeof(int));
203
126
     memset(&ctx->lc[0], 0, max_names*sizeof(ctx->lc[0]));
204
126
     ctx->max_tok = 1;
205
206
126
     ctx->lc[0].last_ntok = 0;
207
208
126
    return ctx;
209
126
}
210
211
126
static void free_context(name_context *ctx) {
212
126
    if (!ctx)
213
0
        return;
214
215
126
    if (ctx->t_head)
216
0
        free(ctx->t_head);
217
126
    if (ctx->pool)
218
0
        pool_destroy(ctx->pool);
219
220
126
    int i;
221
31.6k
    for (i = 0; i < ctx->max_tok*16; i++)
222
31.5k
        free(ctx->desc[i].buf);
223
224
2.84M
    for (i = 0; i < ctx->max_names; i++)
225
2.84M
        free(ctx->lc[i].last);
226
227
126
    htscodecs_tls_free(ctx);
228
126
}
229
230
//-----------------------------------------------------------------------------
231
// Fast unsigned integer printing code.
232
// Returns number of bytes written.
233
0
static int append_uint32_fixed(char *cp, uint32_t i, uint8_t l) {
234
0
    switch (l) {
235
0
    case 9:*cp++ = i / 100000000 + '0', i %= 100000000;
236
0
    case 8:*cp++ = i / 10000000  + '0', i %= 10000000;
237
0
    case 7:*cp++ = i / 1000000   + '0', i %= 1000000;
238
0
    case 6:*cp++ = i / 100000    + '0', i %= 100000;
239
0
    case 5:*cp++ = i / 10000     + '0', i %= 10000;
240
0
    case 4:*cp++ = i / 1000      + '0', i %= 1000;
241
0
    case 3:*cp++ = i / 100       + '0', i %= 100;
242
0
    case 2:*cp++ = i / 10        + '0', i %= 10;
243
0
    case 1:*cp++ = i             + '0';
244
0
    case 0:break;
245
0
    }
246
0
    return l;
247
0
}
248
249
2.12k
static int append_uint32_var(char *cp, uint32_t i) {
250
2.12k
    char *op = cp;
251
2.12k
    uint32_t j;
252
253
    //if (i < 10)         goto b0;
254
2.12k
    if (i < 100)        goto b1;
255
    //if (i < 1000)       goto b2;
256
207
    if (i < 10000)      goto b3;
257
    //if (i < 100000)     goto b4;
258
203
    if (i < 1000000)    goto b5;
259
    //if (i < 10000000)   goto b6;
260
201
    if (i < 100000000)  goto b7;
261
262
191
    if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
263
98
    if ((j = i / 100000000))  {*cp++ = j + '0'; i -= j*100000000;  goto x7;}
264
10
 b7:if ((j = i / 10000000))   {*cp++ = j + '0'; i -= j*10000000;   goto x6;}
265
9
    if ((j = i / 1000000))    {*cp++ = j + '0', i -= j*1000000;    goto x5;}
266
2
 b5:if ((j = i / 100000))     {*cp++ = j + '0', i -= j*100000;     goto x4;}
267
1
    if ((j = i / 10000))      {*cp++ = j + '0', i -= j*10000;      goto x3;}
268
4
 b3:if ((j = i / 1000))       {*cp++ = j + '0', i -= j*1000;       goto x2;}
269
3
    if ((j = i / 100))        {*cp++ = j + '0', i -= j*100;        goto x1;}
270
1.91k
 b1:if ((j = i / 10))         {*cp++ = j + '0', i -= j*10;         goto x0;}
271
1.91k
    if (i)                     *cp++ = i + '0';
272
1.91k
    return cp-op;
273
274
93
 x8:*cp++ = i / 100000000 + '0', i %= 100000000;
275
191
 x7:*cp++ = i / 10000000  + '0', i %= 10000000;
276
192
 x6:*cp++ = i / 1000000   + '0', i %= 1000000;
277
201
 x5:*cp++ = i / 100000    + '0', i %= 100000;
278
202
 x4:*cp++ = i / 10000     + '0', i %= 10000;
279
203
 x3:*cp++ = i / 1000      + '0', i %= 1000;
280
204
 x2:*cp++ = i / 100       + '0', i %= 100;
281
207
 x1:*cp++ = i / 10        + '0', i %= 10;
282
209
 x0:*cp++ = i             + '0';
283
284
209
    return cp-op;
285
207
}
286
287
//-----------------------------------------------------------------------------
288
// Example descriptor encoding and IO.
289
//
290
// Here we just append to a buffer so we can dump out the results.
291
// These could then be passed through a static entropy encoder that
292
// encodes the entire buffer.
293
//
294
// Alternatively an adaptive entropy encoder could be place inline
295
// here to encode as it goes using additional knowledge from the
296
// supplied context.
297
298
// Ensure room for sz more bytes.
299
0
static int descriptor_grow(descriptor *fd, uint32_t sz) {
300
0
    while (fd->buf_l + sz > fd->buf_a) {
301
0
        size_t buf_a = fd->buf_a ? fd->buf_a*2 : 65536;
302
0
        unsigned char *buf = realloc(fd->buf, buf_a);
303
0
        if (!buf)
304
0
            return -1;
305
0
        fd->buf = buf;
306
0
        fd->buf_a = buf_a;
307
0
    }
308
309
0
    return 0;
310
0
}
311
312
static int encode_token_type(name_context *ctx, int ntok,
313
0
                             enum name_type type) {
314
0
    int id = ntok<<4;
315
316
0
    if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1;
317
318
0
    ctx->desc[id].buf[ctx->desc[id].buf_l++] = type;
319
320
0
    return 0;
321
0
}
322
323
0
static int encode_token_match(name_context *ctx, int ntok) {
324
0
    return encode_token_type(ctx, ntok, N_MATCH);
325
0
}
326
327
0
static int encode_token_end(name_context *ctx, int ntok) {
328
0
    return encode_token_type(ctx, ntok, N_END);
329
0
}
330
331
6.44k
static enum name_type decode_token_type(name_context *ctx, int ntok) {
332
6.44k
    int id = ntok<<4;
333
6.44k
    if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) return -1;
334
4.32k
    return ctx->desc[id].buf[ctx->desc[id].buf_l++];
335
6.44k
}
336
337
// int stored as 32-bit quantities
338
static int encode_token_int(name_context *ctx, int ntok,
339
0
                            enum name_type type, uint32_t val) {
340
0
    int id = (ntok<<4) | type;
341
342
0
    if (encode_token_type(ctx, ntok, type) < 0) return -1;
343
0
    if (descriptor_grow(&ctx->desc[id], 4) < 0) return -1;
344
345
0
    uint8_t *cp = &ctx->desc[id].buf[ctx->desc[id].buf_l];
346
0
    cp[0] = (val >>  0) & 0xff;
347
0
    cp[1] = (val >>  8) & 0xff;
348
0
    cp[2] = (val >> 16) & 0xff;
349
0
    cp[3] = (val >> 24) & 0xff;
350
0
    ctx->desc[id].buf_l += 4;
351
352
0
    return 0;
353
0
}
354
355
// Return 0 on success, -1 on failure;
356
static int decode_token_int(name_context *ctx, int ntok,
357
4.28k
                            enum name_type type, uint32_t *val) {
358
4.28k
    int id = (ntok<<4) | type;
359
360
4.28k
    if (ctx->desc[id].buf_l + 4 > ctx->desc[id].buf_a)
361
4
        return -1;
362
363
4.28k
    uint8_t *cp = ctx->desc[id].buf + ctx->desc[id].buf_l;
364
4.28k
    *val = (cp[0]) + (cp[1]<<8) + (cp[2]<<16) + ((uint32_t)cp[3]<<24);
365
4.28k
    ctx->desc[id].buf_l += 4;
366
367
4.28k
    return 0;
368
4.28k
}
369
370
// 8 bit integer quantity
371
static int encode_token_int1(name_context *ctx, int ntok,
372
0
                             enum name_type type, uint32_t val) {
373
0
    int id = (ntok<<4) | type;
374
375
0
    if (encode_token_type(ctx, ntok, type) < 0) return -1;
376
0
    if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1;
377
378
0
    ctx->desc[id].buf[ctx->desc[id].buf_l++] = val;
379
380
0
    return 0;
381
0
}
382
383
static int encode_token_int1_(name_context *ctx, int ntok,
384
0
                              enum name_type type, uint32_t val) {
385
0
    int id = (ntok<<4) | type;
386
387
0
    if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1;
388
389
0
    ctx->desc[id].buf[ctx->desc[id].buf_l++] = val;
390
391
0
    return 0;
392
0
}
393
394
// Return 0 on success, -1 on failure;
395
static int decode_token_int1(name_context *ctx, int ntok,
396
0
                             enum name_type type, uint32_t *val) {
397
0
    int id = (ntok<<4) | type;
398
399
0
    if (ctx->desc[id].buf_l  >= ctx->desc[id].buf_a)
400
0
        return -1;
401
0
    *val = ctx->desc[id].buf[ctx->desc[id].buf_l++];
402
403
0
    return 0;
404
0
}
405
406
407
// Basic C-string style for now.
408
//
409
// Maybe XOR with previous string as context?
410
// This permits partial match to be encoded efficiently.
411
static int encode_token_alpha(name_context *ctx, int ntok,
412
0
                              char *str, int len) {
413
0
    int id = (ntok<<4) | N_ALPHA;
414
415
0
    if (encode_token_type(ctx, ntok, N_ALPHA) < 0)  return -1;
416
0
    if (descriptor_grow(&ctx->desc[id], len+1) < 0) return -1;
417
0
    memcpy(&ctx->desc[id].buf[ctx->desc[id].buf_l], str, len);
418
0
    ctx->desc[id].buf[ctx->desc[id].buf_l+len] = 0;
419
0
    ctx->desc[id].buf_l += len+1;
420
421
0
    return 0;
422
0
}
423
424
// FIXME: need limit on string length for security.
425
// Return length on success, -1 on failure;
426
0
static int decode_token_alpha(name_context *ctx, int ntok, char *str, int max_len) {
427
0
    int id = (ntok<<4) | N_ALPHA;
428
0
    char c;
429
0
    int len = 0;
430
0
    if (ctx->desc[id].buf_l  >= ctx->desc[id].buf_a)
431
0
        return -1;
432
0
    do {
433
0
        c = ctx->desc[id].buf[ctx->desc[id].buf_l++];
434
0
        str[len++] = c;
435
0
    } while(c && len < max_len && ctx->desc[id].buf_l < ctx->desc[id].buf_a);
436
437
0
    return len-1;
438
0
}
439
440
0
static int encode_token_char(name_context *ctx, int ntok, char c) {
441
0
    int id = (ntok<<4) | N_CHAR;
442
443
0
    if (encode_token_type(ctx, ntok, N_CHAR) < 0) return -1;
444
0
    if (descriptor_grow(&ctx->desc[id], 1) < 0)    return -1;
445
0
    ctx->desc[id].buf[ctx->desc[id].buf_l++] = c;
446
447
0
    return 0;
448
0
}
449
450
// FIXME: need limit on string length for security
451
// Return length on success, -1 on failure;
452
0
static int decode_token_char(name_context *ctx, int ntok, char *str) {
453
0
    int id = (ntok<<4) | N_CHAR;
454
455
0
    if (ctx->desc[id].buf_l  >= ctx->desc[id].buf_a)
456
0
        return -1;
457
0
    *str = ctx->desc[id].buf[ctx->desc[id].buf_l++];
458
459
0
    return 1;
460
0
}
461
462
463
// A duplicated name
464
0
static int encode_token_dup(name_context *ctx, uint32_t val) {
465
0
    return encode_token_int(ctx, 0, N_DUP, val);
466
0
}
467
468
// Which read to delta against
469
0
static int encode_token_diff(name_context *ctx, uint32_t val) {
470
0
    return encode_token_int(ctx, 0, N_DIFF, val);
471
0
}
472
473
474
//-----------------------------------------------------------------------------
475
// Trie implementation for tracking common name prefixes.
476
static
477
0
int build_trie(name_context *ctx, char *data, size_t len, int n) {
478
0
    int nlines = 0;
479
0
    size_t i;
480
0
    trie_t *t;
481
482
0
    if (!ctx->t_head) {
483
0
        ctx->t_head = calloc(1, sizeof(*ctx->t_head));
484
0
        if (!ctx->t_head)
485
0
            return -1;
486
0
    }
487
488
    // Build our trie, also counting input lines
489
0
    for (nlines = i = 0; i < len; i++, nlines++) {
490
0
        t = ctx->t_head;
491
0
        t->count++;
492
0
        while (i < len && data[i] > '\n') {
493
0
            unsigned char c = data[i++];
494
0
            if (c & 0x80)
495
                //fprintf(stderr, "8-bit ASCII is unsupported\n");
496
0
                abort();
497
0
            c &= 127;
498
499
500
0
            trie_t *x = t->next, *l = NULL;
501
0
            while (x && x->c != c) {
502
0
                l = x; x = x->sibling;
503
0
            }
504
0
            if (!x) {
505
0
                if (!ctx->pool)
506
0
                    ctx->pool = pool_create(sizeof(trie_t));
507
0
                if (!(x = (trie_t *)pool_alloc(ctx->pool)))
508
0
                    return -1;
509
0
                memset(x, 0, sizeof(*x));
510
0
                if (!l)
511
0
                    x = t->next    = x;
512
0
                else
513
0
                    x = l->sibling = x;
514
0
                x->n = n;
515
0
                x->c = c;
516
0
            }
517
0
            t = x;
518
0
            t->c = c;
519
0
            t->count++;
520
0
        }
521
0
    }
522
523
0
    return 0;
524
0
}
525
526
#if 0
527
void dump_trie(trie_t *t, int depth) {
528
    if (depth == 0) {
529
        printf("graph x_%p {\n    splines = ortho\n    ranksep=2\n", t);
530
        printf("    p_%p [label=\"\"];\n", t);
531
        dump_trie(t, 1);
532
        printf("}\n");
533
    } else {
534
        int j, k, count;//, cj;
535
        char label[100], *cp;
536
        trie_t *tp = t;
537
538
//    patricia:
539
//      for (count = j = 0; j < 128; j++)
540
//          if (t->next[j])
541
//              count++, cj=j;
542
//
543
//      if (count == 1) {
544
//          t = t->next[cj];
545
//          *cp++ = cj;
546
//          goto patricia;
547
//      }
548
549
        trie_t *x;
550
        for (x = t->next; x; x = x->sibling) {
551
            printf("    p_%p [label=\"%c\"];\n", x, x->c);
552
            printf("    p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, x, x->count, MAX((log(x->count)-3)*2,1));
553
            //if (depth <= 11)
554
                dump_trie(x, depth+1);
555
        }
556
557
#if 0       
558
        for (j = 0; j < 128; j++) {
559
            trie_t *tn;
560
561
            if (!t->next[j])
562
                continue;
563
564
            cp = label;
565
            tn = t->next[j];
566
            *cp++ = j;
567
//      patricia:
568
569
            for (count = k = 0; k < 128; k++)
570
                if (tn->next[k])
571
                    count++;//, cj=k;
572
573
//          if (count == 1) {
574
//              tn = tn->next[cj];
575
//              *cp++ = cj;
576
//              goto patricia;
577
//          }
578
            *cp++ = 0;
579
580
            printf("    p_%p [label=\"%s\"];\n", tn, label);
581
            printf("    p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, tn, tn->count, MAX((log(tn->count)-3)*2,1));
582
            if (depth <= 11)
583
                dump_trie(tn, depth+1);
584
        }
585
#endif
586
    }
587
}
588
#endif
589
590
static
591
0
int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, int *is_fixed, int *fixed_len) {
592
0
    int nlines = 0;
593
0
    size_t i;
594
0
    trie_t *t;
595
0
    int from = -1, p3 = -1;
596
0
    *exact = 0;
597
0
    *fixed_len = 0;
598
0
    *is_fixed = 0;
599
600
    // Horrid hack for the encoder only.
601
    // We optimise per known name format here.
602
0
    int prefix_len;
603
0
    char *d = *data == '@' ? data+1 : data;
604
0
    int l   = *data == '@' ? len-1  : len;
605
0
    int f = (*data == '>') ? 1 : 0;
606
0
    if (l > 70 && d[f+0] == 'm' && d[7] == '_' && d[f+14] == '_' && d[f+61] == '/') {
607
0
        prefix_len = 60; // PacBio
608
0
        *is_fixed = 0;
609
0
    } else if (l == 17 && d[f+5] == ':' && d[f+11] == ':') {
610
0
        prefix_len = 6;  // IonTorrent
611
0
        *fixed_len = 6;
612
0
        *is_fixed = 1;
613
0
    } else if (l > 37 && d[f+8] == '-' && d[f+13] == '-' && d[f+18] == '-' && d[f+23] == '-' &&
614
0
               ((d[f+0] >= '0' && d[f+0] <='9') || (d[f+0] >= 'a' && d[f+0] <= 'f')) &&
615
0
               ((d[f+35] >= '0' && d[f+35] <='9') || (d[f+35] >= 'a' && d[f+35] <= 'f'))) {
616
        // ONT: f33d30d5-6eb8-4115-8f46-154c2620a5da_Basecall_1D_template...
617
0
        prefix_len = 37;
618
0
        *fixed_len = 37;
619
0
        *is_fixed = 1;
620
0
    } else {
621
        // Check Illumina and trim back to lane:tile:x:y.
622
0
        int colons = 0;
623
0
        for (i = 0; i < len && data[i] > ' '; i++)
624
0
            ;
625
0
        while (i > 0 && colons < 4)
626
0
            if (data[--i] == ':')
627
0
                colons++;
628
629
0
        if (colons == 4) {
630
            // Constant illumina prefix
631
0
            *fixed_len = i+1;
632
0
            prefix_len = i+1;
633
0
            *is_fixed = 1;
634
0
        } else {
635
            // Unknown, don't use a fixed len, but still search
636
            // for any exact matches.
637
0
            prefix_len = INT_MAX;
638
0
            *is_fixed = 0;
639
0
        }
640
0
    }
641
    //prefix_len = INT_MAX;
642
643
0
    if (!ctx->t_head) {
644
0
        ctx->t_head = calloc(1, sizeof(*ctx->t_head));
645
0
        if (!ctx->t_head)
646
0
            return -1;
647
0
    }
648
649
    // Find an item in the trie
650
0
    for (nlines = i = 0; i < len; i++, nlines++) {
651
0
        t = ctx->t_head;
652
0
        while (i < len && data[i] > '\n') {
653
0
            unsigned char c = data[i++];
654
0
            if (c & 0x80)
655
                //fprintf(stderr, "8-bit ASCII is unsupported\n");
656
0
                abort();
657
0
            c &= 127;
658
659
0
            trie_t *x = t->next;
660
0
            while (x && x->c != c)
661
0
                x = x->sibling;
662
0
            t = x;
663
664
//          t = t->next[c];
665
666
//          if (!t)
667
//              return -1;
668
669
0
            from = t->n;
670
0
            if (i == prefix_len) p3 = t->n;
671
            //if (t->count >= .0035*ctx->t_head->count && t->n != n) p3 = t->n; // pacbio
672
            //if (i == 60) p3 = t->n; // pacbio
673
            //if (i == 7) p3 = t->n; // iontorrent
674
0
            t->n = n;
675
0
        }
676
0
    }
677
678
    //printf("Looked for %d, found %d, prefix %d\n", n, from, p3);
679
680
0
    *exact = (n != from) && len;
681
0
    return *exact ? from : p3;
682
0
}
683
684
685
//-----------------------------------------------------------------------------
686
// Name encoder
687
688
/*
689
 * Tokenises a read name using ctx as context as the previous
690
 * tokenisation.
691
 *
692
 * Parsed elements are then emitted for encoding by calling the
693
 * encode_token() function with the context, token number (Nth token
694
 * in line), token type and token value.
695
 *
696
 * Returns 0 on success;
697
 *        -1 on failure.
698
 */
699
0
static int encode_name(name_context *ctx, char *name, int len, int mode) {
700
0
    int i, is_fixed, fixed_len;
701
702
0
    int exact;
703
0
    int cnum = ctx->counter++;
704
0
    int pnum = search_trie(ctx, name, len, cnum, &exact, &is_fixed, &fixed_len);
705
0
    if (pnum < 0) pnum = cnum ? cnum-1 : 0;
706
    //pnum = pnum & (MAX_NAMES-1);
707
    //cnum = cnum & (MAX_NAMES-1);
708
    //if (pnum == cnum) {pnum = cnum ? cnum-1 : 0;}
709
#ifdef ENC_DEBUG
710
    fprintf(stderr, "%d: pnum=%d (%d), exact=%d\n%s\n%s\n",
711
            ctx->counter, pnum, cnum-pnum, exact, ctx->lc[pnum].last_name, name);
712
#endif
713
714
    // Return DUP or DIFF switch, plus the distance.
715
0
    if (exact && len == strlen(ctx->lc[pnum].last_name)) {
716
0
        encode_token_dup(ctx, cnum-pnum);
717
0
        ctx->lc[cnum].last_name = name;
718
0
        ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok;
719
0
        int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS;
720
0
        ctx->lc[cnum].last = malloc(nc * sizeof(*ctx->lc[cnum].last));
721
0
        if (!ctx->lc[cnum].last)
722
0
            return -1;
723
0
        memcpy(ctx->lc[cnum].last, ctx->lc[pnum].last,
724
0
               ctx->lc[cnum].last_ntok * sizeof(*ctx->lc[cnum].last));
725
0
        return 0;
726
0
    }
727
728
0
    ctx->lc[cnum].last = malloc(MAX_TOKENS * sizeof(*ctx->lc[cnum].last));
729
0
    if (!ctx->lc[cnum].last)
730
0
        return -1;
731
0
    encode_token_diff(ctx, cnum-pnum);
732
733
0
    int ntok = 1;
734
0
    i = 0;
735
0
    if (is_fixed) {
736
0
        if (ntok >= ctx->max_tok) {
737
0
            memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
738
0
            memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int));
739
0
            memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int));
740
0
            ctx->max_tok = ntok+1;
741
0
        }
742
0
        if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_ALPHA) {
743
0
            if (ctx->lc[pnum].last[ntok].token_int == fixed_len && memcmp(name, ctx->lc[pnum].last_name, fixed_len) == 0) {
744
0
                encode_token_match(ctx, ntok);
745
0
            } else {
746
0
                encode_token_alpha(ctx, ntok, name, fixed_len);
747
0
            }
748
0
        } else {
749
0
            encode_token_alpha(ctx, ntok, name, fixed_len);
750
0
        }
751
0
        ctx->lc[cnum].last[ntok].token_int = fixed_len;
752
0
        ctx->lc[cnum].last[ntok].token_str = 0;
753
0
        ctx->lc[cnum].last[ntok++].token_type = N_ALPHA;
754
0
        i = fixed_len;
755
0
    }
756
757
0
    for (; i < len; i++) {
758
0
        if (ntok >= ctx->max_tok) {
759
0
            memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
760
0
            memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int));
761
0
            memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int));
762
0
            ctx->max_tok = ntok+1;
763
0
        }
764
765
        /* Determine data type of this segment */
766
0
        if (isalpha(name[i])) {
767
0
            int s = i+1;
768
//          int S = i+1;
769
770
//          // FIXME: try which of these is best.  alnum is good sometimes.
771
//          while (s < len && isalpha(name[s]))
772
0
            while (s < len && (isalpha(name[s]) || ispunct(name[s])))
773
//          while (s < len && name[s] != ':')
774
//          while (s < len && !isdigit(name[s]) && name[s] != ':')
775
0
                s++;
776
777
//          if (!is_fixed) {
778
//              while (S < len && isalnum(name[S]))
779
//                  S++;
780
//              if (s < S)
781
//                  s = S;
782
//          }
783
784
            // Single byte strings are better encoded as chars.
785
0
            if (s-i == 1) goto n_char;
786
787
0
            if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_ALPHA) {
788
0
                if (s-i == ctx->lc[pnum].last[ntok].token_int &&
789
0
                    memcmp(&name[i], 
790
0
                           &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str],
791
0
                           s-i) == 0) {
792
#ifdef ENC_DEBUG
793
                    fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, s-i, &name[i]);
794
#endif
795
0
                    if (encode_token_match(ctx, ntok) < 0) return -1;
796
0
                } else {
797
#ifdef ENC_DEBUG
798
                    fprintf(stderr, "Tok %d (alpha, %.*s / %.*s)\n", N_ALPHA,
799
                            s-i, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], s-i, &name[i]);
800
#endif
801
                    // same token/length, but mismatches
802
0
                    if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
803
0
                }
804
0
            } else {
805
#ifdef ENC_DEBUG
806
                fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, s-i, &name[i]);
807
#endif
808
0
                if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1;
809
0
            }
810
811
0
            ctx->lc[cnum].last[ntok].token_int = s-i;
812
0
            ctx->lc[cnum].last[ntok].token_str = i;
813
0
            ctx->lc[cnum].last[ntok].token_type = N_ALPHA;
814
815
0
            i = s-1;
816
0
        } else if (name[i] == '0') digits0: {
817
            // Digits starting with zero; encode length + value
818
0
            uint32_t s = i;
819
0
            uint32_t v = 0;
820
0
            int d = 0;
821
822
0
            while (s < len && isdigit(name[s]) && s-i < 9) {
823
0
                v = v*10 + name[s] - '0';
824
                //putchar(name[s]);
825
0
                s++;
826
0
            }
827
828
            // TODO: optimise choice over whether to switch from DIGITS to DELTA
829
            // regularly vs all DIGITS, also MATCH vs DELTA 0.
830
0
            if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_DIGITS0) {
831
0
                d = v - ctx->lc[pnum].last[ntok].token_int;
832
0
                if (d == 0 && ctx->lc[pnum].last[ntok].token_str == s-i) {
833
#ifdef ENC_DEBUG
834
                    fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v);
835
#endif
836
0
                    if (encode_token_match(ctx, ntok) < 0) return -1;
837
                    //ctx->lc[pnum].last[ntok].token_delta=0;
838
0
                } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == s-i) {
839
#ifdef ENC_DEBUG
840
                    fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last[ntok].token_int, v);
841
#endif
842
                    //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
843
0
                    if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1;
844
                    //ctx->lc[pnum].last[ntok].token_delta=1;
845
0
                } else {
846
#ifdef ENC_DEBUG
847
                    fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last[ntok].token_int, v);
848
#endif
849
0
                    if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
850
0
                    if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
851
                    //ctx->lc[pnum].last[ntok].token_delta=0;
852
0
                }
853
0
            } else {
854
#ifdef ENC_DEBUG
855
                fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v);
856
#endif
857
0
                if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1;
858
0
                if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1;
859
                //ctx->lc[pnum].last[ntok].token_delta=0;
860
0
            }
861
862
0
            ctx->lc[cnum].last[ntok].token_str = s-i; // length
863
0
            ctx->lc[cnum].last[ntok].token_int = v;
864
0
            ctx->lc[cnum].last[ntok].token_type = N_DIGITS0;
865
866
0
            i = s-1;
867
0
        } else if (isdigit(name[i])) {
868
            // digits starting 1-9; encode value
869
0
            uint32_t s = i;
870
0
            uint32_t v = 0;
871
0
            int d = 0;
872
873
0
            while (s < len && isdigit(name[s]) && s-i < 9) {
874
0
                v = v*10 + name[s] - '0';
875
                //putchar(name[s]);
876
0
                s++;
877
0
            }
878
879
            // dataset/10/K562_cytosol_LID8465_TopHat_v2.names
880
            // col 4 is Illumina lane - we don't want match & delta in there
881
            // as it has multiple lanes (so not ALL match) and delta is just
882
            // random chance, increasing entropy instead.
883
//          if (ntok == 4  || ntok == 8 || ntok == 10) {
884
//              encode_token_int(ctx, ntok, N_DIGITS, v);
885
//          } else {
886
887
            // If the last token was DIGITS0 and we are the same length, then encode
888
            // using that method instead as it seems likely the entire column is fixed
889
            // width, sometimes with leading zeros.
890
0
            if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok &&
891
0
                ctx->lc[pnum].last[ntok].token_type == N_DIGITS0 &&
892
0
                ctx->lc[pnum].last[ntok].token_str == s-i)
893
0
                goto digits0;
894
            
895
            // TODO: optimise choice over whether to switch from DIGITS to DELTA
896
            // regularly vs all DIGITS, also MATCH vs DELTA 0.
897
0
            if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_DIGITS) {
898
0
                d = v - ctx->lc[pnum].last[ntok].token_int;
899
0
                if (d == 0) {
900
#ifdef ENC_DEBUG
901
                    fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v);
902
#endif
903
0
                    if (encode_token_match(ctx, ntok) < 0) return -1;
904
                    //ctx->lc[pnum].last[ntok].token_delta=0;
905
                    //ctx->token_zcount[ntok]++;
906
0
                } else if (mode == 1 && d < 256 && d >= 0
907
                           //&& (10+ctx->token_dcount[ntok]) > (ctx->token_icount[ntok]+ctx->token_zcount[ntok])
908
0
                           && (5+ctx->token_dcount[ntok]) > ctx->token_icount[ntok]
909
0
                           ) {
910
#ifdef ENC_DEBUG
911
                    fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last[ntok].token_int, v);
912
#endif
913
0
                    if (encode_token_int1(ctx, ntok, N_DDELTA, d) < 0) return -1;
914
                    //ctx->lc[pnum].last[ntok].token_delta=1;
915
0
                    ctx->token_dcount[ntok]++;
916
0
                } else {
917
#ifdef ENC_DEBUG
918
                    fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last[ntok].token_int, v);
919
#endif
920
0
                    if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1;
921
                    //ctx->lc[pnum].last[ntok].token_delta=0;
922
0
                    ctx->token_icount[ntok]++;
923
0
                }
924
0
            } else {
925
#ifdef ENC_DEBUG
926
                fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v);
927
#endif
928
0
                if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1;
929
                //ctx->lc[pnum].last[ntok].token_delta=0;
930
0
            }
931
//          }
932
933
0
            ctx->lc[cnum].last[ntok].token_int = v;
934
0
            ctx->lc[cnum].last[ntok].token_type = N_DIGITS;
935
936
0
            i = s-1;
937
0
        } else {
938
0
        n_char:
939
            //if (!isalpha(name[i])) putchar(name[i]);
940
0
            if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_CHAR) {
941
0
                if (name[i] == ctx->lc[pnum].last[ntok].token_int) {
942
#ifdef ENC_DEBUG
943
                    fprintf(stderr, "Tok %d (chr-mat, %c)\n", N_MATCH, name[i]);
944
#endif
945
0
                    if (encode_token_match(ctx, ntok) < 0) return -1;
946
0
                } else {
947
#ifdef ENC_DEBUG
948
                    fprintf(stderr, "Tok %d (chr, %c / %c)\n", N_CHAR, ctx->lc[pnum].last[ntok].token_int, name[i]);
949
#endif
950
0
                    if (encode_token_char(ctx, ntok, name[i]) < 0) return -1;
951
0
                }
952
0
            } else {
953
#ifdef ENC_DEBUG
954
                fprintf(stderr, "Tok %d (new chr, %c)\n", N_CHAR, name[i]);
955
#endif
956
0
                if (encode_token_char(ctx, ntok, name[i]) < 0) return -1;
957
0
            }
958
959
0
            ctx->lc[cnum].last[ntok].token_int = name[i];
960
0
            ctx->lc[cnum].last[ntok].token_type = N_CHAR;
961
0
        }
962
963
0
        ntok++;
964
        //putchar(' ');
965
0
    }
966
967
#ifdef ENC_DEBUG
968
    fprintf(stderr, "Tok %d (end)\n", N_END);
969
#endif
970
0
    if (ntok >= ctx->max_tok) {
971
0
        memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0]));
972
0
        memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int));
973
0
        memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int));
974
0
        ctx->max_tok = ntok+1;
975
0
    }
976
0
    if (encode_token_end(ctx, ntok) < 0) return -1;
977
#ifdef ENC_DEBUG
978
    fprintf(stderr, "ntok=%d max_tok=%d\n", ntok, ctx->max_tok);
979
#endif
980
981
    //printf("Encoded %.*s with %d tokens\n", len, name, ntok);
982
    
983
0
    ctx->lc[cnum].last_name = name;
984
0
    ctx->lc[cnum].last_ntok = ntok;
985
0
    last_context_tok *shrunk = realloc(ctx->lc[cnum].last,
986
0
                                       (ntok+1) * sizeof(*ctx->lc[cnum].last));
987
0
    if (shrunk)
988
0
        ctx->lc[cnum].last = shrunk;
989
990
0
    if (!ctx->lc[cnum].last)
991
0
        return -1;
992
993
0
    return 0;
994
0
}
995
996
//-----------------------------------------------------------------------------
997
// Name decoder
998
999
2.16k
static int decode_name(name_context *ctx, char *name, int name_len) {
1000
2.16k
    int t0 = decode_token_type(ctx, 0);
1001
2.16k
    uint32_t dist;
1002
2.16k
    int pnum, cnum = ctx->counter++;
1003
1004
2.16k
    if (cnum >= ctx->max_names)
1005
0
        return -1;
1006
1007
2.16k
    if (t0 < 0 || t0 >= ctx->max_tok*16)
1008
2
        return 0;
1009
1010
2.16k
    if (decode_token_int(ctx, 0, t0, &dist) < 0 || dist > cnum)
1011
4
        return -1;
1012
2.15k
    if ((pnum = cnum - dist) < 0) pnum = 0;
1013
1014
    //fprintf(stderr, "t0=%d, dist=%d, pnum=%d, cnum=%d\n", t0, dist, pnum, cnum);
1015
1016
2.15k
    if (t0 == N_DUP) {
1017
0
        if (pnum == cnum)
1018
0
            return -1;
1019
1020
0
        if (strlen(ctx->lc[pnum].last_name) +1 >= name_len) return -1;
1021
0
        strcpy(name, ctx->lc[pnum].last_name);
1022
        // FIXME: optimise this
1023
0
        ctx->lc[cnum].last_name = name;
1024
0
        ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok;
1025
1026
0
        int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS;
1027
0
        ctx->lc[cnum].last = malloc(nc * sizeof(*ctx->lc[cnum].last));
1028
0
        if (!ctx->lc[cnum].last)
1029
0
            return -1;
1030
0
        memcpy(ctx->lc[cnum].last, ctx->lc[pnum].last,
1031
0
               ctx->lc[cnum].last_ntok * sizeof(*ctx->lc[cnum].last));
1032
1033
0
        return strlen(name)+1;
1034
0
    }
1035
1036
2.15k
    *name = 0;
1037
2.15k
    int ntok, len = 0, len2;
1038
2.15k
    ctx->lc[cnum].last = malloc(MAX_TOKENS * sizeof(*ctx->lc[cnum].last));
1039
2.15k
    if (!ctx->lc[cnum].last)
1040
0
        return -1;
1041
1042
4.28k
    for (ntok = 1; ntok < MAX_TOKENS && ntok < ctx->max_tok; ntok++) {
1043
4.28k
        uint32_t v, vl;
1044
4.28k
        enum name_type tok;
1045
4.28k
        tok = decode_token_type(ctx, ntok);
1046
        //fprintf(stderr, "Tok %d = %d\n", ntok, tok);
1047
1048
4.28k
        ctx->lc[cnum].last_ntok = 0;
1049
1050
4.28k
        switch (tok) {
1051
0
        case N_CHAR:
1052
0
            if (len+1 >= name_len) return -1;
1053
0
            if (decode_token_char(ctx, ntok, &name[len]) < 0) return -1;
1054
            //fprintf(stderr, "Tok %d CHAR %c\n", ntok, name[len]);
1055
0
            ctx->lc[cnum].last[ntok].token_type = N_CHAR;
1056
0
            ctx->lc[cnum].last[ntok].token_int  = name[len++];
1057
0
            break;
1058
1059
0
        case N_ALPHA:
1060
0
            if ((len2 = decode_token_alpha(ctx, ntok, &name[len], name_len - len)) < 0)
1061
0
                return -1;
1062
            //fprintf(stderr, "Tok %d ALPHA %.*s\n", ntok, len2, &name[len]);
1063
0
            ctx->lc[cnum].last[ntok].token_type = N_ALPHA;
1064
0
            ctx->lc[cnum].last[ntok].token_str  = len;
1065
0
            ctx->lc[cnum].last[ntok].token_int  = len2;
1066
0
            len += len2;
1067
0
            break;
1068
1069
0
        case N_DIGITS0: // [0-9]*
1070
0
            if (decode_token_int1(ctx, ntok, N_DZLEN, &vl) < 0) return -1;
1071
0
            if (decode_token_int(ctx, ntok, N_DIGITS0, &v) < 0) return -1;
1072
0
            if (len+20+vl >= name_len) return -1;
1073
0
            len += append_uint32_fixed(&name[len], v, vl);
1074
            //fprintf(stderr, "Tok %d DIGITS0 %0*d\n", ntok, vl, v);
1075
0
            ctx->lc[cnum].last[ntok].token_type = N_DIGITS0;
1076
0
            ctx->lc[cnum].last[ntok].token_int = v;
1077
0
            ctx->lc[cnum].last[ntok].token_str = vl;
1078
0
            break;
1079
1080
0
        case N_DDELTA0:
1081
0
            if (ntok >= ctx->lc[pnum].last_ntok) return -1;
1082
0
            if (decode_token_int1(ctx, ntok, N_DDELTA0, &v) < 0) return -1;
1083
0
            v += ctx->lc[pnum].last[ntok].token_int;
1084
0
            if (len+ctx->lc[pnum].last[ntok].token_str+1 >= name_len) return -1;
1085
0
            len += append_uint32_fixed(&name[len], v, ctx->lc[pnum].last[ntok].token_str);
1086
            //fprintf(stderr, "Tok %d DELTA0 %0*d\n", ntok, ctx->lc[pnum].last[ntok].token_str, v);
1087
0
            ctx->lc[cnum].last[ntok].token_type = N_DIGITS0;
1088
0
            ctx->lc[cnum].last[ntok].token_int = v;
1089
0
            ctx->lc[cnum].last[ntok].token_str = ctx->lc[pnum].last[ntok].token_str;
1090
0
            break;
1091
1092
2.12k
        case N_DIGITS: // [1-9][0-9]*
1093
2.12k
            if (decode_token_int(ctx, ntok, N_DIGITS, &v) < 0) return -1;
1094
2.12k
            if (len+20 >= name_len) return -1;
1095
2.12k
            len += append_uint32_var(&name[len], v);
1096
            //fprintf(stderr, "Tok %d DIGITS %d\n", ntok, v);
1097
2.12k
            ctx->lc[cnum].last[ntok].token_type = N_DIGITS;
1098
2.12k
            ctx->lc[cnum].last[ntok].token_int = v;
1099
2.12k
            break;
1100
1101
0
        case N_DDELTA:
1102
0
            if (ntok >= ctx->lc[pnum].last_ntok) return -1;
1103
0
            if (decode_token_int1(ctx, ntok, N_DDELTA, &v) < 0) return -1;
1104
0
            v += ctx->lc[pnum].last[ntok].token_int;
1105
0
            if (len+20 >= name_len) return -1;
1106
0
            len += append_uint32_var(&name[len], v);
1107
            //fprintf(stderr, "Tok %d DELTA %d\n", ntok, v);
1108
0
            ctx->lc[cnum].last[ntok].token_type = N_DIGITS;
1109
0
            ctx->lc[cnum].last[ntok].token_int = v;
1110
0
            break;
1111
1112
0
        case N_NOP:
1113
0
            ctx->lc[cnum].last[ntok].token_type = N_NOP;
1114
0
            break;
1115
1116
0
        case N_MATCH:
1117
0
            if (ntok >= ctx->lc[pnum].last_ntok) return -1;
1118
0
            switch (ctx->lc[pnum].last[ntok].token_type) {
1119
0
            case N_CHAR:
1120
0
                if (len+1 >= name_len) return -1;
1121
0
                name[len++] = ctx->lc[pnum].last[ntok].token_int;
1122
                //fprintf(stderr, "Tok %d MATCH CHAR %c\n", ntok, ctx->lc[pnum].last[ntok].token_int);
1123
0
                ctx->lc[cnum].last[ntok].token_type = N_CHAR;
1124
0
                ctx->lc[cnum].last[ntok].token_int = ctx->lc[pnum].last[ntok].token_int;
1125
0
                break;
1126
1127
0
            case N_ALPHA:
1128
0
                if (ctx->lc[pnum].last[ntok].token_int < 0 ||
1129
0
                    len+ctx->lc[pnum].last[ntok].token_int >= name_len) return -1;
1130
0
                memcpy(&name[len],
1131
0
                       &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str],
1132
0
                       ctx->lc[pnum].last[ntok].token_int);
1133
                //fprintf(stderr, "Tok %d MATCH ALPHA %.*s\n", ntok, ctx->lc[pnum].last[ntok].token_int, &name[len]);
1134
0
                ctx->lc[cnum].last[ntok].token_type = N_ALPHA;
1135
0
                ctx->lc[cnum].last[ntok].token_str = len;
1136
0
                ctx->lc[cnum].last[ntok].token_int = ctx->lc[pnum].last[ntok].token_int;
1137
0
                len += ctx->lc[pnum].last[ntok].token_int;
1138
0
                break;
1139
1140
0
            case N_DIGITS:
1141
0
                if (len+20 >= name_len) return -1;
1142
0
                len += append_uint32_var(&name[len], ctx->lc[pnum].last[ntok].token_int);
1143
                //fprintf(stderr, "Tok %d MATCH DIGITS %d\n", ntok, ctx->lc[pnum].last[ntok].token_int);
1144
0
                ctx->lc[cnum].last[ntok].token_type = N_DIGITS;
1145
0
                ctx->lc[cnum].last[ntok].token_int = ctx->lc[pnum].last[ntok].token_int;
1146
0
                break;
1147
1148
0
            case N_DIGITS0:
1149
0
                if (len+ctx->lc[pnum].last[ntok].token_str >= name_len) return -1;
1150
0
                len += append_uint32_fixed(&name[len], ctx->lc[pnum].last[ntok].token_int, ctx->lc[pnum].last[ntok].token_str);
1151
                //fprintf(stderr, "Tok %d MATCH DIGITS %0*d\n", ntok, ctx->lc[pnum].last[ntok].token_str, ctx->lc[pnum].last[ntok].token_int);
1152
0
                ctx->lc[cnum].last[ntok].token_type = N_DIGITS0;
1153
0
                ctx->lc[cnum].last[ntok].token_int = ctx->lc[pnum].last[ntok].token_int;
1154
0
                ctx->lc[cnum].last[ntok].token_str = ctx->lc[pnum].last[ntok].token_str;
1155
0
                break;
1156
1157
0
            default:
1158
0
                return -1;
1159
0
            }
1160
0
            break;
1161
1162
2.15k
        default: // an elided N_END
1163
2.15k
        case N_END:
1164
2.15k
            if (len+1 >= name_len) return -1;
1165
2.15k
            name[len++] = 0;
1166
2.15k
            ctx->lc[cnum].last[ntok].token_type = N_END;
1167
1168
2.15k
            ctx->lc[cnum].last_name = name;
1169
2.15k
            ctx->lc[cnum].last_ntok = ntok;
1170
1171
2.15k
            last_context_tok *shrunk
1172
2.15k
                = realloc(ctx->lc[cnum].last,
1173
2.15k
                          (ntok+1) * sizeof(*ctx->lc[cnum].last));
1174
2.15k
            if (shrunk)
1175
2.15k
                ctx->lc[cnum].last = shrunk;
1176
1177
2.15k
            if (!ctx->lc[cnum].last)
1178
0
                return -1;
1179
1180
2.15k
            return len;
1181
4.28k
        }
1182
4.28k
    }
1183
1184
1185
0
    return -1;
1186
2.15k
}
1187
1188
//-----------------------------------------------------------------------------
1189
// arith adaptive codec or static rANS 4x16pr codec
1190
0
static int arith_encode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len, int method) {
1191
0
    unsigned int olen = *out_len-6, nb;
1192
0
    if (arith_compress_to(in, in_len, out+6, &olen, method) == NULL)
1193
0
        return -1;
1194
1195
0
    nb = var_put_u32(out, out + *out_len, olen);
1196
0
    memmove(out+nb, out+6, olen);
1197
0
    *out_len = olen+nb;
1198
1199
0
    return 0;
1200
0
}
1201
1202
// Returns number of bytes read from 'in' on success,
1203
//        -1 on failure.
1204
3.83k
static int64_t arith_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) {
1205
3.83k
    unsigned int olen = *out_len;
1206
1207
3.83k
    uint32_t clen;
1208
3.83k
    int nb = var_get_u32(in, in+in_len, &clen);
1209
    //fprintf(stderr, "Arith decode %x\n", in[nb]);
1210
3.83k
    if (arith_uncompress_to(in+nb, in_len-nb, out, &olen) == NULL)
1211
17
        return -1;
1212
    //fprintf(stderr, "    Stored clen=%d\n", (int)clen);
1213
3.81k
    *out_len = olen;
1214
3.81k
    return clen+nb;
1215
3.83k
}
1216
1217
0
static int rans_encode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len, int method) {
1218
0
    unsigned int olen = *out_len-6, nb;
1219
0
    if (rans_compress_to_4x16(in, in_len, out+6, &olen, method) == NULL)
1220
0
        return -1;
1221
1222
0
    nb = var_put_u32(out, out + *out_len, olen);
1223
0
    memmove(out+nb, out+6, olen);
1224
0
    *out_len = olen+nb;
1225
1226
0
    return 0;
1227
0
}
1228
1229
// Returns number of bytes read from 'in' on success,
1230
//        -1 on failure.
1231
5.01k
static int64_t rans_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) {
1232
5.01k
    unsigned int olen = *out_len;
1233
1234
5.01k
    uint32_t clen;
1235
5.01k
    int nb = var_get_u32(in, in+in_len, &clen);
1236
    //fprintf(stderr, "Arith decode %x\n", in[nb]);
1237
5.01k
    if (rans_uncompress_to_4x16(in+nb, in_len-nb, out, &olen) == NULL)
1238
83
        return -1;
1239
    //fprintf(stderr, "    Stored clen=%d\n", (int)clen);
1240
4.93k
    *out_len = olen;
1241
4.93k
    return clen+nb;
1242
5.01k
}
1243
1244
static int compress(uint8_t *in, uint64_t in_len, int level, int use_arith,
1245
0
                    uint8_t *out, uint64_t *out_len) {
1246
0
    uint64_t best_sz = UINT64_MAX;
1247
0
    int best = 0;
1248
0
    uint64_t olen = *out_len;
1249
1250
    //fprintf(stderr, "=== try %d ===\n", (int)in_len);
1251
1252
0
    int m, rmethods[5][12] = {
1253
0
        {2,   0,      128},                                   // 1
1254
0
        {2,   0,                         192+8},              // 3
1255
0
        {3,   0,  128,                   193+8},              // 5
1256
0
        {6,   0,1,    129,   65,    193, 193+8},              // 7
1257
0
        {9,   0,1,128,129,64,65,192,193, 193+8},              // 9
1258
0
    };
1259
1260
    // 1-9 to 0-4
1261
0
    level = (level-1)/2;
1262
0
    if (level<0) level=0;
1263
0
    if (level>4) level=4;
1264
1265
0
    for (m = 1; m <= rmethods[level][0]; m++) {
1266
0
        *out_len = olen;
1267
1268
0
        if (in_len % 4 != 0 && (rmethods[level][m] & 8))
1269
0
            continue;
1270
1271
0
        if (use_arith) {
1272
0
            if (arith_encode(in, in_len, out, out_len, rmethods[level][m]) < 0)
1273
0
                return -1;
1274
0
        } else {
1275
0
            if (rans_encode(in, in_len, out, out_len, rmethods[level][m]) < 0)
1276
0
                return -1;
1277
0
        }
1278
1279
0
        if (best_sz > *out_len) {
1280
0
            best_sz = *out_len;
1281
0
            best = rmethods[level][m];
1282
0
        }
1283
0
    }
1284
1285
0
    *out_len = olen;
1286
0
    if (use_arith) {
1287
0
        if (arith_encode(in, in_len, out, out_len, best) < 0)
1288
0
            return -1;
1289
0
    } else {
1290
0
        if (rans_encode(in, in_len, out, out_len, best) < 0)
1291
0
            return -1;
1292
0
    }
1293
1294
//    uint64_t tmp;
1295
//    fprintf(stderr, "%d -> %d via method %x, %x\n", (int)in_len, (int)best_sz, best, out[i7get(out,&tmp)]);
1296
1297
0
    return 0;
1298
0
}
1299
1300
8.84k
static uint64_t uncompressed_size(uint8_t *in, uint64_t in_len) {
1301
8.84k
    uint32_t clen, ulen;
1302
1303
    // in[0] in part of buffer written by us
1304
8.84k
    int nb = var_get_u32(in, in+in_len, &clen);
1305
1306
    // in[nb] is part of buffer written to by arith_dynamic.
1307
8.84k
    var_get_u32(in+nb+1, in+in_len, &ulen);
1308
1309
8.84k
    return ulen;
1310
8.84k
}
1311
1312
static int uncompress(int use_arith, uint8_t *in, uint64_t in_len,
1313
8.84k
                      uint8_t *out, uint64_t *out_len) {
1314
8.84k
    uint32_t clen;
1315
8.84k
    var_get_u32(in, in+in_len, &clen);
1316
8.84k
    return use_arith
1317
8.84k
        ? arith_decode(in, in_len, out, out_len)
1318
8.84k
        : rans_decode(in, in_len, out, out_len);
1319
8.84k
}
1320
1321
//-----------------------------------------------------------------------------
1322
1323
/*
1324
 * Converts a line or \0 separated block of reading names to a compressed buffer.
1325
 * The code can only encode whole lines and will not attempt a partial line.
1326
 * Use the "last_start_p" return value to identify the partial line start
1327
 * offset, for continuation purposes.
1328
 *
1329
 * Returns a malloced buffer holding compressed data of size *out_len,
1330
 *         or NULL on failure
1331
 */
1332
uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith,
1333
0
                           int *out_len, int *last_start_p) {
1334
0
    int last_start = 0, i, j, nreads;
1335
1336
0
    if (len < 0) {
1337
0
        *out_len = 0;
1338
0
        return NULL;
1339
0
    }
1340
1341
    // Count lines
1342
0
    for (nreads = i = 0; i < len; i++)
1343
0
        if (blk[i] <= '\n') // \n or \0 separated entries
1344
0
            nreads++;
1345
1346
0
    name_context *ctx = create_context(nreads);
1347
0
    if (!ctx)
1348
0
        return NULL;
1349
1350
    // Construct trie
1351
0
    int ctr = 0;
1352
0
    for (i = j = 0; i < len; j=++i) {
1353
0
        while (i < len && blk[i] > '\n')
1354
0
            i++;
1355
0
        if (i >= len)
1356
0
            break;
1357
1358
        //blk[i] = '\0';
1359
0
        last_start = i+1;
1360
0
        if (build_trie(ctx, &blk[j], i-j, ctr++) < 0) {
1361
0
            free_context(ctx);
1362
0
            return NULL;
1363
0
        }
1364
0
    }
1365
0
    if (last_start_p)
1366
0
        *last_start_p = last_start;
1367
1368
    //fprintf(stderr, "Processed %d of %d in block, line %d\n", last_start, len, ctr);
1369
1370
    // Encode name
1371
0
    for (i = j = 0; i < len; j=++i) {
1372
0
        while (i < len && blk[i] > '\n')
1373
0
            i++;
1374
0
        if (i >= len)
1375
0
            break;
1376
1377
0
        blk[i] = '\0';
1378
        // try both 0 and 1 and pick best?
1379
0
        if (encode_name(ctx, &blk[j], i-j, 1) < 0) {
1380
0
            free_context(ctx);
1381
0
            return NULL;
1382
0
        }
1383
0
    }
1384
1385
#if 0
1386
    for (i = 0; i < ctx->max_tok*16; i++) {
1387
        char fn[1024];
1388
        if (!ctx->desc[i].buf_l) continue;
1389
        sprintf(fn, "_tok.%02d_%02d.%d", i>>4,i&15,i);
1390
        FILE *fp = fopen(fn, "w");
1391
        fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp);
1392
        fclose(fp);
1393
    }
1394
#endif
1395
1396
    //dump_trie(t_head, 0);
1397
1398
    // FIXME: merge descriptors
1399
    //
1400
    // If we see foo7:1 foo7:12 foo7:7 etc then foo: is constant,
1401
    // but it's encoded as alpha<foo>+dig<7>+char<:> instead of alpha<foo7:>.
1402
    // Any time token type 0 is all match beyond the first location we have
1403
    // a candidate for merging in string form.
1404
    //
1405
    // This saves around .1 to 1.3 percent on varying data sets.
1406
    // Cruder hack is dedicated prefix/suffix matching to short-cut this.
1407
1408
1409
    // Drop N_TYPE blocks if they all contain matches bar the first item,
1410
    // as we can regenerate these from the subsequent blocks types during
1411
    // decode.
1412
0
    for (i = 0; i < ctx->max_tok*16; i+=16) {
1413
0
        if (!ctx->desc[i].buf_l) continue;
1414
1415
0
        int z;
1416
0
        for (z=1; z<ctx->desc[i].buf_l; z++) {
1417
0
            if (ctx->desc[i].buf[z] != N_MATCH)
1418
0
                break;
1419
0
        }
1420
0
        if (z == ctx->desc[i].buf_l) {
1421
0
            int k;
1422
0
            for (k=1; k<16; k++)
1423
0
                if (ctx->desc[i+k].buf_l)
1424
0
                    break;
1425
1426
0
            if (k < 16) {
1427
0
                ctx->desc[i].buf_l = 0;
1428
0
                free(ctx->desc[i].buf);
1429
0
                ctx->desc[i].buf = NULL;
1430
0
            }
1431
0
        }
1432
0
    }
1433
1434
    // Serialise descriptors
1435
0
    uint32_t tot_size = 9;
1436
0
    int ndesc = 0;
1437
0
    for (i = 0; i < ctx->max_tok*16; i++) {
1438
0
        if (!ctx->desc[i].buf_l) continue;
1439
1440
0
        ndesc++;
1441
1442
0
        int tnum = i>>4;
1443
0
        int ttype = i&15;
1444
1445
0
        uint64_t out_len = 1.5 * arith_compress_bound(ctx->desc[i].buf_l, 1); // guesswork
1446
0
        uint8_t *out = malloc(out_len);
1447
0
        if (!out) {
1448
0
            free_context(ctx);
1449
0
            return NULL;
1450
0
        }
1451
1452
0
        if (compress(ctx->desc[i].buf, ctx->desc[i].buf_l, level, use_arith,
1453
0
                     out, &out_len) < 0) {
1454
0
            free_context(ctx);
1455
0
            return NULL;
1456
0
        }
1457
1458
0
        free(ctx->desc[i].buf);
1459
0
        ctx->desc[i].buf = out;
1460
0
        ctx->desc[i].buf_l = out_len;
1461
0
        ctx->desc[i].tnum = tnum;
1462
0
        ctx->desc[i].ttype = ttype;
1463
1464
        // Find dups
1465
0
        int j;
1466
0
        for (j = 0; j < i; j++) {
1467
0
            if (!ctx->desc[j].buf)
1468
0
                continue;
1469
0
            if (ctx->desc[i].buf_l != ctx->desc[j].buf_l || ctx->desc[i].buf_l <= 4)
1470
0
                continue;
1471
0
            if (memcmp(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_l) == 0)
1472
0
                break;
1473
0
        }
1474
0
        if (j < i) {
1475
0
            ctx->desc[i].dup_from = j;
1476
0
            tot_size += 3; // flag, dup_from, ttype
1477
0
        } else {
1478
0
            ctx->desc[i].dup_from = 0;
1479
0
            tot_size += out_len + 1; // ttype
1480
0
        }
1481
0
    }
1482
1483
#if 0
1484
    for (i = 0; i < ctx->max_tok*16; i++) {
1485
        char fn[1024];
1486
        if (!ctx->desc[i].buf_l && !ctx->desc[i].dup_from) continue;
1487
        sprintf(fn, "_tok.%02d_%02d.%d.comp", i>>4,i&15,i);
1488
        FILE *fp = fopen(fn, "w");
1489
        fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp);
1490
        fclose(fp);
1491
    }
1492
#endif
1493
1494
    // Write
1495
0
    uint8_t *out = malloc(tot_size+13);
1496
0
    if (!out) {
1497
0
        free_context(ctx);
1498
0
        return NULL;
1499
0
    }
1500
1501
0
    uint8_t *cp = out;
1502
1503
0
    *out_len = tot_size;
1504
//    *(uint32_t *)cp = last_start; cp += 4;
1505
//    *(uint32_t *)cp = nreads;     cp += 4;
1506
0
    *cp++ = (last_start >>  0) & 0xff;
1507
0
    *cp++ = (last_start >>  8) & 0xff;
1508
0
    *cp++ = (last_start >> 16) & 0xff;
1509
0
    *cp++ = (last_start >> 24) & 0xff;
1510
0
    *cp++ = (nreads     >>  0) & 0xff;
1511
0
    *cp++ = (nreads     >>  8) & 0xff;
1512
0
    *cp++ = (nreads     >> 16) & 0xff;
1513
0
    *cp++ = (nreads     >> 24) & 0xff;
1514
0
    *cp++ = use_arith;
1515
    //write(1, &nreads, 4);
1516
0
    int last_tnum = -1;
1517
0
    for (i = 0; i < ctx->max_tok*16; i++) {
1518
0
        if (!ctx->desc[i].buf_l) continue;
1519
0
        uint8_t ttype8 = ctx->desc[i].ttype;
1520
0
        if (ctx->desc[i].tnum != last_tnum) {
1521
0
            ttype8 |= 128;
1522
0
            last_tnum = ctx->desc[i].tnum;
1523
0
        }
1524
0
        if (ctx->desc[i].dup_from) {
1525
            //fprintf(stderr, "Dup %d from %d, sz %d\n", i, ctx->desc[i].dup_from, ctx->desc[i].buf_l);
1526
0
            *cp++ = ttype8 | 64;
1527
0
            *cp++ = ctx->desc[i].dup_from >> 4;
1528
0
            *cp++ = ctx->desc[i].dup_from & 15;
1529
0
        } else {
1530
0
            *cp++ = ttype8;
1531
0
            memcpy(cp, ctx->desc[i].buf, ctx->desc[i].buf_l);
1532
0
            cp += ctx->desc[i].buf_l;
1533
0
        }
1534
0
    }
1535
1536
    //assert(cp-out == tot_size);
1537
1538
0
    free_context(ctx);
1539
1540
0
    return out;
1541
0
}
1542
1543
// Deprecated interface; to remove when we next to an ABI breakage
1544
uint8_t *encode_names(char *blk, int len, int level, int use_arith,
1545
0
                      int *out_len, int *last_start_p) {
1546
0
    return tok3_encode_names(blk, len, level, use_arith, out_len,
1547
0
                             last_start_p);
1548
0
}
1549
1550
/*
1551
 * Decodes a compressed block of read names into \0 separated names.
1552
 * The size of the data returned (malloced) is in *out_len.
1553
 *
1554
 * Returns NULL on failure.
1555
 */
1556
126
uint8_t *tok3_decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) {
1557
126
    if (sz < 9)
1558
0
        return NULL;
1559
1560
126
    int i, o = 9;
1561
    //int ulen   = *(uint32_t *)in;
1562
126
    int ulen   = (in[0]<<0) | (in[1]<<8) | (in[2]<<16) |
1563
126
        (((uint32_t)in[3])<<24);
1564
1565
126
    if (ulen < 0 || ulen >= INT_MAX-1024)
1566
0
        return NULL;
1567
1568
126
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1569
    // Speed up fuzzing by blocking excessive sizes
1570
126
    if (ulen > 100000)
1571
0
        return NULL;
1572
126
#endif
1573
1574
    //int nreads = *(uint32_t *)(in+4);
1575
126
    int nreads = (in[4]<<0) | (in[5]<<8) | (in[6]<<16) | (((uint32_t)in[7])<<24);
1576
126
    int use_arith = in[8];
1577
126
    name_context *ctx = create_context(nreads);
1578
126
    if (!ctx)
1579
0
        return NULL;
1580
1581
    // Unpack descriptors
1582
126
    int tnum = -1;
1583
9.21k
    while (o < sz) {
1584
9.21k
        uint8_t ttype = in[o++];
1585
9.21k
        if (ttype & 64) {
1586
364
            if (o+2 >= sz) goto err;
1587
364
            int j = in[o++]<<4;
1588
364
            j += in[o++];
1589
364
            if (ttype & 128) {
1590
120
                tnum++;
1591
120
                if (tnum >= MAX_TOKENS)
1592
0
                    goto err;
1593
120
                ctx->max_tok = tnum+1;
1594
120
                memset(&ctx->desc[tnum<<4], 0, 16*sizeof(ctx->desc[tnum]));
1595
120
            }
1596
1597
364
            if ((ttype & 15) != 0 && (ttype & 128)) {
1598
54
                if (tnum < 0) goto err;
1599
54
                ctx->desc[tnum<<4].buf = malloc(nreads);
1600
54
                if (!ctx->desc[tnum<<4].buf)
1601
0
                    goto err;
1602
1603
54
                ctx->desc[tnum<<4].buf_l = 0;
1604
54
                ctx->desc[tnum<<4].buf_a = nreads;
1605
54
                ctx->desc[tnum<<4].buf[0] = ttype&15;
1606
54
                memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1);
1607
54
            }
1608
1609
364
            if (tnum < 0) goto err;
1610
364
            i = (tnum<<4) | (ttype&15);
1611
364
            if (j >= i)
1612
14
                goto err;
1613
350
            if (!ctx->desc[j].buf)
1614
1
                goto err; // Attempt to copy a non-existent stream
1615
1616
349
            ctx->desc[i].buf_l = 0;
1617
349
            ctx->desc[i].buf_a = ctx->desc[j].buf_a;
1618
349
            if (ctx->desc[i].buf) free(ctx->desc[i].buf);
1619
349
            ctx->desc[i].buf = malloc(ctx->desc[i].buf_a);
1620
349
            if (!ctx->desc[i].buf)
1621
0
                goto err;
1622
1623
349
            memcpy(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_a);
1624
            //fprintf(stderr, "Copy ttype %d, i=%d,j=%d, size %d\n", ttype, i, j, (int)ctx->desc[i].buf_a);
1625
349
            continue;
1626
349
        }
1627
1628
        //if (ttype == 0)
1629
8.84k
        if (ttype & 128) {
1630
1.85k
            tnum++;
1631
1.85k
            if (tnum >= MAX_TOKENS)
1632
1
                goto err;
1633
1.85k
            ctx->max_tok = tnum+1;
1634
1.85k
            memset(&ctx->desc[tnum<<4], 0, 16*sizeof(ctx->desc[tnum]));
1635
1.85k
        }
1636
1637
8.84k
        if ((ttype & 15) != 0 && (ttype & 128)) {
1638
87
            if (tnum < 0) goto err;
1639
87
            if (ctx->desc[tnum<<4].buf) free(ctx->desc[tnum<<4].buf);
1640
87
            ctx->desc[tnum<<4].buf = malloc(nreads);
1641
87
            if (!ctx->desc[tnum<<4].buf)
1642
0
                goto err;
1643
87
            ctx->desc[tnum<<4].buf_l = 0;
1644
87
            ctx->desc[tnum<<4].buf_a = nreads;
1645
87
            ctx->desc[tnum<<4].buf[0] = ttype&15;
1646
87
            memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1);
1647
87
        }
1648
1649
        //fprintf(stderr, "Read %02x\n", c);
1650
1651
        // Load compressed block
1652
8.84k
        int64_t clen, ulen = uncompressed_size(&in[o], sz-o);
1653
8.84k
        if (ulen < 0 || ulen >= INT_MAX)
1654
0
            goto err;
1655
8.84k
        if (tnum < 0) goto err;
1656
8.84k
        i = (tnum<<4) | (ttype&15);
1657
1658
8.84k
        if (i >= MAX_TBLOCKS || i < 0)
1659
0
            goto err;
1660
1661
8.84k
        ctx->desc[i].buf_l = 0;
1662
8.84k
        if (ctx->desc[i].buf) free(ctx->desc[i].buf);
1663
8.84k
        ctx->desc[i].buf = malloc(ulen);
1664
8.84k
        if (!ctx->desc[i].buf)
1665
0
            goto err;
1666
1667
8.84k
        ctx->desc[i].buf_a = ulen;
1668
8.84k
        uint64_t usz = ctx->desc[i].buf_a; // convert from size_t for 32-bit sys
1669
8.84k
        clen = uncompress(use_arith, &in[o], sz-o, ctx->desc[i].buf, &usz);
1670
8.84k
        ctx->desc[i].buf_a = usz;
1671
8.84k
        if (clen < 0 || ctx->desc[i].buf_a != ulen)
1672
103
            goto err;
1673
1674
        // fprintf(stderr, "%d: Decode tnum %d type %d clen %d ulen %d via %d\n",
1675
        //      o, tnum, ttype, (int)clen, (int)ctx->desc[i].buf_a, ctx->desc[i].buf[0]);
1676
1677
8.74k
        o += clen;
1678
1679
        // Encode tnum 0 type 0 ulen 100000 clen 12530 via 2
1680
        // Encode tnum 0 type 6 ulen 196800 clen 43928 via 3
1681
        // Encode tnum 0 type 7 ulen 203200 clen 17531 via 3
1682
        // Encode tnum 1 type 0 ulen 50800 clen 10 via 1
1683
        // Encode tnum 1 type 1 ulen 3 clen 5 via 0
1684
        // Encode tnum 2 type 0 ulen 50800 clen 10 via 1
1685
        //      
1686
8.74k
    }
1687
1688
7
    int ret;
1689
7
    ulen += 1024; // for easy coding in decode_name.
1690
7
    uint8_t *out = malloc(ulen);
1691
7
    if (!out)
1692
0
        goto err;
1693
1694
7
    size_t out_sz = 0;
1695
2.16k
    while ((ret = decode_name(ctx, (char *)out+out_sz, ulen)) > 0) {
1696
2.15k
        out_sz += ret;
1697
2.15k
        ulen -= ret;
1698
2.15k
    }
1699
1700
7
    if (ret < 0)
1701
5
        free(out);
1702
1703
7
    free_context(ctx);
1704
1705
7
    *out_len = out_sz;
1706
7
    return ret == 0 ? out : NULL;
1707
1708
119
 err:
1709
119
    free_context(ctx);
1710
119
    return NULL;
1711
7
}
1712
1713
// Deprecated interface; to remove when we next to an ABI breakage
1714
0
uint8_t *decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) {
1715
0
    return tok3_decode_names(in, sz, out_len);
1716
0
}