/src/htslib/htscodecs/htscodecs/tokenise_name3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016-2021 Genome Research Ltd. |
3 | | * Author(s): James Bonfield |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions are met: |
7 | | * |
8 | | * 1. Redistributions of source code must retain the above copyright notice, |
9 | | * this list of conditions and the following disclaimer. |
10 | | * |
11 | | * 2. Redistributions in binary form must reproduce the above |
12 | | * copyright notice, this list of conditions and the following |
13 | | * disclaimer in the documentation and/or other materials provided |
14 | | * with the distribution. |
15 | | * |
16 | | * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
17 | | * Institute nor the names of its contributors may be used to endorse |
18 | | * or promote products derived from this software without specific |
19 | | * prior written permission. |
20 | | * |
21 | | * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS |
22 | | * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
23 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
24 | | * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH |
25 | | * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
26 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
27 | | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
28 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
29 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
30 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
31 | | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
32 | | */ |
33 | | |
34 | | // cc -O3 -g -DTEST_TOKENISER tokenise_name3.c arith_dynamic.c rANS_static4x16pr.c pooled_alloc.c -I.. -I. -lbz2 -pthread |
35 | | |
36 | | // Name tokeniser. |
37 | | // It generates a series of byte streams (per token) and compresses these |
38 | | // either using static rANS or dynamic arithmetic coding. Arith coding is |
39 | | // typically 1-5% smaller, but around 50-100% slower. We only envisage it |
40 | | // being used at the higher compression levels. |
41 | | |
42 | | // TODO |
43 | | // |
44 | | // - Is it better when encoding 1, 2, 3, 3, 4, 5, 5, 6, 7, 9, 9, 10 to encode |
45 | | // this as a mixture of MATCH and DELTA ops, or as entirely as DELTA ops |
46 | | // with some delta values being zero? I suspect the latter, but it is |
47 | | // not implemented here. See "last_token_delta" comments in code. |
48 | | // |
49 | | // - Consider variable size string implementations. |
50 | | // Pascal style strings (length + str), |
51 | | // C style strings (nul terminated), |
52 | | // Or split blocks: length block and string contents block. |
53 | | // |
54 | | // - Is this one token-block or many serialised token-blocks? |
55 | | // A) Lots of different models but feeding one bit-buffer emitted to |
56 | | // by the entropy encoder => one block (fqzcomp). |
57 | | // B) Lots of different models each feeding their own bit-buffers |
58 | | // => many blocks. |
59 | | // |
60 | | // - multiple integer types depending on size; 1, 2, 4 byte long. |
61 | | // |
62 | | // - Consider token choice for isalnum instead of isalpha. Sometimes better. |
63 | | // |
64 | | // - Consider token synchronisation (eg on matching chr symbols?) incase of |
65 | | // variable number. Eg consider foo:0999, foo:1000, foo:1001 (the leading |
66 | | // zero adds an extra token). |
67 | | // |
68 | | // - Optimisation of tokens. Eg: |
69 | | // HS25_09827:2:2102:11274:80442#49 |
70 | | // HS25_09827:2:2109:12941:31311#49 |
71 | | // |
72 | | // We'll have tokens for HS 25 _ 09827 : 2 : that are entirely <MATCH> |
73 | | // after the initial token. These 7 tokens could be one ALPHA instead |
74 | | // of 7 distinct tokens, with 1 MATCH instead of 7. This is both a speed |
75 | | // improvement for decoding as well as a space saving (fewer token-blocks |
76 | | // and associated overhead). |
77 | | // |
78 | | // - XOR. Like ALPHA, but used when previous symbol is ALPHA or XOR |
79 | | // and string lengths match. Useful when names are similar, eg: |
80 | | // the sequence in 07.names: |
81 | | // |
82 | | // @VP2-06:112:H7LNDMCVY:1:1105:26919:1172 1:N:0:ATTCAGAA+AGGAGAAG |
83 | | // @VP2-06:112:H7LNDMCVY:1:1105:27100:1172 1:N:0:ATTCAGAA+AGGCGAAG |
84 | | // @VP2-06:112:H7LNDMCVY:1:1105:27172:1172 1:N:0:ATTCAGAA+AGGCTAAG |
85 | | |
86 | | #include "config.h" |
87 | | |
88 | | #include <stdio.h> |
89 | | #include <stdlib.h> |
90 | | #include <string.h> |
91 | | #include <sys/types.h> |
92 | | #include <sys/stat.h> |
93 | | #include <unistd.h> |
94 | | #include <limits.h> |
95 | | #include <ctype.h> |
96 | | #include <assert.h> |
97 | | #include <inttypes.h> |
98 | | #include <math.h> |
99 | | #include <fcntl.h> |
100 | | #include <errno.h> |
101 | | #include <time.h> |
102 | | |
103 | | #include "pooled_alloc.h" |
104 | | #include "arith_dynamic.h" |
105 | | #include "rANS_static4x16.h" |
106 | | #include "tokenise_name3.h" |
107 | | #include "varint.h" |
108 | | #include "utils.h" |
109 | | |
110 | | // 128 is insufficient for SAM names (max 256 bytes) as |
111 | | // we may alternate a0a0a0a0a0 etc. However if we fail, |
112 | | // we just give up and switch to another codec, so this |
113 | | // isn't a serious limit. Maybe up to 256 to permit all |
114 | | // SAM names? |
115 | 21.5k | #define MAX_TOKENS 128 |
116 | 17.6k | #define MAX_TBLOCKS (MAX_TOKENS<<4) |
117 | | |
118 | | // Number of names per block |
119 | | #define MAX_NAMES 1000000 |
120 | | |
121 | | enum name_type {N_ERR = -1, N_TYPE = 0, N_ALPHA, N_CHAR, N_DIGITS0, N_DZLEN, N_DUP, N_DIFF, |
122 | | N_DIGITS, N_DDELTA, N_DDELTA0, N_MATCH, N_NOP, N_END, N_ALL}; |
123 | | |
124 | | typedef struct trie { |
125 | | struct trie *next, *sibling; |
126 | | int count; |
127 | | uint32_t c:8; |
128 | | uint32_t n:24; // Nth line |
129 | | } trie_t; |
130 | | |
131 | | typedef struct { |
132 | | enum name_type token_type; |
133 | | int token_int; |
134 | | int token_str; |
135 | | } last_context_tok; |
136 | | |
137 | | typedef struct { |
138 | | char *last_name; |
139 | | int last_ntok; |
140 | | last_context_tok *last; // [last_ntok] |
141 | | } last_context; |
142 | | |
143 | | typedef struct { |
144 | | uint8_t *buf; |
145 | | size_t buf_a, buf_l; // alloc and used length. |
146 | | int tnum, ttype; |
147 | | int dup_from; |
148 | | } descriptor; |
149 | | |
150 | | typedef struct { |
151 | | last_context *lc; |
152 | | |
153 | | // For finding entire line dups |
154 | | int counter; |
155 | | |
156 | | // Trie used in encoder only |
157 | | trie_t *t_head; |
158 | | pool_alloc_t *pool; |
159 | | |
160 | | // token blocks |
161 | | descriptor desc[MAX_TBLOCKS]; |
162 | | |
163 | | // summary stats per token |
164 | | int token_dcount[MAX_TOKENS]; |
165 | | int token_icount[MAX_TOKENS]; |
166 | | //int token_zcount[MAX_TOKENS]; |
167 | | |
168 | | int max_tok; // tracks which desc/[id]count elements have been initialised |
169 | | int max_names; |
170 | | } name_context; |
171 | | |
172 | 126 | static name_context *create_context(int max_names) { |
173 | 126 | if (max_names <= 0) |
174 | 0 | return NULL; |
175 | | |
176 | 126 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
177 | 126 | if (max_names > 100000) |
178 | 0 | return NULL; |
179 | 126 | #endif |
180 | | |
181 | | // An arbitrary limit to prevent malformed data from consuming excessive |
182 | | // amounts of memory. Consider upping this if we have genuine use cases |
183 | | // for larger blocks. |
184 | 126 | if (max_names > 1e7) { |
185 | 0 | fprintf(stderr, "Name codec currently has a max of 10 million rec.\n"); |
186 | 0 | return NULL; |
187 | 0 | } |
188 | | |
189 | 126 | name_context *ctx = htscodecs_tls_alloc(sizeof(*ctx) + |
190 | 126 | ++max_names*sizeof(*ctx->lc)); |
191 | 126 | if (!ctx) return NULL; |
192 | 126 | ctx->max_names = max_names; |
193 | | |
194 | 126 | ctx->counter = 0; |
195 | 126 | ctx->t_head = NULL; |
196 | | |
197 | 126 | ctx->lc = (last_context *)(((char *)ctx) + sizeof(*ctx)); |
198 | 126 | ctx->pool = NULL; |
199 | | |
200 | 126 | memset(&ctx->desc[0], 0, 2*16 * sizeof(ctx->desc[0])); |
201 | 126 | memset(&ctx->token_dcount[0], 0, sizeof(int)); |
202 | 126 | memset(&ctx->token_icount[0], 0, sizeof(int)); |
203 | 126 | memset(&ctx->lc[0], 0, max_names*sizeof(ctx->lc[0])); |
204 | 126 | ctx->max_tok = 1; |
205 | | |
206 | 126 | ctx->lc[0].last_ntok = 0; |
207 | | |
208 | 126 | return ctx; |
209 | 126 | } |
210 | | |
211 | 126 | static void free_context(name_context *ctx) { |
212 | 126 | if (!ctx) |
213 | 0 | return; |
214 | | |
215 | 126 | if (ctx->t_head) |
216 | 0 | free(ctx->t_head); |
217 | 126 | if (ctx->pool) |
218 | 0 | pool_destroy(ctx->pool); |
219 | | |
220 | 126 | int i; |
221 | 31.6k | for (i = 0; i < ctx->max_tok*16; i++) |
222 | 31.5k | free(ctx->desc[i].buf); |
223 | | |
224 | 2.84M | for (i = 0; i < ctx->max_names; i++) |
225 | 2.84M | free(ctx->lc[i].last); |
226 | | |
227 | 126 | htscodecs_tls_free(ctx); |
228 | 126 | } |
229 | | |
230 | | //----------------------------------------------------------------------------- |
231 | | // Fast unsigned integer printing code. |
232 | | // Returns number of bytes written. |
233 | 0 | static int append_uint32_fixed(char *cp, uint32_t i, uint8_t l) { |
234 | 0 | switch (l) { |
235 | 0 | case 9:*cp++ = i / 100000000 + '0', i %= 100000000; |
236 | 0 | case 8:*cp++ = i / 10000000 + '0', i %= 10000000; |
237 | 0 | case 7:*cp++ = i / 1000000 + '0', i %= 1000000; |
238 | 0 | case 6:*cp++ = i / 100000 + '0', i %= 100000; |
239 | 0 | case 5:*cp++ = i / 10000 + '0', i %= 10000; |
240 | 0 | case 4:*cp++ = i / 1000 + '0', i %= 1000; |
241 | 0 | case 3:*cp++ = i / 100 + '0', i %= 100; |
242 | 0 | case 2:*cp++ = i / 10 + '0', i %= 10; |
243 | 0 | case 1:*cp++ = i + '0'; |
244 | 0 | case 0:break; |
245 | 0 | } |
246 | 0 | return l; |
247 | 0 | } |
248 | | |
249 | 2.12k | static int append_uint32_var(char *cp, uint32_t i) { |
250 | 2.12k | char *op = cp; |
251 | 2.12k | uint32_t j; |
252 | | |
253 | | //if (i < 10) goto b0; |
254 | 2.12k | if (i < 100) goto b1; |
255 | | //if (i < 1000) goto b2; |
256 | 207 | if (i < 10000) goto b3; |
257 | | //if (i < 100000) goto b4; |
258 | 203 | if (i < 1000000) goto b5; |
259 | | //if (i < 10000000) goto b6; |
260 | 201 | if (i < 100000000) goto b7; |
261 | | |
262 | 191 | if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;} |
263 | 98 | if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;} |
264 | 10 | b7:if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;} |
265 | 9 | if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;} |
266 | 2 | b5:if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;} |
267 | 1 | if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;} |
268 | 4 | b3:if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;} |
269 | 3 | if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;} |
270 | 1.91k | b1:if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;} |
271 | 1.91k | if (i) *cp++ = i + '0'; |
272 | 1.91k | return cp-op; |
273 | | |
274 | 93 | x8:*cp++ = i / 100000000 + '0', i %= 100000000; |
275 | 191 | x7:*cp++ = i / 10000000 + '0', i %= 10000000; |
276 | 192 | x6:*cp++ = i / 1000000 + '0', i %= 1000000; |
277 | 201 | x5:*cp++ = i / 100000 + '0', i %= 100000; |
278 | 202 | x4:*cp++ = i / 10000 + '0', i %= 10000; |
279 | 203 | x3:*cp++ = i / 1000 + '0', i %= 1000; |
280 | 204 | x2:*cp++ = i / 100 + '0', i %= 100; |
281 | 207 | x1:*cp++ = i / 10 + '0', i %= 10; |
282 | 209 | x0:*cp++ = i + '0'; |
283 | | |
284 | 209 | return cp-op; |
285 | 207 | } |
286 | | |
287 | | //----------------------------------------------------------------------------- |
288 | | // Example descriptor encoding and IO. |
289 | | // |
290 | | // Here we just append to a buffer so we can dump out the results. |
291 | | // These could then be passed through a static entropy encoder that |
292 | | // encodes the entire buffer. |
293 | | // |
294 | | // Alternatively an adaptive entropy encoder could be place inline |
295 | | // here to encode as it goes using additional knowledge from the |
296 | | // supplied context. |
297 | | |
298 | | // Ensure room for sz more bytes. |
299 | 0 | static int descriptor_grow(descriptor *fd, uint32_t sz) { |
300 | 0 | while (fd->buf_l + sz > fd->buf_a) { |
301 | 0 | size_t buf_a = fd->buf_a ? fd->buf_a*2 : 65536; |
302 | 0 | unsigned char *buf = realloc(fd->buf, buf_a); |
303 | 0 | if (!buf) |
304 | 0 | return -1; |
305 | 0 | fd->buf = buf; |
306 | 0 | fd->buf_a = buf_a; |
307 | 0 | } |
308 | | |
309 | 0 | return 0; |
310 | 0 | } |
311 | | |
312 | | static int encode_token_type(name_context *ctx, int ntok, |
313 | 0 | enum name_type type) { |
314 | 0 | int id = ntok<<4; |
315 | |
|
316 | 0 | if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1; |
317 | | |
318 | 0 | ctx->desc[id].buf[ctx->desc[id].buf_l++] = type; |
319 | |
|
320 | 0 | return 0; |
321 | 0 | } |
322 | | |
323 | 0 | static int encode_token_match(name_context *ctx, int ntok) { |
324 | 0 | return encode_token_type(ctx, ntok, N_MATCH); |
325 | 0 | } |
326 | | |
327 | 0 | static int encode_token_end(name_context *ctx, int ntok) { |
328 | 0 | return encode_token_type(ctx, ntok, N_END); |
329 | 0 | } |
330 | | |
331 | 6.44k | static enum name_type decode_token_type(name_context *ctx, int ntok) { |
332 | 6.44k | int id = ntok<<4; |
333 | 6.44k | if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) return -1; |
334 | 4.32k | return ctx->desc[id].buf[ctx->desc[id].buf_l++]; |
335 | 6.44k | } |
336 | | |
337 | | // int stored as 32-bit quantities |
338 | | static int encode_token_int(name_context *ctx, int ntok, |
339 | 0 | enum name_type type, uint32_t val) { |
340 | 0 | int id = (ntok<<4) | type; |
341 | |
|
342 | 0 | if (encode_token_type(ctx, ntok, type) < 0) return -1; |
343 | 0 | if (descriptor_grow(&ctx->desc[id], 4) < 0) return -1; |
344 | | |
345 | 0 | uint8_t *cp = &ctx->desc[id].buf[ctx->desc[id].buf_l]; |
346 | 0 | cp[0] = (val >> 0) & 0xff; |
347 | 0 | cp[1] = (val >> 8) & 0xff; |
348 | 0 | cp[2] = (val >> 16) & 0xff; |
349 | 0 | cp[3] = (val >> 24) & 0xff; |
350 | 0 | ctx->desc[id].buf_l += 4; |
351 | |
|
352 | 0 | return 0; |
353 | 0 | } |
354 | | |
355 | | // Return 0 on success, -1 on failure; |
356 | | static int decode_token_int(name_context *ctx, int ntok, |
357 | 4.28k | enum name_type type, uint32_t *val) { |
358 | 4.28k | int id = (ntok<<4) | type; |
359 | | |
360 | 4.28k | if (ctx->desc[id].buf_l + 4 > ctx->desc[id].buf_a) |
361 | 4 | return -1; |
362 | | |
363 | 4.28k | uint8_t *cp = ctx->desc[id].buf + ctx->desc[id].buf_l; |
364 | 4.28k | *val = (cp[0]) + (cp[1]<<8) + (cp[2]<<16) + ((uint32_t)cp[3]<<24); |
365 | 4.28k | ctx->desc[id].buf_l += 4; |
366 | | |
367 | 4.28k | return 0; |
368 | 4.28k | } |
369 | | |
370 | | // 8 bit integer quantity |
371 | | static int encode_token_int1(name_context *ctx, int ntok, |
372 | 0 | enum name_type type, uint32_t val) { |
373 | 0 | int id = (ntok<<4) | type; |
374 | |
|
375 | 0 | if (encode_token_type(ctx, ntok, type) < 0) return -1; |
376 | 0 | if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1; |
377 | | |
378 | 0 | ctx->desc[id].buf[ctx->desc[id].buf_l++] = val; |
379 | |
|
380 | 0 | return 0; |
381 | 0 | } |
382 | | |
383 | | static int encode_token_int1_(name_context *ctx, int ntok, |
384 | 0 | enum name_type type, uint32_t val) { |
385 | 0 | int id = (ntok<<4) | type; |
386 | |
|
387 | 0 | if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1; |
388 | | |
389 | 0 | ctx->desc[id].buf[ctx->desc[id].buf_l++] = val; |
390 | |
|
391 | 0 | return 0; |
392 | 0 | } |
393 | | |
394 | | // Return 0 on success, -1 on failure; |
395 | | static int decode_token_int1(name_context *ctx, int ntok, |
396 | 0 | enum name_type type, uint32_t *val) { |
397 | 0 | int id = (ntok<<4) | type; |
398 | |
|
399 | 0 | if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) |
400 | 0 | return -1; |
401 | 0 | *val = ctx->desc[id].buf[ctx->desc[id].buf_l++]; |
402 | |
|
403 | 0 | return 0; |
404 | 0 | } |
405 | | |
406 | | |
407 | | // Basic C-string style for now. |
408 | | // |
409 | | // Maybe XOR with previous string as context? |
410 | | // This permits partial match to be encoded efficiently. |
411 | | static int encode_token_alpha(name_context *ctx, int ntok, |
412 | 0 | char *str, int len) { |
413 | 0 | int id = (ntok<<4) | N_ALPHA; |
414 | |
|
415 | 0 | if (encode_token_type(ctx, ntok, N_ALPHA) < 0) return -1; |
416 | 0 | if (descriptor_grow(&ctx->desc[id], len+1) < 0) return -1; |
417 | 0 | memcpy(&ctx->desc[id].buf[ctx->desc[id].buf_l], str, len); |
418 | 0 | ctx->desc[id].buf[ctx->desc[id].buf_l+len] = 0; |
419 | 0 | ctx->desc[id].buf_l += len+1; |
420 | |
|
421 | 0 | return 0; |
422 | 0 | } |
423 | | |
424 | | // FIXME: need limit on string length for security. |
425 | | // Return length on success, -1 on failure; |
426 | 0 | static int decode_token_alpha(name_context *ctx, int ntok, char *str, int max_len) { |
427 | 0 | int id = (ntok<<4) | N_ALPHA; |
428 | 0 | char c; |
429 | 0 | int len = 0; |
430 | 0 | if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) |
431 | 0 | return -1; |
432 | 0 | do { |
433 | 0 | c = ctx->desc[id].buf[ctx->desc[id].buf_l++]; |
434 | 0 | str[len++] = c; |
435 | 0 | } while(c && len < max_len && ctx->desc[id].buf_l < ctx->desc[id].buf_a); |
436 | |
|
437 | 0 | return len-1; |
438 | 0 | } |
439 | | |
440 | 0 | static int encode_token_char(name_context *ctx, int ntok, char c) { |
441 | 0 | int id = (ntok<<4) | N_CHAR; |
442 | |
|
443 | 0 | if (encode_token_type(ctx, ntok, N_CHAR) < 0) return -1; |
444 | 0 | if (descriptor_grow(&ctx->desc[id], 1) < 0) return -1; |
445 | 0 | ctx->desc[id].buf[ctx->desc[id].buf_l++] = c; |
446 | |
|
447 | 0 | return 0; |
448 | 0 | } |
449 | | |
450 | | // FIXME: need limit on string length for security |
451 | | // Return length on success, -1 on failure; |
452 | 0 | static int decode_token_char(name_context *ctx, int ntok, char *str) { |
453 | 0 | int id = (ntok<<4) | N_CHAR; |
454 | |
|
455 | 0 | if (ctx->desc[id].buf_l >= ctx->desc[id].buf_a) |
456 | 0 | return -1; |
457 | 0 | *str = ctx->desc[id].buf[ctx->desc[id].buf_l++]; |
458 | |
|
459 | 0 | return 1; |
460 | 0 | } |
461 | | |
462 | | |
463 | | // A duplicated name |
464 | 0 | static int encode_token_dup(name_context *ctx, uint32_t val) { |
465 | 0 | return encode_token_int(ctx, 0, N_DUP, val); |
466 | 0 | } |
467 | | |
468 | | // Which read to delta against |
469 | 0 | static int encode_token_diff(name_context *ctx, uint32_t val) { |
470 | 0 | return encode_token_int(ctx, 0, N_DIFF, val); |
471 | 0 | } |
472 | | |
473 | | |
474 | | //----------------------------------------------------------------------------- |
475 | | // Trie implementation for tracking common name prefixes. |
476 | | static |
477 | 0 | int build_trie(name_context *ctx, char *data, size_t len, int n) { |
478 | 0 | int nlines = 0; |
479 | 0 | size_t i; |
480 | 0 | trie_t *t; |
481 | |
|
482 | 0 | if (!ctx->t_head) { |
483 | 0 | ctx->t_head = calloc(1, sizeof(*ctx->t_head)); |
484 | 0 | if (!ctx->t_head) |
485 | 0 | return -1; |
486 | 0 | } |
487 | | |
488 | | // Build our trie, also counting input lines |
489 | 0 | for (nlines = i = 0; i < len; i++, nlines++) { |
490 | 0 | t = ctx->t_head; |
491 | 0 | t->count++; |
492 | 0 | while (i < len && data[i] > '\n') { |
493 | 0 | unsigned char c = data[i++]; |
494 | 0 | if (c & 0x80) |
495 | | //fprintf(stderr, "8-bit ASCII is unsupported\n"); |
496 | 0 | abort(); |
497 | 0 | c &= 127; |
498 | | |
499 | |
|
500 | 0 | trie_t *x = t->next, *l = NULL; |
501 | 0 | while (x && x->c != c) { |
502 | 0 | l = x; x = x->sibling; |
503 | 0 | } |
504 | 0 | if (!x) { |
505 | 0 | if (!ctx->pool) |
506 | 0 | ctx->pool = pool_create(sizeof(trie_t)); |
507 | 0 | if (!(x = (trie_t *)pool_alloc(ctx->pool))) |
508 | 0 | return -1; |
509 | 0 | memset(x, 0, sizeof(*x)); |
510 | 0 | if (!l) |
511 | 0 | x = t->next = x; |
512 | 0 | else |
513 | 0 | x = l->sibling = x; |
514 | 0 | x->n = n; |
515 | 0 | x->c = c; |
516 | 0 | } |
517 | 0 | t = x; |
518 | 0 | t->c = c; |
519 | 0 | t->count++; |
520 | 0 | } |
521 | 0 | } |
522 | | |
523 | 0 | return 0; |
524 | 0 | } |
525 | | |
526 | | #if 0 |
527 | | void dump_trie(trie_t *t, int depth) { |
528 | | if (depth == 0) { |
529 | | printf("graph x_%p {\n splines = ortho\n ranksep=2\n", t); |
530 | | printf(" p_%p [label=\"\"];\n", t); |
531 | | dump_trie(t, 1); |
532 | | printf("}\n"); |
533 | | } else { |
534 | | int j, k, count;//, cj; |
535 | | char label[100], *cp; |
536 | | trie_t *tp = t; |
537 | | |
538 | | // patricia: |
539 | | // for (count = j = 0; j < 128; j++) |
540 | | // if (t->next[j]) |
541 | | // count++, cj=j; |
542 | | // |
543 | | // if (count == 1) { |
544 | | // t = t->next[cj]; |
545 | | // *cp++ = cj; |
546 | | // goto patricia; |
547 | | // } |
548 | | |
549 | | trie_t *x; |
550 | | for (x = t->next; x; x = x->sibling) { |
551 | | printf(" p_%p [label=\"%c\"];\n", x, x->c); |
552 | | printf(" p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, x, x->count, MAX((log(x->count)-3)*2,1)); |
553 | | //if (depth <= 11) |
554 | | dump_trie(x, depth+1); |
555 | | } |
556 | | |
557 | | #if 0 |
558 | | for (j = 0; j < 128; j++) { |
559 | | trie_t *tn; |
560 | | |
561 | | if (!t->next[j]) |
562 | | continue; |
563 | | |
564 | | cp = label; |
565 | | tn = t->next[j]; |
566 | | *cp++ = j; |
567 | | // patricia: |
568 | | |
569 | | for (count = k = 0; k < 128; k++) |
570 | | if (tn->next[k]) |
571 | | count++;//, cj=k; |
572 | | |
573 | | // if (count == 1) { |
574 | | // tn = tn->next[cj]; |
575 | | // *cp++ = cj; |
576 | | // goto patricia; |
577 | | // } |
578 | | *cp++ = 0; |
579 | | |
580 | | printf(" p_%p [label=\"%s\"];\n", tn, label); |
581 | | printf(" p_%p -- p_%p [label=\"%d\", penwidth=\"%f\"];\n", tp, tn, tn->count, MAX((log(tn->count)-3)*2,1)); |
582 | | if (depth <= 11) |
583 | | dump_trie(tn, depth+1); |
584 | | } |
585 | | #endif |
586 | | } |
587 | | } |
588 | | #endif |
589 | | |
590 | | static |
591 | 0 | int search_trie(name_context *ctx, char *data, size_t len, int n, int *exact, int *is_fixed, int *fixed_len) { |
592 | 0 | int nlines = 0; |
593 | 0 | size_t i; |
594 | 0 | trie_t *t; |
595 | 0 | int from = -1, p3 = -1; |
596 | 0 | *exact = 0; |
597 | 0 | *fixed_len = 0; |
598 | 0 | *is_fixed = 0; |
599 | | |
600 | | // Horrid hack for the encoder only. |
601 | | // We optimise per known name format here. |
602 | 0 | int prefix_len; |
603 | 0 | char *d = *data == '@' ? data+1 : data; |
604 | 0 | int l = *data == '@' ? len-1 : len; |
605 | 0 | int f = (*data == '>') ? 1 : 0; |
606 | 0 | if (l > 70 && d[f+0] == 'm' && d[7] == '_' && d[f+14] == '_' && d[f+61] == '/') { |
607 | 0 | prefix_len = 60; // PacBio |
608 | 0 | *is_fixed = 0; |
609 | 0 | } else if (l == 17 && d[f+5] == ':' && d[f+11] == ':') { |
610 | 0 | prefix_len = 6; // IonTorrent |
611 | 0 | *fixed_len = 6; |
612 | 0 | *is_fixed = 1; |
613 | 0 | } else if (l > 37 && d[f+8] == '-' && d[f+13] == '-' && d[f+18] == '-' && d[f+23] == '-' && |
614 | 0 | ((d[f+0] >= '0' && d[f+0] <='9') || (d[f+0] >= 'a' && d[f+0] <= 'f')) && |
615 | 0 | ((d[f+35] >= '0' && d[f+35] <='9') || (d[f+35] >= 'a' && d[f+35] <= 'f'))) { |
616 | | // ONT: f33d30d5-6eb8-4115-8f46-154c2620a5da_Basecall_1D_template... |
617 | 0 | prefix_len = 37; |
618 | 0 | *fixed_len = 37; |
619 | 0 | *is_fixed = 1; |
620 | 0 | } else { |
621 | | // Check Illumina and trim back to lane:tile:x:y. |
622 | 0 | int colons = 0; |
623 | 0 | for (i = 0; i < len && data[i] > ' '; i++) |
624 | 0 | ; |
625 | 0 | while (i > 0 && colons < 4) |
626 | 0 | if (data[--i] == ':') |
627 | 0 | colons++; |
628 | |
|
629 | 0 | if (colons == 4) { |
630 | | // Constant illumina prefix |
631 | 0 | *fixed_len = i+1; |
632 | 0 | prefix_len = i+1; |
633 | 0 | *is_fixed = 1; |
634 | 0 | } else { |
635 | | // Unknown, don't use a fixed len, but still search |
636 | | // for any exact matches. |
637 | 0 | prefix_len = INT_MAX; |
638 | 0 | *is_fixed = 0; |
639 | 0 | } |
640 | 0 | } |
641 | | //prefix_len = INT_MAX; |
642 | |
|
643 | 0 | if (!ctx->t_head) { |
644 | 0 | ctx->t_head = calloc(1, sizeof(*ctx->t_head)); |
645 | 0 | if (!ctx->t_head) |
646 | 0 | return -1; |
647 | 0 | } |
648 | | |
649 | | // Find an item in the trie |
650 | 0 | for (nlines = i = 0; i < len; i++, nlines++) { |
651 | 0 | t = ctx->t_head; |
652 | 0 | while (i < len && data[i] > '\n') { |
653 | 0 | unsigned char c = data[i++]; |
654 | 0 | if (c & 0x80) |
655 | | //fprintf(stderr, "8-bit ASCII is unsupported\n"); |
656 | 0 | abort(); |
657 | 0 | c &= 127; |
658 | |
|
659 | 0 | trie_t *x = t->next; |
660 | 0 | while (x && x->c != c) |
661 | 0 | x = x->sibling; |
662 | 0 | t = x; |
663 | | |
664 | | // t = t->next[c]; |
665 | | |
666 | | // if (!t) |
667 | | // return -1; |
668 | |
|
669 | 0 | from = t->n; |
670 | 0 | if (i == prefix_len) p3 = t->n; |
671 | | //if (t->count >= .0035*ctx->t_head->count && t->n != n) p3 = t->n; // pacbio |
672 | | //if (i == 60) p3 = t->n; // pacbio |
673 | | //if (i == 7) p3 = t->n; // iontorrent |
674 | 0 | t->n = n; |
675 | 0 | } |
676 | 0 | } |
677 | | |
678 | | //printf("Looked for %d, found %d, prefix %d\n", n, from, p3); |
679 | | |
680 | 0 | *exact = (n != from) && len; |
681 | 0 | return *exact ? from : p3; |
682 | 0 | } |
683 | | |
684 | | |
685 | | //----------------------------------------------------------------------------- |
686 | | // Name encoder |
687 | | |
688 | | /* |
689 | | * Tokenises a read name using ctx as context as the previous |
690 | | * tokenisation. |
691 | | * |
692 | | * Parsed elements are then emitted for encoding by calling the |
693 | | * encode_token() function with the context, token number (Nth token |
694 | | * in line), token type and token value. |
695 | | * |
696 | | * Returns 0 on success; |
697 | | * -1 on failure. |
698 | | */ |
699 | 0 | static int encode_name(name_context *ctx, char *name, int len, int mode) { |
700 | 0 | int i, is_fixed, fixed_len; |
701 | |
|
702 | 0 | int exact; |
703 | 0 | int cnum = ctx->counter++; |
704 | 0 | int pnum = search_trie(ctx, name, len, cnum, &exact, &is_fixed, &fixed_len); |
705 | 0 | if (pnum < 0) pnum = cnum ? cnum-1 : 0; |
706 | | //pnum = pnum & (MAX_NAMES-1); |
707 | | //cnum = cnum & (MAX_NAMES-1); |
708 | | //if (pnum == cnum) {pnum = cnum ? cnum-1 : 0;} |
709 | | #ifdef ENC_DEBUG |
710 | | fprintf(stderr, "%d: pnum=%d (%d), exact=%d\n%s\n%s\n", |
711 | | ctx->counter, pnum, cnum-pnum, exact, ctx->lc[pnum].last_name, name); |
712 | | #endif |
713 | | |
714 | | // Return DUP or DIFF switch, plus the distance. |
715 | 0 | if (exact && len == strlen(ctx->lc[pnum].last_name)) { |
716 | 0 | encode_token_dup(ctx, cnum-pnum); |
717 | 0 | ctx->lc[cnum].last_name = name; |
718 | 0 | ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok; |
719 | 0 | int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS; |
720 | 0 | ctx->lc[cnum].last = malloc(nc * sizeof(*ctx->lc[cnum].last)); |
721 | 0 | if (!ctx->lc[cnum].last) |
722 | 0 | return -1; |
723 | 0 | memcpy(ctx->lc[cnum].last, ctx->lc[pnum].last, |
724 | 0 | ctx->lc[cnum].last_ntok * sizeof(*ctx->lc[cnum].last)); |
725 | 0 | return 0; |
726 | 0 | } |
727 | | |
728 | 0 | ctx->lc[cnum].last = malloc(MAX_TOKENS * sizeof(*ctx->lc[cnum].last)); |
729 | 0 | if (!ctx->lc[cnum].last) |
730 | 0 | return -1; |
731 | 0 | encode_token_diff(ctx, cnum-pnum); |
732 | |
|
733 | 0 | int ntok = 1; |
734 | 0 | i = 0; |
735 | 0 | if (is_fixed) { |
736 | 0 | if (ntok >= ctx->max_tok) { |
737 | 0 | memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0])); |
738 | 0 | memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int)); |
739 | 0 | memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int)); |
740 | 0 | ctx->max_tok = ntok+1; |
741 | 0 | } |
742 | 0 | if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_ALPHA) { |
743 | 0 | if (ctx->lc[pnum].last[ntok].token_int == fixed_len && memcmp(name, ctx->lc[pnum].last_name, fixed_len) == 0) { |
744 | 0 | encode_token_match(ctx, ntok); |
745 | 0 | } else { |
746 | 0 | encode_token_alpha(ctx, ntok, name, fixed_len); |
747 | 0 | } |
748 | 0 | } else { |
749 | 0 | encode_token_alpha(ctx, ntok, name, fixed_len); |
750 | 0 | } |
751 | 0 | ctx->lc[cnum].last[ntok].token_int = fixed_len; |
752 | 0 | ctx->lc[cnum].last[ntok].token_str = 0; |
753 | 0 | ctx->lc[cnum].last[ntok++].token_type = N_ALPHA; |
754 | 0 | i = fixed_len; |
755 | 0 | } |
756 | |
|
757 | 0 | for (; i < len; i++) { |
758 | 0 | if (ntok >= ctx->max_tok) { |
759 | 0 | memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0])); |
760 | 0 | memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int)); |
761 | 0 | memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int)); |
762 | 0 | ctx->max_tok = ntok+1; |
763 | 0 | } |
764 | | |
765 | | /* Determine data type of this segment */ |
766 | 0 | if (isalpha(name[i])) { |
767 | 0 | int s = i+1; |
768 | | // int S = i+1; |
769 | | |
770 | | // // FIXME: try which of these is best. alnum is good sometimes. |
771 | | // while (s < len && isalpha(name[s])) |
772 | 0 | while (s < len && (isalpha(name[s]) || ispunct(name[s]))) |
773 | | // while (s < len && name[s] != ':') |
774 | | // while (s < len && !isdigit(name[s]) && name[s] != ':') |
775 | 0 | s++; |
776 | | |
777 | | // if (!is_fixed) { |
778 | | // while (S < len && isalnum(name[S])) |
779 | | // S++; |
780 | | // if (s < S) |
781 | | // s = S; |
782 | | // } |
783 | | |
784 | | // Single byte strings are better encoded as chars. |
785 | 0 | if (s-i == 1) goto n_char; |
786 | | |
787 | 0 | if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_ALPHA) { |
788 | 0 | if (s-i == ctx->lc[pnum].last[ntok].token_int && |
789 | 0 | memcmp(&name[i], |
790 | 0 | &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], |
791 | 0 | s-i) == 0) { |
792 | | #ifdef ENC_DEBUG |
793 | | fprintf(stderr, "Tok %d (alpha-mat, %.*s)\n", N_MATCH, s-i, &name[i]); |
794 | | #endif |
795 | 0 | if (encode_token_match(ctx, ntok) < 0) return -1; |
796 | 0 | } else { |
797 | | #ifdef ENC_DEBUG |
798 | | fprintf(stderr, "Tok %d (alpha, %.*s / %.*s)\n", N_ALPHA, |
799 | | s-i, &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], s-i, &name[i]); |
800 | | #endif |
801 | | // same token/length, but mismatches |
802 | 0 | if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1; |
803 | 0 | } |
804 | 0 | } else { |
805 | | #ifdef ENC_DEBUG |
806 | | fprintf(stderr, "Tok %d (new alpha, %.*s)\n", N_ALPHA, s-i, &name[i]); |
807 | | #endif |
808 | 0 | if (encode_token_alpha(ctx, ntok, &name[i], s-i) < 0) return -1; |
809 | 0 | } |
810 | | |
811 | 0 | ctx->lc[cnum].last[ntok].token_int = s-i; |
812 | 0 | ctx->lc[cnum].last[ntok].token_str = i; |
813 | 0 | ctx->lc[cnum].last[ntok].token_type = N_ALPHA; |
814 | |
|
815 | 0 | i = s-1; |
816 | 0 | } else if (name[i] == '0') digits0: { |
817 | | // Digits starting with zero; encode length + value |
818 | 0 | uint32_t s = i; |
819 | 0 | uint32_t v = 0; |
820 | 0 | int d = 0; |
821 | |
|
822 | 0 | while (s < len && isdigit(name[s]) && s-i < 9) { |
823 | 0 | v = v*10 + name[s] - '0'; |
824 | | //putchar(name[s]); |
825 | 0 | s++; |
826 | 0 | } |
827 | | |
828 | | // TODO: optimise choice over whether to switch from DIGITS to DELTA |
829 | | // regularly vs all DIGITS, also MATCH vs DELTA 0. |
830 | 0 | if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_DIGITS0) { |
831 | 0 | d = v - ctx->lc[pnum].last[ntok].token_int; |
832 | 0 | if (d == 0 && ctx->lc[pnum].last[ntok].token_str == s-i) { |
833 | | #ifdef ENC_DEBUG |
834 | | fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v); |
835 | | #endif |
836 | 0 | if (encode_token_match(ctx, ntok) < 0) return -1; |
837 | | //ctx->lc[pnum].last[ntok].token_delta=0; |
838 | 0 | } else if (mode == 1 && d < 256 && d >= 0 && ctx->lc[pnum].last[ntok].token_str == s-i) { |
839 | | #ifdef ENC_DEBUG |
840 | | fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last[ntok].token_int, v); |
841 | | #endif |
842 | | //if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; |
843 | 0 | if (encode_token_int1(ctx, ntok, N_DDELTA0, d) < 0) return -1; |
844 | | //ctx->lc[pnum].last[ntok].token_delta=1; |
845 | 0 | } else { |
846 | | #ifdef ENC_DEBUG |
847 | | fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last[ntok].token_int, v); |
848 | | #endif |
849 | 0 | if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; |
850 | 0 | if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1; |
851 | | //ctx->lc[pnum].last[ntok].token_delta=0; |
852 | 0 | } |
853 | 0 | } else { |
854 | | #ifdef ENC_DEBUG |
855 | | fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v); |
856 | | #endif |
857 | 0 | if (encode_token_int1_(ctx, ntok, N_DZLEN, s-i) < 0) return -1; |
858 | 0 | if (encode_token_int(ctx, ntok, N_DIGITS0, v) < 0) return -1; |
859 | | //ctx->lc[pnum].last[ntok].token_delta=0; |
860 | 0 | } |
861 | | |
862 | 0 | ctx->lc[cnum].last[ntok].token_str = s-i; // length |
863 | 0 | ctx->lc[cnum].last[ntok].token_int = v; |
864 | 0 | ctx->lc[cnum].last[ntok].token_type = N_DIGITS0; |
865 | |
|
866 | 0 | i = s-1; |
867 | 0 | } else if (isdigit(name[i])) { |
868 | | // digits starting 1-9; encode value |
869 | 0 | uint32_t s = i; |
870 | 0 | uint32_t v = 0; |
871 | 0 | int d = 0; |
872 | |
|
873 | 0 | while (s < len && isdigit(name[s]) && s-i < 9) { |
874 | 0 | v = v*10 + name[s] - '0'; |
875 | | //putchar(name[s]); |
876 | 0 | s++; |
877 | 0 | } |
878 | | |
879 | | // dataset/10/K562_cytosol_LID8465_TopHat_v2.names |
880 | | // col 4 is Illumina lane - we don't want match & delta in there |
881 | | // as it has multiple lanes (so not ALL match) and delta is just |
882 | | // random chance, increasing entropy instead. |
883 | | // if (ntok == 4 || ntok == 8 || ntok == 10) { |
884 | | // encode_token_int(ctx, ntok, N_DIGITS, v); |
885 | | // } else { |
886 | | |
887 | | // If the last token was DIGITS0 and we are the same length, then encode |
888 | | // using that method instead as it seems likely the entire column is fixed |
889 | | // width, sometimes with leading zeros. |
890 | 0 | if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && |
891 | 0 | ctx->lc[pnum].last[ntok].token_type == N_DIGITS0 && |
892 | 0 | ctx->lc[pnum].last[ntok].token_str == s-i) |
893 | 0 | goto digits0; |
894 | | |
895 | | // TODO: optimise choice over whether to switch from DIGITS to DELTA |
896 | | // regularly vs all DIGITS, also MATCH vs DELTA 0. |
897 | 0 | if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_DIGITS) { |
898 | 0 | d = v - ctx->lc[pnum].last[ntok].token_int; |
899 | 0 | if (d == 0) { |
900 | | #ifdef ENC_DEBUG |
901 | | fprintf(stderr, "Tok %d (dig-mat, %d)\n", N_MATCH, v); |
902 | | #endif |
903 | 0 | if (encode_token_match(ctx, ntok) < 0) return -1; |
904 | | //ctx->lc[pnum].last[ntok].token_delta=0; |
905 | | //ctx->token_zcount[ntok]++; |
906 | 0 | } else if (mode == 1 && d < 256 && d >= 0 |
907 | | //&& (10+ctx->token_dcount[ntok]) > (ctx->token_icount[ntok]+ctx->token_zcount[ntok]) |
908 | 0 | && (5+ctx->token_dcount[ntok]) > ctx->token_icount[ntok] |
909 | 0 | ) { |
910 | | #ifdef ENC_DEBUG |
911 | | fprintf(stderr, "Tok %d (dig-delta, %d / %d)\n", N_DDELTA, ctx->lc[pnum].last[ntok].token_int, v); |
912 | | #endif |
913 | 0 | if (encode_token_int1(ctx, ntok, N_DDELTA, d) < 0) return -1; |
914 | | //ctx->lc[pnum].last[ntok].token_delta=1; |
915 | 0 | ctx->token_dcount[ntok]++; |
916 | 0 | } else { |
917 | | #ifdef ENC_DEBUG |
918 | | fprintf(stderr, "Tok %d (dig, %d / %d)\n", N_DIGITS, ctx->lc[pnum].last[ntok].token_int, v); |
919 | | #endif |
920 | 0 | if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1; |
921 | | //ctx->lc[pnum].last[ntok].token_delta=0; |
922 | 0 | ctx->token_icount[ntok]++; |
923 | 0 | } |
924 | 0 | } else { |
925 | | #ifdef ENC_DEBUG |
926 | | fprintf(stderr, "Tok %d (new dig, %d)\n", N_DIGITS, v); |
927 | | #endif |
928 | 0 | if (encode_token_int(ctx, ntok, N_DIGITS, v) < 0) return -1; |
929 | | //ctx->lc[pnum].last[ntok].token_delta=0; |
930 | 0 | } |
931 | | // } |
932 | | |
933 | 0 | ctx->lc[cnum].last[ntok].token_int = v; |
934 | 0 | ctx->lc[cnum].last[ntok].token_type = N_DIGITS; |
935 | |
|
936 | 0 | i = s-1; |
937 | 0 | } else { |
938 | 0 | n_char: |
939 | | //if (!isalpha(name[i])) putchar(name[i]); |
940 | 0 | if (pnum < cnum && ntok < ctx->lc[pnum].last_ntok && ctx->lc[pnum].last[ntok].token_type == N_CHAR) { |
941 | 0 | if (name[i] == ctx->lc[pnum].last[ntok].token_int) { |
942 | | #ifdef ENC_DEBUG |
943 | | fprintf(stderr, "Tok %d (chr-mat, %c)\n", N_MATCH, name[i]); |
944 | | #endif |
945 | 0 | if (encode_token_match(ctx, ntok) < 0) return -1; |
946 | 0 | } else { |
947 | | #ifdef ENC_DEBUG |
948 | | fprintf(stderr, "Tok %d (chr, %c / %c)\n", N_CHAR, ctx->lc[pnum].last[ntok].token_int, name[i]); |
949 | | #endif |
950 | 0 | if (encode_token_char(ctx, ntok, name[i]) < 0) return -1; |
951 | 0 | } |
952 | 0 | } else { |
953 | | #ifdef ENC_DEBUG |
954 | | fprintf(stderr, "Tok %d (new chr, %c)\n", N_CHAR, name[i]); |
955 | | #endif |
956 | 0 | if (encode_token_char(ctx, ntok, name[i]) < 0) return -1; |
957 | 0 | } |
958 | | |
959 | 0 | ctx->lc[cnum].last[ntok].token_int = name[i]; |
960 | 0 | ctx->lc[cnum].last[ntok].token_type = N_CHAR; |
961 | 0 | } |
962 | | |
963 | 0 | ntok++; |
964 | | //putchar(' '); |
965 | 0 | } |
966 | | |
967 | | #ifdef ENC_DEBUG |
968 | | fprintf(stderr, "Tok %d (end)\n", N_END); |
969 | | #endif |
970 | 0 | if (ntok >= ctx->max_tok) { |
971 | 0 | memset(&ctx->desc[ctx->max_tok << 4], 0, 16*sizeof(ctx->desc[0])); |
972 | 0 | memset(&ctx->token_dcount[ctx->max_tok], 0, sizeof(int)); |
973 | 0 | memset(&ctx->token_icount[ctx->max_tok], 0, sizeof(int)); |
974 | 0 | ctx->max_tok = ntok+1; |
975 | 0 | } |
976 | 0 | if (encode_token_end(ctx, ntok) < 0) return -1; |
977 | | #ifdef ENC_DEBUG |
978 | | fprintf(stderr, "ntok=%d max_tok=%d\n", ntok, ctx->max_tok); |
979 | | #endif |
980 | | |
981 | | //printf("Encoded %.*s with %d tokens\n", len, name, ntok); |
982 | | |
983 | 0 | ctx->lc[cnum].last_name = name; |
984 | 0 | ctx->lc[cnum].last_ntok = ntok; |
985 | 0 | last_context_tok *shrunk = realloc(ctx->lc[cnum].last, |
986 | 0 | (ntok+1) * sizeof(*ctx->lc[cnum].last)); |
987 | 0 | if (shrunk) |
988 | 0 | ctx->lc[cnum].last = shrunk; |
989 | |
|
990 | 0 | if (!ctx->lc[cnum].last) |
991 | 0 | return -1; |
992 | | |
993 | 0 | return 0; |
994 | 0 | } |
995 | | |
996 | | //----------------------------------------------------------------------------- |
997 | | // Name decoder |
998 | | |
999 | 2.16k | static int decode_name(name_context *ctx, char *name, int name_len) { |
1000 | 2.16k | int t0 = decode_token_type(ctx, 0); |
1001 | 2.16k | uint32_t dist; |
1002 | 2.16k | int pnum, cnum = ctx->counter++; |
1003 | | |
1004 | 2.16k | if (cnum >= ctx->max_names) |
1005 | 0 | return -1; |
1006 | | |
1007 | 2.16k | if (t0 < 0 || t0 >= ctx->max_tok*16) |
1008 | 2 | return 0; |
1009 | | |
1010 | 2.16k | if (decode_token_int(ctx, 0, t0, &dist) < 0 || dist > cnum) |
1011 | 4 | return -1; |
1012 | 2.15k | if ((pnum = cnum - dist) < 0) pnum = 0; |
1013 | | |
1014 | | //fprintf(stderr, "t0=%d, dist=%d, pnum=%d, cnum=%d\n", t0, dist, pnum, cnum); |
1015 | | |
1016 | 2.15k | if (t0 == N_DUP) { |
1017 | 0 | if (pnum == cnum) |
1018 | 0 | return -1; |
1019 | | |
1020 | 0 | if (strlen(ctx->lc[pnum].last_name) +1 >= name_len) return -1; |
1021 | 0 | strcpy(name, ctx->lc[pnum].last_name); |
1022 | | // FIXME: optimise this |
1023 | 0 | ctx->lc[cnum].last_name = name; |
1024 | 0 | ctx->lc[cnum].last_ntok = ctx->lc[pnum].last_ntok; |
1025 | |
|
1026 | 0 | int nc = ctx->lc[cnum].last_ntok ? ctx->lc[cnum].last_ntok : MAX_TOKENS; |
1027 | 0 | ctx->lc[cnum].last = malloc(nc * sizeof(*ctx->lc[cnum].last)); |
1028 | 0 | if (!ctx->lc[cnum].last) |
1029 | 0 | return -1; |
1030 | 0 | memcpy(ctx->lc[cnum].last, ctx->lc[pnum].last, |
1031 | 0 | ctx->lc[cnum].last_ntok * sizeof(*ctx->lc[cnum].last)); |
1032 | |
|
1033 | 0 | return strlen(name)+1; |
1034 | 0 | } |
1035 | | |
1036 | 2.15k | *name = 0; |
1037 | 2.15k | int ntok, len = 0, len2; |
1038 | 2.15k | ctx->lc[cnum].last = malloc(MAX_TOKENS * sizeof(*ctx->lc[cnum].last)); |
1039 | 2.15k | if (!ctx->lc[cnum].last) |
1040 | 0 | return -1; |
1041 | | |
1042 | 4.28k | for (ntok = 1; ntok < MAX_TOKENS && ntok < ctx->max_tok; ntok++) { |
1043 | 4.28k | uint32_t v, vl; |
1044 | 4.28k | enum name_type tok; |
1045 | 4.28k | tok = decode_token_type(ctx, ntok); |
1046 | | //fprintf(stderr, "Tok %d = %d\n", ntok, tok); |
1047 | | |
1048 | 4.28k | ctx->lc[cnum].last_ntok = 0; |
1049 | | |
1050 | 4.28k | switch (tok) { |
1051 | 0 | case N_CHAR: |
1052 | 0 | if (len+1 >= name_len) return -1; |
1053 | 0 | if (decode_token_char(ctx, ntok, &name[len]) < 0) return -1; |
1054 | | //fprintf(stderr, "Tok %d CHAR %c\n", ntok, name[len]); |
1055 | 0 | ctx->lc[cnum].last[ntok].token_type = N_CHAR; |
1056 | 0 | ctx->lc[cnum].last[ntok].token_int = name[len++]; |
1057 | 0 | break; |
1058 | | |
1059 | 0 | case N_ALPHA: |
1060 | 0 | if ((len2 = decode_token_alpha(ctx, ntok, &name[len], name_len - len)) < 0) |
1061 | 0 | return -1; |
1062 | | //fprintf(stderr, "Tok %d ALPHA %.*s\n", ntok, len2, &name[len]); |
1063 | 0 | ctx->lc[cnum].last[ntok].token_type = N_ALPHA; |
1064 | 0 | ctx->lc[cnum].last[ntok].token_str = len; |
1065 | 0 | ctx->lc[cnum].last[ntok].token_int = len2; |
1066 | 0 | len += len2; |
1067 | 0 | break; |
1068 | | |
1069 | 0 | case N_DIGITS0: // [0-9]* |
1070 | 0 | if (decode_token_int1(ctx, ntok, N_DZLEN, &vl) < 0) return -1; |
1071 | 0 | if (decode_token_int(ctx, ntok, N_DIGITS0, &v) < 0) return -1; |
1072 | 0 | if (len+20+vl >= name_len) return -1; |
1073 | 0 | len += append_uint32_fixed(&name[len], v, vl); |
1074 | | //fprintf(stderr, "Tok %d DIGITS0 %0*d\n", ntok, vl, v); |
1075 | 0 | ctx->lc[cnum].last[ntok].token_type = N_DIGITS0; |
1076 | 0 | ctx->lc[cnum].last[ntok].token_int = v; |
1077 | 0 | ctx->lc[cnum].last[ntok].token_str = vl; |
1078 | 0 | break; |
1079 | | |
1080 | 0 | case N_DDELTA0: |
1081 | 0 | if (ntok >= ctx->lc[pnum].last_ntok) return -1; |
1082 | 0 | if (decode_token_int1(ctx, ntok, N_DDELTA0, &v) < 0) return -1; |
1083 | 0 | v += ctx->lc[pnum].last[ntok].token_int; |
1084 | 0 | if (len+ctx->lc[pnum].last[ntok].token_str+1 >= name_len) return -1; |
1085 | 0 | len += append_uint32_fixed(&name[len], v, ctx->lc[pnum].last[ntok].token_str); |
1086 | | //fprintf(stderr, "Tok %d DELTA0 %0*d\n", ntok, ctx->lc[pnum].last[ntok].token_str, v); |
1087 | 0 | ctx->lc[cnum].last[ntok].token_type = N_DIGITS0; |
1088 | 0 | ctx->lc[cnum].last[ntok].token_int = v; |
1089 | 0 | ctx->lc[cnum].last[ntok].token_str = ctx->lc[pnum].last[ntok].token_str; |
1090 | 0 | break; |
1091 | | |
1092 | 2.12k | case N_DIGITS: // [1-9][0-9]* |
1093 | 2.12k | if (decode_token_int(ctx, ntok, N_DIGITS, &v) < 0) return -1; |
1094 | 2.12k | if (len+20 >= name_len) return -1; |
1095 | 2.12k | len += append_uint32_var(&name[len], v); |
1096 | | //fprintf(stderr, "Tok %d DIGITS %d\n", ntok, v); |
1097 | 2.12k | ctx->lc[cnum].last[ntok].token_type = N_DIGITS; |
1098 | 2.12k | ctx->lc[cnum].last[ntok].token_int = v; |
1099 | 2.12k | break; |
1100 | | |
1101 | 0 | case N_DDELTA: |
1102 | 0 | if (ntok >= ctx->lc[pnum].last_ntok) return -1; |
1103 | 0 | if (decode_token_int1(ctx, ntok, N_DDELTA, &v) < 0) return -1; |
1104 | 0 | v += ctx->lc[pnum].last[ntok].token_int; |
1105 | 0 | if (len+20 >= name_len) return -1; |
1106 | 0 | len += append_uint32_var(&name[len], v); |
1107 | | //fprintf(stderr, "Tok %d DELTA %d\n", ntok, v); |
1108 | 0 | ctx->lc[cnum].last[ntok].token_type = N_DIGITS; |
1109 | 0 | ctx->lc[cnum].last[ntok].token_int = v; |
1110 | 0 | break; |
1111 | | |
1112 | 0 | case N_NOP: |
1113 | 0 | ctx->lc[cnum].last[ntok].token_type = N_NOP; |
1114 | 0 | break; |
1115 | | |
1116 | 0 | case N_MATCH: |
1117 | 0 | if (ntok >= ctx->lc[pnum].last_ntok) return -1; |
1118 | 0 | switch (ctx->lc[pnum].last[ntok].token_type) { |
1119 | 0 | case N_CHAR: |
1120 | 0 | if (len+1 >= name_len) return -1; |
1121 | 0 | name[len++] = ctx->lc[pnum].last[ntok].token_int; |
1122 | | //fprintf(stderr, "Tok %d MATCH CHAR %c\n", ntok, ctx->lc[pnum].last[ntok].token_int); |
1123 | 0 | ctx->lc[cnum].last[ntok].token_type = N_CHAR; |
1124 | 0 | ctx->lc[cnum].last[ntok].token_int = ctx->lc[pnum].last[ntok].token_int; |
1125 | 0 | break; |
1126 | | |
1127 | 0 | case N_ALPHA: |
1128 | 0 | if (ctx->lc[pnum].last[ntok].token_int < 0 || |
1129 | 0 | len+ctx->lc[pnum].last[ntok].token_int >= name_len) return -1; |
1130 | 0 | memcpy(&name[len], |
1131 | 0 | &ctx->lc[pnum].last_name[ctx->lc[pnum].last[ntok].token_str], |
1132 | 0 | ctx->lc[pnum].last[ntok].token_int); |
1133 | | //fprintf(stderr, "Tok %d MATCH ALPHA %.*s\n", ntok, ctx->lc[pnum].last[ntok].token_int, &name[len]); |
1134 | 0 | ctx->lc[cnum].last[ntok].token_type = N_ALPHA; |
1135 | 0 | ctx->lc[cnum].last[ntok].token_str = len; |
1136 | 0 | ctx->lc[cnum].last[ntok].token_int = ctx->lc[pnum].last[ntok].token_int; |
1137 | 0 | len += ctx->lc[pnum].last[ntok].token_int; |
1138 | 0 | break; |
1139 | | |
1140 | 0 | case N_DIGITS: |
1141 | 0 | if (len+20 >= name_len) return -1; |
1142 | 0 | len += append_uint32_var(&name[len], ctx->lc[pnum].last[ntok].token_int); |
1143 | | //fprintf(stderr, "Tok %d MATCH DIGITS %d\n", ntok, ctx->lc[pnum].last[ntok].token_int); |
1144 | 0 | ctx->lc[cnum].last[ntok].token_type = N_DIGITS; |
1145 | 0 | ctx->lc[cnum].last[ntok].token_int = ctx->lc[pnum].last[ntok].token_int; |
1146 | 0 | break; |
1147 | | |
1148 | 0 | case N_DIGITS0: |
1149 | 0 | if (len+ctx->lc[pnum].last[ntok].token_str >= name_len) return -1; |
1150 | 0 | len += append_uint32_fixed(&name[len], ctx->lc[pnum].last[ntok].token_int, ctx->lc[pnum].last[ntok].token_str); |
1151 | | //fprintf(stderr, "Tok %d MATCH DIGITS %0*d\n", ntok, ctx->lc[pnum].last[ntok].token_str, ctx->lc[pnum].last[ntok].token_int); |
1152 | 0 | ctx->lc[cnum].last[ntok].token_type = N_DIGITS0; |
1153 | 0 | ctx->lc[cnum].last[ntok].token_int = ctx->lc[pnum].last[ntok].token_int; |
1154 | 0 | ctx->lc[cnum].last[ntok].token_str = ctx->lc[pnum].last[ntok].token_str; |
1155 | 0 | break; |
1156 | | |
1157 | 0 | default: |
1158 | 0 | return -1; |
1159 | 0 | } |
1160 | 0 | break; |
1161 | | |
1162 | 2.15k | default: // an elided N_END |
1163 | 2.15k | case N_END: |
1164 | 2.15k | if (len+1 >= name_len) return -1; |
1165 | 2.15k | name[len++] = 0; |
1166 | 2.15k | ctx->lc[cnum].last[ntok].token_type = N_END; |
1167 | | |
1168 | 2.15k | ctx->lc[cnum].last_name = name; |
1169 | 2.15k | ctx->lc[cnum].last_ntok = ntok; |
1170 | | |
1171 | 2.15k | last_context_tok *shrunk |
1172 | 2.15k | = realloc(ctx->lc[cnum].last, |
1173 | 2.15k | (ntok+1) * sizeof(*ctx->lc[cnum].last)); |
1174 | 2.15k | if (shrunk) |
1175 | 2.15k | ctx->lc[cnum].last = shrunk; |
1176 | | |
1177 | 2.15k | if (!ctx->lc[cnum].last) |
1178 | 0 | return -1; |
1179 | | |
1180 | 2.15k | return len; |
1181 | 4.28k | } |
1182 | 4.28k | } |
1183 | | |
1184 | | |
1185 | 0 | return -1; |
1186 | 2.15k | } |
1187 | | |
1188 | | //----------------------------------------------------------------------------- |
1189 | | // arith adaptive codec or static rANS 4x16pr codec |
1190 | 0 | static int arith_encode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len, int method) { |
1191 | 0 | unsigned int olen = *out_len-6, nb; |
1192 | 0 | if (arith_compress_to(in, in_len, out+6, &olen, method) == NULL) |
1193 | 0 | return -1; |
1194 | | |
1195 | 0 | nb = var_put_u32(out, out + *out_len, olen); |
1196 | 0 | memmove(out+nb, out+6, olen); |
1197 | 0 | *out_len = olen+nb; |
1198 | |
|
1199 | 0 | return 0; |
1200 | 0 | } |
1201 | | |
1202 | | // Returns number of bytes read from 'in' on success, |
1203 | | // -1 on failure. |
1204 | 3.83k | static int64_t arith_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) { |
1205 | 3.83k | unsigned int olen = *out_len; |
1206 | | |
1207 | 3.83k | uint32_t clen; |
1208 | 3.83k | int nb = var_get_u32(in, in+in_len, &clen); |
1209 | | //fprintf(stderr, "Arith decode %x\n", in[nb]); |
1210 | 3.83k | if (arith_uncompress_to(in+nb, in_len-nb, out, &olen) == NULL) |
1211 | 17 | return -1; |
1212 | | //fprintf(stderr, " Stored clen=%d\n", (int)clen); |
1213 | 3.81k | *out_len = olen; |
1214 | 3.81k | return clen+nb; |
1215 | 3.83k | } |
1216 | | |
1217 | 0 | static int rans_encode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len, int method) { |
1218 | 0 | unsigned int olen = *out_len-6, nb; |
1219 | 0 | if (rans_compress_to_4x16(in, in_len, out+6, &olen, method) == NULL) |
1220 | 0 | return -1; |
1221 | | |
1222 | 0 | nb = var_put_u32(out, out + *out_len, olen); |
1223 | 0 | memmove(out+nb, out+6, olen); |
1224 | 0 | *out_len = olen+nb; |
1225 | |
|
1226 | 0 | return 0; |
1227 | 0 | } |
1228 | | |
1229 | | // Returns number of bytes read from 'in' on success, |
1230 | | // -1 on failure. |
1231 | 5.01k | static int64_t rans_decode(uint8_t *in, uint64_t in_len, uint8_t *out, uint64_t *out_len) { |
1232 | 5.01k | unsigned int olen = *out_len; |
1233 | | |
1234 | 5.01k | uint32_t clen; |
1235 | 5.01k | int nb = var_get_u32(in, in+in_len, &clen); |
1236 | | //fprintf(stderr, "Arith decode %x\n", in[nb]); |
1237 | 5.01k | if (rans_uncompress_to_4x16(in+nb, in_len-nb, out, &olen) == NULL) |
1238 | 83 | return -1; |
1239 | | //fprintf(stderr, " Stored clen=%d\n", (int)clen); |
1240 | 4.93k | *out_len = olen; |
1241 | 4.93k | return clen+nb; |
1242 | 5.01k | } |
1243 | | |
1244 | | static int compress(uint8_t *in, uint64_t in_len, int level, int use_arith, |
1245 | 0 | uint8_t *out, uint64_t *out_len) { |
1246 | 0 | uint64_t best_sz = UINT64_MAX; |
1247 | 0 | int best = 0; |
1248 | 0 | uint64_t olen = *out_len; |
1249 | | |
1250 | | //fprintf(stderr, "=== try %d ===\n", (int)in_len); |
1251 | |
|
1252 | 0 | int m, rmethods[5][12] = { |
1253 | 0 | {2, 0, 128}, // 1 |
1254 | 0 | {2, 0, 192+8}, // 3 |
1255 | 0 | {3, 0, 128, 193+8}, // 5 |
1256 | 0 | {6, 0,1, 129, 65, 193, 193+8}, // 7 |
1257 | 0 | {9, 0,1,128,129,64,65,192,193, 193+8}, // 9 |
1258 | 0 | }; |
1259 | | |
1260 | | // 1-9 to 0-4 |
1261 | 0 | level = (level-1)/2; |
1262 | 0 | if (level<0) level=0; |
1263 | 0 | if (level>4) level=4; |
1264 | |
|
1265 | 0 | for (m = 1; m <= rmethods[level][0]; m++) { |
1266 | 0 | *out_len = olen; |
1267 | |
|
1268 | 0 | if (in_len % 4 != 0 && (rmethods[level][m] & 8)) |
1269 | 0 | continue; |
1270 | | |
1271 | 0 | if (use_arith) { |
1272 | 0 | if (arith_encode(in, in_len, out, out_len, rmethods[level][m]) < 0) |
1273 | 0 | return -1; |
1274 | 0 | } else { |
1275 | 0 | if (rans_encode(in, in_len, out, out_len, rmethods[level][m]) < 0) |
1276 | 0 | return -1; |
1277 | 0 | } |
1278 | | |
1279 | 0 | if (best_sz > *out_len) { |
1280 | 0 | best_sz = *out_len; |
1281 | 0 | best = rmethods[level][m]; |
1282 | 0 | } |
1283 | 0 | } |
1284 | | |
1285 | 0 | *out_len = olen; |
1286 | 0 | if (use_arith) { |
1287 | 0 | if (arith_encode(in, in_len, out, out_len, best) < 0) |
1288 | 0 | return -1; |
1289 | 0 | } else { |
1290 | 0 | if (rans_encode(in, in_len, out, out_len, best) < 0) |
1291 | 0 | return -1; |
1292 | 0 | } |
1293 | | |
1294 | | // uint64_t tmp; |
1295 | | // fprintf(stderr, "%d -> %d via method %x, %x\n", (int)in_len, (int)best_sz, best, out[i7get(out,&tmp)]); |
1296 | | |
1297 | 0 | return 0; |
1298 | 0 | } |
1299 | | |
1300 | 8.84k | static uint64_t uncompressed_size(uint8_t *in, uint64_t in_len) { |
1301 | 8.84k | uint32_t clen, ulen; |
1302 | | |
1303 | | // in[0] in part of buffer written by us |
1304 | 8.84k | int nb = var_get_u32(in, in+in_len, &clen); |
1305 | | |
1306 | | // in[nb] is part of buffer written to by arith_dynamic. |
1307 | 8.84k | var_get_u32(in+nb+1, in+in_len, &ulen); |
1308 | | |
1309 | 8.84k | return ulen; |
1310 | 8.84k | } |
1311 | | |
1312 | | static int uncompress(int use_arith, uint8_t *in, uint64_t in_len, |
1313 | 8.84k | uint8_t *out, uint64_t *out_len) { |
1314 | 8.84k | uint32_t clen; |
1315 | 8.84k | var_get_u32(in, in+in_len, &clen); |
1316 | 8.84k | return use_arith |
1317 | 8.84k | ? arith_decode(in, in_len, out, out_len) |
1318 | 8.84k | : rans_decode(in, in_len, out, out_len); |
1319 | 8.84k | } |
1320 | | |
1321 | | //----------------------------------------------------------------------------- |
1322 | | |
1323 | | /* |
1324 | | * Converts a line or \0 separated block of reading names to a compressed buffer. |
1325 | | * The code can only encode whole lines and will not attempt a partial line. |
1326 | | * Use the "last_start_p" return value to identify the partial line start |
1327 | | * offset, for continuation purposes. |
1328 | | * |
1329 | | * Returns a malloced buffer holding compressed data of size *out_len, |
1330 | | * or NULL on failure |
1331 | | */ |
1332 | | uint8_t *tok3_encode_names(char *blk, int len, int level, int use_arith, |
1333 | 0 | int *out_len, int *last_start_p) { |
1334 | 0 | int last_start = 0, i, j, nreads; |
1335 | |
|
1336 | 0 | if (len < 0) { |
1337 | 0 | *out_len = 0; |
1338 | 0 | return NULL; |
1339 | 0 | } |
1340 | | |
1341 | | // Count lines |
1342 | 0 | for (nreads = i = 0; i < len; i++) |
1343 | 0 | if (blk[i] <= '\n') // \n or \0 separated entries |
1344 | 0 | nreads++; |
1345 | |
|
1346 | 0 | name_context *ctx = create_context(nreads); |
1347 | 0 | if (!ctx) |
1348 | 0 | return NULL; |
1349 | | |
1350 | | // Construct trie |
1351 | 0 | int ctr = 0; |
1352 | 0 | for (i = j = 0; i < len; j=++i) { |
1353 | 0 | while (i < len && blk[i] > '\n') |
1354 | 0 | i++; |
1355 | 0 | if (i >= len) |
1356 | 0 | break; |
1357 | | |
1358 | | //blk[i] = '\0'; |
1359 | 0 | last_start = i+1; |
1360 | 0 | if (build_trie(ctx, &blk[j], i-j, ctr++) < 0) { |
1361 | 0 | free_context(ctx); |
1362 | 0 | return NULL; |
1363 | 0 | } |
1364 | 0 | } |
1365 | 0 | if (last_start_p) |
1366 | 0 | *last_start_p = last_start; |
1367 | | |
1368 | | //fprintf(stderr, "Processed %d of %d in block, line %d\n", last_start, len, ctr); |
1369 | | |
1370 | | // Encode name |
1371 | 0 | for (i = j = 0; i < len; j=++i) { |
1372 | 0 | while (i < len && blk[i] > '\n') |
1373 | 0 | i++; |
1374 | 0 | if (i >= len) |
1375 | 0 | break; |
1376 | | |
1377 | 0 | blk[i] = '\0'; |
1378 | | // try both 0 and 1 and pick best? |
1379 | 0 | if (encode_name(ctx, &blk[j], i-j, 1) < 0) { |
1380 | 0 | free_context(ctx); |
1381 | 0 | return NULL; |
1382 | 0 | } |
1383 | 0 | } |
1384 | | |
1385 | | #if 0 |
1386 | | for (i = 0; i < ctx->max_tok*16; i++) { |
1387 | | char fn[1024]; |
1388 | | if (!ctx->desc[i].buf_l) continue; |
1389 | | sprintf(fn, "_tok.%02d_%02d.%d", i>>4,i&15,i); |
1390 | | FILE *fp = fopen(fn, "w"); |
1391 | | fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp); |
1392 | | fclose(fp); |
1393 | | } |
1394 | | #endif |
1395 | | |
1396 | | //dump_trie(t_head, 0); |
1397 | | |
1398 | | // FIXME: merge descriptors |
1399 | | // |
1400 | | // If we see foo7:1 foo7:12 foo7:7 etc then foo: is constant, |
1401 | | // but it's encoded as alpha<foo>+dig<7>+char<:> instead of alpha<foo7:>. |
1402 | | // Any time token type 0 is all match beyond the first location we have |
1403 | | // a candidate for merging in string form. |
1404 | | // |
1405 | | // This saves around .1 to 1.3 percent on varying data sets. |
1406 | | // Cruder hack is dedicated prefix/suffix matching to short-cut this. |
1407 | | |
1408 | | |
1409 | | // Drop N_TYPE blocks if they all contain matches bar the first item, |
1410 | | // as we can regenerate these from the subsequent blocks types during |
1411 | | // decode. |
1412 | 0 | for (i = 0; i < ctx->max_tok*16; i+=16) { |
1413 | 0 | if (!ctx->desc[i].buf_l) continue; |
1414 | | |
1415 | 0 | int z; |
1416 | 0 | for (z=1; z<ctx->desc[i].buf_l; z++) { |
1417 | 0 | if (ctx->desc[i].buf[z] != N_MATCH) |
1418 | 0 | break; |
1419 | 0 | } |
1420 | 0 | if (z == ctx->desc[i].buf_l) { |
1421 | 0 | int k; |
1422 | 0 | for (k=1; k<16; k++) |
1423 | 0 | if (ctx->desc[i+k].buf_l) |
1424 | 0 | break; |
1425 | |
|
1426 | 0 | if (k < 16) { |
1427 | 0 | ctx->desc[i].buf_l = 0; |
1428 | 0 | free(ctx->desc[i].buf); |
1429 | 0 | ctx->desc[i].buf = NULL; |
1430 | 0 | } |
1431 | 0 | } |
1432 | 0 | } |
1433 | | |
1434 | | // Serialise descriptors |
1435 | 0 | uint32_t tot_size = 9; |
1436 | 0 | int ndesc = 0; |
1437 | 0 | for (i = 0; i < ctx->max_tok*16; i++) { |
1438 | 0 | if (!ctx->desc[i].buf_l) continue; |
1439 | | |
1440 | 0 | ndesc++; |
1441 | |
|
1442 | 0 | int tnum = i>>4; |
1443 | 0 | int ttype = i&15; |
1444 | |
|
1445 | 0 | uint64_t out_len = 1.5 * arith_compress_bound(ctx->desc[i].buf_l, 1); // guesswork |
1446 | 0 | uint8_t *out = malloc(out_len); |
1447 | 0 | if (!out) { |
1448 | 0 | free_context(ctx); |
1449 | 0 | return NULL; |
1450 | 0 | } |
1451 | | |
1452 | 0 | if (compress(ctx->desc[i].buf, ctx->desc[i].buf_l, level, use_arith, |
1453 | 0 | out, &out_len) < 0) { |
1454 | 0 | free_context(ctx); |
1455 | 0 | return NULL; |
1456 | 0 | } |
1457 | | |
1458 | 0 | free(ctx->desc[i].buf); |
1459 | 0 | ctx->desc[i].buf = out; |
1460 | 0 | ctx->desc[i].buf_l = out_len; |
1461 | 0 | ctx->desc[i].tnum = tnum; |
1462 | 0 | ctx->desc[i].ttype = ttype; |
1463 | | |
1464 | | // Find dups |
1465 | 0 | int j; |
1466 | 0 | for (j = 0; j < i; j++) { |
1467 | 0 | if (!ctx->desc[j].buf) |
1468 | 0 | continue; |
1469 | 0 | if (ctx->desc[i].buf_l != ctx->desc[j].buf_l || ctx->desc[i].buf_l <= 4) |
1470 | 0 | continue; |
1471 | 0 | if (memcmp(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_l) == 0) |
1472 | 0 | break; |
1473 | 0 | } |
1474 | 0 | if (j < i) { |
1475 | 0 | ctx->desc[i].dup_from = j; |
1476 | 0 | tot_size += 3; // flag, dup_from, ttype |
1477 | 0 | } else { |
1478 | 0 | ctx->desc[i].dup_from = 0; |
1479 | 0 | tot_size += out_len + 1; // ttype |
1480 | 0 | } |
1481 | 0 | } |
1482 | | |
1483 | | #if 0 |
1484 | | for (i = 0; i < ctx->max_tok*16; i++) { |
1485 | | char fn[1024]; |
1486 | | if (!ctx->desc[i].buf_l && !ctx->desc[i].dup_from) continue; |
1487 | | sprintf(fn, "_tok.%02d_%02d.%d.comp", i>>4,i&15,i); |
1488 | | FILE *fp = fopen(fn, "w"); |
1489 | | fwrite(ctx->desc[i].buf, 1, ctx->desc[i].buf_l, fp); |
1490 | | fclose(fp); |
1491 | | } |
1492 | | #endif |
1493 | | |
1494 | | // Write |
1495 | 0 | uint8_t *out = malloc(tot_size+13); |
1496 | 0 | if (!out) { |
1497 | 0 | free_context(ctx); |
1498 | 0 | return NULL; |
1499 | 0 | } |
1500 | | |
1501 | 0 | uint8_t *cp = out; |
1502 | |
|
1503 | 0 | *out_len = tot_size; |
1504 | | // *(uint32_t *)cp = last_start; cp += 4; |
1505 | | // *(uint32_t *)cp = nreads; cp += 4; |
1506 | 0 | *cp++ = (last_start >> 0) & 0xff; |
1507 | 0 | *cp++ = (last_start >> 8) & 0xff; |
1508 | 0 | *cp++ = (last_start >> 16) & 0xff; |
1509 | 0 | *cp++ = (last_start >> 24) & 0xff; |
1510 | 0 | *cp++ = (nreads >> 0) & 0xff; |
1511 | 0 | *cp++ = (nreads >> 8) & 0xff; |
1512 | 0 | *cp++ = (nreads >> 16) & 0xff; |
1513 | 0 | *cp++ = (nreads >> 24) & 0xff; |
1514 | 0 | *cp++ = use_arith; |
1515 | | //write(1, &nreads, 4); |
1516 | 0 | int last_tnum = -1; |
1517 | 0 | for (i = 0; i < ctx->max_tok*16; i++) { |
1518 | 0 | if (!ctx->desc[i].buf_l) continue; |
1519 | 0 | uint8_t ttype8 = ctx->desc[i].ttype; |
1520 | 0 | if (ctx->desc[i].tnum != last_tnum) { |
1521 | 0 | ttype8 |= 128; |
1522 | 0 | last_tnum = ctx->desc[i].tnum; |
1523 | 0 | } |
1524 | 0 | if (ctx->desc[i].dup_from) { |
1525 | | //fprintf(stderr, "Dup %d from %d, sz %d\n", i, ctx->desc[i].dup_from, ctx->desc[i].buf_l); |
1526 | 0 | *cp++ = ttype8 | 64; |
1527 | 0 | *cp++ = ctx->desc[i].dup_from >> 4; |
1528 | 0 | *cp++ = ctx->desc[i].dup_from & 15; |
1529 | 0 | } else { |
1530 | 0 | *cp++ = ttype8; |
1531 | 0 | memcpy(cp, ctx->desc[i].buf, ctx->desc[i].buf_l); |
1532 | 0 | cp += ctx->desc[i].buf_l; |
1533 | 0 | } |
1534 | 0 | } |
1535 | | |
1536 | | //assert(cp-out == tot_size); |
1537 | |
|
1538 | 0 | free_context(ctx); |
1539 | |
|
1540 | 0 | return out; |
1541 | 0 | } |
1542 | | |
1543 | | // Deprecated interface; to remove when we next to an ABI breakage |
1544 | | uint8_t *encode_names(char *blk, int len, int level, int use_arith, |
1545 | 0 | int *out_len, int *last_start_p) { |
1546 | 0 | return tok3_encode_names(blk, len, level, use_arith, out_len, |
1547 | 0 | last_start_p); |
1548 | 0 | } |
1549 | | |
1550 | | /* |
1551 | | * Decodes a compressed block of read names into \0 separated names. |
1552 | | * The size of the data returned (malloced) is in *out_len. |
1553 | | * |
1554 | | * Returns NULL on failure. |
1555 | | */ |
1556 | 126 | uint8_t *tok3_decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) { |
1557 | 126 | if (sz < 9) |
1558 | 0 | return NULL; |
1559 | | |
1560 | 126 | int i, o = 9; |
1561 | | //int ulen = *(uint32_t *)in; |
1562 | 126 | int ulen = (in[0]<<0) | (in[1]<<8) | (in[2]<<16) | |
1563 | 126 | (((uint32_t)in[3])<<24); |
1564 | | |
1565 | 126 | if (ulen < 0 || ulen >= INT_MAX-1024) |
1566 | 0 | return NULL; |
1567 | | |
1568 | 126 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
1569 | | // Speed up fuzzing by blocking excessive sizes |
1570 | 126 | if (ulen > 100000) |
1571 | 0 | return NULL; |
1572 | 126 | #endif |
1573 | | |
1574 | | //int nreads = *(uint32_t *)(in+4); |
1575 | 126 | int nreads = (in[4]<<0) | (in[5]<<8) | (in[6]<<16) | (((uint32_t)in[7])<<24); |
1576 | 126 | int use_arith = in[8]; |
1577 | 126 | name_context *ctx = create_context(nreads); |
1578 | 126 | if (!ctx) |
1579 | 0 | return NULL; |
1580 | | |
1581 | | // Unpack descriptors |
1582 | 126 | int tnum = -1; |
1583 | 9.21k | while (o < sz) { |
1584 | 9.21k | uint8_t ttype = in[o++]; |
1585 | 9.21k | if (ttype & 64) { |
1586 | 364 | if (o+2 >= sz) goto err; |
1587 | 364 | int j = in[o++]<<4; |
1588 | 364 | j += in[o++]; |
1589 | 364 | if (ttype & 128) { |
1590 | 120 | tnum++; |
1591 | 120 | if (tnum >= MAX_TOKENS) |
1592 | 0 | goto err; |
1593 | 120 | ctx->max_tok = tnum+1; |
1594 | 120 | memset(&ctx->desc[tnum<<4], 0, 16*sizeof(ctx->desc[tnum])); |
1595 | 120 | } |
1596 | | |
1597 | 364 | if ((ttype & 15) != 0 && (ttype & 128)) { |
1598 | 54 | if (tnum < 0) goto err; |
1599 | 54 | ctx->desc[tnum<<4].buf = malloc(nreads); |
1600 | 54 | if (!ctx->desc[tnum<<4].buf) |
1601 | 0 | goto err; |
1602 | | |
1603 | 54 | ctx->desc[tnum<<4].buf_l = 0; |
1604 | 54 | ctx->desc[tnum<<4].buf_a = nreads; |
1605 | 54 | ctx->desc[tnum<<4].buf[0] = ttype&15; |
1606 | 54 | memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1); |
1607 | 54 | } |
1608 | | |
1609 | 364 | if (tnum < 0) goto err; |
1610 | 364 | i = (tnum<<4) | (ttype&15); |
1611 | 364 | if (j >= i) |
1612 | 14 | goto err; |
1613 | 350 | if (!ctx->desc[j].buf) |
1614 | 1 | goto err; // Attempt to copy a non-existent stream |
1615 | | |
1616 | 349 | ctx->desc[i].buf_l = 0; |
1617 | 349 | ctx->desc[i].buf_a = ctx->desc[j].buf_a; |
1618 | 349 | if (ctx->desc[i].buf) free(ctx->desc[i].buf); |
1619 | 349 | ctx->desc[i].buf = malloc(ctx->desc[i].buf_a); |
1620 | 349 | if (!ctx->desc[i].buf) |
1621 | 0 | goto err; |
1622 | | |
1623 | 349 | memcpy(ctx->desc[i].buf, ctx->desc[j].buf, ctx->desc[i].buf_a); |
1624 | | //fprintf(stderr, "Copy ttype %d, i=%d,j=%d, size %d\n", ttype, i, j, (int)ctx->desc[i].buf_a); |
1625 | 349 | continue; |
1626 | 349 | } |
1627 | | |
1628 | | //if (ttype == 0) |
1629 | 8.84k | if (ttype & 128) { |
1630 | 1.85k | tnum++; |
1631 | 1.85k | if (tnum >= MAX_TOKENS) |
1632 | 1 | goto err; |
1633 | 1.85k | ctx->max_tok = tnum+1; |
1634 | 1.85k | memset(&ctx->desc[tnum<<4], 0, 16*sizeof(ctx->desc[tnum])); |
1635 | 1.85k | } |
1636 | | |
1637 | 8.84k | if ((ttype & 15) != 0 && (ttype & 128)) { |
1638 | 87 | if (tnum < 0) goto err; |
1639 | 87 | if (ctx->desc[tnum<<4].buf) free(ctx->desc[tnum<<4].buf); |
1640 | 87 | ctx->desc[tnum<<4].buf = malloc(nreads); |
1641 | 87 | if (!ctx->desc[tnum<<4].buf) |
1642 | 0 | goto err; |
1643 | 87 | ctx->desc[tnum<<4].buf_l = 0; |
1644 | 87 | ctx->desc[tnum<<4].buf_a = nreads; |
1645 | 87 | ctx->desc[tnum<<4].buf[0] = ttype&15; |
1646 | 87 | memset(&ctx->desc[tnum<<4].buf[1], N_MATCH, nreads-1); |
1647 | 87 | } |
1648 | | |
1649 | | //fprintf(stderr, "Read %02x\n", c); |
1650 | | |
1651 | | // Load compressed block |
1652 | 8.84k | int64_t clen, ulen = uncompressed_size(&in[o], sz-o); |
1653 | 8.84k | if (ulen < 0 || ulen >= INT_MAX) |
1654 | 0 | goto err; |
1655 | 8.84k | if (tnum < 0) goto err; |
1656 | 8.84k | i = (tnum<<4) | (ttype&15); |
1657 | | |
1658 | 8.84k | if (i >= MAX_TBLOCKS || i < 0) |
1659 | 0 | goto err; |
1660 | | |
1661 | 8.84k | ctx->desc[i].buf_l = 0; |
1662 | 8.84k | if (ctx->desc[i].buf) free(ctx->desc[i].buf); |
1663 | 8.84k | ctx->desc[i].buf = malloc(ulen); |
1664 | 8.84k | if (!ctx->desc[i].buf) |
1665 | 0 | goto err; |
1666 | | |
1667 | 8.84k | ctx->desc[i].buf_a = ulen; |
1668 | 8.84k | uint64_t usz = ctx->desc[i].buf_a; // convert from size_t for 32-bit sys |
1669 | 8.84k | clen = uncompress(use_arith, &in[o], sz-o, ctx->desc[i].buf, &usz); |
1670 | 8.84k | ctx->desc[i].buf_a = usz; |
1671 | 8.84k | if (clen < 0 || ctx->desc[i].buf_a != ulen) |
1672 | 103 | goto err; |
1673 | | |
1674 | | // fprintf(stderr, "%d: Decode tnum %d type %d clen %d ulen %d via %d\n", |
1675 | | // o, tnum, ttype, (int)clen, (int)ctx->desc[i].buf_a, ctx->desc[i].buf[0]); |
1676 | | |
1677 | 8.74k | o += clen; |
1678 | | |
1679 | | // Encode tnum 0 type 0 ulen 100000 clen 12530 via 2 |
1680 | | // Encode tnum 0 type 6 ulen 196800 clen 43928 via 3 |
1681 | | // Encode tnum 0 type 7 ulen 203200 clen 17531 via 3 |
1682 | | // Encode tnum 1 type 0 ulen 50800 clen 10 via 1 |
1683 | | // Encode tnum 1 type 1 ulen 3 clen 5 via 0 |
1684 | | // Encode tnum 2 type 0 ulen 50800 clen 10 via 1 |
1685 | | // |
1686 | 8.74k | } |
1687 | | |
1688 | 7 | int ret; |
1689 | 7 | ulen += 1024; // for easy coding in decode_name. |
1690 | 7 | uint8_t *out = malloc(ulen); |
1691 | 7 | if (!out) |
1692 | 0 | goto err; |
1693 | | |
1694 | 7 | size_t out_sz = 0; |
1695 | 2.16k | while ((ret = decode_name(ctx, (char *)out+out_sz, ulen)) > 0) { |
1696 | 2.15k | out_sz += ret; |
1697 | 2.15k | ulen -= ret; |
1698 | 2.15k | } |
1699 | | |
1700 | 7 | if (ret < 0) |
1701 | 5 | free(out); |
1702 | | |
1703 | 7 | free_context(ctx); |
1704 | | |
1705 | 7 | *out_len = out_sz; |
1706 | 7 | return ret == 0 ? out : NULL; |
1707 | | |
1708 | 119 | err: |
1709 | 119 | free_context(ctx); |
1710 | 119 | return NULL; |
1711 | 7 | } |
1712 | | |
1713 | | // Deprecated interface; to remove when we next to an ABI breakage |
1714 | 0 | uint8_t *decode_names(uint8_t *in, uint32_t sz, uint32_t *out_len) { |
1715 | 0 | return tok3_decode_names(in, sz, out_len); |
1716 | 0 | } |