Coverage Report

Created: 2025-08-29 06:39

/src/htslib/hts_expr.c
Line
Count
Source (jump to first uncovered line)
1
/*  hts_expr.c -- filter expression parsing and processing.
2
3
    Copyright (C) 2020-2022, 2024 Genome Research Ltd.
4
5
    Author: James Bonfield <jkb@sanger.ac.uk>
6
7
Permission is hereby granted, free of charge, to any person obtaining a copy
8
of this software and associated documentation files (the "Software"), to deal
9
in the Software without restriction, including without limitation the rights
10
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
copies of the Software, and to permit persons to whom the Software is
12
furnished to do so, subject to the following conditions:
13
14
The above copyright notices and this permission notice shall be included in
15
all copies or substantial portions of the Software.
16
17
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23
DEALINGS IN THE SOFTWARE.  */
24
25
// TODO:
26
// - ?: operator for conditionals?
27
28
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
29
#include <config.h>
30
31
#include <stdio.h>
32
#include <string.h>
33
#include <ctype.h>
34
#include <stdlib.h>
35
#include <stdint.h>
36
#include <float.h>
37
#include <regex.h>
38
#include <math.h>
39
40
#include "htslib/hts_expr.h"
41
#include "htslib/hts_log.h"
42
#include "textutils_internal.h"
43
44
// Could also cache hts_expr_val_t stack here for kstring reuse?
45
0
#define MAX_REGEX 10
46
struct hts_filter_t {
47
    char *str;
48
    int parsed;
49
    int curr_regex, max_regex;
50
    regex_t preg[MAX_REGEX];
51
};
52
53
/*
54
 * This is designed to be mostly C like with mostly same the precedence rules,
55
 * with the exception of bit operators (widely considered as a mistake in C).
56
 * It's not full C (eg no bit-shifting), but good enough for our purposes.
57
 *
58
 * Supported syntax, in order of precedence:
59
 *
60
 * Grouping:      (, ),   eg "(1+2)*3"
61
 * Values:        integers, floats, strings or variables
62
 * Unary ops:     +, -, !, ~  eg -10 +10, !10 (0), ~5 (bitwise not)
63
 * Math ops:      *, /, %  [TODO: add // for floor division?]
64
 * Math ops:      +, -
65
 * Bit-wise:      &, ^, |  [NB as 3 precedence levels, in that order]
66
 * Conditionals:  >, >=, <, <=,
67
 * Equality:      ==, !=, =~, !~
68
 * Boolean:       &&, ||
69
 */
70
71
// Skip to start of term
72
0
static char *ws(char *str) {
73
0
    while (*str && (*str == ' ' || *str == '\t'))
74
0
        str++;
75
0
    return str;
76
0
}
77
78
static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
79
                      char *str, char **end, hts_expr_val_t *res);
80
81
/*
82
 * Simple functions operating on strings only.
83
 * length, min, max, avg.
84
 *
85
 * All return 0 on success,
86
 *           -1 on failure
87
 */
88
0
static int expr_func_length(hts_expr_val_t *res) {
89
0
    if (!res->is_str)
90
0
        return -1;
91
92
0
    res->is_str = 0;
93
0
    res->d = res->s.l;
94
0
    return 0;
95
0
}
96
97
0
static int expr_func_min(hts_expr_val_t *res) {
98
0
    if (!res->is_str)
99
0
        return -1;
100
101
0
    size_t l = res->s.l;
102
0
    int v = INT_MAX;
103
0
    const uint8_t *x = (uint8_t *)res->s.s;
104
0
    for (l = 0; l < res->s.l; l++)
105
0
        if (v > x[l])
106
0
            v = x[l];
107
108
0
    res->is_str = 0;
109
0
    res->d = v == INT_MAX ? NAN : v;
110
111
0
    return 0;
112
0
}
113
114
0
static int expr_func_max(hts_expr_val_t *res) {
115
0
    if (!res->is_str)
116
0
        return -1;
117
118
0
    size_t l = res->s.l;
119
0
    int v = INT_MIN;
120
0
    const uint8_t *x = (uint8_t *)res->s.s;
121
0
    for (l = 0; l < res->s.l; l++)
122
0
        if (v < x[l])
123
0
            v = x[l];
124
125
0
    res->is_str = 0;
126
0
    res->d = v == INT_MIN ? NAN : v;
127
128
0
    return 0;
129
0
}
130
131
0
static int expr_func_avg(hts_expr_val_t *res) {
132
0
    if (!res->is_str)
133
0
        return -1;
134
135
0
    size_t l = res->s.l;
136
0
    double v = 0;
137
0
    const uint8_t *x = (uint8_t *)res->s.s;
138
0
    for (l = 0; l < res->s.l; l++)
139
0
        v += x[l];
140
0
    if (l)
141
0
        v /= l;
142
143
0
    res->is_str = 0;
144
0
    res->d = v;
145
146
0
    return 0;
147
0
}
148
149
/*
150
 * functions:  FUNC(expr).
151
 * Note for simplicity of parsing, the "(" must immediately follow FUNC,
152
 * so "FUNC (x)" is invalid.
153
 */
154
static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
155
0
                     char *str, char **end, hts_expr_val_t *res) {
156
0
    int func_ok = -1;
157
0
    switch (*str) {
158
0
    case 'a':
159
0
        if (strncmp(str, "avg(", 4) == 0) {
160
0
            if (expression(filt, data, fn, str+4, end, res)) return -1;
161
0
            func_ok = expr_func_avg(res);
162
0
        }
163
0
        break;
164
165
0
    case 'd':
166
0
        if (strncmp(str, "default(", 8) == 0) {
167
0
            if (expression(filt, data, fn, str+8, end, res)) return -1;
168
0
            if (**end != ',')
169
0
                return -1;
170
0
            (*end)++;
171
0
            hts_expr_val_t val = HTS_EXPR_VAL_INIT;
172
0
            if (expression(filt, data, fn, ws(*end), end, &val)) return -1;
173
0
            func_ok = 1;
174
0
            if (!hts_expr_val_existsT(res)) {
175
0
                kstring_t swap = res->s;
176
0
                *res = val;
177
0
                val.s = swap;
178
0
                hts_expr_val_free(&val);
179
0
            }
180
0
        }
181
0
        break;
182
183
0
    case 'e':
184
0
        if (strncmp(str, "exists(", 7) == 0) {
185
0
            if (expression(filt, data, fn, str+7, end, res)) return -1;
186
0
            func_ok = 1;
187
0
            res->is_true = res->d = hts_expr_val_existsT(res);
188
0
            res->is_str = 0;
189
0
        } else if (strncmp(str, "exp(", 4) == 0) {
190
0
            if (expression(filt, data, fn, str+4, end, res)) return -1;
191
0
            func_ok = 1;
192
0
            res->d = exp(res->d);
193
0
            res->is_str = 0;
194
0
            if (isnan(res->d))
195
0
                hts_expr_val_undef(res);
196
0
        }
197
198
0
        break;
199
200
0
    case 'l':
201
0
        if (strncmp(str, "length(", 7) == 0) {
202
0
            if (expression(filt, data, fn, str+7, end, res)) return -1;
203
0
            func_ok = expr_func_length(res);
204
0
        } else if (strncmp(str, "log(", 4) == 0) {
205
0
            if (expression(filt, data, fn, str+4, end, res)) return -1;
206
0
            func_ok = 1;
207
0
            res->d = log(res->d);
208
0
            res->is_str = 0;
209
0
            if (isnan(res->d))
210
0
                hts_expr_val_undef(res);
211
0
        }
212
0
        break;
213
214
0
    case 'm':
215
0
        if (strncmp(str, "min(", 4) == 0) {
216
0
            if (expression(filt, data, fn, str+4, end, res)) return -1;
217
0
            func_ok = expr_func_min(res);
218
0
        } else if (strncmp(str, "max(", 4) == 0) {
219
0
            if (expression(filt, data, fn, str+4, end, res)) return -1;
220
0
            func_ok = expr_func_max(res);
221
0
        }
222
0
        break;
223
224
0
    case 'p':
225
0
        if (strncmp(str, "pow(", 4) == 0) {
226
0
            if (expression(filt, data, fn, str+4, end, res)) return -1;
227
0
            func_ok = 1;
228
229
0
            if (**end != ',')
230
0
                return -1;
231
0
            (*end)++;
232
0
            hts_expr_val_t val = HTS_EXPR_VAL_INIT;
233
0
            if (expression(filt, data, fn, ws(*end), end, &val)) return -1;
234
0
            if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
235
0
                hts_expr_val_undef(res);
236
0
            } else if (res->is_str || val.is_str) {
237
0
                hts_expr_val_free(&val); // arith on strings
238
0
                return -1;
239
0
            } else {
240
0
                func_ok = 1;
241
0
                res->d = pow(res->d, val.d);
242
0
                hts_expr_val_free(&val);
243
0
                res->is_str = 0;
244
0
            }
245
246
0
            if (isnan(res->d))
247
0
                hts_expr_val_undef(res);
248
0
        }
249
0
        break;
250
251
0
    case 's':
252
0
        if (strncmp(str, "sqrt(", 5) == 0) {
253
0
            if (expression(filt, data, fn, str+5, end, res)) return -1;
254
0
            func_ok = 1;
255
0
            res->d = sqrt(res->d);
256
0
            res->is_str = 0;
257
0
            if (isnan(res->d))
258
0
                hts_expr_val_undef(res);
259
0
        }
260
0
        break;
261
0
    }
262
263
0
    if (func_ok < 0)
264
0
        return -1;
265
266
0
    str = ws(*end);
267
0
    if (*str != ')') {
268
0
        fprintf(stderr, "Missing ')'\n");
269
0
        return -1;
270
0
    }
271
0
    *end = str+1;
272
273
0
    return 0;
274
0
}
275
276
/*
277
 * simple_expr
278
 *     : identifier
279
 *     | constant
280
 *     | string
281
 *     | func_expr
282
 *     | '(' expression ')'
283
*/
284
static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
285
0
                       char *str, char **end, hts_expr_val_t *res) {
286
    // Main recursion step
287
0
    str = ws(str);
288
0
    if (*str == '(') {
289
0
        if (expression(filt, data, fn, str+1, end, res)) return -1;
290
0
        str = ws(*end);
291
0
        if (*str != ')') {
292
0
            fprintf(stderr, "Missing ')'\n");
293
0
            return -1;
294
0
        }
295
0
        *end = str+1;
296
297
0
        return 0;
298
0
    }
299
300
    // Otherwise a basic element.
301
0
    int fail = 0;
302
0
    double d = hts_str2dbl(str, end, &fail);
303
0
    if (str != *end) {
304
0
        res->is_str = 0;
305
0
        res->d = d;
306
0
    } else {
307
        // Not valid floating point syntax.
308
        // TODO: add function call names in here; len(), sqrt(), pow(), etc
309
0
        if (*str == '"') {
310
0
            res->is_str = 1;
311
0
            char *e = str+1;
312
0
            int backslash = 0;
313
0
            while (*e && *e != '"') {
314
0
                if (*e == '\\')
315
0
                    backslash=1, e+=1+(e[1]!='\0');
316
0
                else
317
0
                    e++;
318
0
            }
319
320
0
            kputsn(str+1, e-(str+1), ks_clear(&res->s));
321
0
            if (backslash) {
322
0
                size_t i, j;
323
0
                for (i = j = 0; i < res->s.l; i++) {
324
0
                    res->s.s[j++] = res->s.s[i];
325
0
                    if (res->s.s[i] == '\\') {
326
0
                        switch (res->s.s[++i]) {
327
0
                        case '"': res->s.s[j-1] = '"'; break;
328
0
                        case '\\':res->s.s[j-1] = '\\'; break;
329
0
                        case 't': res->s.s[j-1] = '\t'; break;
330
0
                        case 'n': res->s.s[j-1] = '\n'; break;
331
0
                        case 'r': res->s.s[j-1] = '\r'; break;
332
0
                        default:  res->s.s[j++] = res->s.s[i];
333
0
                        }
334
0
                    }
335
0
                }
336
0
                res->s.s[j] = 0;
337
0
                res->s.l = j;
338
0
            }
339
0
            if (*e != '"')
340
0
                return -1;
341
0
            *end = e+1;
342
0
        } else if (fn) {
343
            // Try lookup as variable, if not as function
344
0
            if (fn(data, str, end, res) == 0)
345
0
                return 0;
346
0
            else
347
0
                return func_expr(filt, data, fn, str, end, res);
348
0
        } else {
349
0
            return -1;
350
0
        }
351
0
    }
352
353
0
    return 0;
354
0
}
355
356
/*
357
 * unary_expr
358
 *     : simple_expr
359
 *     | '+' simple_expr
360
 *     | '-' simple_expr
361
 *     | '!' unary_expr // higher precedence
362
 *     | '~' unary_expr // higher precedence
363
 */
364
static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
365
0
                      char *str, char **end, hts_expr_val_t *res) {
366
0
    int err;
367
0
    str = ws(str);
368
0
    if (*str == '+' || *str == '-') {
369
0
        err = simple_expr(filt, data, fn, str+1, end, res);
370
0
        if (!hts_expr_val_exists(res)) {
371
0
            hts_expr_val_undef(res);
372
0
        } else {
373
0
            err |= res->is_str;
374
0
            if (*str == '-')
375
0
                res->d = -res->d;
376
0
            res->is_true = res->d != 0;
377
0
        }
378
0
    } else if (*str == '!') {
379
0
        err = unary_expr(filt, data, fn, str+1, end, res);
380
0
        if (res->is_true) {
381
            // Any explicitly true value becomes false
382
0
            res->d = res->is_true = 0;
383
0
        } else if (!hts_expr_val_exists(res)) {
384
            // We can also still negate undef values by toggling the
385
            // is_true override value.
386
0
            res->d = res->is_true = !res->is_true;
387
0
        } else if (res->is_str) {
388
            // !null = true, !"foo" = false, NOTE: !"" = false also
389
0
            res->d = res->is_true = (res->s.s == NULL);
390
0
        } else {
391
0
            res->d = !(int64_t)res->d;
392
0
            res->is_true = res->d != 0;
393
0
        }
394
0
        res->is_str = 0;
395
0
    } else if (*str == '~') {
396
0
        err = unary_expr(filt, data, fn, str+1, end, res);
397
0
        if (!hts_expr_val_exists(res)) {
398
0
            hts_expr_val_undef(res);
399
0
        } else {
400
0
            err |= res->is_str;
401
0
            if (!hts_expr_val_exists(res)) {
402
0
                hts_expr_val_undef(res);
403
0
            } else {
404
0
                res->d = ~(int64_t)res->d;
405
0
                res->is_true = res->d != 0;
406
0
            }
407
0
        }
408
0
    } else {
409
0
        err = simple_expr(filt, data, fn, str, end, res);
410
0
    }
411
0
    return err ? -1 : 0;
412
0
}
413
414
415
/*
416
 * mul_expr
417
 *     : unary_expr (
418
 *           '*' unary_expr
419
 *         | '/' unary_expr
420
 *         | '%' unary_expr
421
 *       )*
422
 */
423
static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
424
0
                    char *str, char **end, hts_expr_val_t *res) {
425
0
    if (unary_expr(filt, data, fn, str, end, res))
426
0
        return -1;
427
428
0
    str = *end;
429
0
    hts_expr_val_t val = HTS_EXPR_VAL_INIT;
430
0
    while (*str) {
431
0
        str = ws(str);
432
0
        if (*str == '*' || *str == '/' || *str == '%') {
433
0
            if (unary_expr(filt, data, fn, str+1, end, &val)) return -1;
434
0
            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
435
0
                hts_expr_val_undef(res);
436
0
            } else if (val.is_str || res->is_str) {
437
0
                hts_expr_val_free(&val);
438
0
                return -1; // arith on strings
439
0
            }
440
0
        }
441
442
0
        if (*str == '*')
443
0
            res->d *= val.d;
444
0
        else if (*str == '/')
445
0
            res->d /= val.d;
446
0
        else if (*str == '%') {
447
0
            if (val.d)
448
0
                res->d = (int64_t)res->d % (int64_t)val.d;
449
0
            else
450
0
                hts_expr_val_undef(res);
451
0
        } else
452
0
            break;
453
454
0
        res->is_true = hts_expr_val_exists(res) && (res->d != 0);
455
0
        str = *end;
456
0
    }
457
458
0
    hts_expr_val_free(&val);
459
460
0
    return 0;
461
0
}
462
463
/*
464
 * add_expr
465
 *     : mul_expr (
466
 *           '+' mul_expr
467
 *         | '-' mul_expr
468
 *       )*
469
 */
470
static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
471
0
                    char *str, char **end, hts_expr_val_t *res) {
472
0
    if (mul_expr(filt, data, fn, str, end, res))
473
0
        return -1;
474
475
0
    str = *end;
476
0
    hts_expr_val_t val = HTS_EXPR_VAL_INIT;
477
0
    while (*str) {
478
0
        str = ws(str);
479
0
        int undef = 0;
480
0
        if (*str == '+' || *str == '-') {
481
0
            if (mul_expr(filt, data, fn, str+1, end, &val)) return -1;
482
0
            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
483
0
                undef = 1;
484
0
            } else if (val.is_str || res->is_str) {
485
0
                hts_expr_val_free(&val);
486
0
                return -1; // arith on strings
487
0
            }
488
0
        }
489
490
0
        if (*str == '+')
491
0
            res->d += val.d;
492
0
        else if (*str == '-')
493
0
            res->d -= val.d;
494
0
        else
495
0
            break;
496
497
0
        if (undef)
498
0
            hts_expr_val_undef(res);
499
0
        else
500
0
            res->is_true = res->d != 0;
501
502
0
        str = *end;
503
0
    }
504
505
0
    hts_expr_val_free(&val);
506
507
0
    return 0;
508
0
}
509
510
/*
511
 * bitand_expr
512
 *     : add_expr
513
 *     | bitand_expr '&' add_expr
514
 */
515
static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
516
0
                       char *str, char **end, hts_expr_val_t *res) {
517
0
    if (add_expr(filt, data, fn, str, end, res)) return -1;
518
519
0
    hts_expr_val_t val = HTS_EXPR_VAL_INIT;
520
0
    int undef = 0;
521
0
    for (;;) {
522
0
        str = ws(*end);
523
0
        if (*str == '&' && str[1] != '&') {
524
0
            if (add_expr(filt, data, fn, str+1, end, &val)) return -1;
525
0
            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
526
0
                undef = 1;
527
0
            } else if (res->is_str || val.is_str) {
528
0
                hts_expr_val_free(&val);
529
0
                return -1;
530
0
            } else {
531
0
                res->is_true =
532
0
                    (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0;
533
0
            }
534
0
        } else {
535
0
            break;
536
0
        }
537
0
    }
538
0
    hts_expr_val_free(&val);
539
0
    if (undef)
540
0
        hts_expr_val_undef(res);
541
542
0
    return 0;
543
0
}
544
545
/*
546
 * bitxor_expr
547
 *     : bitand_expr
548
 *     | bitxor_expr '^' bitand_expr
549
 */
550
static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
551
0
                       char *str, char **end, hts_expr_val_t *res) {
552
0
    if (bitand_expr(filt, data, fn, str, end, res)) return -1;
553
554
0
    hts_expr_val_t val = HTS_EXPR_VAL_INIT;
555
0
    int undef = 0;
556
0
    for (;;) {
557
0
        str = ws(*end);
558
0
        if (*str == '^') {
559
0
            if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1;
560
0
            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
561
0
                undef = 1;
562
0
            } else if (res->is_str || val.is_str) {
563
0
                hts_expr_val_free(&val);
564
0
                return -1;
565
0
            } else {
566
0
                res->is_true =
567
0
                    (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0;
568
0
            }
569
0
        } else {
570
0
            break;
571
0
        }
572
0
    }
573
0
    hts_expr_val_free(&val);
574
0
    if (undef)
575
0
        hts_expr_val_undef(res);
576
577
0
    return 0;
578
0
}
579
580
/*
581
 * bitor_expr
582
 *     : bitxor_expr
583
 *     | bitor_expr '|' bitxor_expr
584
 */
585
static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
586
0
                      char *str, char **end, hts_expr_val_t *res) {
587
0
    if (bitxor_expr(filt, data, fn, str, end, res)) return -1;
588
589
0
    hts_expr_val_t val = HTS_EXPR_VAL_INIT;
590
0
    int undef = 0;
591
0
    for (;;) {
592
0
        str = ws(*end);
593
0
        if (*str == '|' && str[1] != '|') {
594
0
            if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1;
595
0
            if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
596
0
                undef = 1;
597
0
            } else if (res->is_str || val.is_str) {
598
0
                hts_expr_val_free(&val);
599
0
                return -1;
600
0
            } else {
601
0
                res->is_true =
602
0
                    (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0;
603
0
            }
604
0
        } else {
605
0
            break;
606
0
        }
607
0
    }
608
0
    hts_expr_val_free(&val);
609
0
    if (undef)
610
0
        hts_expr_val_undef(res);
611
612
0
    return 0;
613
0
}
614
615
/*
616
 * cmp_expr
617
 *     : bitor_expr
618
 *     | cmp_expr '<=' bitor_expr
619
 *     | cmp_expr '<'  bitor_expr
620
 *     | cmp_expr '>=' bitor_expr
621
 *     | cmp_expr '>'  bitor_expr
622
 */
623
static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
624
0
                    char *str, char **end, hts_expr_val_t *res) {
625
0
    if (bitor_expr(filt, data, fn, str, end, res)) return -1;
626
627
0
    str = ws(*end);
628
0
    hts_expr_val_t val = HTS_EXPR_VAL_INIT;
629
0
    int err = 0, cmp_done = 0;
630
631
0
    if (*str == '>' && str[1] == '=') {
632
0
        cmp_done = 1;
633
0
        err = cmp_expr(filt, data, fn, str+2, end, &val);
634
0
        if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
635
0
            hts_expr_val_undef(res);
636
0
        } else {
637
0
            res->is_true=res->d
638
0
                = res->is_str && res->s.s && val.is_str && val.s.s
639
0
                ? strcmp(res->s.s, val.s.s) >= 0
640
0
                : !res->is_str && !val.is_str && res->d >= val.d;
641
0
            res->is_str = 0;
642
0
        }
643
0
    } else if (*str == '>') {
644
0
        cmp_done = 1;
645
0
        err = cmp_expr(filt, data, fn, str+1, end, &val);
646
0
        if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
647
0
            hts_expr_val_undef(res);
648
0
        } else {
649
0
            res->is_true=res->d
650
0
                = res->is_str && res->s.s && val.is_str && val.s.s
651
0
                ? strcmp(res->s.s, val.s.s) > 0
652
0
                : !res->is_str && !val.is_str && res->d > val.d;
653
0
            res->is_str = 0;
654
0
        }
655
0
    } else if (*str == '<' && str[1] == '=') {
656
0
        cmp_done = 1;
657
0
        err = cmp_expr(filt, data, fn, str+2, end, &val);
658
0
        if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
659
0
            hts_expr_val_undef(res);
660
0
        } else {
661
0
            res->is_true=res->d
662
0
                = res->is_str && res->s.s && val.is_str && val.s.s
663
0
                ? strcmp(res->s.s, val.s.s) <= 0
664
0
                : !res->is_str && !val.is_str && res->d <= val.d;
665
0
            res->is_str = 0;
666
0
        }
667
0
    } else if (*str == '<') {
668
0
        cmp_done = 1;
669
0
        err = cmp_expr(filt, data, fn, str+1, end, &val);
670
0
        if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
671
0
            hts_expr_val_undef(res);
672
0
        } else {
673
0
            res->is_true=res->d
674
0
                = res->is_str && res->s.s && val.is_str && val.s.s
675
0
                ? strcmp(res->s.s, val.s.s) < 0
676
0
                : !res->is_str && !val.is_str && res->d < val.d;
677
0
            res->is_str = 0;
678
0
        }
679
0
    }
680
681
0
    if (cmp_done && (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)))
682
0
        hts_expr_val_undef(res);
683
0
    hts_expr_val_free(&val);
684
685
0
    return err ? -1 : 0;
686
0
}
687
688
/*
689
 * eq_expr
690
 *     : cmp_expr
691
 *     | eq_expr '==' cmp_expr
692
 *     | eq_expr '!=' cmp_expr
693
 *     | eq_expr '=~' cmp_expr
694
 *     | eq_expr '!~' cmp_expr
695
 */
696
static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
697
0
                   char *str, char **end, hts_expr_val_t *res) {
698
0
    if (cmp_expr(filt, data, fn, str, end, res)) return -1;
699
700
0
    str = ws(*end);
701
702
0
    int err = 0, eq_done = 0;
703
0
    hts_expr_val_t val = HTS_EXPR_VAL_INIT;
704
705
    // numeric vs numeric comparison is as expected
706
    // string vs string comparison is as expected
707
    // numeric vs string is false
708
0
    if (str[0] == '=' && str[1] == '=') {
709
0
        eq_done = 1;
710
0
        if ((err = eq_expr(filt, data, fn, str+2, end, &val))) {
711
0
            res->is_true = res->d = 0;
712
0
        } else {
713
0
            if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
714
0
                hts_expr_val_undef(res);
715
0
            } else {
716
0
                res->is_true = res->d = res->is_str
717
0
                    ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s)==0 :0)
718
0
                    : !res->is_str && !val.is_str && res->d == val.d;
719
0
            }
720
0
        }
721
0
        res->is_str = 0;
722
723
0
    } else if (str[0] == '!' && str[1] == '=') {
724
0
        eq_done = 1;
725
0
        if ((err = eq_expr(filt, data, fn, str+2, end, &val))) {
726
0
            res->is_true = res->d = 0;
727
0
        } else {
728
0
            if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
729
0
                hts_expr_val_undef(res);
730
0
            } else {
731
0
                res->is_true = res->d = res->is_str
732
0
                    ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s) != 0 :1)
733
0
                    : res->is_str != val.is_str || res->d != val.d;
734
0
            }
735
0
        }
736
0
        res->is_str = 0;
737
738
0
    } else if ((str[0] == '=' && str[1] == '~') ||
739
0
               (str[0] == '!' && str[1] == '~')) {
740
0
        eq_done = 1;
741
0
        err = eq_expr(filt, data, fn, str+2, end, &val);
742
0
        if (!val.is_str || !res->is_str) {
743
0
            hts_expr_val_free(&val);
744
0
            return -1;
745
0
        }
746
0
        if (val.s.s && res->s.s && val.is_true >= 0 && res->is_true >= 0) {
747
0
            regex_t preg_, *preg;
748
0
            if (filt->curr_regex >= filt->max_regex) {
749
                // Compile regex if not seen before
750
0
                if (filt->curr_regex >= MAX_REGEX) {
751
0
                    preg = &preg_;
752
0
                } else {
753
0
                    preg = &filt->preg[filt->curr_regex];
754
0
                    filt->max_regex++;
755
0
                }
756
757
0
                int ec = regcomp(preg, val.s.s, REG_EXTENDED | REG_NOSUB);
758
0
                if (ec != 0) {
759
0
                    char errbuf[1024];
760
0
                    regerror(ec, preg, errbuf, 1024);
761
0
                    fprintf(stderr, "Failed regex: %.1024s\n", errbuf);
762
0
                    hts_expr_val_free(&val);
763
0
                    return -1;
764
0
                }
765
0
            } else {
766
0
                preg = &filt->preg[filt->curr_regex];
767
0
            }
768
0
            res->is_true = res->d = regexec(preg, res->s.s, 0, NULL, 0) == 0
769
0
                ? *str == '='  // matcn
770
0
                : *str == '!'; // no-match
771
0
            if (preg == &preg_)
772
0
                regfree(preg);
773
774
0
            filt->curr_regex++;
775
0
        } else {
776
            // nul regexp or input is considered false
777
0
            res->is_true = 0;
778
0
        }
779
0
        res->is_str = 0;
780
0
    }
781
782
0
    if (eq_done && ((!hts_expr_val_exists(&val)) || !hts_expr_val_exists(res)))
783
0
        hts_expr_val_undef(res);
784
0
    hts_expr_val_free(&val);
785
786
0
    return err ? -1 : 0;
787
0
}
788
789
/*
790
 * and_expr
791
 *     : eq_expr
792
 *     | and_expr 'and' eq_expr
793
 *     | and_expr 'or'  eq_expr
794
 */
795
static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
796
0
                    char *str, char **end, hts_expr_val_t *res) {
797
0
    if (eq_expr(filt, data, fn, str, end, res)) return -1;
798
799
0
    for (;;) {
800
0
        hts_expr_val_t val = HTS_EXPR_VAL_INIT;
801
0
        str = ws(*end);
802
0
        if (str[0] == '&' && str[1] == '&') {
803
0
            if (eq_expr(filt, data, fn, str+2, end, &val)) return -1;
804
0
            if (!hts_expr_val_existsT(res) || !hts_expr_val_existsT(&val)) {
805
0
                hts_expr_val_undef(res);
806
0
                res->d = 0;
807
0
            } else {
808
0
                res->is_true = res->d =
809
0
                    (res->is_true || (res->is_str && res->s.s) || res->d) &&
810
0
                    (val.is_true  || (val.is_str && val.s.s) || val.d);
811
0
                res->is_str = 0;
812
0
            }
813
0
        } else if (str[0] == '|' && str[1] == '|') {
814
0
            if (eq_expr(filt, data, fn, str+2, end, &val)) return -1;
815
0
            if (!hts_expr_val_existsT(res) && !hts_expr_val_existsT(&val)) {
816
                // neither defined
817
0
                hts_expr_val_undef(res);
818
0
                res->d = 0;
819
0
            } else if (!hts_expr_val_existsT(res) &&
820
0
                       !(val.is_true  || (val.is_str  && val.s.s ) || val.d)) {
821
                // LHS undef and RHS false
822
0
                hts_expr_val_undef(res);
823
0
                res->d = 0;
824
0
            } else if (!hts_expr_val_existsT(&val) &&
825
0
                       !(res->is_true || (res->is_str && res->s.s) || res->d)){
826
                // RHS undef and LHS false
827
0
                hts_expr_val_undef(res);
828
0
                res->d = 0;
829
0
            } else {
830
0
                res->is_true = res->d =
831
0
                    res->is_true || (res->is_str && res->s.s) || res->d ||
832
0
                    val.is_true  || (val.is_str  && val.s.s ) || val.d;
833
0
                res->is_str = 0;
834
0
            }
835
0
        } else {
836
0
            break;
837
0
        }
838
0
        hts_expr_val_free(&val);
839
0
    }
840
841
0
    return 0;
842
0
}
843
844
static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
845
0
                      char *str, char **end, hts_expr_val_t *res) {
846
0
    return and_expr(filt, data, fn, str, end, res);
847
0
}
848
849
0
hts_filter_t *hts_filter_init(const char *str) {
850
0
    hts_filter_t *f = calloc(1, sizeof(*f));
851
0
    if (!f) return NULL;
852
853
    // Oversize to permit faster comparisons with memcmp over strcmp
854
0
    size_t len = strlen(str)+100;
855
0
    if (!(f->str = malloc(len))) {
856
0
        free(f);
857
0
        return NULL;
858
0
    }
859
0
    strcpy(f->str, str);
860
0
    return f;
861
0
}
862
863
9.36k
void hts_filter_free(hts_filter_t *filt) {
864
9.36k
    if (!filt)
865
9.36k
        return;
866
867
0
    int i;
868
0
    for (i = 0; i < filt->max_regex; i++)
869
0
        regfree(&filt->preg[i]);
870
871
0
    free(filt->str);
872
0
    free(filt);
873
0
}
874
875
static int hts_filter_eval_(hts_filter_t *filt,
876
                            void *data, hts_expr_sym_func *fn,
877
0
                            hts_expr_val_t *res) {
878
0
    char *end = NULL;
879
880
0
    filt->curr_regex = 0;
881
0
    if (expression(filt, data, fn, filt->str, &end, res))
882
0
        return -1;
883
884
0
    if (end && *ws(end)) {
885
0
        fprintf(stderr, "Unable to parse expression at %s\n", filt->str);
886
0
        return -1;
887
0
    }
888
889
    // Strings evaluate to true.  An empty string is also true, but an
890
    // absent (null) string is false, unless overriden by is_true.  An
891
    // empty string has kstring length of zero, but a pointer as it's
892
    // nul-terminated.
893
0
    if (res->is_str) {
894
0
        res->is_true |= res->s.s != NULL;
895
0
        res->d = res->is_true;
896
0
    } else if (hts_expr_val_exists(res)) {
897
0
        res->is_true |= res->d != 0;
898
0
    }
899
900
0
    return 0;
901
0
}
902
903
int hts_filter_eval(hts_filter_t *filt,
904
                    void *data, hts_expr_sym_func *fn,
905
0
                    hts_expr_val_t *res) {
906
0
    if (res->s.l != 0 || res->s.m != 0 || res->s.s != NULL) {
907
        // As *res is cleared below, it's not safe to call this function
908
        // with res->s.s set, as memory would be leaked.  It's also not
909
        // possible to know is res was initialised correctly, so in
910
        // either case we fail.
911
0
        hts_log_error("Results structure must be cleared before calling this function");
912
0
        return -1;
913
0
    }
914
915
0
    memset(res, 0, sizeof(*res));
916
917
0
    return hts_filter_eval_(filt, data, fn, res);
918
0
}
919
920
int hts_filter_eval2(hts_filter_t *filt,
921
                     void *data, hts_expr_sym_func *fn,
922
0
                     hts_expr_val_t *res) {
923
0
    ks_free(&res->s);
924
0
    memset(res, 0, sizeof(*res));
925
926
0
    return hts_filter_eval_(filt, data, fn, res);
927
0
}