Coverage Report

Created: 2025-07-07 10:01

/work/workdir/UnpackedTarball/rasqal/libsv/sv.c
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: c; c-basic-offset: 2 -*-
2
 *
3
 * sv.c - Parse separated-values (CSV, TSV) files
4
 *
5
 * Copyright (C) 2009-2014, David Beckett http://www.dajobe.org/
6
 * 
7
 * This package is Free Software
8
 * 
9
 * It is licensed under the following three licenses as alternatives:
10
 *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
11
 *   2. GNU General Public License (GPL) V2 or any newer version
12
 *   3. Apache License, V2.0 or any newer version
13
 * 
14
 * You may not use this file except in compliance with at least one of
15
 * the above three licenses.
16
 * 
17
 * See LICENSE.txt at the top of this package for the
18
 * complete terms and further detail along with the license texts for
19
 * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
20
 * 
21
 */
22
23
24
#ifdef SV_CONFIG
25
#include <sv_config.h>
26
#endif
27
28
#include <stdio.h>
29
#include <string.h>
30
#include <stdarg.h>
31
#include <ctype.h>
32
33
#ifdef HAVE_STDLIB_H
34
#include <stdlib.h>
35
#endif
36
37
#include <sv.h>
38
39
/* bit flags */
40
0
#define SV_FLAGS_SAVE_HEADER    (1<<0)
41
/* error out on bad data lines */
42
0
#define SV_FLAGS_BAD_DATA_ERROR (1<<1)
43
/* allow fields to be quoted */
44
0
#define SV_FLAGS_QUOTED_FIELDS  (1<<2)
45
/* strip (non-separator) whitespace around fields */
46
0
#define SV_FLAGS_STRIP_WHITESPACE  (1<<3)
47
48
49
struct sv_s {
50
  /* field separator: '\t' or ',' */
51
  char field_sep;
52
53
  int line;
54
  
55
  /* row callback */
56
  void *callback_user_data;
57
  sv_fields_callback header_callback;
58
  sv_fields_callback data_callback;
59
60
  /* current buffer */
61
  char *buffer;
62
  /* size allocated */
63
  size_t size;
64
  /* size used */
65
  size_t len;
66
67
  unsigned int fields_count;
68
  char **fields;
69
  size_t *fields_widths;
70
71
  /* memory buffer used for constructing fields for user;
72
   * array above 'fields' points into this 
73
   */
74
  char* fields_buffer;
75
  size_t fields_buffer_size;
76
77
  /* first row is saved as headers */
78
  char **headers;
79
  size_t *headers_widths;
80
81
  unsigned int flags;
82
83
  /* error state */
84
  sv_status_t status;
85
86
  int bad_records;
87
88
  char last_char;
89
90
  char quote_char;
91
92
  /* called with the line (before parsing) */
93
  sv_line_callback line_callback;
94
};
95
96
97
/**
98
 * sv_new:
99
 * @user_data: user data to use for callbacks
100
 * @header_callback: callback to receive headers (or NULL)
101
 * @data_callback: callback to receive data rows (or NULL)
102
 * @field_sep: field separator ',' or '\t'
103
 *
104
 * Constructor - create an SV object
105
 *
106
 * Return value: new SV object or NULL on failure.
107
 */
108
sv*
109
sv_new(void *user_data, sv_fields_callback header_callback,
110
       sv_fields_callback data_callback,
111
       char field_sep)
112
0
{
113
0
  sv *t;
114
115
0
  if(field_sep != '\t' && field_sep != ',')
116
0
    return NULL;
117
  
118
0
  t = (sv*)malloc(sizeof(*t));
119
0
  if(!t)
120
0
    return NULL;
121
  
122
0
  t->field_sep = field_sep;
123
124
0
  t->line = 1;
125
  
126
0
  t->callback_user_data = user_data;
127
0
  t->header_callback = header_callback;
128
0
  t->data_callback = data_callback;
129
130
0
  t->buffer = NULL;
131
0
  t->size = 0;
132
0
  t->len = 0;
133
134
0
  t->fields_count = 0;
135
0
  t->fields = NULL;
136
0
  t->fields_widths = NULL;
137
138
0
  t->fields_buffer = NULL;  
139
0
  t->fields_buffer_size = 0;
140
  
141
0
  t->headers = NULL;
142
0
  t->headers_widths = NULL;
143
144
  /* default flags */
145
0
  t->flags = SV_FLAGS_SAVE_HEADER | SV_FLAGS_QUOTED_FIELDS;
146
147
0
  t->status = SV_STATUS_OK;
148
149
0
  t->bad_records = 0;
150
151
0
  t->last_char = '\0';
152
153
0
  t->quote_char = '"';
154
155
0
  t->line_callback = NULL;
156
157
0
  return t;
158
0
}
159
160
161
static sv_status_t
162
sv_init_fields(sv *t) 
163
0
{
164
0
  t->fields = (char**)malloc(sizeof(char*) * (t->fields_count+1));
165
0
  if(!t->fields)
166
0
    goto failed;
167
    
168
0
  t->fields_widths = (size_t*)malloc(sizeof(size_t) * (t->fields_count+1));
169
0
  if(!t->fields_widths)
170
0
    goto failed;
171
172
0
  t->headers = (char**)malloc(sizeof(char*) * (t->fields_count+1));
173
0
  if(!t->headers)
174
0
    goto failed;
175
  
176
0
  t->headers_widths = (size_t*)malloc(sizeof(size_t) * (t->fields_count+1));
177
0
  if(!t->headers_widths)
178
0
    goto failed;
179
180
0
  return SV_STATUS_OK;
181
  
182
183
0
  failed:
184
0
  if(t->fields) {
185
0
    free(t->fields);
186
0
    t->fields = NULL;
187
0
  }
188
189
0
  if(t->fields_widths) {
190
0
    free(t->fields_widths);
191
0
    t->fields_widths = NULL;
192
0
  }
193
  
194
0
  if(t->headers) {
195
0
    free(t->headers);
196
0
    t->headers = NULL;
197
0
  }
198
199
0
  return SV_STATUS_NO_MEMORY;
200
0
}
201
202
203
/**
204
 * sv_free:
205
 * @t: SV object
206
 *
207
 * Destructor: destroy an SV object
208
 *
209
 */
210
void
211
sv_free(sv *t)
212
0
{
213
0
  if(!t)
214
0
    return;
215
216
0
  if(t->headers_widths)
217
0
    free(t->headers_widths);
218
0
  if(t->headers) {
219
0
    unsigned int i;
220
    
221
0
    for(i = 0; i < t->fields_count; i++)
222
0
      free(t->headers[i]);
223
0
    free(t->headers);
224
0
  }
225
  
226
227
0
  if(t->fields_buffer)
228
0
    free(t->fields_buffer);
229
230
0
  if(t->fields_widths)
231
0
    free(t->fields_widths);
232
0
  if(t->fields)
233
0
    free(t->fields);
234
0
  if(t->buffer)
235
0
    free(t->buffer);
236
  
237
0
  free(t);
238
0
}
239
240
241
242
/* Ensure fields buffer is big enough for len bytes total */
243
static sv_status_t
244
sv_ensure_fields_buffer_size(sv *t, size_t len)
245
0
{
246
0
  char *nbuffer;
247
0
  size_t nsize;
248
  
249
0
  if(len < t->fields_buffer_size)
250
0
    return SV_STATUS_OK;
251
  
252
0
  nsize = len + 8;
253
254
#if defined(SV_DEBUG) && SV_DEBUG > 1
255
  fprintf(stderr, "%d: Growing buffer from %d to %d bytes\n",
256
          t->line, (int)t->fields_buffer_size, (int)nsize);
257
#endif
258
  
259
0
  nbuffer = (char*)malloc(nsize + 1);
260
0
  if(!nbuffer)
261
0
    return SV_STATUS_NO_MEMORY;
262
263
0
  if(t->fields_buffer)
264
0
    free(t->fields_buffer);
265
  
266
0
  t->fields_buffer = nbuffer;
267
0
  t->fields_buffer_size = nsize;
268
269
0
  return SV_STATUS_OK;
270
0
}
271
272
273
274
/* Ensure internal buffer is big enough for len more bytes */
275
static sv_status_t
276
sv_ensure_line_buffer_size(sv *t, size_t len)
277
0
{
278
0
  char *nbuffer;
279
0
  size_t nsize;
280
  
281
0
  if(t->len + len < t->size)
282
0
    return SV_STATUS_OK;
283
  
284
0
  nsize = (len + t->len) << 1;
285
    
286
0
  nbuffer = (char*)malloc(nsize + 1);
287
0
  if(!nbuffer)
288
0
    return SV_STATUS_NO_MEMORY;
289
290
0
  if(t->len)
291
0
    memcpy(nbuffer, t->buffer, t->len);
292
0
  nbuffer[t->len] = '\0';
293
  
294
0
  if(t->buffer)
295
0
    free(t->buffer);
296
  
297
0
  t->buffer = nbuffer;
298
0
  t->size = nsize;
299
300
0
  return SV_STATUS_OK;
301
0
}
302
303
304
/**
305
 * sv_get_line:
306
 * @t: sv object
307
 *
308
 * Get current SV line number
309
 *
310
 * Return value: line number or <0 on failure
311
 */
312
int
313
sv_get_line(sv *t)
314
0
{
315
0
  if(!t)
316
0
    return -1;
317
318
0
  return t->line;
319
0
}
320
321
322
/**
323
 * sv_get_header:
324
 * @t: sv object
325
 * @i: header index 0
326
 * @width_p: pointer to store width (or NULL)
327
 *
328
 * Get an SV header with optional width
329
 *
330
 * Return value: shared pointer to header or NULL if out of range
331
 */
332
const char*
333
sv_get_header(sv *t, unsigned int i, size_t *width_p)
334
0
{
335
0
  if(!t || !t->headers || i > t->fields_count)
336
0
    return NULL;
337
338
0
  if(width_p)
339
0
    *width_p = t->headers_widths[i];
340
  
341
0
  return (const char*)t->headers[i];
342
0
}
343
344
345
#if defined(SV_DEBUG) && SV_DEBUG > 1
346
static void
347
sv_dump_buffer(FILE* fh, const char* label, const char* buffer, size_t len) 
348
{
349
  size_t mylen=len;
350
  
351
  fprintf(fh, "%s (%zu bytes) >>>", label, len);
352
  if(mylen > 100)
353
    mylen = 100;
354
  fwrite(buffer, 1, mylen, fh);
355
  if(mylen != len)
356
    fputs("...", fh);
357
  fputs("<<<\n", fh);
358
}
359
#endif
360
361
362
static sv_status_t
363
sv_parse_line(sv *t, char *line, size_t len,  unsigned int* field_count_p)
364
0
{
365
0
  unsigned int column;
366
0
  int field_width = 0;
367
0
  int field_offset = 0;
368
0
  char* current_field = NULL;
369
0
  char* p = NULL;
370
0
  char** fields = t->fields;
371
0
  size_t* fields_widths = t->fields_widths;
372
0
  sv_status_t status;
373
0
  int field_is_quoted = 0;
374
375
#if defined(SV_DEBUG) && SV_DEBUG > 1
376
  if(fields)
377
    sv_dump_buffer(stderr, "(sv_parse_line): Parsing line", line, len);
378
#endif
379
  
380
0
  status = sv_ensure_fields_buffer_size(t, len);
381
0
  if(status)
382
0
    return status;
383
384
0
  if(fields) {
385
0
    current_field = t->fields_buffer;
386
0
    p = current_field;
387
388
0
    if(!p)
389
0
      return SV_STATUS_OK;
390
0
  }
391
392
0
  for(column = 0; 1; column++) {
393
0
    int c = -1;
394
0
    int field_ended = 0;
395
0
    int expect_sep = 0;
396
397
0
    if(column == len) {
398
0
      field_ended = 1;
399
0
      goto do_last;
400
0
    }
401
    
402
0
    c = line[column];
403
404
0
    if(t->flags & SV_FLAGS_QUOTED_FIELDS) {
405
0
      if(c == t->quote_char) {
406
0
        if(!field_width && !field_is_quoted) {
407
0
          field_is_quoted = 1;
408
  #if defined(SV_DEBUG) && SV_DEBUG > 1
409
          fprintf(stderr, "Field is quoted\n");
410
  #endif
411
0
          continue;
412
0
        } else if(column < len && line[column+1] == t->quote_char) {
413
  #if defined(SV_DEBUG) && SV_DEBUG > 1
414
          fprintf(stderr, "Doubled quote %c absorbed\n", t->quote_char);
415
  #endif
416
0
          column++;
417
          /* skip repeated quote - so it just replaces ""... with " */
418
0
          goto skip;
419
0
        } else if(column == len-1 || line[column+1] == t->field_sep) {
420
  #if defined(SV_DEBUG) && SV_DEBUG > 1
421
          fprintf(stderr, "Field ended on quote + sep\n");
422
  #endif
423
0
          field_ended = 1;
424
0
          expect_sep = 1;
425
0
          goto do_last;
426
0
        }
427
0
      }
428
0
    }
429
430
0
    if(!field_is_quoted && c == t->field_sep) {
431
#if defined(SV_DEBUG) && SV_DEBUG > 1
432
      fprintf(stderr, "Field ended on sep\n");
433
#endif
434
0
      field_ended = 1;
435
0
    }
436
437
0
    do_last:
438
0
    if(field_ended) {
439
0
      if(p)
440
0
        *p++ = '\0';
441
      
442
0
      if(fields) {
443
444
0
        if(t->flags & SV_FLAGS_STRIP_WHITESPACE) {
445
          /* Remove whitespace around a field */
446
0
          while(field_width > 0 && isspace(current_field[0])) {
447
0
            current_field++;
448
0
            field_width--;
449
0
          }
450
451
0
          while(field_width > 0 && isspace(current_field[field_width - 1]))
452
0
            field_width--;
453
454
0
          current_field[field_width] = '\0';
455
0
        }
456
457
0
        if(expect_sep)
458
0
          column++;
459
460
0
      }
461
462
#if defined(SV_DEBUG) && SV_DEBUG > 1
463
      if(fields) {
464
        fprintf(stderr, "  Field %d: %s (%d)\n", (int)field_offset, current_field, (int)field_width);
465
      }
466
#endif
467
0
      if(fields)
468
0
        fields[field_offset] = current_field;
469
0
      if(fields_widths)
470
0
        fields_widths[field_offset] = field_width;
471
472
      /* end loop when out of columns */
473
0
      if(column == len)
474
0
        break;
475
      
476
      /* otherwise got a tab so reset for next field */
477
0
      field_width = 0;
478
0
      field_is_quoted = 0;
479
480
0
      field_offset++;
481
0
      current_field = p;
482
483
0
      continue;
484
0
    }
485
486
0
    skip:
487
0
    if(fields)
488
0
      *p++ = c;
489
0
    field_width++;
490
0
  }
491
492
493
0
  if(field_count_p)
494
0
    *field_count_p = field_offset + 1;
495
496
0
  return SV_STATUS_OK;
497
0
}
498
499
500
static sv_status_t
501
sv_parse_chunk_line(sv* t, size_t line_len, int has_nl)
502
0
{
503
0
  size_t move_len = line_len;
504
0
  sv_status_t status = SV_STATUS_OK;
505
0
  unsigned int fields_count = 0;
506
507
0
  if(!line_len)
508
0
    goto skip_line;
509
510
0
  if(t->line_callback) {
511
0
    char c = t->buffer[line_len];
512
      
513
0
    t->buffer[line_len] = '\0';
514
0
    status = t->line_callback(t, t->callback_user_data, t->buffer, line_len);
515
0
    t->buffer[line_len] = c;
516
0
    if(status != SV_STATUS_OK)
517
0
      return status;
518
0
  }
519
520
0
  if(!t->fields_count) {
521
    /* First line in the file - calculate number of fields */
522
0
    status = sv_parse_line(t, t->buffer, line_len, &t->fields_count);
523
0
    if(status)
524
0
      return status;
525
526
    /* initialise arrays of size t->fields_count */
527
0
    status = sv_init_fields(t);
528
0
    if(status)
529
0
      return status;
530
0
  }
531
532
0
  status = sv_parse_line(t, t->buffer, line_len, &fields_count);
533
0
  if(status)
534
0
    return status;
535
536
0
  if(fields_count != t->fields_count) {
537
0
    t->bad_records++;
538
0
    if(t->flags & SV_FLAGS_BAD_DATA_ERROR) {
539
#if defined(SV_DEBUG) && SV_DEBUG > 1
540
      fprintf(stderr, "Error in line %d: saw %d fields expected %d\n",
541
              t->line, fields_count, t->fields_count);
542
#endif
543
0
      status = SV_STATUS_LINE_FIELDS;
544
0
      return status;
545
0
    }
546
#if defined(SV_DEBUG) && SV_DEBUG > 1
547
    fprintf(stderr, "Ignoring line %d: saw %d fields expected %d\n",
548
            t->line, fields_count, t->fields_count);
549
#endif
550
    /* Otherwise skip the line */
551
0
    goto skip_line;
552
0
  }
553
554
0
  if(t->line == 1 && (t->flags & SV_FLAGS_SAVE_HEADER)) {
555
    /* first line and header: turn fields into headers */
556
0
    unsigned int i;
557
      
558
0
    for(i = 0; i < t->fields_count; i++) {
559
0
      char *s = (char*)malloc(t->fields_widths[i]+1);
560
0
      if(!s) {
561
0
        status = SV_STATUS_NO_MEMORY;
562
0
        break;
563
0
      }
564
0
      memcpy(s, t->fields[i], t->fields_widths[i]+1);
565
0
      t->headers[i] = s;
566
0
      t->headers_widths[i] = t->fields_widths[i];
567
0
    }
568
569
0
    if(status == SV_STATUS_OK && t->header_callback) {
570
      /* got header fields - return them to user */
571
0
      status = t->header_callback(t, t->callback_user_data, t->headers, 
572
0
                                  t->headers_widths, t->fields_count);
573
0
    }
574
0
  } else {
575
    /* data */
576
577
0
    if(t->data_callback) {
578
      /* got data fields - return them to user */
579
0
      status = t->data_callback(t, t->callback_user_data, t->fields, 
580
0
                                t->fields_widths, t->fields_count);
581
0
    }
582
0
  }
583
584
0
  skip_line:
585
586
0
  if(has_nl)
587
0
    move_len++;
588
589
  /* adjust buffer - remove 'line_len+1' bytes from start of buffer */
590
0
  t->len -= move_len;
591
592
  /* this is an overlapping move */
593
0
  memmove(t->buffer, &t->buffer[move_len], t->len);
594
595
  /* This is not needed: guaranteed above */
596
  /* t->buffer[t->len] = '\0' */
597
598
0
  t->line++;
599
600
0
  return status;
601
0
}
602
603
604
/**
605
 * sv_parse_chunk:
606
 * @t: sv object
607
 * @buffer: buffer to parse (or NULL)
608
 * @len: length of @buffer (or 0)
609
 *
610
 * Parse a chunk of data
611
 *
612
 * Parsing ends if either @buffer is NULL or @len is 0
613
 *
614
 * Return value: #SV_STATUS_OK on success
615
 */
616
sv_status_t
617
sv_parse_chunk(sv *t, char *buffer, size_t len)
618
0
{
619
0
  size_t offset = 0;
620
0
  sv_status_t status = SV_STATUS_OK;
621
  /* End of input if either of these is NULL */
622
0
  int is_end = (!buffer || !len);
623
624
0
  if(!is_end) {
625
    /* add new data to existing buffer */
626
0
    status = sv_ensure_line_buffer_size(t, len);
627
0
    if(status)
628
0
      return status;
629
630
    /* add new buffer */
631
0
    memcpy(t->buffer + t->len, buffer, len);
632
633
    /* always ensure it is NUL terminated even if input chunk was not */
634
0
    t->len += len;
635
0
    t->buffer[t->len] = '\0';
636
0
  }
637
638
  /* look for an end of line to do some work */
639
0
  for(offset = 0; offset < t->len; offset++) {
640
0
    char c = t->buffer[offset];
641
642
    /* skip \n when just seen \r - i.e. \r\n or CR LF */
643
0
    if(t->last_char == '\r' && c == '\n') {
644
#if defined(SV_DEBUG) && SV_DEBUG > 1
645
      fprintf(stderr, "Skipping a \\n after \\r\n");
646
#endif
647
648
      /* adjust buffer */
649
0
      t->len -= 1;
650
651
      /* this is an overlapping move */
652
0
      memmove(t->buffer, &t->buffer[1], t->len);
653
654
0
      t->last_char = '\0';
655
0
      continue;
656
0
    }
657
658
0
    if(c != '\r' && c != '\n')
659
0
      continue;
660
661
0
    t->last_char = c;
662
663
#if defined(SV_DEBUG) && SV_DEBUG > 1
664
    sv_dump_buffer(stderr, "Starting buffer", t->buffer, t->len);
665
#endif
666
667
    /* found a line */
668
0
    status = sv_parse_chunk_line(t, offset, 1);
669
0
    if(status != SV_STATUS_OK)
670
0
      break;
671
672
0
    offset = -1; /* so for loop starts at 0 */
673
0
  }
674
675
0
  if(is_end && status == SV_STATUS_OK) {
676
    /* If end of input and there is a non-empty buffer left, try to
677
     * parse it all as the last line.  It will NOT contain newlines.
678
     */
679
0
    if(t->len)
680
0
      status = sv_parse_chunk_line(t, t->len, 0);
681
0
  }
682
683
0
  return status;
684
0
}
685
686
687
static sv_status_t
688
sv_set_option_vararg(sv* t, sv_option_t option, va_list arg)
689
0
{
690
0
  sv_status_t status = SV_STATUS_OK;
691
692
0
  switch(option) {
693
0
    case SV_OPTION_SAVE_HEADER:
694
0
      t->flags &= ~SV_FLAGS_SAVE_HEADER;
695
0
      if(va_arg(arg, long))
696
0
        t->flags |= SV_FLAGS_SAVE_HEADER;
697
0
      break;
698
699
0
    case SV_OPTION_BAD_DATA_ERROR:
700
0
      t->flags &= ~SV_FLAGS_BAD_DATA_ERROR;
701
0
      if(va_arg(arg, long))
702
0
        t->flags |= SV_FLAGS_BAD_DATA_ERROR;
703
0
      break;
704
705
0
    case SV_OPTION_QUOTED_FIELDS:
706
0
      t->flags &= ~SV_FLAGS_QUOTED_FIELDS;
707
0
      if(va_arg(arg, long))
708
0
        t->flags |= SV_FLAGS_QUOTED_FIELDS;
709
0
      break;
710
711
0
    case SV_OPTION_STRIP_WHITESPACE:
712
0
      t->flags &= ~SV_FLAGS_STRIP_WHITESPACE;
713
0
      if(va_arg(arg, long))
714
0
        t->flags |= SV_FLAGS_STRIP_WHITESPACE;
715
0
      break;
716
717
0
    case SV_OPTION_QUOTE_CHAR:
718
0
      if(1) {
719
0
        int c = va_arg(arg, int);
720
0
        if(c != t->field_sep)
721
0
          t->quote_char = c;
722
0
      }
723
0
      break;
724
725
0
    case SV_OPTION_LINE_CALLBACK:
726
0
      if(1) {
727
0
        sv_line_callback cb = (sv_line_callback)va_arg(arg, void*);
728
0
        t->line_callback = cb;
729
0
      }
730
731
0
    default:
732
0
    case SV_OPTION_NONE:
733
0
      status = SV_STATUS_FAILED;
734
0
      break;
735
736
0
  }
737
738
0
  return status;
739
0
}
740
  
741
742
/**
743
 * sv_set_option:
744
 * @t: sv object
745
 * @option: option name
746
 *
747
 * Set an option value.  The value varies in type dependent on the @option
748
 *
749
 * Return value: #SV_STATUS_FAILED if failed
750
 */
751
sv_status_t
752
sv_set_option(sv *t, sv_option_t option, ...)
753
0
{
754
0
  sv_status_t status;
755
0
  va_list arg;
756
757
0
  va_start(arg, option);
758
0
  status = sv_set_option_vararg(t, option, arg);
759
0
  va_end(arg);
760
761
0
  return status;
762
0
}