Coverage Report

Created: 2025-08-26 06:47

/src/file/src/is_csv.c
Line
Count
Source (jump to first uncovered line)
1
/*-
2
 * Copyright (c) 2019 Christos Zoulas
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 * 1. Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 * 2. Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in the
12
 *    documentation and/or other materials provided with the distribution.
13
 *
14
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24
 * POSSIBILITY OF SUCH DAMAGE.
25
 */
26
27
/*
28
 * Parse CSV object serialization format (RFC-4180, RFC-7111)
29
 */
30
31
#ifndef TEST
32
#include "file.h"
33
34
#ifndef lint
35
FILE_RCSID("@(#)$File: is_csv.c,v 1.15 2024/05/18 15:16:13 christos Exp $")
36
#endif
37
38
#include <string.h>
39
#include "magic.h"
40
#else
41
#define CAST(a, b)  ((a)(b))
42
#include <sys/types.h>
43
#endif
44
45
46
#ifdef DEBUG
47
#include <stdio.h>
48
#define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
49
#else
50
#define DPRINTF(fmt, ...)
51
#endif
52
53
/*
54
 * if CSV_LINES == 0:
55
 *  check all the lines in the buffer
56
 * otherwise:
57
 *  check only up-to the number of lines specified
58
 *
59
 * the last line count is always ignored if it does not end in CRLF
60
 */
61
#ifndef CSV_LINES
62
760
#define CSV_LINES 10
63
#endif
64
65
static int csv_parse(const unsigned char *, const unsigned char *);
66
67
static const unsigned char *
68
eatquote(const unsigned char *uc, const unsigned char *ue)
69
6.52M
{
70
6.52M
  int quote = 0;
71
72
144M
  while (uc < ue) {
73
144M
    unsigned char c = *uc++;
74
144M
    if (c != '"') {
75
      // We already got one, done.
76
129M
      if (quote) {
77
6.52M
        return --uc;
78
6.52M
      }
79
123M
      continue;
80
129M
    }
81
14.4M
    if (quote) {
82
      // quote-quote escapes
83
3.98M
      quote = 0;
84
3.98M
      continue;
85
3.98M
    }
86
    // first quote
87
10.5M
    quote = 1;
88
10.5M
  }
89
411
  return ue;
90
6.52M
}
91
92
static int
93
csv_parse(const unsigned char *uc, const unsigned char *ue)
94
4.31k
{
95
4.31k
  size_t nf = 0, tf = 0, nl = 0;
96
97
592M
  while (uc < ue) {
98
592M
    switch (*uc++) {
99
6.52M
    case '"':
100
      // Eat until the matching quote
101
6.52M
      uc = eatquote(uc, ue);
102
6.52M
      break;
103
49.1M
    case ',':
104
49.1M
      nf++;
105
49.1M
      break;
106
760
    case '\n':
107
760
      DPRINTF("%zu %zu %zu\n", nl, nf, tf);
108
760
      nl++;
109
760
#if CSV_LINES
110
760
      if (nl == CSV_LINES)
111
24
        return tf > 1 && tf == nf;
112
736
#endif
113
736
      if (tf == 0) {
114
        // First time and no fields, give up
115
453
        if (nf == 0) 
116
328
          return 0;
117
        // First time, set the number of fields
118
125
        tf = nf;
119
283
      } else if (tf != nf) {
120
        // Field number mismatch, we are done.
121
53
        return 0;
122
53
      }
123
355
      nf = 0;
124
355
      break;
125
536M
    default:
126
536M
      break;
127
592M
    }
128
592M
  }
129
3.91k
  return tf > 1 && nl >= 2;
130
4.31k
}
131
132
#ifndef TEST
133
int
134
file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
135
    const char *code)
136
25.5k
{
137
25.5k
  const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
138
25.5k
  const unsigned char *ue = uc + b->flen;
139
25.5k
  int mime = ms->flags & MAGIC_MIME;
140
141
25.5k
  if (!looks_text)
142
21.2k
    return 0;
143
144
4.31k
  if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
145
0
    return 0;
146
147
4.31k
  if (!csv_parse(uc, ue))
148
4.30k
    return 0;
149
150
7
  if (mime == MAGIC_MIME_ENCODING)
151
0
    return 1;
152
153
7
  if (mime) {
154
0
    if (file_printf(ms, "text/csv") == -1)
155
0
      return -1;
156
0
    return 1;
157
0
  }
158
159
7
  if (file_printf(ms, "CSV %s%stext", code ? code : "",
160
7
      code ? " " : "") == -1)
161
0
    return -1;
162
163
7
  return 1;
164
7
}
165
166
#else
167
168
#include <sys/types.h>
169
#include <sys/stat.h>
170
#include <stdio.h>
171
#include <fcntl.h>
172
#include <unistd.h>
173
#include <stdlib.h>
174
#include <stdint.h>
175
#include <err.h>
176
177
int
178
main(int argc, char *argv[])
179
{
180
  int fd;
181
  struct stat st;
182
  unsigned char *p;
183
184
  if ((fd = open(argv[1], O_RDONLY)) == -1)
185
    err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
186
187
  if (fstat(fd, &st) == -1)
188
    err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
189
190
  if ((p = CAST(unsigned char *, malloc(st.st_size))) == NULL)
191
    err(EXIT_FAILURE, "Can't allocate %jd bytes",
192
        (intmax_t)st.st_size);
193
  if (read(fd, p, st.st_size) != st.st_size)
194
    err(EXIT_FAILURE, "Can't read %jd bytes",
195
        (intmax_t)st.st_size);
196
  printf("is csv %d\n", csv_parse(p, p + st.st_size));
197
  return 0;
198
}
199
#endif