Coverage Report

Created: 2023-09-25 06:40

/src/file/src/is_csv.c
Line
Count
Source (jump to first uncovered line)
1
/*-
2
 * Copyright (c) 2019 Christos Zoulas
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions
7
 * are met:
8
 * 1. Redistributions of source code must retain the above copyright
9
 *    notice, this list of conditions and the following disclaimer.
10
 * 2. Redistributions in binary form must reproduce the above copyright
11
 *    notice, this list of conditions and the following disclaimer in the
12
 *    documentation and/or other materials provided with the distribution.
13
 *
14
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
15
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
18
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24
 * POSSIBILITY OF SUCH DAMAGE.
25
 */
26
27
/*
28
 * Parse CSV object serialization format (RFC-4180, RFC-7111)
29
 */
30
31
#ifndef TEST
32
#include "file.h"
33
34
#ifndef lint
35
FILE_RCSID("@(#)$File: is_csv.c,v 1.13 2023/07/17 16:08:17 christos Exp $")
36
#endif
37
38
#include <string.h>
39
#include "magic.h"
40
#else
41
#include <sys/types.h>
42
#endif
43
44
45
#ifdef DEBUG
46
#include <stdio.h>
47
#define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__)
48
#else
49
#define DPRINTF(fmt, ...)
50
#endif
51
52
/*
53
 * if CSV_LINES == 0:
54
 *  check all the lines in the buffer
55
 * otherwise:
56
 *  check only up-to the number of lines specified
57
 *
58
 * the last line count is always ignored if it does not end in CRLF
59
 */
60
#ifndef CSV_LINES
61
238
#define CSV_LINES 10
62
#endif
63
64
static int csv_parse(const unsigned char *, const unsigned char *);
65
66
static const unsigned char *
67
eatquote(const unsigned char *uc, const unsigned char *ue)
68
6.30k
{
69
6.30k
  int quote = 0;
70
71
7.63M
  while (uc < ue) {
72
7.63M
    unsigned char c = *uc++;
73
7.63M
    if (c != '"') {
74
      // We already got one, done.
75
242k
      if (quote) {
76
6.13k
        return --uc;
77
6.13k
      }
78
236k
      continue;
79
242k
    }
80
7.38M
    if (quote) {
81
      // quote-quote escapes
82
3.69M
      quote = 0;
83
3.69M
      continue;
84
3.69M
    }
85
    // first quote
86
3.69M
    quote = 1;
87
3.69M
  }
88
167
  return ue;
89
6.30k
}
90
91
static int
92
csv_parse(const unsigned char *uc, const unsigned char *ue)
93
1.50k
{
94
1.50k
  size_t nf = 0, tf = 0, nl = 0;
95
96
53.9M
  while (uc < ue) {
97
53.9M
    switch (*uc++) {
98
6.30k
    case '"':
99
      // Eat until the matching quote
100
6.30k
      uc = eatquote(uc, ue);
101
6.30k
      break;
102
2.93M
    case ',':
103
2.93M
      nf++;
104
2.93M
      break;
105
238
    case '\n':
106
238
      DPRINTF("%zu %zu %zu\n", nl, nf, tf);
107
238
      nl++;
108
238
#if CSV_LINES
109
238
      if (nl == CSV_LINES)
110
1
        return tf != 0 && tf == nf;
111
237
#endif
112
237
      if (tf == 0) {
113
        // First time and no fields, give up
114
184
        if (nf == 0) 
115
134
          return 0;
116
        // First time, set the number of fields
117
50
        tf = nf;
118
53
      } else if (tf != nf) {
119
        // Field number mismatch, we are done.
120
31
        return 0;
121
31
      }
122
72
      nf = 0;
123
72
      break;
124
51.0M
    default:
125
51.0M
      break;
126
53.9M
    }
127
53.9M
  }
128
1.34k
  return tf && nl >= 2;
129
1.50k
}
130
131
#ifndef TEST
132
int
133
file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text,
134
    const char *code)
135
9.75k
{
136
9.75k
  const unsigned char *uc = CAST(const unsigned char *, b->fbuf);
137
9.75k
  const unsigned char *ue = uc + b->flen;
138
9.75k
  int mime = ms->flags & MAGIC_MIME;
139
140
9.75k
  if (!looks_text)
141
8.25k
    return 0;
142
143
1.50k
  if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0)
144
0
    return 0;
145
146
1.50k
  if (!csv_parse(uc, ue))
147
1.50k
    return 0;
148
149
7
  if (mime == MAGIC_MIME_ENCODING)
150
0
    return 1;
151
152
7
  if (mime) {
153
0
    if (file_printf(ms, "text/csv") == -1)
154
0
      return -1;
155
0
    return 1;
156
0
  }
157
158
7
  if (file_printf(ms, "CSV %s%stext", code ? code : "",
159
7
      code ? " " : "") == -1)
160
0
    return -1;
161
162
7
  return 1;
163
7
}
164
165
#else
166
167
#include <sys/types.h>
168
#include <sys/stat.h>
169
#include <stdio.h>
170
#include <fcntl.h>
171
#include <unistd.h>
172
#include <stdlib.h>
173
#include <stdint.h>
174
#include <err.h>
175
176
int
177
main(int argc, char *argv[])
178
{
179
  int fd;
180
  struct stat st;
181
  unsigned char *p;
182
183
  if ((fd = open(argv[1], O_RDONLY)) == -1)
184
    err(EXIT_FAILURE, "Can't open `%s'", argv[1]);
185
186
  if (fstat(fd, &st) == -1)
187
    err(EXIT_FAILURE, "Can't stat `%s'", argv[1]);
188
189
  if ((p = CAST(char *, malloc(st.st_size))) == NULL)
190
    err(EXIT_FAILURE, "Can't allocate %jd bytes",
191
        (intmax_t)st.st_size);
192
  if (read(fd, p, st.st_size) != st.st_size)
193
    err(EXIT_FAILURE, "Can't read %jd bytes",
194
        (intmax_t)st.st_size);
195
  printf("is csv %d\n", csv_parse(p, p + st.st_size));
196
  return 0;
197
}
198
#endif