Line | Count | Source (jump to first uncovered line) |
1 | | /*- |
2 | | * Copyright (c) 2019 Christos Zoulas |
3 | | * All rights reserved. |
4 | | * |
5 | | * Redistribution and use in source and binary forms, with or without |
6 | | * modification, are permitted provided that the following conditions |
7 | | * are met: |
8 | | * 1. Redistributions of source code must retain the above copyright |
9 | | * notice, this list of conditions and the following disclaimer. |
10 | | * 2. Redistributions in binary form must reproduce the above copyright |
11 | | * notice, this list of conditions and the following disclaimer in the |
12 | | * documentation and/or other materials provided with the distribution. |
13 | | * |
14 | | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
15 | | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
16 | | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
17 | | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
18 | | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
19 | | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
20 | | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
21 | | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
22 | | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
23 | | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
24 | | * POSSIBILITY OF SUCH DAMAGE. |
25 | | */ |
26 | | |
27 | | /* |
28 | | * Parse CSV object serialization format (RFC-4180, RFC-7111) |
29 | | */ |
30 | | |
31 | | #ifndef TEST |
32 | | #include "file.h" |
33 | | |
34 | | #ifndef lint |
35 | | FILE_RCSID("@(#)$File: is_csv.c,v 1.15 2024/05/18 15:16:13 christos Exp $") |
36 | | #endif |
37 | | |
38 | | #include <string.h> |
39 | | #include "magic.h" |
40 | | #else |
41 | | #define CAST(a, b) ((a)(b)) |
42 | | #include <sys/types.h> |
43 | | #endif |
44 | | |
45 | | |
46 | | #ifdef DEBUG |
47 | | #include <stdio.h> |
48 | | #define DPRINTF(fmt, ...) printf(fmt, __VA_ARGS__) |
49 | | #else |
50 | | #define DPRINTF(fmt, ...) |
51 | | #endif |
52 | | |
53 | | /* |
54 | | * if CSV_LINES == 0: |
55 | | * check all the lines in the buffer |
56 | | * otherwise: |
57 | | * check only up-to the number of lines specified |
58 | | * |
59 | | * the last line count is always ignored if it does not end in CRLF |
60 | | */ |
61 | | #ifndef CSV_LINES |
62 | 760 | #define CSV_LINES 10 |
63 | | #endif |
64 | | |
65 | | static int csv_parse(const unsigned char *, const unsigned char *); |
66 | | |
67 | | static const unsigned char * |
68 | | eatquote(const unsigned char *uc, const unsigned char *ue) |
69 | 6.52M | { |
70 | 6.52M | int quote = 0; |
71 | | |
72 | 144M | while (uc < ue) { |
73 | 144M | unsigned char c = *uc++; |
74 | 144M | if (c != '"') { |
75 | | // We already got one, done. |
76 | 129M | if (quote) { |
77 | 6.52M | return --uc; |
78 | 6.52M | } |
79 | 123M | continue; |
80 | 129M | } |
81 | 14.4M | if (quote) { |
82 | | // quote-quote escapes |
83 | 3.98M | quote = 0; |
84 | 3.98M | continue; |
85 | 3.98M | } |
86 | | // first quote |
87 | 10.5M | quote = 1; |
88 | 10.5M | } |
89 | 411 | return ue; |
90 | 6.52M | } |
91 | | |
92 | | static int |
93 | | csv_parse(const unsigned char *uc, const unsigned char *ue) |
94 | 4.31k | { |
95 | 4.31k | size_t nf = 0, tf = 0, nl = 0; |
96 | | |
97 | 592M | while (uc < ue) { |
98 | 592M | switch (*uc++) { |
99 | 6.52M | case '"': |
100 | | // Eat until the matching quote |
101 | 6.52M | uc = eatquote(uc, ue); |
102 | 6.52M | break; |
103 | 49.1M | case ',': |
104 | 49.1M | nf++; |
105 | 49.1M | break; |
106 | 760 | case '\n': |
107 | 760 | DPRINTF("%zu %zu %zu\n", nl, nf, tf); |
108 | 760 | nl++; |
109 | 760 | #if CSV_LINES |
110 | 760 | if (nl == CSV_LINES) |
111 | 24 | return tf > 1 && tf == nf; |
112 | 736 | #endif |
113 | 736 | if (tf == 0) { |
114 | | // First time and no fields, give up |
115 | 453 | if (nf == 0) |
116 | 328 | return 0; |
117 | | // First time, set the number of fields |
118 | 125 | tf = nf; |
119 | 283 | } else if (tf != nf) { |
120 | | // Field number mismatch, we are done. |
121 | 53 | return 0; |
122 | 53 | } |
123 | 355 | nf = 0; |
124 | 355 | break; |
125 | 536M | default: |
126 | 536M | break; |
127 | 592M | } |
128 | 592M | } |
129 | 3.91k | return tf > 1 && nl >= 2; |
130 | 4.31k | } |
131 | | |
132 | | #ifndef TEST |
133 | | int |
134 | | file_is_csv(struct magic_set *ms, const struct buffer *b, int looks_text, |
135 | | const char *code) |
136 | 25.5k | { |
137 | 25.5k | const unsigned char *uc = CAST(const unsigned char *, b->fbuf); |
138 | 25.5k | const unsigned char *ue = uc + b->flen; |
139 | 25.5k | int mime = ms->flags & MAGIC_MIME; |
140 | | |
141 | 25.5k | if (!looks_text) |
142 | 21.2k | return 0; |
143 | | |
144 | 4.31k | if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)) != 0) |
145 | 0 | return 0; |
146 | | |
147 | 4.31k | if (!csv_parse(uc, ue)) |
148 | 4.30k | return 0; |
149 | | |
150 | 7 | if (mime == MAGIC_MIME_ENCODING) |
151 | 0 | return 1; |
152 | | |
153 | 7 | if (mime) { |
154 | 0 | if (file_printf(ms, "text/csv") == -1) |
155 | 0 | return -1; |
156 | 0 | return 1; |
157 | 0 | } |
158 | | |
159 | 7 | if (file_printf(ms, "CSV %s%stext", code ? code : "", |
160 | 7 | code ? " " : "") == -1) |
161 | 0 | return -1; |
162 | | |
163 | 7 | return 1; |
164 | 7 | } |
165 | | |
166 | | #else |
167 | | |
168 | | #include <sys/types.h> |
169 | | #include <sys/stat.h> |
170 | | #include <stdio.h> |
171 | | #include <fcntl.h> |
172 | | #include <unistd.h> |
173 | | #include <stdlib.h> |
174 | | #include <stdint.h> |
175 | | #include <err.h> |
176 | | |
177 | | int |
178 | | main(int argc, char *argv[]) |
179 | | { |
180 | | int fd; |
181 | | struct stat st; |
182 | | unsigned char *p; |
183 | | |
184 | | if ((fd = open(argv[1], O_RDONLY)) == -1) |
185 | | err(EXIT_FAILURE, "Can't open `%s'", argv[1]); |
186 | | |
187 | | if (fstat(fd, &st) == -1) |
188 | | err(EXIT_FAILURE, "Can't stat `%s'", argv[1]); |
189 | | |
190 | | if ((p = CAST(unsigned char *, malloc(st.st_size))) == NULL) |
191 | | err(EXIT_FAILURE, "Can't allocate %jd bytes", |
192 | | (intmax_t)st.st_size); |
193 | | if (read(fd, p, st.st_size) != st.st_size) |
194 | | err(EXIT_FAILURE, "Can't read %jd bytes", |
195 | | (intmax_t)st.st_size); |
196 | | printf("is csv %d\n", csv_parse(p, p + st.st_size)); |
197 | | return 0; |
198 | | } |
199 | | #endif |