/src/duckdb/src/function/table/sniff_csv.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | #include "duckdb/function/built_in_functions.hpp" |
2 | | #include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp" |
3 | | #include "duckdb/common/types/data_chunk.hpp" |
4 | | #include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp" |
5 | | #include "duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp" |
6 | | #include "duckdb/function/table_function.hpp" |
7 | | #include "duckdb/main/client_context.hpp" |
8 | | #include "duckdb/function/table/range.hpp" |
9 | | #include "duckdb/execution/operator/csv_scanner/csv_file_handle.hpp" |
10 | | #include "duckdb/execution/operator/csv_scanner/csv_multi_file_info.hpp" |
11 | | #include "duckdb/function/table/read_csv.hpp" |
12 | | |
13 | | namespace duckdb { |
14 | | |
15 | | struct CSVSniffFunctionData : public TableFunctionData { |
16 | 0 | CSVSniffFunctionData() { |
17 | 0 | } |
18 | | string path; |
19 | | // The CSV reader options |
20 | | CSVReaderOptions options; |
21 | | // Return Types of CSV (If given by the user) |
22 | | vector<LogicalType> return_types_csv; |
23 | | // Column Names of CSV (If given by the user) |
24 | | vector<string> names_csv; |
25 | | // If we want to force the match of the sniffer types |
26 | | bool force_match = true; |
27 | | }; |
28 | | |
29 | | struct CSVSniffGlobalState : public GlobalTableFunctionState { |
30 | 0 | CSVSniffGlobalState() { |
31 | 0 | } |
32 | | bool done = false; |
33 | | }; |
34 | | |
35 | 0 | static unique_ptr<GlobalTableFunctionState> CSVSniffInitGlobal(ClientContext &context, TableFunctionInitInput &input) { |
36 | 0 | return make_uniq<CSVSniffGlobalState>(); |
37 | 0 | } |
38 | | |
39 | | static unique_ptr<FunctionData> CSVSniffBind(ClientContext &context, TableFunctionBindInput &input, |
40 | 0 | vector<LogicalType> &return_types, vector<string> &names) { |
41 | 0 | auto result = make_uniq<CSVSniffFunctionData>(); |
42 | 0 | if (input.inputs[0].IsNull()) { |
43 | 0 | throw BinderException("sniff_csv cannot take NULL as a file path parameter"); |
44 | 0 | } |
45 | 0 | result->path = input.inputs[0].ToString(); |
46 | 0 | auto it = input.named_parameters.find("auto_detect"); |
47 | 0 | if (it != input.named_parameters.end()) { |
48 | 0 | if (it->second.IsNull()) { |
49 | 0 | throw BinderException("\"%s\" expects a non-null boolean value (e.g. TRUE or 1)", it->first); |
50 | 0 | } |
51 | 0 | if (!it->second.GetValue<bool>()) { |
52 | 0 | throw InvalidInputException("sniff_csv function does not accept auto_detect variable set to false"); |
53 | 0 | } |
54 | | // otherwise remove it |
55 | 0 | input.named_parameters.erase("auto_detect"); |
56 | 0 | } |
57 | | |
58 | | // If we want to force the match of the sniffer |
59 | 0 | it = input.named_parameters.find("force_match"); |
60 | 0 | if (it != input.named_parameters.end()) { |
61 | 0 | result->force_match = it->second.GetValue<bool>(); |
62 | 0 | input.named_parameters.erase("force_match"); |
63 | 0 | } |
64 | 0 | MultiFileOptions file_options; |
65 | 0 | result->options.FromNamedParameters(input.named_parameters, context, file_options); |
66 | 0 | result->options.Verify(file_options); |
67 | | |
68 | | // We want to return the whole CSV Configuration |
69 | | // 1. Delimiter |
70 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
71 | 0 | names.emplace_back("Delimiter"); |
72 | | // 2. Quote |
73 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
74 | 0 | names.emplace_back("Quote"); |
75 | | // 3. Escape |
76 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
77 | 0 | names.emplace_back("Escape"); |
78 | | // 4. NewLine Delimiter |
79 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
80 | 0 | names.emplace_back("NewLineDelimiter"); |
81 | | // 5. Comment |
82 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
83 | 0 | names.emplace_back("Comment"); |
84 | | // 6. Skip Rows |
85 | 0 | return_types.emplace_back(LogicalType::UINTEGER); |
86 | 0 | names.emplace_back("SkipRows"); |
87 | | // 7. Has Header |
88 | 0 | return_types.emplace_back(LogicalType::BOOLEAN); |
89 | 0 | names.emplace_back("HasHeader"); |
90 | | // 8. List<Struct<Column-Name:Types>> |
91 | 0 | child_list_t<LogicalType> struct_children {{"name", LogicalType::VARCHAR}, {"type", LogicalType::VARCHAR}}; |
92 | 0 | auto list_child = LogicalType::STRUCT(struct_children); |
93 | 0 | return_types.emplace_back(LogicalType::LIST(list_child)); |
94 | 0 | names.emplace_back("Columns"); |
95 | | // 9. Date Format |
96 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
97 | 0 | names.emplace_back("DateFormat"); |
98 | | // 10. Timestamp Format |
99 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
100 | 0 | names.emplace_back("TimestampFormat"); |
101 | | // 11. CSV read function with all the options used |
102 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
103 | 0 | names.emplace_back("UserArguments"); |
104 | | // 12. CSV read function with all the options used |
105 | 0 | return_types.emplace_back(LogicalType::VARCHAR); |
106 | 0 | names.emplace_back("Prompt"); |
107 | 0 | return std::move(result); |
108 | 0 | } |
109 | | |
110 | 0 | string FormatOptions(char opt) { |
111 | 0 | if (opt == '\'') { |
112 | 0 | return "''"; |
113 | 0 | } |
114 | 0 | if (opt == '\0') { |
115 | 0 | return ""; |
116 | 0 | } |
117 | 0 | string result; |
118 | 0 | result += opt; |
119 | 0 | return result; |
120 | 0 | } |
121 | | |
122 | 0 | string FormatOptions(string opt) { |
123 | 0 | if (opt.size() == 1) { |
124 | 0 | return FormatOptions(opt[0]); |
125 | 0 | } |
126 | 0 | return opt; |
127 | 0 | } |
128 | | |
129 | 0 | static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { |
130 | 0 | auto &global_state = data_p.global_state->Cast<CSVSniffGlobalState>(); |
131 | | // Are we done? |
132 | 0 | if (global_state.done) { |
133 | 0 | return; |
134 | 0 | } |
135 | 0 | const CSVSniffFunctionData &data = data_p.bind_data->Cast<CSVSniffFunctionData>(); |
136 | 0 | auto &fs = duckdb::FileSystem::GetFileSystem(context); |
137 | |
|
138 | 0 | auto files = fs.GlobFiles(data.path, context, FileGlobOptions::DISALLOW_EMPTY); |
139 | 0 | if (files.size() > 1) { |
140 | 0 | throw NotImplementedException("sniff_csv does not operate on more than one file yet"); |
141 | 0 | } |
142 | | |
143 | | // We must run the sniffer. |
144 | 0 | auto sniffer_options = data.options; |
145 | 0 | sniffer_options.file_path = files[0].path; |
146 | |
|
147 | 0 | auto buffer_manager = make_shared_ptr<CSVBufferManager>(context, sniffer_options, sniffer_options.file_path, 0); |
148 | 0 | if (sniffer_options.name_list.empty()) { |
149 | 0 | sniffer_options.name_list = data.names_csv; |
150 | 0 | } |
151 | |
|
152 | 0 | if (sniffer_options.sql_type_list.empty()) { |
153 | 0 | sniffer_options.sql_type_list = data.return_types_csv; |
154 | 0 | } |
155 | 0 | MultiFileOptions file_options; |
156 | 0 | CSVSniffer sniffer(sniffer_options, file_options, buffer_manager, CSVStateMachineCache::Get(context)); |
157 | 0 | auto sniffer_result = sniffer.SniffCSV(data.force_match); |
158 | 0 | if (sniffer.EmptyOrOnlyHeader()) { |
159 | 0 | for (idx_t i = 0; i < sniffer_result.return_types.size(); i++) { |
160 | 0 | if (!sniffer_options.sql_types_per_column.empty()) { |
161 | 0 | if (sniffer_options.sql_types_per_column.find(sniffer_result.names[i]) != |
162 | 0 | sniffer_options.sql_types_per_column.end()) { |
163 | 0 | continue; |
164 | 0 | } |
165 | 0 | } else if (i < sniffer_options.sql_type_list.size()) { |
166 | 0 | continue; |
167 | 0 | } |
168 | | // we default to varchar if all files are empty or only have a header after all the sniffing |
169 | 0 | sniffer_result.return_types[i] = LogicalType::VARCHAR; |
170 | 0 | } |
171 | 0 | } |
172 | 0 | for (auto &type : sniffer_result.return_types) { |
173 | 0 | if (type.id() == LogicalTypeId::SQLNULL) { |
174 | 0 | type = LogicalType::VARCHAR; |
175 | 0 | } |
176 | 0 | } |
177 | 0 | string str_opt; |
178 | 0 | string separator = ", "; |
179 | | // Set output |
180 | 0 | output.SetCardinality(1); |
181 | | |
182 | | // 1. Delimiter |
183 | 0 | str_opt = sniffer_options.dialect_options.state_machine_options.delimiter.FormatValue(); |
184 | 0 | output.SetValue(0, 0, str_opt); |
185 | | // 2. Quote |
186 | 0 | str_opt = sniffer_options.dialect_options.state_machine_options.quote.FormatValue(); |
187 | 0 | output.SetValue(1, 0, str_opt); |
188 | | // 3. Escape |
189 | 0 | str_opt = sniffer_options.dialect_options.state_machine_options.escape.FormatValue(); |
190 | 0 | output.SetValue(2, 0, str_opt); |
191 | | // 4. NewLine Delimiter |
192 | 0 | auto new_line_identifier = sniffer_options.NewLineIdentifierToString(); |
193 | 0 | output.SetValue(3, 0, new_line_identifier); |
194 | | // 5. Comment |
195 | 0 | str_opt = sniffer_options.dialect_options.state_machine_options.comment.FormatValue(); |
196 | 0 | output.SetValue(4, 0, str_opt); |
197 | | // 6. Skip Rows |
198 | 0 | output.SetValue(5, 0, Value::UINTEGER(NumericCast<uint32_t>(sniffer_options.dialect_options.skip_rows.GetValue()))); |
199 | | // 7. Has Header |
200 | 0 | auto has_header = Value::BOOLEAN(sniffer_options.dialect_options.header.GetValue()); |
201 | 0 | output.SetValue(6, 0, has_header); |
202 | | // 8. List<Struct<Column-Name:Types>> {'col1': 'INTEGER', 'col2': 'VARCHAR'} |
203 | 0 | vector<Value> values; |
204 | 0 | std::ostringstream columns; |
205 | 0 | columns << "{"; |
206 | 0 | for (idx_t i = 0; i < sniffer_result.return_types.size(); i++) { |
207 | 0 | child_list_t<Value> struct_children {{"name", sniffer_result.names[i]}, |
208 | 0 | {"type", {sniffer_result.return_types[i].ToString()}}}; |
209 | 0 | values.emplace_back(Value::STRUCT(struct_children)); |
210 | 0 | columns << "'" << sniffer_result.names[i] << "': '" << sniffer_result.return_types[i].ToString() << "'"; |
211 | 0 | if (i != sniffer_result.return_types.size() - 1) { |
212 | 0 | columns << separator; |
213 | 0 | } |
214 | 0 | } |
215 | 0 | columns << "}"; |
216 | 0 | output.SetValue(7, 0, Value::LIST(values)); |
217 | | // 9. Date Format |
218 | 0 | auto date_format = sniffer_options.dialect_options.date_format[LogicalType::DATE].GetValue(); |
219 | 0 | if (!date_format.Empty()) { |
220 | 0 | output.SetValue(8, 0, date_format.format_specifier); |
221 | 0 | } else { |
222 | 0 | bool has_date = false; |
223 | 0 | for (auto &c_type : sniffer_result.return_types) { |
224 | | // Must be ISO 8601 |
225 | 0 | if (c_type.id() == LogicalTypeId::DATE) { |
226 | 0 | output.SetValue(8, 0, Value("%Y-%m-%d")); |
227 | 0 | has_date = true; |
228 | 0 | } |
229 | 0 | } |
230 | 0 | if (!has_date) { |
231 | 0 | output.SetValue(8, 0, Value(nullptr)); |
232 | 0 | } |
233 | 0 | } |
234 | | |
235 | | // 10. Timestamp Format |
236 | 0 | auto timestamp_format = sniffer_options.dialect_options.date_format[LogicalType::TIMESTAMP].GetValue(); |
237 | 0 | if (!timestamp_format.Empty()) { |
238 | 0 | output.SetValue(9, 0, timestamp_format.format_specifier); |
239 | 0 | } else { |
240 | 0 | output.SetValue(9, 0, Value(nullptr)); |
241 | 0 | } |
242 | | |
243 | | // 11. The Extra User Arguments |
244 | 0 | if (data.options.user_defined_parameters.empty()) { |
245 | 0 | output.SetValue(10, 0, Value()); |
246 | 0 | } else { |
247 | 0 | output.SetValue(10, 0, Value(data.options.GetUserDefinedParameters())); |
248 | 0 | } |
249 | | |
250 | | // 12. csv_read string |
251 | 0 | std::ostringstream csv_read; |
252 | | |
253 | | // Base, Path and auto_detect=false |
254 | 0 | csv_read << "FROM read_csv('" << files[0].path << "'" << separator << "auto_detect=false" << separator; |
255 | | // 10.1. Delimiter |
256 | 0 | if (!sniffer_options.dialect_options.state_machine_options.delimiter.IsSetByUser()) { |
257 | 0 | csv_read << "delim=" |
258 | 0 | << "'" << FormatOptions(sniffer_options.dialect_options.state_machine_options.delimiter.GetValue()) |
259 | 0 | << "'" << separator; |
260 | 0 | } |
261 | | // 11.2. Quote |
262 | 0 | if (!sniffer_options.dialect_options.state_machine_options.quote.IsSetByUser()) { |
263 | 0 | csv_read << "quote=" |
264 | 0 | << "'" << FormatOptions(sniffer_options.dialect_options.state_machine_options.quote.GetValue()) << "'" |
265 | 0 | << separator; |
266 | 0 | } |
267 | | // 11.3. Escape |
268 | 0 | if (!sniffer_options.dialect_options.state_machine_options.escape.IsSetByUser()) { |
269 | 0 | csv_read << "escape=" |
270 | 0 | << "'" << FormatOptions(sniffer_options.dialect_options.state_machine_options.escape.GetValue()) << "'" |
271 | 0 | << separator; |
272 | 0 | } |
273 | | // 11.4. NewLine Delimiter |
274 | 0 | if (!sniffer_options.dialect_options.state_machine_options.new_line.IsSetByUser()) { |
275 | 0 | if (new_line_identifier != "mix") { |
276 | 0 | csv_read << "new_line=" |
277 | 0 | << "'" << new_line_identifier << "'" << separator; |
278 | 0 | } |
279 | 0 | } |
280 | | // 11.5. Skip Rows |
281 | 0 | if (!sniffer_options.dialect_options.skip_rows.IsSetByUser()) { |
282 | 0 | csv_read << "skip=" << sniffer_options.dialect_options.skip_rows.GetValue() << separator; |
283 | 0 | } |
284 | | |
285 | | // 11.6. Comment |
286 | 0 | if (!sniffer_options.dialect_options.state_machine_options.comment.IsSetByUser()) { |
287 | 0 | csv_read << "comment=" |
288 | 0 | << "'" << FormatOptions(sniffer_options.dialect_options.state_machine_options.comment.GetValue()) |
289 | 0 | << "'" << separator; |
290 | 0 | } |
291 | | |
292 | | // 11.7. Has Header |
293 | 0 | if (!sniffer_options.dialect_options.header.IsSetByUser()) { |
294 | 0 | csv_read << "header=" << has_header << separator; |
295 | 0 | } |
296 | | // 11.8. column={'col1': 'INTEGER', 'col2': 'VARCHAR'} |
297 | 0 | csv_read << "columns=" << columns.str(); |
298 | | // 11.9. Date Format |
299 | 0 | if (!sniffer_options.dialect_options.date_format[LogicalType::DATE].IsSetByUser()) { |
300 | 0 | if (!sniffer_options.dialect_options.date_format[LogicalType::DATE].GetValue().format_specifier.empty()) { |
301 | 0 | csv_read << separator << "dateformat=" |
302 | 0 | << "'" |
303 | 0 | << sniffer_options.dialect_options.date_format[LogicalType::DATE].GetValue().format_specifier |
304 | 0 | << "'"; |
305 | 0 | } else { |
306 | 0 | for (auto &c_type : sniffer_result.return_types) { |
307 | | // Must be ISO 8601 |
308 | 0 | if (c_type.id() == LogicalTypeId::DATE) { |
309 | 0 | csv_read << separator << "dateformat=" |
310 | 0 | << "'%Y-%m-%d'"; |
311 | 0 | break; |
312 | 0 | } |
313 | 0 | } |
314 | 0 | } |
315 | 0 | } |
316 | | // 11.10. Timestamp Format |
317 | 0 | if (!sniffer_options.dialect_options.date_format[LogicalType::TIMESTAMP].IsSetByUser()) { |
318 | 0 | if (!sniffer_options.dialect_options.date_format[LogicalType::TIMESTAMP].GetValue().format_specifier.empty()) { |
319 | 0 | csv_read << separator << "timestampformat=" |
320 | 0 | << "'" |
321 | 0 | << sniffer_options.dialect_options.date_format[LogicalType::TIMESTAMP].GetValue().format_specifier |
322 | 0 | << "'"; |
323 | 0 | } |
324 | 0 | } |
325 | | // 11.11 User Arguments |
326 | 0 | if (!data.options.user_defined_parameters.empty()) { |
327 | 0 | csv_read << separator << data.options.GetUserDefinedParameters(); |
328 | 0 | } |
329 | 0 | csv_read << ");"; |
330 | 0 | output.SetValue(11, 0, csv_read.str()); |
331 | 0 | global_state.done = true; |
332 | 0 | } |
333 | | |
334 | 9.24k | void CSVSnifferFunction::RegisterFunction(BuiltinFunctions &set) { |
335 | 9.24k | TableFunction csv_sniffer("sniff_csv", {LogicalType::VARCHAR}, CSVSniffFunction, CSVSniffBind, CSVSniffInitGlobal); |
336 | | // Accept same options as the actual csv reader |
337 | 9.24k | ReadCSVTableFunction::ReadCSVAddNamedParameters(csv_sniffer); |
338 | 9.24k | csv_sniffer.named_parameters["force_match"] = LogicalType::BOOLEAN; |
339 | 9.24k | set.AddFunction(csv_sniffer); |
340 | 9.24k | } |
341 | | } // namespace duckdb |