Coverage Report

Created: 2025-09-05 08:05

/src/duckdb/src/function/table/sniff_csv.cpp
Line
Count
Source (jump to first uncovered line)
1
#include "duckdb/function/built_in_functions.hpp"
2
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
3
#include "duckdb/common/types/data_chunk.hpp"
4
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
5
#include "duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp"
6
#include "duckdb/function/table_function.hpp"
7
#include "duckdb/main/client_context.hpp"
8
#include "duckdb/function/table/range.hpp"
9
#include "duckdb/execution/operator/csv_scanner/csv_file_handle.hpp"
10
#include "duckdb/execution/operator/csv_scanner/csv_multi_file_info.hpp"
11
#include "duckdb/function/table/read_csv.hpp"
12
13
namespace duckdb {
14
15
struct CSVSniffFunctionData : public TableFunctionData {
16
0
  CSVSniffFunctionData() {
17
0
  }
18
  string path;
19
  // The CSV reader options
20
  CSVReaderOptions options;
21
  // Return Types of CSV (If given by the user)
22
  vector<LogicalType> return_types_csv;
23
  // Column Names of CSV (If given by the user)
24
  vector<string> names_csv;
25
  // If we want to force the match of the sniffer types
26
  bool force_match = true;
27
};
28
29
struct CSVSniffGlobalState : public GlobalTableFunctionState {
30
0
  CSVSniffGlobalState() {
31
0
  }
32
  bool done = false;
33
};
34
35
0
static unique_ptr<GlobalTableFunctionState> CSVSniffInitGlobal(ClientContext &context, TableFunctionInitInput &input) {
36
0
  return make_uniq<CSVSniffGlobalState>();
37
0
}
38
39
static unique_ptr<FunctionData> CSVSniffBind(ClientContext &context, TableFunctionBindInput &input,
40
0
                                             vector<LogicalType> &return_types, vector<string> &names) {
41
0
  auto result = make_uniq<CSVSniffFunctionData>();
42
0
  if (input.inputs[0].IsNull()) {
43
0
    throw BinderException("sniff_csv cannot take NULL as a file path parameter");
44
0
  }
45
0
  result->path = input.inputs[0].ToString();
46
0
  auto it = input.named_parameters.find("auto_detect");
47
0
  if (it != input.named_parameters.end()) {
48
0
    if (it->second.IsNull()) {
49
0
      throw BinderException("\"%s\" expects a non-null boolean value (e.g. TRUE or 1)", it->first);
50
0
    }
51
0
    if (!it->second.GetValue<bool>()) {
52
0
      throw InvalidInputException("sniff_csv function does not accept auto_detect variable set to false");
53
0
    }
54
    // otherwise remove it
55
0
    input.named_parameters.erase("auto_detect");
56
0
  }
57
58
  // If we want to force the match of the sniffer
59
0
  it = input.named_parameters.find("force_match");
60
0
  if (it != input.named_parameters.end()) {
61
0
    result->force_match = it->second.GetValue<bool>();
62
0
    input.named_parameters.erase("force_match");
63
0
  }
64
0
  MultiFileOptions file_options;
65
0
  result->options.FromNamedParameters(input.named_parameters, context, file_options);
66
0
  result->options.Verify(file_options);
67
68
  // We want to return the whole CSV Configuration
69
  // 1. Delimiter
70
0
  return_types.emplace_back(LogicalType::VARCHAR);
71
0
  names.emplace_back("Delimiter");
72
  // 2. Quote
73
0
  return_types.emplace_back(LogicalType::VARCHAR);
74
0
  names.emplace_back("Quote");
75
  // 3. Escape
76
0
  return_types.emplace_back(LogicalType::VARCHAR);
77
0
  names.emplace_back("Escape");
78
  // 4. NewLine Delimiter
79
0
  return_types.emplace_back(LogicalType::VARCHAR);
80
0
  names.emplace_back("NewLineDelimiter");
81
  // 5. Comment
82
0
  return_types.emplace_back(LogicalType::VARCHAR);
83
0
  names.emplace_back("Comment");
84
  // 6. Skip Rows
85
0
  return_types.emplace_back(LogicalType::UINTEGER);
86
0
  names.emplace_back("SkipRows");
87
  // 7. Has Header
88
0
  return_types.emplace_back(LogicalType::BOOLEAN);
89
0
  names.emplace_back("HasHeader");
90
  // 8. List<Struct<Column-Name:Types>>
91
0
  child_list_t<LogicalType> struct_children {{"name", LogicalType::VARCHAR}, {"type", LogicalType::VARCHAR}};
92
0
  auto list_child = LogicalType::STRUCT(struct_children);
93
0
  return_types.emplace_back(LogicalType::LIST(list_child));
94
0
  names.emplace_back("Columns");
95
  // 9. Date Format
96
0
  return_types.emplace_back(LogicalType::VARCHAR);
97
0
  names.emplace_back("DateFormat");
98
  // 10. Timestamp Format
99
0
  return_types.emplace_back(LogicalType::VARCHAR);
100
0
  names.emplace_back("TimestampFormat");
101
  // 11. CSV read function with all the options used
102
0
  return_types.emplace_back(LogicalType::VARCHAR);
103
0
  names.emplace_back("UserArguments");
104
  // 12. CSV read function with all the options used
105
0
  return_types.emplace_back(LogicalType::VARCHAR);
106
0
  names.emplace_back("Prompt");
107
0
  return std::move(result);
108
0
}
109
110
0
string FormatOptions(char opt) {
111
0
  if (opt == '\'') {
112
0
    return "''";
113
0
  }
114
0
  if (opt == '\0') {
115
0
    return "";
116
0
  }
117
0
  string result;
118
0
  result += opt;
119
0
  return result;
120
0
}
121
122
0
string FormatOptions(string opt) {
123
0
  if (opt.size() == 1) {
124
0
    return FormatOptions(opt[0]);
125
0
  }
126
0
  return opt;
127
0
}
128
129
0
static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) {
130
0
  auto &global_state = data_p.global_state->Cast<CSVSniffGlobalState>();
131
  // Are we done?
132
0
  if (global_state.done) {
133
0
    return;
134
0
  }
135
0
  const CSVSniffFunctionData &data = data_p.bind_data->Cast<CSVSniffFunctionData>();
136
0
  auto &fs = duckdb::FileSystem::GetFileSystem(context);
137
138
0
  auto files = fs.GlobFiles(data.path, context, FileGlobOptions::DISALLOW_EMPTY);
139
0
  if (files.size() > 1) {
140
0
    throw NotImplementedException("sniff_csv does not operate on more than one file yet");
141
0
  }
142
143
  // We must run the sniffer.
144
0
  auto sniffer_options = data.options;
145
0
  sniffer_options.file_path = files[0].path;
146
147
0
  auto buffer_manager = make_shared_ptr<CSVBufferManager>(context, sniffer_options, sniffer_options.file_path, 0);
148
0
  if (sniffer_options.name_list.empty()) {
149
0
    sniffer_options.name_list = data.names_csv;
150
0
  }
151
152
0
  if (sniffer_options.sql_type_list.empty()) {
153
0
    sniffer_options.sql_type_list = data.return_types_csv;
154
0
  }
155
0
  MultiFileOptions file_options;
156
0
  CSVSniffer sniffer(sniffer_options, file_options, buffer_manager, CSVStateMachineCache::Get(context));
157
0
  auto sniffer_result = sniffer.SniffCSV(data.force_match);
158
0
  if (sniffer.EmptyOrOnlyHeader()) {
159
0
    for (idx_t i = 0; i < sniffer_result.return_types.size(); i++) {
160
0
      if (!sniffer_options.sql_types_per_column.empty()) {
161
0
        if (sniffer_options.sql_types_per_column.find(sniffer_result.names[i]) !=
162
0
            sniffer_options.sql_types_per_column.end()) {
163
0
          continue;
164
0
        }
165
0
      } else if (i < sniffer_options.sql_type_list.size()) {
166
0
        continue;
167
0
      }
168
      // we default to varchar if all files are empty or only have a header after all the sniffing
169
0
      sniffer_result.return_types[i] = LogicalType::VARCHAR;
170
0
    }
171
0
  }
172
0
  for (auto &type : sniffer_result.return_types) {
173
0
    if (type.id() == LogicalTypeId::SQLNULL) {
174
0
      type = LogicalType::VARCHAR;
175
0
    }
176
0
  }
177
0
  string str_opt;
178
0
  string separator = ", ";
179
  // Set output
180
0
  output.SetCardinality(1);
181
182
  // 1. Delimiter
183
0
  str_opt = sniffer_options.dialect_options.state_machine_options.delimiter.FormatValue();
184
0
  output.SetValue(0, 0, str_opt);
185
  // 2. Quote
186
0
  str_opt = sniffer_options.dialect_options.state_machine_options.quote.FormatValue();
187
0
  output.SetValue(1, 0, str_opt);
188
  // 3. Escape
189
0
  str_opt = sniffer_options.dialect_options.state_machine_options.escape.FormatValue();
190
0
  output.SetValue(2, 0, str_opt);
191
  // 4. NewLine Delimiter
192
0
  auto new_line_identifier = sniffer_options.NewLineIdentifierToString();
193
0
  output.SetValue(3, 0, new_line_identifier);
194
  // 5. Comment
195
0
  str_opt = sniffer_options.dialect_options.state_machine_options.comment.FormatValue();
196
0
  output.SetValue(4, 0, str_opt);
197
  // 6. Skip Rows
198
0
  output.SetValue(5, 0, Value::UINTEGER(NumericCast<uint32_t>(sniffer_options.dialect_options.skip_rows.GetValue())));
199
  // 7. Has Header
200
0
  auto has_header = Value::BOOLEAN(sniffer_options.dialect_options.header.GetValue());
201
0
  output.SetValue(6, 0, has_header);
202
  // 8. List<Struct<Column-Name:Types>> {'col1': 'INTEGER', 'col2': 'VARCHAR'}
203
0
  vector<Value> values;
204
0
  std::ostringstream columns;
205
0
  columns << "{";
206
0
  for (idx_t i = 0; i < sniffer_result.return_types.size(); i++) {
207
0
    child_list_t<Value> struct_children {{"name", sniffer_result.names[i]},
208
0
                                         {"type", {sniffer_result.return_types[i].ToString()}}};
209
0
    values.emplace_back(Value::STRUCT(struct_children));
210
0
    columns << "'" << sniffer_result.names[i] << "': '" << sniffer_result.return_types[i].ToString() << "'";
211
0
    if (i != sniffer_result.return_types.size() - 1) {
212
0
      columns << separator;
213
0
    }
214
0
  }
215
0
  columns << "}";
216
0
  output.SetValue(7, 0, Value::LIST(values));
217
  // 9. Date Format
218
0
  auto date_format = sniffer_options.dialect_options.date_format[LogicalType::DATE].GetValue();
219
0
  if (!date_format.Empty()) {
220
0
    output.SetValue(8, 0, date_format.format_specifier);
221
0
  } else {
222
0
    bool has_date = false;
223
0
    for (auto &c_type : sniffer_result.return_types) {
224
      // Must be ISO 8601
225
0
      if (c_type.id() == LogicalTypeId::DATE) {
226
0
        output.SetValue(8, 0, Value("%Y-%m-%d"));
227
0
        has_date = true;
228
0
      }
229
0
    }
230
0
    if (!has_date) {
231
0
      output.SetValue(8, 0, Value(nullptr));
232
0
    }
233
0
  }
234
235
  // 10. Timestamp Format
236
0
  auto timestamp_format = sniffer_options.dialect_options.date_format[LogicalType::TIMESTAMP].GetValue();
237
0
  if (!timestamp_format.Empty()) {
238
0
    output.SetValue(9, 0, timestamp_format.format_specifier);
239
0
  } else {
240
0
    output.SetValue(9, 0, Value(nullptr));
241
0
  }
242
243
  // 11. The Extra User Arguments
244
0
  if (data.options.user_defined_parameters.empty()) {
245
0
    output.SetValue(10, 0, Value());
246
0
  } else {
247
0
    output.SetValue(10, 0, Value(data.options.GetUserDefinedParameters()));
248
0
  }
249
250
  // 12. csv_read string
251
0
  std::ostringstream csv_read;
252
253
  // Base, Path and auto_detect=false
254
0
  csv_read << "FROM read_csv('" << files[0].path << "'" << separator << "auto_detect=false" << separator;
255
  // 10.1. Delimiter
256
0
  if (!sniffer_options.dialect_options.state_machine_options.delimiter.IsSetByUser()) {
257
0
    csv_read << "delim="
258
0
             << "'" << FormatOptions(sniffer_options.dialect_options.state_machine_options.delimiter.GetValue())
259
0
             << "'" << separator;
260
0
  }
261
  // 11.2. Quote
262
0
  if (!sniffer_options.dialect_options.state_machine_options.quote.IsSetByUser()) {
263
0
    csv_read << "quote="
264
0
             << "'" << FormatOptions(sniffer_options.dialect_options.state_machine_options.quote.GetValue()) << "'"
265
0
             << separator;
266
0
  }
267
  // 11.3. Escape
268
0
  if (!sniffer_options.dialect_options.state_machine_options.escape.IsSetByUser()) {
269
0
    csv_read << "escape="
270
0
             << "'" << FormatOptions(sniffer_options.dialect_options.state_machine_options.escape.GetValue()) << "'"
271
0
             << separator;
272
0
  }
273
  // 11.4. NewLine Delimiter
274
0
  if (!sniffer_options.dialect_options.state_machine_options.new_line.IsSetByUser()) {
275
0
    if (new_line_identifier != "mix") {
276
0
      csv_read << "new_line="
277
0
               << "'" << new_line_identifier << "'" << separator;
278
0
    }
279
0
  }
280
  // 11.5. Skip Rows
281
0
  if (!sniffer_options.dialect_options.skip_rows.IsSetByUser()) {
282
0
    csv_read << "skip=" << sniffer_options.dialect_options.skip_rows.GetValue() << separator;
283
0
  }
284
285
  // 11.6. Comment
286
0
  if (!sniffer_options.dialect_options.state_machine_options.comment.IsSetByUser()) {
287
0
    csv_read << "comment="
288
0
             << "'" << FormatOptions(sniffer_options.dialect_options.state_machine_options.comment.GetValue())
289
0
             << "'" << separator;
290
0
  }
291
292
  // 11.7. Has Header
293
0
  if (!sniffer_options.dialect_options.header.IsSetByUser()) {
294
0
    csv_read << "header=" << has_header << separator;
295
0
  }
296
  // 11.8. column={'col1': 'INTEGER', 'col2': 'VARCHAR'}
297
0
  csv_read << "columns=" << columns.str();
298
  // 11.9. Date Format
299
0
  if (!sniffer_options.dialect_options.date_format[LogicalType::DATE].IsSetByUser()) {
300
0
    if (!sniffer_options.dialect_options.date_format[LogicalType::DATE].GetValue().format_specifier.empty()) {
301
0
      csv_read << separator << "dateformat="
302
0
               << "'"
303
0
               << sniffer_options.dialect_options.date_format[LogicalType::DATE].GetValue().format_specifier
304
0
               << "'";
305
0
    } else {
306
0
      for (auto &c_type : sniffer_result.return_types) {
307
        // Must be ISO 8601
308
0
        if (c_type.id() == LogicalTypeId::DATE) {
309
0
          csv_read << separator << "dateformat="
310
0
                   << "'%Y-%m-%d'";
311
0
          break;
312
0
        }
313
0
      }
314
0
    }
315
0
  }
316
  // 11.10. Timestamp Format
317
0
  if (!sniffer_options.dialect_options.date_format[LogicalType::TIMESTAMP].IsSetByUser()) {
318
0
    if (!sniffer_options.dialect_options.date_format[LogicalType::TIMESTAMP].GetValue().format_specifier.empty()) {
319
0
      csv_read << separator << "timestampformat="
320
0
               << "'"
321
0
               << sniffer_options.dialect_options.date_format[LogicalType::TIMESTAMP].GetValue().format_specifier
322
0
               << "'";
323
0
    }
324
0
  }
325
  // 11.11 User Arguments
326
0
  if (!data.options.user_defined_parameters.empty()) {
327
0
    csv_read << separator << data.options.GetUserDefinedParameters();
328
0
  }
329
0
  csv_read << ");";
330
0
  output.SetValue(11, 0, csv_read.str());
331
0
  global_state.done = true;
332
0
}
333
334
9.24k
void CSVSnifferFunction::RegisterFunction(BuiltinFunctions &set) {
335
9.24k
  TableFunction csv_sniffer("sniff_csv", {LogicalType::VARCHAR}, CSVSniffFunction, CSVSniffBind, CSVSniffInitGlobal);
336
  // Accept same options as the actual csv reader
337
9.24k
  ReadCSVTableFunction::ReadCSVAddNamedParameters(csv_sniffer);
338
9.24k
  csv_sniffer.named_parameters["force_match"] = LogicalType::BOOLEAN;
339
9.24k
  set.AddFunction(csv_sniffer);
340
9.24k
}
341
} // namespace duckdb