Coverage Report

Created: 2025-06-24 07:53

/src/duckdb/extension/parquet/reader/string_column_reader.cpp
Line
Count
Source (jump to first uncovered line)
1
#include "reader/string_column_reader.hpp"
2
#include "utf8proc_wrapper.hpp"
3
#include "parquet_reader.hpp"
4
#include "duckdb/common/types/blob.hpp"
5
6
namespace duckdb {
7
8
//===--------------------------------------------------------------------===//
9
// String Column Reader
10
//===--------------------------------------------------------------------===//
11
StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema)
12
0
    : ColumnReader(reader, schema) {
13
0
  fixed_width_string_length = 0;
14
0
  if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
15
0
    fixed_width_string_length = schema.type_length;
16
0
  }
17
0
}
18
19
0
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) {
20
0
  if (!is_varchar) {
21
0
    return;
22
0
  }
23
  // verify if a string is actually UTF8, and if there are no null bytes in the middle of the string
24
  // technically Parquet should guarantee this, but reality is often disappointing
25
0
  UnicodeInvalidReason reason;
26
0
  size_t pos;
27
0
  auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos);
28
0
  if (utf_type == UnicodeType::INVALID) {
29
0
    throw InvalidInputException("Invalid string encoding found in Parquet file: value \"" +
30
0
                                Blob::ToString(string_t(str_data, str_len)) + "\" is not valid UTF8!");
31
0
  }
32
0
}
33
34
0
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) {
35
0
  VerifyString(str_data, str_len, Type().id() == LogicalTypeId::VARCHAR);
36
0
}
37
38
class ParquetStringVectorBuffer : public VectorBuffer {
39
public:
40
  explicit ParquetStringVectorBuffer(shared_ptr<ResizeableBuffer> buffer_p)
41
0
      : VectorBuffer(VectorBufferType::OPAQUE_BUFFER), buffer(std::move(buffer_p)) {
42
0
  }
43
44
private:
45
  shared_ptr<ResizeableBuffer> buffer;
46
};
47
48
0
void StringColumnReader::ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block) {
49
0
  StringVector::AddBuffer(result, make_buffer<ParquetStringVectorBuffer>(block));
50
0
}
51
52
void StringColumnReader::Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
53
0
                               idx_t result_offset, Vector &result) {
54
0
  ReferenceBlock(result, plain_data);
55
0
  PlainTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result_offset, result);
56
0
}
57
58
0
void StringColumnReader::PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) {
59
0
  PlainSkipTemplated<StringParquetValueConversion>(plain_data, defines, num_values);
60
0
}
61
62
void StringColumnReader::PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
63
0
                                     Vector &result, const SelectionVector &sel, idx_t count) {
64
0
  ReferenceBlock(result, plain_data);
65
0
  PlainSelectTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result, sel, count);
66
0
}
67
68
} // namespace duckdb