Coverage Report

Created: 2026-03-31 07:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/duckdb/extension/parquet/reader/string_column_reader.cpp
Line
Count
Source
1
#include "reader/string_column_reader.hpp"
2
#include "utf8proc_wrapper.hpp"
3
#include "parquet_reader.hpp"
4
#include "duckdb/common/types/blob.hpp"
5
6
namespace duckdb {
7
8
//===--------------------------------------------------------------------===//
9
// String Column Reader
10
//===--------------------------------------------------------------------===//
11
StringColumnReader::StringColumnReader(const ParquetReader &reader, const ParquetColumnSchema &schema)
12
0
    : ColumnReader(reader, schema), string_column_type(GetStringColumnType(Type())) {
13
0
  fixed_width_string_length = 0;
14
0
  if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) {
15
0
    fixed_width_string_length = schema.type_length;
16
0
  }
17
0
}
18
19
0
bool StringColumnReader::IsValid(const char *str_data, uint32_t str_len, const bool is_varchar) {
20
0
  if (!is_varchar) {
21
0
    return true;
22
0
  }
23
  // verify if a string is actually UTF8, and if there are no null bytes in the middle of the string
24
  // technically Parquet should guarantee this, but reality is often disappointing
25
0
  UnicodeInvalidReason reason;
26
0
  size_t pos;
27
0
  auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos);
28
0
  return utf_type != UnicodeType::INVALID;
29
0
}
30
31
0
bool StringColumnReader::IsValid(const string &str, bool is_varchar) {
32
0
  return IsValid(str.c_str(), str.size(), is_varchar);
33
0
}
34
0
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) const {
35
0
  if (!IsValid(str_data, str_len, is_varchar)) {
36
0
    throw InvalidInputException(
37
0
        "Invalid string encoding found in Parquet file \"%s\": value \"%s\" is not valid UTF8!",
38
0
        reader.GetFileName(), Blob::ToString(string_t(str_data, str_len)));
39
0
  }
40
0
}
41
42
0
void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) const {
43
0
  switch (string_column_type) {
44
0
  case StringColumnType::VARCHAR:
45
0
    VerifyString(str_data, str_len, true);
46
0
    break;
47
0
  case StringColumnType::JSON: {
48
0
    const auto error = StringUtil::ValidateJSON(str_data, str_len);
49
0
    if (!error.empty()) {
50
0
      throw InvalidInputException("Invalid JSON found in Parquet file: %s", error);
51
0
    }
52
0
    break;
53
0
  }
54
0
  default:
55
0
    break;
56
0
  }
57
0
}
58
59
class ParquetStringVectorBuffer : public VectorBuffer {
60
public:
61
  explicit ParquetStringVectorBuffer(shared_ptr<ResizeableBuffer> buffer_p)
62
0
      : VectorBuffer(VectorBufferType::OPAQUE_BUFFER), buffer(std::move(buffer_p)) {
63
0
  }
64
65
private:
66
  shared_ptr<ResizeableBuffer> buffer;
67
};
68
69
0
void StringColumnReader::ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block) {
70
0
  StringVector::AddBuffer(result, make_buffer<ParquetStringVectorBuffer>(block));
71
0
}
72
73
void StringColumnReader::Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
74
0
                               idx_t result_offset, Vector &result) {
75
0
  ReferenceBlock(result, plain_data);
76
0
  PlainTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result_offset, result);
77
0
}
78
79
0
void StringColumnReader::PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) {
80
0
  PlainSkipTemplated<StringParquetValueConversion>(plain_data, defines, num_values);
81
0
}
82
83
void StringColumnReader::PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values,
84
0
                                     Vector &result, const SelectionVector &sel, idx_t count) {
85
0
  ReferenceBlock(result, plain_data);
86
0
  PlainSelectTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result, sel, count);
87
0
}
88
89
} // namespace duckdb