/src/duckdb/extension/parquet/reader/struct_column_reader.cpp

Source (jump to first uncovered line)
#include "reader/struct_column_reader.hpp"

namespace duckdb {

//===--------------------------------------------------------------------===//
// Struct Column Reader
//===--------------------------------------------------------------------===//
StructColumnReader::StructColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
                                       vector<unique_ptr<ColumnReader>> child_readers_p)
    : ColumnReader(reader, schema), child_readers(std::move(child_readers_p)) {
  D_ASSERT(Type().InternalType() == PhysicalType::STRUCT);
}

ColumnReader &StructColumnReader::GetChildReader(idx_t child_idx) {
  if (!child_readers[child_idx]) {
    throw InternalException("StructColumnReader::GetChildReader(%d) - but this child reader is not set", child_idx);
  }
  return *child_readers[child_idx].get();
}

void StructColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns,
                                        TProtocol &protocol_p) {
  for (auto &child : child_readers) {
    if (!child) {
      continue;
    }
    child->InitializeRead(row_group_idx_p, columns, protocol_p);
  }
}

idx_t StructColumnReader::Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) {
  auto &struct_entries = StructVector::GetEntries(result);
  D_ASSERT(StructType::GetChildTypes(Type()).size() == struct_entries.size());

  if (pending_skips > 0) {
    throw InternalException("StructColumnReader cannot have pending skips");
  }

  // If the child reader values are all valid, "define_out" may not be initialized at all
  // So, we just initialize them to all be valid beforehand
  std::fill_n(define_out, num_values, MaxDefine());

  optional_idx read_count;
  for (idx_t i = 0; i < child_readers.size(); i++) {
    auto &child = child_readers[i];
    auto &target_vector = *struct_entries[i];
    if (!child) {
      // if we are not scanning this vector - set it to NULL
      target_vector.SetVectorType(VectorType::CONSTANT_VECTOR);
      ConstantVector::SetNull(target_vector, true);
      continue;
    }
    auto child_num_values = child->Read(num_values, define_out, repeat_out, target_vector);
    if (!read_count.IsValid()) {
      read_count = child_num_values;
    } else if (read_count.GetIndex() != child_num_values) {
      throw std::runtime_error("Struct child row count mismatch");
    }
  }
  if (!read_count.IsValid()) {
    read_count = num_values;
  }
  // set the validity mask for this level
  auto &validity = FlatVector::Validity(result);
  for (idx_t i = 0; i < read_count.GetIndex(); i++) {
    if (define_out[i] < MaxDefine()) {
      validity.SetInvalid(i);
    }
  }

  return read_count.GetIndex();
}

void StructColumnReader::Skip(idx_t num_values) {
  for (auto &child : child_readers) {
    if (!child) {
      continue;
    }
    child->Skip(num_values);
  }
}

void StructColumnReader::RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) {
  for (auto &child : child_readers) {
    if (!child) {
      continue;
    }
    child->RegisterPrefetch(transport, allow_merge);
  }
}

uint64_t StructColumnReader::TotalCompressedSize() {
  uint64_t size = 0;
  for (auto &child : child_readers) {
    if (!child) {
      continue;
    }
    size += child->TotalCompressedSize();
  }
  return size;
}

static bool TypeHasExactRowCount(const LogicalType &type) {
  switch (type.id()) {
  case LogicalTypeId::LIST:
  case LogicalTypeId::MAP:
    return false;
  case LogicalTypeId::STRUCT:
    for (auto &kv : StructType::GetChildTypes(type)) {
      if (TypeHasExactRowCount(kv.second)) {
        return true;
      }
    }
    return false;
  default:
    return true;
  }
}

idx_t StructColumnReader::GroupRowsAvailable() {
  for (auto &child : child_readers) {
    if (!child) {
      continue;
    }
    if (TypeHasExactRowCount(child->Type())) {
      return child->GroupRowsAvailable();
    }
  }
  for (auto &child : child_readers) {
    if (!child) {
      continue;
    }
    return child->GroupRowsAvailable();
  }
  throw InternalException("No projected columns in struct?");
}

} // namespace duckdb

Line	Count	Source (jump to first uncovered line)
1		#include "reader/struct_column_reader.hpp"
2
3		namespace duckdb {
4
5		//===--------------------------------------------------------------------===//
6		// Struct Column Reader
7		//===--------------------------------------------------------------------===//
8		StructColumnReader::StructColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
9		vector<unique_ptr<ColumnReader>> child_readers_p)
10	0	: ColumnReader(reader, schema), child_readers(std::move(child_readers_p)) {
11	0	D_ASSERT(Type().InternalType() == PhysicalType::STRUCT);
12	0	}
13
14	0	ColumnReader &StructColumnReader::GetChildReader(idx_t child_idx) {
15	0	if (!child_readers[child_idx]) {
16	0	throw InternalException("StructColumnReader::GetChildReader(%d) - but this child reader is not set", child_idx);
17	0	}
18	0	return *child_readers[child_idx].get();
19	0	}
20
21		void StructColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChunk> &columns,
22	0	TProtocol &protocol_p) {
23	0	for (auto &child : child_readers) {
24	0	if (!child) {
25	0	continue;
26	0	}
27	0	child->InitializeRead(row_group_idx_p, columns, protocol_p);
28	0	}
29	0	}
30
31	0	idx_t StructColumnReader::Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result) {
32	0	auto &struct_entries = StructVector::GetEntries(result);
33	0	D_ASSERT(StructType::GetChildTypes(Type()).size() == struct_entries.size());
34
35	0	if (pending_skips > 0) {
36	0	throw InternalException("StructColumnReader cannot have pending skips");
37	0	}
38
39		// If the child reader values are all valid, "define_out" may not be initialized at all
40		// So, we just initialize them to all be valid beforehand
41	0	std::fill_n(define_out, num_values, MaxDefine());
42
43	0	optional_idx read_count;
44	0	for (idx_t i = 0; i < child_readers.size(); i++) {
45	0	auto &child = child_readers[i];
46	0	auto &target_vector = *struct_entries[i];
47	0	if (!child) {
48		// if we are not scanning this vector - set it to NULL
49	0	target_vector.SetVectorType(VectorType::CONSTANT_VECTOR);
50	0	ConstantVector::SetNull(target_vector, true);
51	0	continue;
52	0	}
53	0	auto child_num_values = child->Read(num_values, define_out, repeat_out, target_vector);
54	0	if (!read_count.IsValid()) {
55	0	read_count = child_num_values;
56	0	} else if (read_count.GetIndex() != child_num_values) {
57	0	throw std::runtime_error("Struct child row count mismatch");
58	0	}
59	0	}
60	0	if (!read_count.IsValid()) {
61	0	read_count = num_values;
62	0	}
63		// set the validity mask for this level
64	0	auto &validity = FlatVector::Validity(result);
65	0	for (idx_t i = 0; i < read_count.GetIndex(); i++) {
66	0	if (define_out[i] < MaxDefine()) {
67	0	validity.SetInvalid(i);
68	0	}
69	0	}
70
71	0	return read_count.GetIndex();
72	0	}
73
74	0	void StructColumnReader::Skip(idx_t num_values) {
75	0	for (auto &child : child_readers) {
76	0	if (!child) {
77	0	continue;
78	0	}
79	0	child->Skip(num_values);
80	0	}
81	0	}
82
83	0	void StructColumnReader::RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge) {
84	0	for (auto &child : child_readers) {
85	0	if (!child) {
86	0	continue;
87	0	}
88	0	child->RegisterPrefetch(transport, allow_merge);
89	0	}
90	0	}
91
92	0	uint64_t StructColumnReader::TotalCompressedSize() {
93	0	uint64_t size = 0;
94	0	for (auto &child : child_readers) {
95	0	if (!child) {
96	0	continue;
97	0	}
98	0	size += child->TotalCompressedSize();
99	0	}
100	0	return size;
101	0	}
102
103	0	static bool TypeHasExactRowCount(const LogicalType &type) {
104	0	switch (type.id()) {
105	0	case LogicalTypeId::LIST:
106	0	case LogicalTypeId::MAP:
107	0	return false;
108	0	case LogicalTypeId::STRUCT:
109	0	for (auto &kv : StructType::GetChildTypes(type)) {
110	0	if (TypeHasExactRowCount(kv.second)) {
111	0	return true;
112	0	}
113	0	}
114	0	return false;
115	0	default:
116	0	return true;
117	0	}
118	0	}
119
120	0	idx_t StructColumnReader::GroupRowsAvailable() {
121	0	for (auto &child : child_readers) {
122	0	if (!child) {
123	0	continue;
124	0	}
125	0	if (TypeHasExactRowCount(child->Type())) {
126	0	return child->GroupRowsAvailable();
127	0	}
128	0	}
129	0	for (auto &child : child_readers) {
130	0	if (!child) {
131	0	continue;
132	0	}
133	0	return child->GroupRowsAvailable();
134	0	}
135	0	throw InternalException("No projected columns in struct?");
136	0	}
137
138		} // namespace duckdb

Coverage Report

Created: 2025-08-28 07:58