/src/duckdb/extension/parquet/reader/string_column_reader.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | #include "reader/string_column_reader.hpp" |
2 | | #include "utf8proc_wrapper.hpp" |
3 | | #include "parquet_reader.hpp" |
4 | | #include "duckdb/common/types/blob.hpp" |
5 | | |
6 | | namespace duckdb { |
7 | | |
8 | | //===--------------------------------------------------------------------===// |
9 | | // String Column Reader |
10 | | //===--------------------------------------------------------------------===// |
11 | | StringColumnReader::StringColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema) |
12 | 0 | : ColumnReader(reader, schema) { |
13 | 0 | fixed_width_string_length = 0; |
14 | 0 | if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) { |
15 | 0 | fixed_width_string_length = schema.type_length; |
16 | 0 | } |
17 | 0 | } |
18 | | |
19 | 0 | void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) { |
20 | 0 | if (!is_varchar) { |
21 | 0 | return; |
22 | 0 | } |
23 | | // verify if a string is actually UTF8, and if there are no null bytes in the middle of the string |
24 | | // technically Parquet should guarantee this, but reality is often disappointing |
25 | 0 | UnicodeInvalidReason reason; |
26 | 0 | size_t pos; |
27 | 0 | auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos); |
28 | 0 | if (utf_type == UnicodeType::INVALID) { |
29 | 0 | throw InvalidInputException("Invalid string encoding found in Parquet file: value \"" + |
30 | 0 | Blob::ToString(string_t(str_data, str_len)) + "\" is not valid UTF8!"); |
31 | 0 | } |
32 | 0 | } |
33 | | |
34 | 0 | void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) { |
35 | 0 | VerifyString(str_data, str_len, Type().id() == LogicalTypeId::VARCHAR); |
36 | 0 | } |
37 | | |
38 | | class ParquetStringVectorBuffer : public VectorBuffer { |
39 | | public: |
40 | | explicit ParquetStringVectorBuffer(shared_ptr<ResizeableBuffer> buffer_p) |
41 | 0 | : VectorBuffer(VectorBufferType::OPAQUE_BUFFER), buffer(std::move(buffer_p)) { |
42 | 0 | } |
43 | | |
44 | | private: |
45 | | shared_ptr<ResizeableBuffer> buffer; |
46 | | }; |
47 | | |
48 | 0 | void StringColumnReader::ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block) { |
49 | 0 | StringVector::AddBuffer(result, make_buffer<ParquetStringVectorBuffer>(block)); |
50 | 0 | } |
51 | | |
52 | | void StringColumnReader::Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, |
53 | 0 | idx_t result_offset, Vector &result) { |
54 | 0 | ReferenceBlock(result, plain_data); |
55 | 0 | PlainTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result_offset, result); |
56 | 0 | } |
57 | | |
58 | 0 | void StringColumnReader::PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) { |
59 | 0 | PlainSkipTemplated<StringParquetValueConversion>(plain_data, defines, num_values); |
60 | 0 | } |
61 | | |
62 | | void StringColumnReader::PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, |
63 | 0 | Vector &result, const SelectionVector &sel, idx_t count) { |
64 | 0 | ReferenceBlock(result, plain_data); |
65 | 0 | PlainSelectTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result, sel, count); |
66 | 0 | } |
67 | | |
68 | | } // namespace duckdb |