/src/duckdb/extension/parquet/reader/string_column_reader.cpp
Line | Count | Source |
1 | | #include "reader/string_column_reader.hpp" |
2 | | #include "utf8proc_wrapper.hpp" |
3 | | #include "parquet_reader.hpp" |
4 | | #include "duckdb/common/types/blob.hpp" |
5 | | |
6 | | namespace duckdb { |
7 | | |
8 | | //===--------------------------------------------------------------------===// |
9 | | // String Column Reader |
10 | | //===--------------------------------------------------------------------===// |
11 | | StringColumnReader::StringColumnReader(const ParquetReader &reader, const ParquetColumnSchema &schema) |
12 | 0 | : ColumnReader(reader, schema), string_column_type(GetStringColumnType(Type())) { |
13 | 0 | fixed_width_string_length = 0; |
14 | 0 | if (schema.parquet_type == Type::FIXED_LEN_BYTE_ARRAY) { |
15 | 0 | fixed_width_string_length = schema.type_length; |
16 | 0 | } |
17 | 0 | } |
18 | | |
19 | 0 | bool StringColumnReader::IsValid(const char *str_data, uint32_t str_len, const bool is_varchar) { |
20 | 0 | if (!is_varchar) { |
21 | 0 | return true; |
22 | 0 | } |
23 | | // verify if a string is actually UTF8, and if there are no null bytes in the middle of the string |
24 | | // technically Parquet should guarantee this, but reality is often disappointing |
25 | 0 | UnicodeInvalidReason reason; |
26 | 0 | size_t pos; |
27 | 0 | auto utf_type = Utf8Proc::Analyze(str_data, str_len, &reason, &pos); |
28 | 0 | return utf_type != UnicodeType::INVALID; |
29 | 0 | } |
30 | | |
31 | 0 | bool StringColumnReader::IsValid(const string &str, bool is_varchar) { |
32 | 0 | return IsValid(str.c_str(), str.size(), is_varchar); |
33 | 0 | } |
34 | 0 | void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len, const bool is_varchar) const { |
35 | 0 | if (!IsValid(str_data, str_len, is_varchar)) { |
36 | 0 | throw InvalidInputException( |
37 | 0 | "Invalid string encoding found in Parquet file \"%s\": value \"%s\" is not valid UTF8!", |
38 | 0 | reader.GetFileName(), Blob::ToString(string_t(str_data, str_len))); |
39 | 0 | } |
40 | 0 | } |
41 | | |
42 | 0 | void StringColumnReader::VerifyString(const char *str_data, uint32_t str_len) const { |
43 | 0 | switch (string_column_type) { |
44 | 0 | case StringColumnType::VARCHAR: |
45 | 0 | VerifyString(str_data, str_len, true); |
46 | 0 | break; |
47 | 0 | case StringColumnType::JSON: { |
48 | 0 | const auto error = StringUtil::ValidateJSON(str_data, str_len); |
49 | 0 | if (!error.empty()) { |
50 | 0 | throw InvalidInputException("Invalid JSON found in Parquet file: %s", error); |
51 | 0 | } |
52 | 0 | break; |
53 | 0 | } |
54 | 0 | default: |
55 | 0 | break; |
56 | 0 | } |
57 | 0 | } |
58 | | |
59 | | class ParquetStringVectorBuffer : public VectorBuffer { |
60 | | public: |
61 | | explicit ParquetStringVectorBuffer(shared_ptr<ResizeableBuffer> buffer_p) |
62 | 0 | : VectorBuffer(VectorBufferType::OPAQUE_BUFFER), buffer(std::move(buffer_p)) { |
63 | 0 | } |
64 | | |
65 | | private: |
66 | | shared_ptr<ResizeableBuffer> buffer; |
67 | | }; |
68 | | |
69 | 0 | void StringColumnReader::ReferenceBlock(Vector &result, shared_ptr<ResizeableBuffer> &block) { |
70 | 0 | StringVector::AddBuffer(result, make_buffer<ParquetStringVectorBuffer>(block)); |
71 | 0 | } |
72 | | |
73 | | void StringColumnReader::Plain(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, |
74 | 0 | idx_t result_offset, Vector &result) { |
75 | 0 | ReferenceBlock(result, plain_data); |
76 | 0 | PlainTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result_offset, result); |
77 | 0 | } |
78 | | |
79 | 0 | void StringColumnReader::PlainSkip(ByteBuffer &plain_data, uint8_t *defines, idx_t num_values) { |
80 | 0 | PlainSkipTemplated<StringParquetValueConversion>(plain_data, defines, num_values); |
81 | 0 | } |
82 | | |
83 | | void StringColumnReader::PlainSelect(shared_ptr<ResizeableBuffer> &plain_data, uint8_t *defines, idx_t num_values, |
84 | 0 | Vector &result, const SelectionVector &sel, idx_t count) { |
85 | 0 | ReferenceBlock(result, plain_data); |
86 | 0 | PlainSelectTemplated<string_t, StringParquetValueConversion>(*plain_data, defines, num_values, result, sel, count); |
87 | 0 | } |
88 | | |
89 | | } // namespace duckdb |