/src/arrow/cpp/src/parquet/column_page.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | // This module defines an abstract interface for iterating through pages in a |
19 | | // Parquet column chunk within a row group. It could be extended in the future |
20 | | // to iterate through all data pages in all chunks in a file. |
21 | | |
22 | | #pragma once |
23 | | |
24 | | #include <cstdint> |
25 | | #include <memory> |
26 | | #include <optional> |
27 | | #include <string> |
28 | | |
29 | | #include "parquet/size_statistics.h" |
30 | | #include "parquet/statistics.h" |
31 | | #include "parquet/types.h" |
32 | | |
33 | | namespace parquet { |
34 | | |
35 | | // TODO: Parallel processing is not yet safe because of memory-ownership |
36 | | // semantics (the PageReader may or may not own the memory referenced by a |
37 | | // page) |
38 | | // |
39 | | // TODO(wesm): In the future Parquet implementations may store the crc code |
40 | | // in format::PageHeader. parquet-mr currently does not, so we also skip it |
41 | | // here, both on the read and write path |
42 | | class Page { |
43 | | public: |
44 | | Page(const std::shared_ptr<Buffer>& buffer, PageType::type type) |
45 | 1.67M | : buffer_(buffer), type_(type) {} |
46 | | |
47 | 1.67M | virtual ~Page() = default; |
48 | | |
49 | 3.27M | PageType::type type() const { return type_; } |
50 | | |
51 | 0 | std::shared_ptr<Buffer> buffer() const { return buffer_; } |
52 | | |
53 | | // @returns: a pointer to the page's data |
54 | 3.24M | const uint8_t* data() const { return buffer_->data(); } |
55 | | |
56 | | // @returns: the total size in bytes of the page's data buffer |
57 | 3.24M | int32_t size() const { return static_cast<int32_t>(buffer_->size()); } |
58 | | |
59 | | private: |
60 | | std::shared_ptr<Buffer> buffer_; |
61 | | PageType::type type_; |
62 | | }; |
63 | | |
64 | | /// \brief Base type for DataPageV1 and DataPageV2 including common attributes |
65 | | class DataPage : public Page { |
66 | | public: |
67 | 1.57M | int32_t num_values() const { return num_values_; } |
68 | 1.57M | Encoding::type encoding() const { return encoding_; } |
69 | 0 | int64_t uncompressed_size() const { return uncompressed_size_; } |
70 | 0 | const EncodedStatistics& statistics() const { return statistics_; } |
71 | | /// Return the row ordinal within the row group to the first row in the data page. |
72 | | /// Currently it is only present from data pages created by ColumnWriter in order |
73 | | /// to collect page index. |
74 | 0 | std::optional<int64_t> first_row_index() const { return first_row_index_; } |
75 | 0 | const SizeStatistics& size_statistics() const { return size_statistics_; } |
76 | | |
77 | 1.57M | virtual ~DataPage() = default; |
78 | | |
79 | | protected: |
80 | | DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values, |
81 | | Encoding::type encoding, int64_t uncompressed_size, |
82 | | EncodedStatistics statistics, std::optional<int64_t> first_row_index, |
83 | | SizeStatistics size_statistics) |
84 | 1.57M | : Page(buffer, type), |
85 | 1.57M | num_values_(num_values), |
86 | 1.57M | encoding_(encoding), |
87 | 1.57M | uncompressed_size_(uncompressed_size), |
88 | 1.57M | statistics_(std::move(statistics)), |
89 | 1.57M | first_row_index_(std::move(first_row_index)), |
90 | 1.57M | size_statistics_(std::move(size_statistics)) {} |
91 | | |
92 | | int32_t num_values_; |
93 | | Encoding::type encoding_; |
94 | | int64_t uncompressed_size_; |
95 | | EncodedStatistics statistics_; |
96 | | /// Row ordinal within the row group to the first row in the data page. |
97 | | std::optional<int64_t> first_row_index_; |
98 | | SizeStatistics size_statistics_; |
99 | | }; |
100 | | |
101 | | class DataPageV1 : public DataPage { |
102 | | public: |
103 | | DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values, |
104 | | Encoding::type encoding, Encoding::type definition_level_encoding, |
105 | | Encoding::type repetition_level_encoding, int64_t uncompressed_size, |
106 | | EncodedStatistics statistics = EncodedStatistics(), |
107 | | std::optional<int64_t> first_row_index = std::nullopt, |
108 | | SizeStatistics size_statistics = SizeStatistics()) |
109 | 1.55M | : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size, |
110 | 1.55M | std::move(statistics), std::move(first_row_index), |
111 | 1.55M | std::move(size_statistics)), |
112 | 1.55M | definition_level_encoding_(definition_level_encoding), |
113 | 1.55M | repetition_level_encoding_(repetition_level_encoding) {} |
114 | | |
115 | 1.55M | Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; } |
116 | | |
117 | 1.55M | Encoding::type definition_level_encoding() const { return definition_level_encoding_; } |
118 | | |
119 | | private: |
120 | | Encoding::type definition_level_encoding_; |
121 | | Encoding::type repetition_level_encoding_; |
122 | | }; |
123 | | |
124 | | class DataPageV2 : public DataPage { |
125 | | public: |
126 | | DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls, |
127 | | int32_t num_rows, Encoding::type encoding, |
128 | | int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length, |
129 | | int64_t uncompressed_size, bool is_compressed = false, |
130 | | EncodedStatistics statistics = EncodedStatistics(), |
131 | | std::optional<int64_t> first_row_index = std::nullopt, |
132 | | SizeStatistics size_statistics = SizeStatistics()) |
133 | 21.7k | : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size, |
134 | 21.7k | std::move(statistics), std::move(first_row_index), |
135 | 21.7k | std::move(size_statistics)), |
136 | 21.7k | num_nulls_(num_nulls), |
137 | 21.7k | num_rows_(num_rows), |
138 | 21.7k | definition_levels_byte_length_(definition_levels_byte_length), |
139 | 21.7k | repetition_levels_byte_length_(repetition_levels_byte_length), |
140 | 21.7k | is_compressed_(is_compressed) {} |
141 | | |
142 | 0 | int32_t num_nulls() const { return num_nulls_; } |
143 | | |
144 | 0 | int32_t num_rows() const { return num_rows_; } |
145 | | |
146 | 39.2k | int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; } |
147 | | |
148 | 45.5k | int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; } |
149 | | |
150 | 0 | bool is_compressed() const { return is_compressed_; } |
151 | | |
152 | | private: |
153 | | int32_t num_nulls_; |
154 | | int32_t num_rows_; |
155 | | int32_t definition_levels_byte_length_; |
156 | | int32_t repetition_levels_byte_length_; |
157 | | bool is_compressed_; |
158 | | }; |
159 | | |
160 | | class DictionaryPage : public Page { |
161 | | public: |
162 | | DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values, |
163 | | Encoding::type encoding, bool is_sorted = false) |
164 | 102k | : Page(buffer, PageType::DICTIONARY_PAGE), |
165 | 102k | num_values_(num_values), |
166 | 102k | encoding_(encoding), |
167 | 102k | is_sorted_(is_sorted) {} |
168 | | |
169 | 101k | int32_t num_values() const { return num_values_; } |
170 | | |
171 | 390k | Encoding::type encoding() const { return encoding_; } |
172 | | |
173 | 0 | bool is_sorted() const { return is_sorted_; } |
174 | | |
175 | | private: |
176 | | int32_t num_values_; |
177 | | Encoding::type encoding_; |
178 | | bool is_sorted_; |
179 | | }; |
180 | | |
181 | | } // namespace parquet |