Coverage Report

Created: 2026-03-12 06:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/arrow/cpp/src/parquet/column_page.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
// This module defines an abstract interface for iterating through pages in a
19
// Parquet column chunk within a row group. It could be extended in the future
20
// to iterate through all data pages in all chunks in a file.
21
22
#pragma once
23
24
#include <cstdint>
25
#include <memory>
26
#include <optional>
27
#include <string>
28
29
#include "parquet/size_statistics.h"
30
#include "parquet/statistics.h"
31
#include "parquet/types.h"
32
33
namespace parquet {
34
35
// TODO: Parallel processing is not yet safe because of memory-ownership
36
// semantics (the PageReader may or may not own the memory referenced by a
37
// page)
38
//
39
// TODO(wesm): In the future Parquet implementations may store the crc code
40
// in format::PageHeader. parquet-mr currently does not, so we also skip it
41
// here, both on the read and write path
42
class Page {
43
 public:
44
  Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
45
1.67M
      : buffer_(buffer), type_(type) {}
46
47
1.67M
  virtual ~Page() = default;
48
49
3.27M
  PageType::type type() const { return type_; }
50
51
0
  std::shared_ptr<Buffer> buffer() const { return buffer_; }
52
53
  // @returns: a pointer to the page's data
54
3.24M
  const uint8_t* data() const { return buffer_->data(); }
55
56
  // @returns: the total size in bytes of the page's data buffer
57
3.24M
  int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
58
59
 private:
60
  std::shared_ptr<Buffer> buffer_;
61
  PageType::type type_;
62
};
63
64
/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
65
class DataPage : public Page {
66
 public:
67
1.57M
  int32_t num_values() const { return num_values_; }
68
1.57M
  Encoding::type encoding() const { return encoding_; }
69
0
  int64_t uncompressed_size() const { return uncompressed_size_; }
70
0
  const EncodedStatistics& statistics() const { return statistics_; }
71
  /// Return the row ordinal within the row group to the first row in the data page.
72
  /// Currently it is only present from data pages created by ColumnWriter in order
73
  /// to collect page index.
74
0
  std::optional<int64_t> first_row_index() const { return first_row_index_; }
75
0
  const SizeStatistics& size_statistics() const { return size_statistics_; }
76
77
1.57M
  virtual ~DataPage() = default;
78
79
 protected:
80
  DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
81
           Encoding::type encoding, int64_t uncompressed_size,
82
           EncodedStatistics statistics, std::optional<int64_t> first_row_index,
83
           SizeStatistics size_statistics)
84
1.57M
      : Page(buffer, type),
85
1.57M
        num_values_(num_values),
86
1.57M
        encoding_(encoding),
87
1.57M
        uncompressed_size_(uncompressed_size),
88
1.57M
        statistics_(std::move(statistics)),
89
1.57M
        first_row_index_(std::move(first_row_index)),
90
1.57M
        size_statistics_(std::move(size_statistics)) {}
91
92
  int32_t num_values_;
93
  Encoding::type encoding_;
94
  int64_t uncompressed_size_;
95
  EncodedStatistics statistics_;
96
  /// Row ordinal within the row group to the first row in the data page.
97
  std::optional<int64_t> first_row_index_;
98
  SizeStatistics size_statistics_;
99
};
100
101
class DataPageV1 : public DataPage {
102
 public:
103
  DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
104
             Encoding::type encoding, Encoding::type definition_level_encoding,
105
             Encoding::type repetition_level_encoding, int64_t uncompressed_size,
106
             EncodedStatistics statistics = EncodedStatistics(),
107
             std::optional<int64_t> first_row_index = std::nullopt,
108
             SizeStatistics size_statistics = SizeStatistics())
109
1.55M
      : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
110
1.55M
                 std::move(statistics), std::move(first_row_index),
111
1.55M
                 std::move(size_statistics)),
112
1.55M
        definition_level_encoding_(definition_level_encoding),
113
1.55M
        repetition_level_encoding_(repetition_level_encoding) {}
114
115
1.55M
  Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
116
117
1.55M
  Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
118
119
 private:
120
  Encoding::type definition_level_encoding_;
121
  Encoding::type repetition_level_encoding_;
122
};
123
124
class DataPageV2 : public DataPage {
125
 public:
126
  DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
127
             int32_t num_rows, Encoding::type encoding,
128
             int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
129
             int64_t uncompressed_size, bool is_compressed = false,
130
             EncodedStatistics statistics = EncodedStatistics(),
131
             std::optional<int64_t> first_row_index = std::nullopt,
132
             SizeStatistics size_statistics = SizeStatistics())
133
21.7k
      : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
134
21.7k
                 std::move(statistics), std::move(first_row_index),
135
21.7k
                 std::move(size_statistics)),
136
21.7k
        num_nulls_(num_nulls),
137
21.7k
        num_rows_(num_rows),
138
21.7k
        definition_levels_byte_length_(definition_levels_byte_length),
139
21.7k
        repetition_levels_byte_length_(repetition_levels_byte_length),
140
21.7k
        is_compressed_(is_compressed) {}
141
142
0
  int32_t num_nulls() const { return num_nulls_; }
143
144
0
  int32_t num_rows() const { return num_rows_; }
145
146
39.2k
  int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
147
148
45.5k
  int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
149
150
0
  bool is_compressed() const { return is_compressed_; }
151
152
 private:
153
  int32_t num_nulls_;
154
  int32_t num_rows_;
155
  int32_t definition_levels_byte_length_;
156
  int32_t repetition_levels_byte_length_;
157
  bool is_compressed_;
158
};
159
160
class DictionaryPage : public Page {
161
 public:
162
  DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
163
                 Encoding::type encoding, bool is_sorted = false)
164
102k
      : Page(buffer, PageType::DICTIONARY_PAGE),
165
102k
        num_values_(num_values),
166
102k
        encoding_(encoding),
167
102k
        is_sorted_(is_sorted) {}
168
169
101k
  int32_t num_values() const { return num_values_; }
170
171
390k
  Encoding::type encoding() const { return encoding_; }
172
173
0
  bool is_sorted() const { return is_sorted_; }
174
175
 private:
176
  int32_t num_values_;
177
  Encoding::type encoding_;
178
  bool is_sorted_;
179
};
180
181
}  // namespace parquet