/src/arrow/cpp/src/parquet/properties.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <memory> |
21 | | #include <string> |
22 | | #include <unordered_map> |
23 | | #include <unordered_set> |
24 | | #include <utility> |
25 | | |
26 | | #include "arrow/buffer.h" |
27 | | #include "arrow/io/caching.h" |
28 | | #include "arrow/type_fwd.h" |
29 | | #include "arrow/util/compression.h" |
30 | | #include "arrow/util/type_fwd.h" |
31 | | #include "parquet/encryption/encryption.h" |
32 | | #include "parquet/exception.h" |
33 | | #include "parquet/parquet_version.h" |
34 | | #include "parquet/platform.h" |
35 | | #include "parquet/schema.h" |
36 | | #include "parquet/type_fwd.h" |
37 | | #include "parquet/types.h" |
38 | | |
39 | | namespace parquet { |
40 | | |
41 | | /// Controls serialization format of data pages. parquet-format v2.0.0 |
42 | | /// introduced a new data page metadata type DataPageV2 and serialized page |
43 | | /// structure (for example, encoded levels are no longer compressed). Prior to |
44 | | /// the completion of PARQUET-457 in 2020, this library did not implement |
45 | | /// DataPageV2 correctly, so if you use the V2 data page format, you may have |
46 | | /// forward compatibility issues (older versions of the library will be unable |
47 | | /// to read the files). Note that some Parquet implementations do not implement |
48 | | /// DataPageV2 at all. |
49 | | enum class ParquetDataPageVersion { V1, V2 }; |
50 | | |
51 | | /// Controls the level of size statistics that are written to the file. |
52 | | enum class SizeStatisticsLevel : uint8_t { |
53 | | // No size statistics are written. |
54 | | None = 0, |
55 | | // Only column chunk size statistics are written. |
56 | | ColumnChunk, |
57 | | // Both size statistics in the column chunk and page index are written. |
58 | | PageAndColumnChunk |
59 | | }; |
60 | | |
61 | | /// Align the default buffer size to a small multiple of a page size. |
62 | | constexpr int64_t kDefaultBufferSize = 4096 * 4; |
63 | | |
64 | | constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000; |
65 | | // Structs in the thrift definition are relatively large (at least 300 bytes). |
66 | | // This limits total memory to the same order of magnitude as |
67 | | // kDefaultStringSizeLimit. |
68 | | constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000; |
69 | | |
70 | | // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file |
71 | | constexpr int64_t kDefaultFooterReadSize = 64 * 1024; |
72 | | |
73 | | class PARQUET_EXPORT ReaderProperties { |
74 | | public: |
75 | | explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool()) |
76 | 21.0k | : pool_(pool) {} |
77 | | |
78 | 588k | MemoryPool* memory_pool() const { return pool_; } |
79 | | |
80 | | std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source, |
81 | | int64_t start, int64_t num_bytes); |
82 | | |
83 | | /// Buffered stream reading allows the user to control the memory usage of |
84 | | /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are |
85 | | /// wrapped in a buffered reader that uses a fix sized buffer (of size |
86 | | /// `buffer_size()`) instead of the full size of the ReadAt. |
87 | | /// |
88 | | /// The primary reason for this control knobs is for resource control and not |
89 | | /// performance. |
90 | 0 | bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; } |
91 | | /// Enable buffered stream reading. |
92 | 0 | void enable_buffered_stream() { buffered_stream_enabled_ = true; } |
93 | | /// Disable buffered stream reading. |
94 | 0 | void disable_buffered_stream() { buffered_stream_enabled_ = false; } |
95 | | |
96 | 0 | bool read_dense_for_nullable() const { return read_dense_for_nullable_; } |
97 | 0 | void enable_read_dense_for_nullable() { read_dense_for_nullable_ = true; } |
98 | 0 | void disable_read_dense_for_nullable() { read_dense_for_nullable_ = false; } |
99 | | |
100 | | /// Return the size of the buffered stream buffer. |
101 | 0 | int64_t buffer_size() const { return buffer_size_; } |
102 | | /// Set the size of the buffered stream buffer in bytes. |
103 | 0 | void set_buffer_size(int64_t size) { buffer_size_ = size; } |
104 | | |
105 | | /// \brief Return the size limit on thrift strings. |
106 | | /// |
107 | | /// This limit helps prevent space and time bombs in files, but may need to |
108 | | /// be increased in order to read files with especially large headers. |
109 | 1.67M | int32_t thrift_string_size_limit() const { return thrift_string_size_limit_; } |
110 | | /// Set the size limit on thrift strings. |
111 | 0 | void set_thrift_string_size_limit(int32_t size) { thrift_string_size_limit_ = size; } |
112 | | |
113 | | /// \brief Return the size limit on thrift containers. |
114 | | /// |
115 | | /// This limit helps prevent space and time bombs in files, but may need to |
116 | | /// be increased in order to read files with especially large headers. |
117 | 1.67M | int32_t thrift_container_size_limit() const { return thrift_container_size_limit_; } |
118 | | /// Set the size limit on thrift containers. |
119 | 0 | void set_thrift_container_size_limit(int32_t size) { |
120 | 0 | thrift_container_size_limit_ = size; |
121 | 0 | } |
122 | | |
123 | | /// Set the decryption properties. |
124 | 21.0k | void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) { |
125 | 21.0k | file_decryption_properties_ = std::move(decryption); |
126 | 21.0k | } |
127 | | /// Return the decryption properties. |
128 | 18.6k | const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const { |
129 | 18.6k | return file_decryption_properties_; |
130 | 18.6k | } |
131 | | |
132 | 1.56M | bool page_checksum_verification() const { return page_checksum_verification_; } |
133 | 0 | void set_page_checksum_verification(bool check_crc) { |
134 | 0 | page_checksum_verification_ = check_crc; |
135 | 0 | } |
136 | | |
137 | | // Set the default read size to read the footer from a file. For high latency |
138 | | // file systems and files with large metadata (>64KB) this can increase performance |
139 | | // by reducing the number of round-trips to retrieve the entire file metadata. |
140 | 0 | void set_footer_read_size(size_t size) { footer_read_size_ = size; } |
141 | 21.0k | size_t footer_read_size() const { return footer_read_size_; } |
142 | | |
143 | | private: |
144 | | MemoryPool* pool_; |
145 | | int64_t buffer_size_ = kDefaultBufferSize; |
146 | | int32_t thrift_string_size_limit_ = kDefaultThriftStringSizeLimit; |
147 | | int32_t thrift_container_size_limit_ = kDefaultThriftContainerSizeLimit; |
148 | | bool buffered_stream_enabled_ = false; |
149 | | bool page_checksum_verification_ = false; |
150 | | // Used with a RecordReader. |
151 | | bool read_dense_for_nullable_ = false; |
152 | | size_t footer_read_size_ = kDefaultFooterReadSize; |
153 | | std::shared_ptr<FileDecryptionProperties> file_decryption_properties_; |
154 | | }; |
155 | | |
156 | | ReaderProperties PARQUET_EXPORT default_reader_properties(); |
157 | | |
158 | | static constexpr int64_t kDefaultDataPageSize = 1024 * 1024; |
159 | | static constexpr int64_t kDefaultMaxRowsPerPage = 20'000; |
160 | | static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true; |
161 | | static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize; |
162 | | static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024; |
163 | | static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024; |
164 | | static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true; |
165 | | static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096; |
166 | | static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN; |
167 | | static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION; |
168 | | static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED; |
169 | | static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true; |
170 | | static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL = |
171 | | SizeStatisticsLevel::PageAndColumnChunk; |
172 | | |
173 | | struct PARQUET_EXPORT BloomFilterOptions { |
174 | | /// Expected number of distinct values (NDV) in the bloom filter. |
175 | | /// |
176 | | /// Bloom filters are most effective for high-cardinality columns. A good default |
177 | | /// is to set ndv equal to the number of rows. Lower values reduce disk usage but |
178 | | /// may not be worthwhile for very small NDVs. |
179 | | /// |
180 | | /// Increasing ndv (without increasing fpp) increases disk and memory usage. |
181 | | int32_t ndv = 1 << 20; |
182 | | |
183 | | /// False-positive probability (FPP) of the bloom filter. |
184 | | /// |
185 | | /// Lower FPP values require more disk and memory space. For a fixed ndv, the |
186 | | /// space requirement grows roughly proportional to log(1/fpp). Recommended |
187 | | /// values are 0.1, 0.05, or 0.01. Very small values are counterproductive as |
188 | | /// the bitset may exceed the size of the actual data. Set ndv appropriately |
189 | | /// to minimize space usage. |
190 | | /// |
191 | | /// Below is a table to demonstrate estimated size using common values. |
192 | | /// |
193 | | /// | ndv | fpp | bits/key | size | |
194 | | /// |:-----------|:------|:---------|:----------| |
195 | | /// | 100,000 | 0.10 | 10.5 | 128 KiB | |
196 | | /// | 100,000 | 0.05 | 10.5 | 128 KiB | |
197 | | /// | 100,000 | 0.01 | 10.5 | 128 KiB | |
198 | | /// | 1,000,000 | 0.10 | 8.4 | 1024 KiB | |
199 | | /// | 1,000,000 | 0.05 | 8.4 | 1024 KiB | |
200 | | /// | 1,000,000 | 0.01 | 16.8 | 2048 KiB | |
201 | | /// | 10,000,000 | 0.10 | 6.7 | 8192 KiB | |
202 | | /// | 10,000,000 | 0.05 | 13.4 | 16384 KiB | |
203 | | /// | 10,000,000 | 0.01 | 13.4 | 16384 KiB | |
204 | | double fpp = 0.05; |
205 | | }; |
206 | | |
207 | | class PARQUET_EXPORT ColumnProperties { |
208 | | public: |
209 | | ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING, |
210 | | Compression::type codec = DEFAULT_COMPRESSION_TYPE, |
211 | | bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED, |
212 | | bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED, |
213 | | size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE, |
214 | | bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED) |
215 | | : encoding_(encoding), |
216 | | codec_(codec), |
217 | | dictionary_enabled_(dictionary_enabled), |
218 | | statistics_enabled_(statistics_enabled), |
219 | | max_stats_size_(max_stats_size), |
220 | 0 | page_index_enabled_(page_index_enabled) {} |
221 | | |
222 | 0 | void set_encoding(Encoding::type encoding) { encoding_ = encoding; } |
223 | | |
224 | 0 | void set_compression(Compression::type codec) { codec_ = codec; } |
225 | | |
226 | 0 | void set_dictionary_enabled(bool dictionary_enabled) { |
227 | 0 | dictionary_enabled_ = dictionary_enabled; |
228 | 0 | } |
229 | | |
230 | 0 | void set_statistics_enabled(bool statistics_enabled) { |
231 | 0 | statistics_enabled_ = statistics_enabled; |
232 | 0 | } |
233 | | |
234 | 0 | void set_max_statistics_size(size_t max_stats_size) { |
235 | 0 | max_stats_size_ = max_stats_size; |
236 | 0 | } |
237 | | |
238 | 0 | void set_compression_level(int compression_level) { |
239 | 0 | if (!codec_options_) { |
240 | 0 | codec_options_ = std::make_shared<CodecOptions>(); |
241 | 0 | } |
242 | 0 | codec_options_->compression_level = compression_level; |
243 | 0 | } |
244 | | |
245 | 0 | void set_codec_options(const std::shared_ptr<CodecOptions>& codec_options) { |
246 | 0 | codec_options_ = codec_options; |
247 | 0 | } |
248 | | |
249 | 0 | void set_page_index_enabled(bool page_index_enabled) { |
250 | 0 | page_index_enabled_ = page_index_enabled; |
251 | 0 | } |
252 | | |
253 | 0 | void set_bloom_filter_options(const BloomFilterOptions& bloom_filter_options) { |
254 | 0 | if (bloom_filter_options.fpp >= 1.0 || bloom_filter_options.fpp <= 0.0) { |
255 | 0 | throw ParquetException( |
256 | 0 | "Bloom filter false positive probability must be in (0.0, 1.0), got " + |
257 | 0 | std::to_string(bloom_filter_options.fpp)); |
258 | 0 | } |
259 | 0 | bloom_filter_options_ = bloom_filter_options; |
260 | 0 | } |
261 | | |
262 | 0 | Encoding::type encoding() const { return encoding_; } |
263 | | |
264 | 0 | Compression::type compression() const { return codec_; } |
265 | | |
266 | 0 | bool dictionary_enabled() const { return dictionary_enabled_; } |
267 | | |
268 | 0 | bool statistics_enabled() const { return statistics_enabled_; } |
269 | | |
270 | 0 | size_t max_statistics_size() const { return max_stats_size_; } |
271 | | |
272 | 0 | int compression_level() const { |
273 | 0 | if (!codec_options_) { |
274 | 0 | return ::arrow::util::kUseDefaultCompressionLevel; |
275 | 0 | } |
276 | 0 | return codec_options_->compression_level; |
277 | 0 | } |
278 | | |
279 | 0 | const std::shared_ptr<CodecOptions>& codec_options() const { return codec_options_; } |
280 | | |
281 | 0 | bool page_index_enabled() const { return page_index_enabled_; } |
282 | | |
283 | 0 | std::optional<BloomFilterOptions> bloom_filter_options() const { |
284 | 0 | return bloom_filter_options_; |
285 | 0 | } |
286 | | |
287 | 0 | bool bloom_filter_enabled() const { return bloom_filter_options_.has_value(); } |
288 | | |
289 | | private: |
290 | | Encoding::type encoding_; |
291 | | Compression::type codec_; |
292 | | bool dictionary_enabled_; |
293 | | bool statistics_enabled_; |
294 | | size_t max_stats_size_; |
295 | | std::shared_ptr<CodecOptions> codec_options_; |
296 | | bool page_index_enabled_; |
297 | | std::optional<BloomFilterOptions> bloom_filter_options_; |
298 | | }; |
299 | | |
300 | | // EXPERIMENTAL: Options for content-defined chunking. |
301 | | /// |
302 | | /// Content-defined chunking is an experimental feature that optimizes parquet |
303 | | /// files for content addressable storage (CAS) systems by writing data pages |
304 | | /// according to content-defined chunk boundaries. This allows for more |
305 | | /// efficient deduplication of data across files, hence more efficient network |
306 | | /// transfers and storage. |
307 | | /// Each content-defined chunk is written as a separate parquet data page. The |
308 | | /// following options control the chunks' size and the chunking process. Note |
309 | | /// that the chunk size is calculated based on the logical value of the data, |
310 | | /// before any encoding or compression is applied. |
311 | | struct PARQUET_EXPORT CdcOptions { |
312 | | /// Minimum chunk size in bytes, default is 256 KiB |
313 | | /// The rolling hash will not be updated until this size is reached for each chunk. |
314 | | /// Note that all data sent through the hash function is counted towards the chunk |
315 | | /// size, including definition and repetition levels if present. |
316 | | int64_t min_chunk_size = 256 * 1024; |
317 | | /// Maximum chunk size in bytes, default is 1024 KiB |
318 | | /// The chunker will create a new chunk whenever the chunk size exceeds this value. |
319 | | /// Note that the parquet writer has a related `pagesize` property that controls |
320 | | /// the maximum size of a parquet data page after encoding. While setting |
321 | | /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the |
322 | | /// chunking effectiveness, it results in more small parquet data pages. |
323 | | int64_t max_chunk_size = 1024 * 1024; |
324 | | /// Number of bit adjustment to the gearhash mask in order to center the chunk size |
325 | | /// around the average size more aggressively, default is 0 |
326 | | /// Increasing the normalization level increases the probability of finding a chunk, |
327 | | /// improving the deduplication ratio, but also increasing the number of small chunks |
328 | | /// resulting in many small parquet data pages. The default value provides a good |
329 | | /// balance between deduplication ratio and fragmentation. |
330 | | /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the |
331 | | /// expense of fragmentation. Negative values can also be used to reduce the |
332 | | /// probability of finding a chunk, resulting in larger chunks and fewer data pages. |
333 | | /// Note that values outside [-3, 3] are not recommended, prefer using the default |
334 | | /// value of 0 for most use cases. |
335 | | int norm_level = 0; |
336 | | }; |
337 | | |
338 | | class PARQUET_EXPORT WriterProperties { |
339 | | public: |
340 | | class PARQUET_EXPORT Builder { |
341 | | public: |
342 | | Builder() |
343 | | : pool_(::arrow::default_memory_pool()), |
344 | | dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT), |
345 | | write_batch_size_(DEFAULT_WRITE_BATCH_SIZE), |
346 | | max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH), |
347 | | pagesize_(kDefaultDataPageSize), |
348 | | max_rows_per_page_(kDefaultMaxRowsPerPage), |
349 | | version_(ParquetVersion::PARQUET_2_6), |
350 | | data_page_version_(ParquetDataPageVersion::V1), |
351 | | created_by_(DEFAULT_CREATED_BY), |
352 | | store_decimal_as_integer_(false), |
353 | | page_checksum_enabled_(false), |
354 | | size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL), |
355 | | content_defined_chunking_enabled_(false), |
356 | 0 | content_defined_chunking_options_({}) {} |
357 | | |
358 | | explicit Builder(const WriterProperties& properties) |
359 | | : pool_(properties.memory_pool()), |
360 | | dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()), |
361 | | write_batch_size_(properties.write_batch_size()), |
362 | | max_row_group_length_(properties.max_row_group_length()), |
363 | | pagesize_(properties.data_pagesize()), |
364 | | max_rows_per_page_(properties.max_rows_per_page()), |
365 | | version_(properties.version()), |
366 | | data_page_version_(properties.data_page_version()), |
367 | | created_by_(properties.created_by()), |
368 | | store_decimal_as_integer_(properties.store_decimal_as_integer()), |
369 | | page_checksum_enabled_(properties.page_checksum_enabled()), |
370 | | size_statistics_level_(properties.size_statistics_level()), |
371 | | sorting_columns_(properties.sorting_columns()), |
372 | | default_column_properties_(properties.default_column_properties()), |
373 | | content_defined_chunking_enabled_( |
374 | | properties.content_defined_chunking_enabled()), |
375 | | content_defined_chunking_options_( |
376 | 0 | properties.content_defined_chunking_options()) { |
377 | 0 | CopyColumnSpecificProperties(properties); |
378 | 0 | } |
379 | | |
380 | | /// \brief EXPERIMENTAL: Use content-defined page chunking for all columns. |
381 | | /// |
382 | | /// Optimize parquet files for content addressable storage (CAS) systems by writing |
383 | | /// data pages according to content-defined chunk boundaries. This allows for more |
384 | | /// efficient deduplication of data across files, hence more efficient network |
385 | | /// transfers and storage. The chunking is based on a rolling hash algorithm that |
386 | | /// identifies chunk boundaries based on the actual content of the data. |
387 | | /// |
388 | | /// Note that only the WriteArrow() interface is supported at the moment. |
389 | 0 | Builder* enable_content_defined_chunking() { |
390 | 0 | content_defined_chunking_enabled_ = true; |
391 | 0 | return this; |
392 | 0 | } |
393 | | |
394 | | /// \brief EXPERIMENTAL: Disable content-defined page chunking for all columns. |
395 | 0 | Builder* disable_content_defined_chunking() { |
396 | 0 | content_defined_chunking_enabled_ = false; |
397 | 0 | return this; |
398 | 0 | } |
399 | | |
400 | | /// \brief EXPERIMENTAL: Specify content-defined chunking options, see CdcOptions. |
401 | 0 | Builder* content_defined_chunking_options(const CdcOptions& options) { |
402 | 0 | content_defined_chunking_options_ = options; |
403 | 0 | return this; |
404 | 0 | } |
405 | | |
406 | | /// Specify the memory pool for the writer. Default default_memory_pool. |
407 | 0 | Builder* memory_pool(MemoryPool* pool) { |
408 | 0 | pool_ = pool; |
409 | 0 | return this; |
410 | 0 | } |
411 | | |
412 | | /// Enable dictionary encoding in general for all columns. Default |
413 | | /// enabled. |
414 | 0 | Builder* enable_dictionary() { |
415 | 0 | default_column_properties_.set_dictionary_enabled(true); |
416 | 0 | return this; |
417 | 0 | } |
418 | | |
419 | | /// Disable dictionary encoding in general for all columns. Default |
420 | | /// enabled. |
421 | 0 | Builder* disable_dictionary() { |
422 | 0 | default_column_properties_.set_dictionary_enabled(false); |
423 | 0 | return this; |
424 | 0 | } |
425 | | |
426 | | /// Enable dictionary encoding for column specified by `path`. Default |
427 | | /// enabled. |
428 | 0 | Builder* enable_dictionary(const std::string& path) { |
429 | 0 | dictionary_enabled_[path] = true; |
430 | 0 | return this; |
431 | 0 | } |
432 | | |
433 | | /// Enable dictionary encoding for column specified by `path`. Default |
434 | | /// enabled. |
435 | 0 | Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) { |
436 | 0 | return this->enable_dictionary(path->ToDotString()); |
437 | 0 | } |
438 | | |
439 | | /// Disable dictionary encoding for column specified by `path`. Default |
440 | | /// enabled. |
441 | 0 | Builder* disable_dictionary(const std::string& path) { |
442 | 0 | dictionary_enabled_[path] = false; |
443 | 0 | return this; |
444 | 0 | } |
445 | | |
446 | | /// Disable dictionary encoding for column specified by `path`. Default |
447 | | /// enabled. |
448 | 0 | Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) { |
449 | 0 | return this->disable_dictionary(path->ToDotString()); |
450 | 0 | } |
451 | | |
452 | | /// Specify the dictionary page size limit per row group. Default 1MB. |
453 | 0 | Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) { |
454 | 0 | dictionary_pagesize_limit_ = dictionary_psize_limit; |
455 | 0 | return this; |
456 | 0 | } |
457 | | |
458 | | /// Specify the write batch size while writing batches of Arrow values |
459 | | /// into Parquet. Default 1024. |
460 | 0 | Builder* write_batch_size(int64_t write_batch_size) { |
461 | 0 | write_batch_size_ = write_batch_size; |
462 | 0 | return this; |
463 | 0 | } |
464 | | |
465 | | /// Specify the max number of rows to put in a single row group. |
466 | | /// Default 1Mi rows. |
467 | 0 | Builder* max_row_group_length(int64_t max_row_group_length) { |
468 | 0 | max_row_group_length_ = max_row_group_length; |
469 | 0 | return this; |
470 | 0 | } |
471 | | |
472 | | /// Specify the data page size. |
473 | | /// Default 1MB. |
474 | 0 | Builder* data_pagesize(int64_t pg_size) { |
475 | 0 | pagesize_ = pg_size; |
476 | 0 | return this; |
477 | 0 | } |
478 | | |
479 | | /// Specify the maximum number of rows per data page. |
480 | | /// Default 20K rows. |
481 | 0 | Builder* max_rows_per_page(int64_t max_rows) { |
482 | 0 | max_rows_per_page_ = max_rows; |
483 | 0 | return this; |
484 | 0 | } |
485 | | |
486 | | /// Specify the data page version. |
487 | | /// Default V1. |
488 | 0 | Builder* data_page_version(ParquetDataPageVersion data_page_version) { |
489 | 0 | data_page_version_ = data_page_version; |
490 | 0 | return this; |
491 | 0 | } |
492 | | |
493 | | /// Specify the Parquet file version. |
494 | | /// Default PARQUET_2_6. |
495 | 0 | Builder* version(ParquetVersion::type version) { |
496 | 0 | version_ = version; |
497 | 0 | return this; |
498 | 0 | } |
499 | | |
500 | 0 | Builder* created_by(const std::string& created_by) { |
501 | 0 | created_by_ = created_by; |
502 | 0 | return this; |
503 | 0 | } |
504 | | |
505 | 0 | Builder* enable_page_checksum() { |
506 | 0 | page_checksum_enabled_ = true; |
507 | 0 | return this; |
508 | 0 | } |
509 | | |
510 | 0 | Builder* disable_page_checksum() { |
511 | 0 | page_checksum_enabled_ = false; |
512 | 0 | return this; |
513 | 0 | } |
514 | | |
515 | | /// \brief Define the encoding that is used when we don't utilise dictionary encoding. |
516 | | // |
517 | | /// This is only applied if dictionary encoding is disabled. If the dictionary grows |
518 | | /// too large we always fall back to the PLAIN encoding. |
519 | 0 | Builder* encoding(Encoding::type encoding_type) { |
520 | 0 | if (encoding_type == Encoding::PLAIN_DICTIONARY || |
521 | 0 | encoding_type == Encoding::RLE_DICTIONARY) { |
522 | 0 | throw ParquetException("Can't use dictionary encoding as fallback encoding"); |
523 | 0 | } |
524 | 0 |
|
525 | 0 | default_column_properties_.set_encoding(encoding_type); |
526 | 0 | return this; |
527 | 0 | } |
528 | | |
529 | | /// \brief Define the encoding that is used when we don't utilise dictionary encoding. |
530 | | // |
531 | | /// This is only applied if dictionary encoding is disabled. If the dictionary grows |
532 | | /// too large we always fall back to the PLAIN encoding. |
533 | 0 | Builder* encoding(const std::string& path, Encoding::type encoding_type) { |
534 | 0 | if (encoding_type == Encoding::PLAIN_DICTIONARY || |
535 | 0 | encoding_type == Encoding::RLE_DICTIONARY) { |
536 | 0 | throw ParquetException("Can't use dictionary encoding as fallback encoding"); |
537 | 0 | } |
538 | | |
539 | 0 | encodings_[path] = encoding_type; |
540 | 0 | return this; |
541 | 0 | } |
542 | | |
543 | | /// \brief Define the encoding that is used when we don't utilise dictionary encoding. |
544 | | // |
545 | | /// This is only applied if dictionary encoding is disabled. If the dictionary grows |
546 | | /// too large we always fall back to the PLAIN encoding. |
547 | | Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path, |
548 | 0 | Encoding::type encoding_type) { |
549 | 0 | return this->encoding(path->ToDotString(), encoding_type); |
550 | 0 | } |
551 | | |
552 | | /// Specify compression codec in general for all columns. |
553 | | /// Default UNCOMPRESSED. |
554 | 0 | Builder* compression(Compression::type codec) { |
555 | 0 | default_column_properties_.set_compression(codec); |
556 | 0 | return this; |
557 | 0 | } |
558 | | |
559 | | /// Specify max statistics size to store min max value. |
560 | | /// Default 4KB. |
561 | 0 | Builder* max_statistics_size(size_t max_stats_sz) { |
562 | 0 | default_column_properties_.set_max_statistics_size(max_stats_sz); |
563 | 0 | return this; |
564 | 0 | } |
565 | | |
566 | | /// Specify compression codec for the column specified by `path`. |
567 | | /// Default UNCOMPRESSED. |
568 | 0 | Builder* compression(const std::string& path, Compression::type codec) { |
569 | 0 | codecs_[path] = codec; |
570 | 0 | return this; |
571 | 0 | } |
572 | | |
573 | | /// Specify compression codec for the column specified by `path`. |
574 | | /// Default UNCOMPRESSED. |
575 | | Builder* compression(const std::shared_ptr<schema::ColumnPath>& path, |
576 | 0 | Compression::type codec) { |
577 | 0 | return this->compression(path->ToDotString(), codec); |
578 | 0 | } |
579 | | |
580 | | /// \brief Specify the default compression level for the compressor in |
581 | | /// every column. In case a column does not have an explicitly specified |
582 | | /// compression level, the default one would be used. |
583 | | /// |
584 | | /// The provided compression level is compressor specific. The user would |
585 | | /// have to familiarize oneself with the available levels for the selected |
586 | | /// compressor. If the compressor does not allow for selecting different |
587 | | /// compression levels, calling this function would not have any effect. |
588 | | /// Parquet and Arrow do not validate the passed compression level. If no |
589 | | /// level is selected by the user or if the special |
590 | | /// std::numeric_limits<int>::min() value is passed, then Arrow selects the |
591 | | /// compression level. |
592 | | /// |
593 | | /// If other compressor-specific options need to be set in addition to the compression |
594 | | /// level, use the codec_options method. |
595 | 0 | Builder* compression_level(int compression_level) { |
596 | 0 | default_column_properties_.set_compression_level(compression_level); |
597 | 0 | return this; |
598 | 0 | } |
599 | | |
600 | | /// \brief Specify a compression level for the compressor for the column |
601 | | /// described by path. |
602 | | /// |
603 | | /// The provided compression level is compressor specific. The user would |
604 | | /// have to familiarize oneself with the available levels for the selected |
605 | | /// compressor. If the compressor does not allow for selecting different |
606 | | /// compression levels, calling this function would not have any effect. |
607 | | /// Parquet and Arrow do not validate the passed compression level. If no |
608 | | /// level is selected by the user or if the special |
609 | | /// std::numeric_limits<int>::min() value is passed, then Arrow selects the |
610 | | /// compression level. |
611 | 0 | Builder* compression_level(const std::string& path, int compression_level) { |
612 | 0 | if (!codec_options_[path]) { |
613 | 0 | codec_options_[path] = std::make_shared<CodecOptions>(); |
614 | 0 | } |
615 | 0 | codec_options_[path]->compression_level = compression_level; |
616 | 0 | return this; |
617 | 0 | } |
618 | | |
619 | | /// \brief Specify a compression level for the compressor for the column |
620 | | /// described by path. |
621 | | /// |
622 | | /// The provided compression level is compressor specific. The user would |
623 | | /// have to familiarize oneself with the available levels for the selected |
624 | | /// compressor. If the compressor does not allow for selecting different |
625 | | /// compression levels, calling this function would not have any effect. |
626 | | /// Parquet and Arrow do not validate the passed compression level. If no |
627 | | /// level is selected by the user or if the special |
628 | | /// std::numeric_limits<int>::min() value is passed, then Arrow selects the |
629 | | /// compression level. |
630 | | Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path, |
631 | 0 | int compression_level) { |
632 | 0 | return this->compression_level(path->ToDotString(), compression_level); |
633 | 0 | } |
634 | | |
635 | | /// \brief Specify the default codec options for the compressor in |
636 | | /// every column. |
637 | | /// |
638 | | /// The codec options allow configuring the compression level as well |
639 | | /// as other codec-specific options. |
640 | | Builder* codec_options( |
641 | 0 | const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { |
642 | 0 | default_column_properties_.set_codec_options(codec_options); |
643 | 0 | return this; |
644 | 0 | } |
645 | | |
646 | | /// \brief Specify the codec options for the compressor for the column |
647 | | /// described by path. |
648 | | Builder* codec_options( |
649 | | const std::string& path, |
650 | 0 | const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { |
651 | 0 | codec_options_[path] = codec_options; |
652 | 0 | return this; |
653 | 0 | } |
654 | | |
655 | | /// \brief Specify the codec options for the compressor for the column |
656 | | /// described by path. |
657 | | Builder* codec_options( |
658 | | const std::shared_ptr<schema::ColumnPath>& path, |
659 | 0 | const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) { |
660 | 0 | return this->codec_options(path->ToDotString(), codec_options); |
661 | 0 | } |
662 | | |
663 | | /// Define the file encryption properties. |
664 | | /// Default NULL. |
665 | | Builder* encryption( |
666 | 0 | std::shared_ptr<FileEncryptionProperties> file_encryption_properties) { |
667 | 0 | file_encryption_properties_ = std::move(file_encryption_properties); |
668 | 0 | return this; |
669 | 0 | } |
670 | | |
671 | | /// Enable statistics in general. |
672 | | /// Default enabled. |
673 | 0 | Builder* enable_statistics() { |
674 | 0 | default_column_properties_.set_statistics_enabled(true); |
675 | 0 | return this; |
676 | 0 | } |
677 | | |
678 | | /// Disable statistics in general. |
679 | | /// Default enabled. |
680 | 0 | Builder* disable_statistics() { |
681 | 0 | default_column_properties_.set_statistics_enabled(false); |
682 | 0 | return this; |
683 | 0 | } |
684 | | |
685 | | /// Enable statistics for the column specified by `path`. |
686 | | /// Default enabled. |
687 | 0 | Builder* enable_statistics(const std::string& path) { |
688 | 0 | statistics_enabled_[path] = true; |
689 | 0 | return this; |
690 | 0 | } |
691 | | |
692 | | /// Enable statistics for the column specified by `path`. |
693 | | /// Default enabled. |
694 | 0 | Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) { |
695 | 0 | return this->enable_statistics(path->ToDotString()); |
696 | 0 | } |
697 | | |
698 | | /// Define the sorting columns. |
699 | | /// Default empty. |
700 | | /// |
701 | | /// If sorting columns are set, user should ensure that records |
702 | | /// are sorted by sorting columns. Otherwise, the storing data |
703 | | /// will be inconsistent with sorting_columns metadata. |
704 | 0 | Builder* set_sorting_columns(std::vector<SortingColumn> sorting_columns) { |
705 | 0 | sorting_columns_ = std::move(sorting_columns); |
706 | 0 | return this; |
707 | 0 | } |
708 | | |
709 | | /// Disable statistics for the column specified by `path`. |
710 | | /// Default enabled. |
711 | 0 | Builder* disable_statistics(const std::string& path) { |
712 | 0 | statistics_enabled_[path] = false; |
713 | 0 | return this; |
714 | 0 | } |
715 | | |
716 | | /// Disable statistics for the column specified by `path`. |
717 | | /// Default enabled. |
718 | 0 | Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) { |
719 | 0 | return this->disable_statistics(path->ToDotString()); |
720 | 0 | } |
721 | | |
722 | | /// Disable bloom filter for the column specified by `path`. |
723 | | /// Default disabled. |
724 | 0 | Builder* disable_bloom_filter(const std::string& path) { |
725 | 0 | bloom_filter_options_.erase(path); |
726 | 0 | return this; |
727 | 0 | } |
728 | | |
729 | | /// Disable bloom filter for the column specified by `path`. |
730 | | /// Default disabled. |
731 | 0 | Builder* disable_bloom_filter(const std::shared_ptr<schema::ColumnPath>& path) { |
732 | 0 | return this->disable_bloom_filter(path->ToDotString()); |
733 | 0 | } |
734 | | |
735 | | /// Enable bloom filter for the column specified by `path`. |
736 | | /// |
737 | | /// Default disabled. |
738 | | /// |
739 | | /// \note Bloom filter is not supported for boolean columns. ParquetException will |
740 | | /// be thrown during write if the column is of boolean type. |
741 | | Builder* enable_bloom_filter(const std::string& path, |
742 | 0 | const BloomFilterOptions& bloom_filter_options) { |
743 | 0 | bloom_filter_options_[path] = bloom_filter_options; |
744 | 0 | return this; |
745 | 0 | } |
746 | | |
747 | | /// Enable bloom filter for the column specified by `path`. |
748 | | /// |
749 | | /// Default disabled. |
750 | | /// |
751 | | /// \note Bloom filter is not supported for boolean columns. ParquetException will |
752 | | /// be thrown during write if the column is of boolean type. |
753 | | Builder* enable_bloom_filter(const std::shared_ptr<schema::ColumnPath>& path, |
754 | 0 | const BloomFilterOptions& bloom_filter_options) { |
755 | 0 | return this->enable_bloom_filter(path->ToDotString(), bloom_filter_options); |
756 | 0 | } |
757 | | |
758 | | /// Allow decimals with 1 <= precision <= 18 to be stored as integers. |
759 | | /// |
760 | | /// In Parquet, DECIMAL can be stored in any of the following physical types: |
761 | | /// - int32: for 1 <= precision <= 9. |
762 | | /// - int64: for 10 <= precision <= 18. |
763 | | /// - fixed_len_byte_array: precision is limited by the array size. |
764 | | /// Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. |
765 | | /// - binary: precision is unlimited. The minimum number of bytes to store |
766 | | /// the unscaled value is used. |
767 | | /// |
768 | | /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. |
769 | | /// |
770 | | /// When enabled, the C++ writer will use following physical types to store decimals: |
771 | | /// - int32: for 1 <= precision <= 9. |
772 | | /// - int64: for 10 <= precision <= 18. |
773 | | /// - fixed_len_byte_array: for precision > 18. |
774 | | /// |
775 | | /// As a consequence, decimal columns stored in integer types are more compact. |
776 | 0 | Builder* enable_store_decimal_as_integer() { |
777 | 0 | store_decimal_as_integer_ = true; |
778 | 0 | return this; |
779 | 0 | } |
780 | | |
781 | | /// Disable decimal logical type with 1 <= precision <= 18 to be stored |
782 | | /// as integer physical type. |
783 | | /// |
784 | | /// Default disabled. |
785 | 0 | Builder* disable_store_decimal_as_integer() { |
786 | 0 | store_decimal_as_integer_ = false; |
787 | 0 | return this; |
788 | 0 | } |
789 | | |
790 | | /// Enable writing page index in general for all columns. Default enabled. |
791 | | /// |
792 | | /// Writing statistics to the page index disables the old method of writing |
793 | | /// statistics to each data page header. |
794 | | /// The page index makes filtering more efficient than the page header, as |
795 | | /// it gathers all the statistics for a Parquet file in a single place, |
796 | | /// avoiding scattered I/O. |
797 | | /// |
798 | | /// Please check the link below for more details: |
799 | | /// https://github.com/apache/parquet-format/blob/master/PageIndex.md |
800 | 0 | Builder* enable_write_page_index() { |
801 | 0 | default_column_properties_.set_page_index_enabled(true); |
802 | 0 | return this; |
803 | 0 | } |
804 | | |
805 | | /// Disable writing page index in general for all columns. Default enabled. |
806 | 0 | Builder* disable_write_page_index() { |
807 | 0 | default_column_properties_.set_page_index_enabled(false); |
808 | 0 | return this; |
809 | 0 | } |
810 | | |
811 | | /// Enable writing page index for column specified by `path`. Default enabled. |
812 | 0 | Builder* enable_write_page_index(const std::string& path) { |
813 | 0 | page_index_enabled_[path] = true; |
814 | 0 | return this; |
815 | 0 | } |
816 | | |
817 | | /// Enable writing page index for column specified by `path`. Default enabled. |
818 | 0 | Builder* enable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) { |
819 | 0 | return this->enable_write_page_index(path->ToDotString()); |
820 | 0 | } |
821 | | |
822 | | /// Disable writing page index for column specified by `path`. Default enabled. |
823 | 0 | Builder* disable_write_page_index(const std::string& path) { |
824 | 0 | page_index_enabled_[path] = false; |
825 | 0 | return this; |
826 | 0 | } |
827 | | |
828 | | /// Disable writing page index for column specified by `path`. Default enabled. |
829 | 0 | Builder* disable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) { |
830 | 0 | return this->disable_write_page_index(path->ToDotString()); |
831 | 0 | } |
832 | | |
833 | | /// \brief Set the level to write size statistics for all columns. Default is |
834 | | /// PageAndColumnChunk. |
835 | | /// |
836 | | /// \param level The level to write size statistics. Note that if page index is not |
837 | | /// enabled, page level size statistics will not be written even if the level |
838 | | /// is set to PageAndColumnChunk. |
839 | 0 | Builder* set_size_statistics_level(SizeStatisticsLevel level) { |
840 | 0 | size_statistics_level_ = level; |
841 | 0 | return this; |
842 | 0 | } |
843 | | |
844 | | /// \brief Build the WriterProperties with the builder parameters. |
845 | | /// \return The WriterProperties defined by the builder. |
846 | 0 | std::shared_ptr<WriterProperties> build() { |
847 | 0 | std::unordered_map<std::string, ColumnProperties> column_properties; |
848 | 0 | auto get = [&](const std::string& key) -> ColumnProperties& { |
849 | 0 | auto it = column_properties.find(key); |
850 | 0 | if (it == column_properties.end()) |
851 | 0 | return column_properties[key] = default_column_properties_; |
852 | 0 | else |
853 | 0 | return it->second; |
854 | 0 | }; |
855 | 0 |
|
856 | 0 | for (const auto& item : encodings_) get(item.first).set_encoding(item.second); |
857 | 0 | for (const auto& item : codecs_) get(item.first).set_compression(item.second); |
858 | 0 | for (const auto& item : codec_options_) |
859 | 0 | get(item.first).set_codec_options(item.second); |
860 | 0 | for (const auto& item : dictionary_enabled_) |
861 | 0 | get(item.first).set_dictionary_enabled(item.second); |
862 | 0 | for (const auto& item : statistics_enabled_) |
863 | 0 | get(item.first).set_statistics_enabled(item.second); |
864 | 0 | for (const auto& item : page_index_enabled_) |
865 | 0 | get(item.first).set_page_index_enabled(item.second); |
866 | 0 | for (const auto& item : bloom_filter_options_) |
867 | 0 | get(item.first).set_bloom_filter_options(item.second); |
868 | 0 |
|
869 | 0 | return std::shared_ptr<WriterProperties>(new WriterProperties( |
870 | 0 | pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_, |
871 | 0 | pagesize_, max_rows_per_page_, version_, created_by_, page_checksum_enabled_, |
872 | 0 | size_statistics_level_, std::move(file_encryption_properties_), |
873 | 0 | default_column_properties_, column_properties, data_page_version_, |
874 | 0 | store_decimal_as_integer_, std::move(sorting_columns_), |
875 | 0 | content_defined_chunking_enabled_, content_defined_chunking_options_)); |
876 | 0 | } |
877 | | |
878 | | private: |
879 | | void CopyColumnSpecificProperties(const WriterProperties& properties); |
880 | | |
881 | | MemoryPool* pool_; |
882 | | int64_t dictionary_pagesize_limit_; |
883 | | int64_t write_batch_size_; |
884 | | int64_t max_row_group_length_; |
885 | | int64_t pagesize_; |
886 | | int64_t max_rows_per_page_; |
887 | | ParquetVersion::type version_; |
888 | | ParquetDataPageVersion data_page_version_; |
889 | | std::string created_by_; |
890 | | bool store_decimal_as_integer_; |
891 | | bool page_checksum_enabled_; |
892 | | SizeStatisticsLevel size_statistics_level_; |
893 | | |
894 | | std::shared_ptr<FileEncryptionProperties> file_encryption_properties_; |
895 | | |
896 | | // If empty, there is no sorting columns. |
897 | | std::vector<SortingColumn> sorting_columns_; |
898 | | |
899 | | // Settings used for each column unless overridden in any of the maps below |
900 | | ColumnProperties default_column_properties_; |
901 | | std::unordered_map<std::string, Encoding::type> encodings_; |
902 | | std::unordered_map<std::string, Compression::type> codecs_; |
903 | | std::unordered_map<std::string, std::shared_ptr<CodecOptions>> codec_options_; |
904 | | std::unordered_map<std::string, bool> dictionary_enabled_; |
905 | | std::unordered_map<std::string, bool> statistics_enabled_; |
906 | | std::unordered_map<std::string, bool> page_index_enabled_; |
907 | | std::unordered_map<std::string, BloomFilterOptions> bloom_filter_options_; |
908 | | |
909 | | bool content_defined_chunking_enabled_; |
910 | | CdcOptions content_defined_chunking_options_; |
911 | | }; |
912 | | |
913 | 0 | inline MemoryPool* memory_pool() const { return pool_; } |
914 | | |
915 | 0 | inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; } |
916 | | |
917 | 0 | inline int64_t write_batch_size() const { return write_batch_size_; } |
918 | | |
919 | 0 | inline int64_t max_row_group_length() const { return max_row_group_length_; } |
920 | | |
921 | 0 | inline int64_t data_pagesize() const { return pagesize_; } |
922 | | |
923 | 0 | inline int64_t max_rows_per_page() const { return max_rows_per_page_; } |
924 | | |
925 | 0 | inline ParquetDataPageVersion data_page_version() const { |
926 | 0 | return parquet_data_page_version_; |
927 | 0 | } |
928 | | |
929 | 0 | inline ParquetVersion::type version() const { return parquet_version_; } |
930 | | |
931 | 0 | inline std::string created_by() const { return parquet_created_by_; } |
932 | | |
933 | 0 | inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; } |
934 | | |
935 | 0 | inline bool page_checksum_enabled() const { return page_checksum_enabled_; } |
936 | | |
937 | 0 | inline bool content_defined_chunking_enabled() const { |
938 | 0 | return content_defined_chunking_enabled_; |
939 | 0 | } |
940 | 0 | inline CdcOptions content_defined_chunking_options() const { |
941 | 0 | return content_defined_chunking_options_; |
942 | 0 | } |
943 | | |
944 | 0 | inline SizeStatisticsLevel size_statistics_level() const { |
945 | 0 | return size_statistics_level_; |
946 | 0 | } |
947 | | |
948 | 0 | inline Encoding::type dictionary_index_encoding() const { |
949 | 0 | if (parquet_version_ == ParquetVersion::PARQUET_1_0) { |
950 | 0 | return Encoding::PLAIN_DICTIONARY; |
951 | 0 | } else { |
952 | 0 | return Encoding::RLE_DICTIONARY; |
953 | 0 | } |
954 | 0 | } |
955 | | |
956 | 0 | inline Encoding::type dictionary_page_encoding() const { |
957 | 0 | if (parquet_version_ == ParquetVersion::PARQUET_1_0) { |
958 | 0 | return Encoding::PLAIN_DICTIONARY; |
959 | 0 | } else { |
960 | 0 | return Encoding::PLAIN; |
961 | 0 | } |
962 | 0 | } |
963 | | |
964 | | const ColumnProperties& column_properties( |
965 | 0 | const std::shared_ptr<schema::ColumnPath>& path) const { |
966 | 0 | auto it = column_properties_.find(path->ToDotString()); |
967 | 0 | if (it != column_properties_.end()) return it->second; |
968 | 0 | return default_column_properties_; |
969 | 0 | } |
970 | | |
971 | 0 | Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const { |
972 | 0 | return column_properties(path).encoding(); |
973 | 0 | } |
974 | | |
975 | 0 | Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const { |
976 | 0 | return column_properties(path).compression(); |
977 | 0 | } |
978 | | |
979 | 0 | int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const { |
980 | 0 | return column_properties(path).compression_level(); |
981 | 0 | } |
982 | | |
983 | | const std::shared_ptr<CodecOptions> codec_options( |
984 | 0 | const std::shared_ptr<schema::ColumnPath>& path) const { |
985 | 0 | return column_properties(path).codec_options(); |
986 | 0 | } |
987 | | |
988 | 0 | bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const { |
989 | 0 | return column_properties(path).dictionary_enabled(); |
990 | 0 | } |
991 | | |
992 | 0 | const std::vector<SortingColumn>& sorting_columns() const { return sorting_columns_; } |
993 | | |
994 | 0 | bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const { |
995 | 0 | return column_properties(path).statistics_enabled(); |
996 | 0 | } |
997 | | |
998 | 0 | size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const { |
999 | 0 | return column_properties(path).max_statistics_size(); |
1000 | 0 | } |
1001 | | |
1002 | 0 | bool page_index_enabled(const std::shared_ptr<schema::ColumnPath>& path) const { |
1003 | 0 | return column_properties(path).page_index_enabled(); |
1004 | 0 | } |
1005 | | |
1006 | 0 | bool page_index_enabled() const { |
1007 | 0 | if (default_column_properties_.page_index_enabled()) { |
1008 | 0 | return true; |
1009 | 0 | } |
1010 | 0 | for (const auto& item : column_properties_) { |
1011 | 0 | if (item.second.page_index_enabled()) { |
1012 | 0 | return true; |
1013 | 0 | } |
1014 | 0 | } |
1015 | 0 | return false; |
1016 | 0 | } |
1017 | | |
1018 | | // Return whether bloom filter is enabled for any column. |
1019 | 0 | bool bloom_filter_enabled() const { |
1020 | 0 | return std::any_of(column_properties_.cbegin(), column_properties_.cend(), |
1021 | 0 | [](const auto& p) { return p.second.bloom_filter_enabled(); }); |
1022 | 0 | } |
1023 | | |
1024 | | std::optional<BloomFilterOptions> bloom_filter_options( |
1025 | 0 | const std::shared_ptr<schema::ColumnPath>& path) const { |
1026 | 0 | return column_properties(path).bloom_filter_options(); |
1027 | 0 | } |
1028 | | |
1029 | 0 | inline FileEncryptionProperties* file_encryption_properties() const { |
1030 | 0 | return file_encryption_properties_.get(); |
1031 | 0 | } |
1032 | | |
1033 | | std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties( |
1034 | 0 | const std::string& path) const { |
1035 | 0 | if (file_encryption_properties_) { |
1036 | 0 | return file_encryption_properties_->column_encryption_properties(path); |
1037 | 0 | } else { |
1038 | 0 | return NULLPTR; |
1039 | 0 | } |
1040 | 0 | } Unexecuted instantiation: parquet::WriterProperties::column_encryption_properties(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) const Unexecuted instantiation: parquet::WriterProperties::column_encryption_properties(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) const |
1041 | | |
1042 | | // \brief Return the default column properties |
1043 | 0 | const ColumnProperties& default_column_properties() const { |
1044 | 0 | return default_column_properties_; |
1045 | 0 | } |
1046 | | |
1047 | | private: |
1048 | | explicit WriterProperties( |
1049 | | MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size, |
1050 | | int64_t max_row_group_length, int64_t pagesize, int64_t max_rows_per_page, |
1051 | | ParquetVersion::type version, const std::string& created_by, |
1052 | | bool page_write_checksum_enabled, SizeStatisticsLevel size_statistics_level, |
1053 | | std::shared_ptr<FileEncryptionProperties> file_encryption_properties, |
1054 | | const ColumnProperties& default_column_properties, |
1055 | | const std::unordered_map<std::string, ColumnProperties>& column_properties, |
1056 | | ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer, |
1057 | | std::vector<SortingColumn> sorting_columns, bool content_defined_chunking_enabled, |
1058 | | CdcOptions content_defined_chunking_options) |
1059 | | : pool_(pool), |
1060 | | dictionary_pagesize_limit_(dictionary_pagesize_limit), |
1061 | | write_batch_size_(write_batch_size), |
1062 | | max_row_group_length_(max_row_group_length), |
1063 | | pagesize_(pagesize), |
1064 | | max_rows_per_page_(max_rows_per_page), |
1065 | | parquet_data_page_version_(data_page_version), |
1066 | | parquet_version_(version), |
1067 | | parquet_created_by_(created_by), |
1068 | | store_decimal_as_integer_(store_short_decimal_as_integer), |
1069 | | page_checksum_enabled_(page_write_checksum_enabled), |
1070 | | size_statistics_level_(size_statistics_level), |
1071 | | file_encryption_properties_(file_encryption_properties), |
1072 | | sorting_columns_(std::move(sorting_columns)), |
1073 | | default_column_properties_(default_column_properties), |
1074 | | column_properties_(column_properties), |
1075 | | content_defined_chunking_enabled_(content_defined_chunking_enabled), |
1076 | 0 | content_defined_chunking_options_(content_defined_chunking_options) {} |
1077 | | |
1078 | | MemoryPool* pool_; |
1079 | | int64_t dictionary_pagesize_limit_; |
1080 | | int64_t write_batch_size_; |
1081 | | int64_t max_row_group_length_; |
1082 | | int64_t pagesize_; |
1083 | | int64_t max_rows_per_page_; |
1084 | | ParquetDataPageVersion parquet_data_page_version_; |
1085 | | ParquetVersion::type parquet_version_; |
1086 | | std::string parquet_created_by_; |
1087 | | bool store_decimal_as_integer_; |
1088 | | bool page_checksum_enabled_; |
1089 | | SizeStatisticsLevel size_statistics_level_; |
1090 | | |
1091 | | std::shared_ptr<FileEncryptionProperties> file_encryption_properties_; |
1092 | | |
1093 | | std::vector<SortingColumn> sorting_columns_; |
1094 | | |
1095 | | ColumnProperties default_column_properties_; |
1096 | | std::unordered_map<std::string, ColumnProperties> column_properties_; |
1097 | | |
1098 | | bool content_defined_chunking_enabled_; |
1099 | | CdcOptions content_defined_chunking_options_; |
1100 | | }; |
1101 | | |
1102 | | PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties(); |
1103 | | |
1104 | | // ---------------------------------------------------------------------- |
1105 | | // Properties specific to Apache Arrow columnar read and write |
1106 | | |
1107 | | static constexpr bool kArrowDefaultUseThreads = false; |
1108 | | |
1109 | | // Default number of rows to read when using ::arrow::RecordBatchReader |
1110 | | static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024; |
1111 | | |
1112 | | constexpr inline ::arrow::Type::type kArrowDefaultBinaryType = ::arrow::Type::BINARY; |
1113 | | constexpr inline ::arrow::Type::type kArrowDefaultListType = ::arrow::Type::LIST; |
1114 | | |
1115 | | /// EXPERIMENTAL: Properties for configuring FileReader behavior. |
1116 | | class PARQUET_EXPORT ArrowReaderProperties { |
1117 | | public: |
1118 | | explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads) |
1119 | 103k | : use_threads_(use_threads), |
1120 | 103k | read_dict_indices_(), |
1121 | 103k | batch_size_(kArrowDefaultBatchSize), |
1122 | 103k | pre_buffer_(true), |
1123 | 103k | cache_options_(::arrow::io::CacheOptions::LazyDefaults()), |
1124 | 103k | coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO), |
1125 | 103k | binary_type_(kArrowDefaultBinaryType), |
1126 | 103k | list_type_(kArrowDefaultListType), |
1127 | 103k | arrow_extensions_enabled_(false), |
1128 | 103k | should_load_statistics_(false), |
1129 | 103k | smallest_decimal_enabled_(false) {} |
1130 | | |
1131 | | /// \brief Set whether to use the IO thread pool to parse columns in parallel. |
1132 | | /// |
1133 | | /// Default is false. |
1134 | 0 | void set_use_threads(bool use_threads) { use_threads_ = use_threads; } |
1135 | | /// Return whether will use multiple threads. |
1136 | 60.9k | bool use_threads() const { return use_threads_; } |
1137 | | |
1138 | | /// \brief Set whether to read a particular column as dictionary encoded. |
1139 | | /// |
1140 | | /// If the file metadata contains a serialized Arrow schema, then ... |
1141 | | //// |
1142 | | /// This is only supported for columns with a Parquet physical type of |
1143 | | /// BYTE_ARRAY, such as string or binary types. |
1144 | 0 | void set_read_dictionary(int column_index, bool read_dict) { |
1145 | 0 | if (read_dict) { |
1146 | 0 | read_dict_indices_.insert(column_index); |
1147 | 0 | } else { |
1148 | 0 | read_dict_indices_.erase(column_index); |
1149 | 0 | } |
1150 | 0 | } |
1151 | | /// Return whether the column at the index will be read as dictionary. |
1152 | 257k | bool read_dictionary(int column_index) const { |
1153 | 257k | if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) { |
1154 | 0 | return true; |
1155 | 257k | } else { |
1156 | 257k | return false; |
1157 | 257k | } |
1158 | 257k | } |
1159 | | |
1160 | | /// \brief Set the Arrow binary type to read BYTE_ARRAY columns as. |
1161 | | /// |
1162 | | /// Allowed values are Type::BINARY, Type::LARGE_BINARY and Type::BINARY_VIEW. |
1163 | | /// Default is Type::BINARY. |
1164 | | /// |
1165 | | /// If a BYTE_ARRAY column has the STRING logical type, it is read as the |
1166 | | /// Arrow string type corresponding to the configured binary type (for example |
1167 | | /// Type::LARGE_STRING if the configured binary type is Type::LARGE_BINARY). |
1168 | | /// |
1169 | | /// However, if a serialized Arrow schema is found in the Parquet metadata, |
1170 | | /// this setting is ignored and the Arrow schema takes precedence |
1171 | | /// (see ArrowWriterProperties::store_schema). |
1172 | 0 | void set_binary_type(::arrow::Type::type value) { binary_type_ = value; } |
1173 | | /// Return the Arrow binary type to read BYTE_ARRAY columns as. |
1174 | 46.7k | ::arrow::Type::type binary_type() const { return binary_type_; } |
1175 | | |
1176 | | /// \brief Set the Arrow list type to read Parquet list columns as. |
1177 | | /// |
1178 | | /// Allowed values are Type::LIST and Type::LARGE_LIST. |
1179 | | /// Default is Type::LIST. |
1180 | | /// |
1181 | | /// However, if a serialized Arrow schema is found in the Parquet metadata, |
1182 | | /// this setting is ignored and the Arrow schema takes precedence |
1183 | | /// (see ArrowWriterProperties::store_schema). |
1184 | 0 | void set_list_type(::arrow::Type::type value) { list_type_ = value; } |
1185 | | /// Return the Arrow list type to read Parquet list columns as. |
1186 | 34.5k | ::arrow::Type::type list_type() const { return list_type_; } |
1187 | | |
1188 | | /// \brief Set the maximum number of rows to read into a record batch. |
1189 | | /// |
1190 | | /// Will only be fewer rows when there are no more rows in the file. |
1191 | | /// Note that some APIs such as ReadTable may ignore this setting. |
1192 | 33.4k | void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; } |
1193 | | /// Return the batch size in rows. |
1194 | | /// |
1195 | | /// Note that some APIs such as ReadTable may ignore this setting. |
1196 | 0 | int64_t batch_size() const { return batch_size_; } |
1197 | | |
1198 | | /// Enable read coalescing (default true). |
1199 | | /// |
1200 | | /// When enabled, the Arrow reader will pre-buffer necessary regions |
1201 | | /// of the file in-memory. This is intended to improve performance on |
1202 | | /// high-latency filesystems (e.g. Amazon S3). |
1203 | 0 | void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; } |
1204 | | /// Return whether read coalescing is enabled. |
1205 | 69.6k | bool pre_buffer() const { return pre_buffer_; } |
1206 | | |
1207 | | /// Set options for read coalescing. This can be used to tune the |
1208 | | /// implementation for characteristics of different filesystems. |
1209 | 0 | void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; } |
1210 | | /// Return the options for read coalescing. |
1211 | 69.6k | const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; } |
1212 | | |
1213 | | /// Set execution context for read coalescing. |
1214 | 0 | void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; } |
1215 | | /// Return the execution context used for read coalescing. |
1216 | 69.6k | const ::arrow::io::IOContext& io_context() const { return io_context_; } |
1217 | | |
1218 | | /// Set timestamp unit to use for deprecated INT96-encoded timestamps |
1219 | | /// (default is NANO). |
1220 | 0 | void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) { |
1221 | 0 | coerce_int96_timestamp_unit_ = unit; |
1222 | 0 | } |
1223 | | |
1224 | 9.25k | ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const { |
1225 | 9.25k | return coerce_int96_timestamp_unit_; |
1226 | 9.25k | } |
1227 | | |
1228 | | /// Enable Parquet-supported Arrow extension types. |
1229 | | /// |
1230 | | /// When enabled, Parquet logical types will be mapped to their corresponding Arrow |
1231 | | /// extension types at read time, if such exist. Currently only arrow::extension::json() |
1232 | | /// extension type is supported. Columns whose LogicalType is JSON will be interpreted |
1233 | | /// as arrow::extension::json(), with storage type inferred from the serialized Arrow |
1234 | | /// schema if present, or `utf8` by default. |
1235 | 0 | void set_arrow_extensions_enabled(bool extensions_enabled) { |
1236 | 0 | arrow_extensions_enabled_ = extensions_enabled; |
1237 | 0 | } |
1238 | 24.1k | bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; } |
1239 | | |
1240 | | /// \brief Set whether to load statistics as much as possible. |
1241 | | /// |
1242 | | /// Default is false. |
1243 | 0 | void set_should_load_statistics(bool should_load_statistics) { |
1244 | 0 | should_load_statistics_ = should_load_statistics; |
1245 | 0 | } |
1246 | | /// Return whether loading statistics as much as possible. |
1247 | 153k | bool should_load_statistics() const { return should_load_statistics_; } |
1248 | | |
1249 | | /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical types. |
1250 | | /// |
1251 | | /// Default is false for compatibility, meaning that only Decimal128 and Decimal256 |
1252 | | /// can be inferred. |
1253 | 0 | void set_smallest_decimal_enabled(bool smallest_decimal_enable) { |
1254 | 0 | smallest_decimal_enabled_ = smallest_decimal_enable; |
1255 | 0 | } |
1256 | | /// \brief Whether to infer Decimal32/64 from Parquet decimal logical types. |
1257 | | /// |
1258 | | /// When enabled, Parquet decimal columns will be inferred as the smallest possible |
1259 | | /// Arrow Decimal type. |
1260 | | /// When disabled, Parquet decimal columns will be inferred as either Decimal128 or |
1261 | | /// Decimal256, but not Decimal32/64. |
1262 | | /// |
1263 | | /// Note: if an Arrow schema is found in the Parquet metadata, it will take priority and |
1264 | | /// this setting will be ignored. |
1265 | 9.49k | bool smallest_decimal_enabled() const { return smallest_decimal_enabled_; } |
1266 | | |
1267 | | private: |
1268 | | bool use_threads_; |
1269 | | std::unordered_set<int> read_dict_indices_; |
1270 | | int64_t batch_size_; |
1271 | | bool pre_buffer_; |
1272 | | ::arrow::io::IOContext io_context_; |
1273 | | ::arrow::io::CacheOptions cache_options_; |
1274 | | ::arrow::TimeUnit::type coerce_int96_timestamp_unit_; |
1275 | | ::arrow::Type::type binary_type_; |
1276 | | ::arrow::Type::type list_type_; |
1277 | | bool arrow_extensions_enabled_; |
1278 | | bool should_load_statistics_; |
1279 | | bool smallest_decimal_enabled_; |
1280 | | }; |
1281 | | |
1282 | | /// EXPERIMENTAL: Constructs the default ArrowReaderProperties |
1283 | | PARQUET_EXPORT |
1284 | | ArrowReaderProperties default_arrow_reader_properties(); |
1285 | | |
1286 | | class PARQUET_EXPORT ArrowWriterProperties { |
1287 | | public: |
1288 | | enum EngineVersion { |
1289 | | V1, // Supports only nested lists. |
1290 | | V2 // Full support for all nesting combinations |
1291 | | }; |
1292 | | class Builder { |
1293 | | public: |
1294 | | Builder() |
1295 | 0 | : write_timestamps_as_int96_(false), |
1296 | 0 | coerce_timestamps_enabled_(false), |
1297 | 0 | coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), |
1298 | 0 | truncated_timestamps_allowed_(false), |
1299 | 0 | store_schema_(false), |
1300 | 0 | compliant_nested_types_(true), |
1301 | 0 | engine_version_(V2), |
1302 | 0 | use_threads_(kArrowDefaultUseThreads), |
1303 | 0 | executor_(NULLPTR), |
1304 | 0 | write_time_adjusted_to_utc_(false) {} |
1305 | | |
1306 | | /// \brief Disable writing legacy int96 timestamps (default disabled). |
1307 | 0 | Builder* disable_deprecated_int96_timestamps() { |
1308 | 0 | write_timestamps_as_int96_ = false; |
1309 | 0 | return this; |
1310 | 0 | } |
1311 | | |
1312 | | /// \brief Enable writing legacy int96 timestamps (default disabled). |
1313 | | /// |
1314 | | /// May be turned on to write timestamps compatible with older Parquet writers. |
1315 | | /// This takes precedent over coerce_timestamps. |
1316 | 0 | Builder* enable_deprecated_int96_timestamps() { |
1317 | 0 | write_timestamps_as_int96_ = true; |
1318 | 0 | return this; |
1319 | 0 | } |
1320 | | |
1321 | | /// \brief Coerce all timestamps to the specified time unit. |
1322 | | /// \param unit time unit to truncate to. |
1323 | | /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds. |
1324 | 0 | Builder* coerce_timestamps(::arrow::TimeUnit::type unit) { |
1325 | 0 | coerce_timestamps_enabled_ = true; |
1326 | 0 | coerce_timestamps_unit_ = unit; |
1327 | 0 | return this; |
1328 | 0 | } |
1329 | | |
1330 | | /// \brief Allow loss of data when truncating timestamps. |
1331 | | /// |
1332 | | /// This is disallowed by default and an error will be returned. |
1333 | 0 | Builder* allow_truncated_timestamps() { |
1334 | 0 | truncated_timestamps_allowed_ = true; |
1335 | 0 | return this; |
1336 | 0 | } |
1337 | | |
1338 | | /// \brief Disallow loss of data when truncating timestamps (default). |
1339 | 0 | Builder* disallow_truncated_timestamps() { |
1340 | 0 | truncated_timestamps_allowed_ = false; |
1341 | 0 | return this; |
1342 | 0 | } |
1343 | | |
1344 | | /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file, |
1345 | | /// to enable certain read options (like "read_dictionary") to be set |
1346 | | /// automatically |
1347 | 0 | Builder* store_schema() { |
1348 | 0 | store_schema_ = true; |
1349 | 0 | return this; |
1350 | 0 | } |
1351 | | |
1352 | | /// \brief When enabled, will not preserve Arrow field names for list types. |
1353 | | /// |
1354 | | /// Instead of using the field names Arrow uses for the values array of |
1355 | | /// list types (default "item"), will use "element", as is specified in |
1356 | | /// the Parquet spec. |
1357 | | /// |
1358 | | /// This is enabled by default. |
1359 | 0 | Builder* enable_compliant_nested_types() { |
1360 | 0 | compliant_nested_types_ = true; |
1361 | 0 | return this; |
1362 | 0 | } |
1363 | | |
1364 | | /// Preserve Arrow list field name. |
1365 | 0 | Builder* disable_compliant_nested_types() { |
1366 | 0 | compliant_nested_types_ = false; |
1367 | 0 | return this; |
1368 | 0 | } |
1369 | | |
1370 | | /// Set the version of the Parquet writer engine. |
1371 | 0 | Builder* set_engine_version(EngineVersion version) { |
1372 | 0 | engine_version_ = version; |
1373 | 0 | return this; |
1374 | 0 | } |
1375 | | |
1376 | | /// \brief Set whether to use multiple threads to write columns |
1377 | | /// in parallel in the buffered row group mode. |
1378 | | /// |
1379 | | /// WARNING: If writing multiple files in parallel in the same |
1380 | | /// executor, deadlock may occur if use_threads is true. Please |
1381 | | /// disable it in this case. |
1382 | | /// |
1383 | | /// Default is false. |
1384 | 0 | Builder* set_use_threads(bool use_threads) { |
1385 | 0 | use_threads_ = use_threads; |
1386 | 0 | return this; |
1387 | 0 | } |
1388 | | |
1389 | | /// \brief Set the executor to write columns in parallel in the |
1390 | | /// buffered row group mode. |
1391 | | /// |
1392 | | /// Default is nullptr and the default cpu executor will be used. |
1393 | 0 | Builder* set_executor(::arrow::internal::Executor* executor) { |
1394 | 0 | executor_ = executor; |
1395 | 0 | return this; |
1396 | 0 | } |
1397 | | |
1398 | | /// \brief Set the value of isAdjustedTOUTC when writing a TIME column |
1399 | | /// |
1400 | | /// Default is false because Arrow TIME data is expressed in an unspecified timezone. |
1401 | | /// Note this setting doesn't affect TIMESTAMP data. |
1402 | 0 | Builder* set_time_adjusted_to_utc(bool adjusted) { |
1403 | 0 | write_time_adjusted_to_utc_ = adjusted; |
1404 | 0 | return this; |
1405 | 0 | } |
1406 | | |
1407 | | /// Create the final properties. |
1408 | 0 | std::shared_ptr<ArrowWriterProperties> build() { |
1409 | 0 | return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties( |
1410 | 0 | write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, |
1411 | 0 | truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, |
1412 | 0 | engine_version_, use_threads_, executor_, write_time_adjusted_to_utc_)); |
1413 | 0 | } |
1414 | | |
1415 | | private: |
1416 | | bool write_timestamps_as_int96_; |
1417 | | |
1418 | | bool coerce_timestamps_enabled_; |
1419 | | ::arrow::TimeUnit::type coerce_timestamps_unit_; |
1420 | | bool truncated_timestamps_allowed_; |
1421 | | |
1422 | | bool store_schema_; |
1423 | | bool compliant_nested_types_; |
1424 | | EngineVersion engine_version_; |
1425 | | |
1426 | | bool use_threads_; |
1427 | | ::arrow::internal::Executor* executor_; |
1428 | | |
1429 | | bool write_time_adjusted_to_utc_; |
1430 | | }; |
1431 | | |
1432 | 0 | bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } |
1433 | | |
1434 | 0 | bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } |
1435 | 0 | ::arrow::TimeUnit::type coerce_timestamps_unit() const { |
1436 | 0 | return coerce_timestamps_unit_; |
1437 | 0 | } |
1438 | | |
1439 | 0 | bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; } |
1440 | | |
1441 | 0 | bool store_schema() const { return store_schema_; } |
1442 | | |
1443 | | /// \brief Enable nested type naming according to the parquet specification. |
1444 | | /// |
1445 | | /// Older versions of arrow wrote out field names for nested lists based on the name |
1446 | | /// of the field. According to the parquet specification they should always be |
1447 | | /// "element". |
1448 | 0 | bool compliant_nested_types() const { return compliant_nested_types_; } |
1449 | | |
1450 | | /// \brief The underlying engine version to use when writing Arrow data. |
1451 | | /// |
1452 | | /// V2 is currently the latest V1 is considered deprecated but left in |
1453 | | /// place in case there are bugs detected in V2. |
1454 | 0 | EngineVersion engine_version() const { return engine_version_; } |
1455 | | |
1456 | | /// \brief Returns whether the writer will use multiple threads |
1457 | | /// to write columns in parallel in the buffered row group mode. |
1458 | 0 | bool use_threads() const { return use_threads_; } |
1459 | | |
1460 | | /// \brief Returns the executor used to write columns in parallel. |
1461 | | ::arrow::internal::Executor* executor() const; |
1462 | | |
1463 | | /// \brief The value of isAdjustedTOUTC when writing a TIME column |
1464 | | /// |
1465 | | /// Note this setting doesn't affect TIMESTAMP data. |
1466 | 0 | bool write_time_adjusted_to_utc() const { return write_time_adjusted_to_utc_; } |
1467 | | |
1468 | | private: |
1469 | | explicit ArrowWriterProperties(bool write_nanos_as_int96, |
1470 | | bool coerce_timestamps_enabled, |
1471 | | ::arrow::TimeUnit::type coerce_timestamps_unit, |
1472 | | bool truncated_timestamps_allowed, bool store_schema, |
1473 | | bool compliant_nested_types, |
1474 | | EngineVersion engine_version, bool use_threads, |
1475 | | ::arrow::internal::Executor* executor, |
1476 | | bool write_time_adjusted_to_utc) |
1477 | 0 | : write_timestamps_as_int96_(write_nanos_as_int96), |
1478 | 0 | coerce_timestamps_enabled_(coerce_timestamps_enabled), |
1479 | 0 | coerce_timestamps_unit_(coerce_timestamps_unit), |
1480 | 0 | truncated_timestamps_allowed_(truncated_timestamps_allowed), |
1481 | 0 | store_schema_(store_schema), |
1482 | 0 | compliant_nested_types_(compliant_nested_types), |
1483 | 0 | engine_version_(engine_version), |
1484 | 0 | use_threads_(use_threads), |
1485 | 0 | executor_(executor), |
1486 | 0 | write_time_adjusted_to_utc_(write_time_adjusted_to_utc) {} |
1487 | | |
1488 | | const bool write_timestamps_as_int96_; |
1489 | | const bool coerce_timestamps_enabled_; |
1490 | | const ::arrow::TimeUnit::type coerce_timestamps_unit_; |
1491 | | const bool truncated_timestamps_allowed_; |
1492 | | const bool store_schema_; |
1493 | | const bool compliant_nested_types_; |
1494 | | const EngineVersion engine_version_; |
1495 | | const bool use_threads_; |
1496 | | ::arrow::internal::Executor* executor_; |
1497 | | const bool write_time_adjusted_to_utc_; |
1498 | | }; |
1499 | | |
1500 | | /// \brief State object used for writing Arrow data directly to a Parquet |
1501 | | /// column chunk. API possibly not stable |
1502 | | struct ArrowWriteContext { |
1503 | | ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties) |
1504 | | : memory_pool(memory_pool), |
1505 | | properties(properties), |
1506 | | data_buffer(AllocateBuffer(memory_pool)), |
1507 | 0 | def_levels_buffer(AllocateBuffer(memory_pool)) {} |
1508 | | |
1509 | | template <typename T> |
1510 | | ::arrow::Status GetScratchData(const int64_t num_values, T** out) { |
1511 | | ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false)); |
1512 | | *out = reinterpret_cast<T*>(this->data_buffer->mutable_data()); |
1513 | | return ::arrow::Status::OK(); |
1514 | | } |
1515 | | |
1516 | | MemoryPool* memory_pool; |
1517 | | const ArrowWriterProperties* properties; |
1518 | | |
1519 | | // Buffer used for storing the data of an array converted to the physical type |
1520 | | // as expected by parquet-cpp. |
1521 | | std::shared_ptr<ResizableBuffer> data_buffer; |
1522 | | |
1523 | | // We use the shared ownership of this buffer |
1524 | | std::shared_ptr<ResizableBuffer> def_levels_buffer; |
1525 | | }; |
1526 | | |
1527 | | PARQUET_EXPORT |
1528 | | std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties(); |
1529 | | |
1530 | | } // namespace parquet |