Coverage Report

Created: 2026-06-08 06:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/arrow/cpp/src/parquet/properties.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <memory>
21
#include <string>
22
#include <unordered_map>
23
#include <unordered_set>
24
#include <utility>
25
26
#include "arrow/buffer.h"
27
#include "arrow/io/caching.h"
28
#include "arrow/type_fwd.h"
29
#include "arrow/util/compression.h"
30
#include "arrow/util/type_fwd.h"
31
#include "parquet/encryption/encryption.h"
32
#include "parquet/exception.h"
33
#include "parquet/parquet_version.h"
34
#include "parquet/platform.h"
35
#include "parquet/schema.h"
36
#include "parquet/type_fwd.h"
37
#include "parquet/types.h"
38
39
namespace parquet {
40
41
/// Controls serialization format of data pages.  parquet-format v2.0.0
42
/// introduced a new data page metadata type DataPageV2 and serialized page
43
/// structure (for example, encoded levels are no longer compressed). Prior to
44
/// the completion of PARQUET-457 in 2020, this library did not implement
45
/// DataPageV2 correctly, so if you use the V2 data page format, you may have
46
/// forward compatibility issues (older versions of the library will be unable
47
/// to read the files). Note that some Parquet implementations do not implement
48
/// DataPageV2 at all.
49
enum class ParquetDataPageVersion { V1, V2 };
50
51
/// Controls the level of size statistics that are written to the file.
52
enum class SizeStatisticsLevel : uint8_t {
53
  // No size statistics are written.
54
  None = 0,
55
  // Only column chunk size statistics are written.
56
  ColumnChunk,
57
  // Both size statistics in the column chunk and page index are written.
58
  PageAndColumnChunk
59
};
60
61
/// Align the default buffer size to a small multiple of a page size.
62
constexpr int64_t kDefaultBufferSize = 4096 * 4;
63
64
constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000;
65
// Structs in the thrift definition are relatively large (at least 300 bytes).
66
// This limits total memory to the same order of magnitude as
67
// kDefaultStringSizeLimit.
68
constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000;
69
70
// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
71
constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
72
73
class PARQUET_EXPORT ReaderProperties {
74
 public:
75
  explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
76
21.0k
      : pool_(pool) {}
77
78
588k
  MemoryPool* memory_pool() const { return pool_; }
79
80
  std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
81
                                              int64_t start, int64_t num_bytes);
82
83
  /// Buffered stream reading allows the user to control the memory usage of
84
  /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
85
  /// wrapped in a buffered reader that uses a fix sized buffer (of size
86
  /// `buffer_size()`) instead of the full size of the ReadAt.
87
  ///
88
  /// The primary reason for this control knobs is for resource control and not
89
  /// performance.
90
0
  bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
91
  /// Enable buffered stream reading.
92
0
  void enable_buffered_stream() { buffered_stream_enabled_ = true; }
93
  /// Disable buffered stream reading.
94
0
  void disable_buffered_stream() { buffered_stream_enabled_ = false; }
95
96
0
  bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
97
0
  void enable_read_dense_for_nullable() { read_dense_for_nullable_ = true; }
98
0
  void disable_read_dense_for_nullable() { read_dense_for_nullable_ = false; }
99
100
  /// Return the size of the buffered stream buffer.
101
0
  int64_t buffer_size() const { return buffer_size_; }
102
  /// Set the size of the buffered stream buffer in bytes.
103
0
  void set_buffer_size(int64_t size) { buffer_size_ = size; }
104
105
  /// \brief Return the size limit on thrift strings.
106
  ///
107
  /// This limit helps prevent space and time bombs in files, but may need to
108
  /// be increased in order to read files with especially large headers.
109
1.67M
  int32_t thrift_string_size_limit() const { return thrift_string_size_limit_; }
110
  /// Set the size limit on thrift strings.
111
0
  void set_thrift_string_size_limit(int32_t size) { thrift_string_size_limit_ = size; }
112
113
  /// \brief Return the size limit on thrift containers.
114
  ///
115
  /// This limit helps prevent space and time bombs in files, but may need to
116
  /// be increased in order to read files with especially large headers.
117
1.67M
  int32_t thrift_container_size_limit() const { return thrift_container_size_limit_; }
118
  /// Set the size limit on thrift containers.
119
0
  void set_thrift_container_size_limit(int32_t size) {
120
0
    thrift_container_size_limit_ = size;
121
0
  }
122
123
  /// Set the decryption properties.
124
21.0k
  void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
125
21.0k
    file_decryption_properties_ = std::move(decryption);
126
21.0k
  }
127
  /// Return the decryption properties.
128
18.6k
  const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
129
18.6k
    return file_decryption_properties_;
130
18.6k
  }
131
132
1.56M
  bool page_checksum_verification() const { return page_checksum_verification_; }
133
0
  void set_page_checksum_verification(bool check_crc) {
134
0
    page_checksum_verification_ = check_crc;
135
0
  }
136
137
  // Set the default read size to read the footer from a file. For high latency
138
  // file systems and files with large metadata (>64KB) this can increase performance
139
  // by reducing the number of round-trips to retrieve the entire file metadata.
140
0
  void set_footer_read_size(size_t size) { footer_read_size_ = size; }
141
21.0k
  size_t footer_read_size() const { return footer_read_size_; }
142
143
 private:
144
  MemoryPool* pool_;
145
  int64_t buffer_size_ = kDefaultBufferSize;
146
  int32_t thrift_string_size_limit_ = kDefaultThriftStringSizeLimit;
147
  int32_t thrift_container_size_limit_ = kDefaultThriftContainerSizeLimit;
148
  bool buffered_stream_enabled_ = false;
149
  bool page_checksum_verification_ = false;
150
  // Used with a RecordReader.
151
  bool read_dense_for_nullable_ = false;
152
  size_t footer_read_size_ = kDefaultFooterReadSize;
153
  std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
154
};
155
156
ReaderProperties PARQUET_EXPORT default_reader_properties();
157
158
static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
159
static constexpr int64_t kDefaultMaxRowsPerPage = 20'000;
160
static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
161
static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
162
static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
163
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
164
static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
165
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
166
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
167
static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
168
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
169
static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
170
static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
171
    SizeStatisticsLevel::PageAndColumnChunk;
172
173
struct PARQUET_EXPORT BloomFilterOptions {
174
  /// Expected number of distinct values (NDV) in the bloom filter.
175
  ///
176
  /// Bloom filters are most effective for high-cardinality columns. A good default
177
  /// is to set ndv equal to the number of rows. Lower values reduce disk usage but
178
  /// may not be worthwhile for very small NDVs.
179
  ///
180
  /// Increasing ndv (without increasing fpp) increases disk and memory usage.
181
  int32_t ndv = 1 << 20;
182
183
  /// False-positive probability (FPP) of the bloom filter.
184
  ///
185
  /// Lower FPP values require more disk and memory space. For a fixed ndv, the
186
  /// space requirement grows roughly proportional to log(1/fpp). Recommended
187
  /// values are 0.1, 0.05, or 0.01. Very small values are counterproductive as
188
  /// the bitset may exceed the size of the actual data. Set ndv appropriately
189
  /// to minimize space usage.
190
  ///
191
  /// Below is a table to demonstrate estimated size using common values.
192
  ///
193
  /// | ndv        | fpp   | bits/key | size      |
194
  /// |:-----------|:------|:---------|:----------|
195
  /// | 100,000    | 0.10  | 10.5     | 128 KiB   |
196
  /// | 100,000    | 0.05  | 10.5     | 128 KiB   |
197
  /// | 100,000    | 0.01  | 10.5     | 128 KiB   |
198
  /// | 1,000,000  | 0.10  | 8.4      | 1024 KiB  |
199
  /// | 1,000,000  | 0.05  | 8.4      | 1024 KiB  |
200
  /// | 1,000,000  | 0.01  | 16.8     | 2048 KiB  |
201
  /// | 10,000,000 | 0.10  | 6.7      | 8192 KiB  |
202
  /// | 10,000,000 | 0.05  | 13.4     | 16384 KiB |
203
  /// | 10,000,000 | 0.01  | 13.4     | 16384 KiB |
204
  double fpp = 0.05;
205
};
206
207
class PARQUET_EXPORT ColumnProperties {
208
 public:
209
  ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
210
                   Compression::type codec = DEFAULT_COMPRESSION_TYPE,
211
                   bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
212
                   bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
213
                   size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE,
214
                   bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED)
215
      : encoding_(encoding),
216
        codec_(codec),
217
        dictionary_enabled_(dictionary_enabled),
218
        statistics_enabled_(statistics_enabled),
219
        max_stats_size_(max_stats_size),
220
0
        page_index_enabled_(page_index_enabled) {}
221
222
0
  void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
223
224
0
  void set_compression(Compression::type codec) { codec_ = codec; }
225
226
0
  void set_dictionary_enabled(bool dictionary_enabled) {
227
0
    dictionary_enabled_ = dictionary_enabled;
228
0
  }
229
230
0
  void set_statistics_enabled(bool statistics_enabled) {
231
0
    statistics_enabled_ = statistics_enabled;
232
0
  }
233
234
0
  void set_max_statistics_size(size_t max_stats_size) {
235
0
    max_stats_size_ = max_stats_size;
236
0
  }
237
238
0
  void set_compression_level(int compression_level) {
239
0
    if (!codec_options_) {
240
0
      codec_options_ = std::make_shared<CodecOptions>();
241
0
    }
242
0
    codec_options_->compression_level = compression_level;
243
0
  }
244
245
0
  void set_codec_options(const std::shared_ptr<CodecOptions>& codec_options) {
246
0
    codec_options_ = codec_options;
247
0
  }
248
249
0
  void set_page_index_enabled(bool page_index_enabled) {
250
0
    page_index_enabled_ = page_index_enabled;
251
0
  }
252
253
0
  void set_bloom_filter_options(const BloomFilterOptions& bloom_filter_options) {
254
0
    if (bloom_filter_options.fpp >= 1.0 || bloom_filter_options.fpp <= 0.0) {
255
0
      throw ParquetException(
256
0
          "Bloom filter false positive probability must be in (0.0, 1.0), got " +
257
0
          std::to_string(bloom_filter_options.fpp));
258
0
    }
259
0
    bloom_filter_options_ = bloom_filter_options;
260
0
  }
261
262
0
  Encoding::type encoding() const { return encoding_; }
263
264
0
  Compression::type compression() const { return codec_; }
265
266
0
  bool dictionary_enabled() const { return dictionary_enabled_; }
267
268
0
  bool statistics_enabled() const { return statistics_enabled_; }
269
270
0
  size_t max_statistics_size() const { return max_stats_size_; }
271
272
0
  int compression_level() const {
273
0
    if (!codec_options_) {
274
0
      return ::arrow::util::kUseDefaultCompressionLevel;
275
0
    }
276
0
    return codec_options_->compression_level;
277
0
  }
278
279
0
  const std::shared_ptr<CodecOptions>& codec_options() const { return codec_options_; }
280
281
0
  bool page_index_enabled() const { return page_index_enabled_; }
282
283
0
  std::optional<BloomFilterOptions> bloom_filter_options() const {
284
0
    return bloom_filter_options_;
285
0
  }
286
287
0
  bool bloom_filter_enabled() const { return bloom_filter_options_.has_value(); }
288
289
 private:
290
  Encoding::type encoding_;
291
  Compression::type codec_;
292
  bool dictionary_enabled_;
293
  bool statistics_enabled_;
294
  size_t max_stats_size_;
295
  std::shared_ptr<CodecOptions> codec_options_;
296
  bool page_index_enabled_;
297
  std::optional<BloomFilterOptions> bloom_filter_options_;
298
};
299
300
// EXPERIMENTAL: Options for content-defined chunking.
301
///
302
/// Content-defined chunking is an experimental feature that optimizes parquet
303
/// files for content addressable storage (CAS) systems by writing data pages
304
/// according to content-defined chunk boundaries. This allows for more
305
/// efficient deduplication of data across files, hence more efficient network
306
/// transfers and storage.
307
/// Each content-defined chunk is written as a separate parquet data page. The
308
/// following options control the chunks' size and the chunking process. Note
309
/// that the chunk size is calculated based on the logical value of the data,
310
/// before any encoding or compression is applied.
311
struct PARQUET_EXPORT CdcOptions {
312
  /// Minimum chunk size in bytes, default is 256 KiB
313
  /// The rolling hash will not be updated until this size is reached for each chunk.
314
  /// Note that all data sent through the hash function is counted towards the chunk
315
  /// size, including definition and repetition levels if present.
316
  int64_t min_chunk_size = 256 * 1024;
317
  /// Maximum chunk size in bytes, default is 1024 KiB
318
  /// The chunker will create a new chunk whenever the chunk size exceeds this value.
319
  /// Note that the parquet writer has a related `pagesize` property that controls
320
  /// the maximum size of a parquet data page after encoding. While setting
321
  /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the
322
  /// chunking effectiveness, it results in more small parquet data pages.
323
  int64_t max_chunk_size = 1024 * 1024;
324
  /// Number of bit adjustment to the gearhash mask in order to center the chunk size
325
  /// around the average size more aggressively, default is 0
326
  /// Increasing the normalization level increases the probability of finding a chunk,
327
  /// improving the deduplication ratio, but also increasing the number of small chunks
328
  /// resulting in many small parquet data pages. The default value provides a good
329
  /// balance between deduplication ratio and fragmentation.
330
  /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
331
  /// expense of fragmentation. Negative values can also be used to reduce the
332
  /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
333
  /// Note that values outside [-3, 3] are not recommended, prefer using the default
334
  /// value of 0 for most use cases.
335
  int norm_level = 0;
336
};
337
338
class PARQUET_EXPORT WriterProperties {
339
 public:
340
  class PARQUET_EXPORT Builder {
341
   public:
342
    Builder()
343
        : pool_(::arrow::default_memory_pool()),
344
          dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
345
          write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
346
          max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
347
          pagesize_(kDefaultDataPageSize),
348
          max_rows_per_page_(kDefaultMaxRowsPerPage),
349
          version_(ParquetVersion::PARQUET_2_6),
350
          data_page_version_(ParquetDataPageVersion::V1),
351
          created_by_(DEFAULT_CREATED_BY),
352
          store_decimal_as_integer_(false),
353
          page_checksum_enabled_(false),
354
          size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL),
355
          content_defined_chunking_enabled_(false),
356
0
          content_defined_chunking_options_({}) {}
357
358
    explicit Builder(const WriterProperties& properties)
359
        : pool_(properties.memory_pool()),
360
          dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
361
          write_batch_size_(properties.write_batch_size()),
362
          max_row_group_length_(properties.max_row_group_length()),
363
          pagesize_(properties.data_pagesize()),
364
          max_rows_per_page_(properties.max_rows_per_page()),
365
          version_(properties.version()),
366
          data_page_version_(properties.data_page_version()),
367
          created_by_(properties.created_by()),
368
          store_decimal_as_integer_(properties.store_decimal_as_integer()),
369
          page_checksum_enabled_(properties.page_checksum_enabled()),
370
          size_statistics_level_(properties.size_statistics_level()),
371
          sorting_columns_(properties.sorting_columns()),
372
          default_column_properties_(properties.default_column_properties()),
373
          content_defined_chunking_enabled_(
374
              properties.content_defined_chunking_enabled()),
375
          content_defined_chunking_options_(
376
0
              properties.content_defined_chunking_options()) {
377
0
      CopyColumnSpecificProperties(properties);
378
0
    }
379
380
    /// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
381
    ///
382
    /// Optimize parquet files for content addressable storage (CAS) systems by writing
383
    /// data pages according to content-defined chunk boundaries. This allows for more
384
    /// efficient deduplication of data across files, hence more efficient network
385
    /// transfers and storage. The chunking is based on a rolling hash algorithm that
386
    /// identifies chunk boundaries based on the actual content of the data.
387
    ///
388
    /// Note that only the WriteArrow() interface is supported at the moment.
389
0
    Builder* enable_content_defined_chunking() {
390
0
      content_defined_chunking_enabled_ = true;
391
0
      return this;
392
0
    }
393
394
    /// \brief EXPERIMENTAL: Disable content-defined page chunking for all columns.
395
0
    Builder* disable_content_defined_chunking() {
396
0
      content_defined_chunking_enabled_ = false;
397
0
      return this;
398
0
    }
399
400
    /// \brief EXPERIMENTAL: Specify content-defined chunking options, see CdcOptions.
401
0
    Builder* content_defined_chunking_options(const CdcOptions& options) {
402
0
      content_defined_chunking_options_ = options;
403
0
      return this;
404
0
    }
405
406
    /// Specify the memory pool for the writer. Default default_memory_pool.
407
0
    Builder* memory_pool(MemoryPool* pool) {
408
0
      pool_ = pool;
409
0
      return this;
410
0
    }
411
412
    /// Enable dictionary encoding in general for all columns. Default
413
    /// enabled.
414
0
    Builder* enable_dictionary() {
415
0
      default_column_properties_.set_dictionary_enabled(true);
416
0
      return this;
417
0
    }
418
419
    /// Disable dictionary encoding in general for all columns. Default
420
    /// enabled.
421
0
    Builder* disable_dictionary() {
422
0
      default_column_properties_.set_dictionary_enabled(false);
423
0
      return this;
424
0
    }
425
426
    /// Enable dictionary encoding for column specified by `path`. Default
427
    /// enabled.
428
0
    Builder* enable_dictionary(const std::string& path) {
429
0
      dictionary_enabled_[path] = true;
430
0
      return this;
431
0
    }
432
433
    /// Enable dictionary encoding for column specified by `path`. Default
434
    /// enabled.
435
0
    Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
436
0
      return this->enable_dictionary(path->ToDotString());
437
0
    }
438
439
    /// Disable dictionary encoding for column specified by `path`. Default
440
    /// enabled.
441
0
    Builder* disable_dictionary(const std::string& path) {
442
0
      dictionary_enabled_[path] = false;
443
0
      return this;
444
0
    }
445
446
    /// Disable dictionary encoding for column specified by `path`. Default
447
    /// enabled.
448
0
    Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
449
0
      return this->disable_dictionary(path->ToDotString());
450
0
    }
451
452
    /// Specify the dictionary page size limit per row group. Default 1MB.
453
0
    Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
454
0
      dictionary_pagesize_limit_ = dictionary_psize_limit;
455
0
      return this;
456
0
    }
457
458
    /// Specify the write batch size while writing batches of Arrow values
459
    /// into Parquet. Default 1024.
460
0
    Builder* write_batch_size(int64_t write_batch_size) {
461
0
      write_batch_size_ = write_batch_size;
462
0
      return this;
463
0
    }
464
465
    /// Specify the max number of rows to put in a single row group.
466
    /// Default 1Mi rows.
467
0
    Builder* max_row_group_length(int64_t max_row_group_length) {
468
0
      max_row_group_length_ = max_row_group_length;
469
0
      return this;
470
0
    }
471
472
    /// Specify the data page size.
473
    /// Default 1MB.
474
0
    Builder* data_pagesize(int64_t pg_size) {
475
0
      pagesize_ = pg_size;
476
0
      return this;
477
0
    }
478
479
    /// Specify the maximum number of rows per data page.
480
    /// Default 20K rows.
481
0
    Builder* max_rows_per_page(int64_t max_rows) {
482
0
      max_rows_per_page_ = max_rows;
483
0
      return this;
484
0
    }
485
486
    /// Specify the data page version.
487
    /// Default V1.
488
0
    Builder* data_page_version(ParquetDataPageVersion data_page_version) {
489
0
      data_page_version_ = data_page_version;
490
0
      return this;
491
0
    }
492
493
    /// Specify the Parquet file version.
494
    /// Default PARQUET_2_6.
495
0
    Builder* version(ParquetVersion::type version) {
496
0
      version_ = version;
497
0
      return this;
498
0
    }
499
500
0
    Builder* created_by(const std::string& created_by) {
501
0
      created_by_ = created_by;
502
0
      return this;
503
0
    }
504
505
0
    Builder* enable_page_checksum() {
506
0
      page_checksum_enabled_ = true;
507
0
      return this;
508
0
    }
509
510
0
    Builder* disable_page_checksum() {
511
0
      page_checksum_enabled_ = false;
512
0
      return this;
513
0
    }
514
515
    /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
516
    //
517
    /// This is only applied if dictionary encoding is disabled. If the dictionary grows
518
    /// too large we always fall back to the PLAIN encoding.
519
0
    Builder* encoding(Encoding::type encoding_type) {
520
0
      if (encoding_type == Encoding::PLAIN_DICTIONARY ||
521
0
          encoding_type == Encoding::RLE_DICTIONARY) {
522
0
        throw ParquetException("Can't use dictionary encoding as fallback encoding");
523
0
      }
524
0
525
0
      default_column_properties_.set_encoding(encoding_type);
526
0
      return this;
527
0
    }
528
529
    /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
530
    //
531
    /// This is only applied if dictionary encoding is disabled. If the dictionary grows
532
    /// too large we always fall back to the PLAIN encoding.
533
0
    Builder* encoding(const std::string& path, Encoding::type encoding_type) {
534
0
      if (encoding_type == Encoding::PLAIN_DICTIONARY ||
535
0
          encoding_type == Encoding::RLE_DICTIONARY) {
536
0
        throw ParquetException("Can't use dictionary encoding as fallback encoding");
537
0
      }
538
539
0
      encodings_[path] = encoding_type;
540
0
      return this;
541
0
    }
542
543
    /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
544
    //
545
    /// This is only applied if dictionary encoding is disabled. If the dictionary grows
546
    /// too large we always fall back to the PLAIN encoding.
547
    Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
548
0
                      Encoding::type encoding_type) {
549
0
      return this->encoding(path->ToDotString(), encoding_type);
550
0
    }
551
552
    /// Specify compression codec in general for all columns.
553
    /// Default UNCOMPRESSED.
554
0
    Builder* compression(Compression::type codec) {
555
0
      default_column_properties_.set_compression(codec);
556
0
      return this;
557
0
    }
558
559
    /// Specify max statistics size to store min max value.
560
    /// Default 4KB.
561
0
    Builder* max_statistics_size(size_t max_stats_sz) {
562
0
      default_column_properties_.set_max_statistics_size(max_stats_sz);
563
0
      return this;
564
0
    }
565
566
    /// Specify compression codec for the column specified by `path`.
567
    /// Default UNCOMPRESSED.
568
0
    Builder* compression(const std::string& path, Compression::type codec) {
569
0
      codecs_[path] = codec;
570
0
      return this;
571
0
    }
572
573
    /// Specify compression codec for the column specified by `path`.
574
    /// Default UNCOMPRESSED.
575
    Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
576
0
                         Compression::type codec) {
577
0
      return this->compression(path->ToDotString(), codec);
578
0
    }
579
580
    /// \brief Specify the default compression level for the compressor in
581
    /// every column.  In case a column does not have an explicitly specified
582
    /// compression level, the default one would be used.
583
    ///
584
    /// The provided compression level is compressor specific. The user would
585
    /// have to familiarize oneself with the available levels for the selected
586
    /// compressor.  If the compressor does not allow for selecting different
587
    /// compression levels, calling this function would not have any effect.
588
    /// Parquet and Arrow do not validate the passed compression level.  If no
589
    /// level is selected by the user or if the special
590
    /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
591
    /// compression level.
592
    ///
593
    /// If other compressor-specific options need to be set in addition to the compression
594
    /// level, use the codec_options method.
595
0
    Builder* compression_level(int compression_level) {
596
0
      default_column_properties_.set_compression_level(compression_level);
597
0
      return this;
598
0
    }
599
600
    /// \brief Specify a compression level for the compressor for the column
601
    /// described by path.
602
    ///
603
    /// The provided compression level is compressor specific. The user would
604
    /// have to familiarize oneself with the available levels for the selected
605
    /// compressor.  If the compressor does not allow for selecting different
606
    /// compression levels, calling this function would not have any effect.
607
    /// Parquet and Arrow do not validate the passed compression level.  If no
608
    /// level is selected by the user or if the special
609
    /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
610
    /// compression level.
611
0
    Builder* compression_level(const std::string& path, int compression_level) {
612
0
      if (!codec_options_[path]) {
613
0
        codec_options_[path] = std::make_shared<CodecOptions>();
614
0
      }
615
0
      codec_options_[path]->compression_level = compression_level;
616
0
      return this;
617
0
    }
618
619
    /// \brief Specify a compression level for the compressor for the column
620
    /// described by path.
621
    ///
622
    /// The provided compression level is compressor specific. The user would
623
    /// have to familiarize oneself with the available levels for the selected
624
    /// compressor.  If the compressor does not allow for selecting different
625
    /// compression levels, calling this function would not have any effect.
626
    /// Parquet and Arrow do not validate the passed compression level.  If no
627
    /// level is selected by the user or if the special
628
    /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
629
    /// compression level.
630
    Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
631
0
                               int compression_level) {
632
0
      return this->compression_level(path->ToDotString(), compression_level);
633
0
    }
634
635
    /// \brief Specify the default codec options for the compressor in
636
    /// every column.
637
    ///
638
    /// The codec options allow configuring the compression level as well
639
    /// as other codec-specific options.
640
    Builder* codec_options(
641
0
        const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
642
0
      default_column_properties_.set_codec_options(codec_options);
643
0
      return this;
644
0
    }
645
646
    /// \brief Specify the codec options for the compressor for the column
647
    /// described by path.
648
    Builder* codec_options(
649
        const std::string& path,
650
0
        const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
651
0
      codec_options_[path] = codec_options;
652
0
      return this;
653
0
    }
654
655
    /// \brief Specify the codec options for the compressor for the column
656
    /// described by path.
657
    Builder* codec_options(
658
        const std::shared_ptr<schema::ColumnPath>& path,
659
0
        const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
660
0
      return this->codec_options(path->ToDotString(), codec_options);
661
0
    }
662
663
    /// Define the file encryption properties.
664
    /// Default NULL.
665
    Builder* encryption(
666
0
        std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
667
0
      file_encryption_properties_ = std::move(file_encryption_properties);
668
0
      return this;
669
0
    }
670
671
    /// Enable statistics in general.
672
    /// Default enabled.
673
0
    Builder* enable_statistics() {
674
0
      default_column_properties_.set_statistics_enabled(true);
675
0
      return this;
676
0
    }
677
678
    /// Disable statistics in general.
679
    /// Default enabled.
680
0
    Builder* disable_statistics() {
681
0
      default_column_properties_.set_statistics_enabled(false);
682
0
      return this;
683
0
    }
684
685
    /// Enable statistics for the column specified by `path`.
686
    /// Default enabled.
687
0
    Builder* enable_statistics(const std::string& path) {
688
0
      statistics_enabled_[path] = true;
689
0
      return this;
690
0
    }
691
692
    /// Enable statistics for the column specified by `path`.
693
    /// Default enabled.
694
0
    Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
695
0
      return this->enable_statistics(path->ToDotString());
696
0
    }
697
698
    /// Define the sorting columns.
699
    /// Default empty.
700
    ///
701
    /// If sorting columns are set, user should ensure that records
702
    /// are sorted by sorting columns. Otherwise, the storing data
703
    /// will be inconsistent with sorting_columns metadata.
704
0
    Builder* set_sorting_columns(std::vector<SortingColumn> sorting_columns) {
705
0
      sorting_columns_ = std::move(sorting_columns);
706
0
      return this;
707
0
    }
708
709
    /// Disable statistics for the column specified by `path`.
710
    /// Default enabled.
711
0
    Builder* disable_statistics(const std::string& path) {
712
0
      statistics_enabled_[path] = false;
713
0
      return this;
714
0
    }
715
716
    /// Disable statistics for the column specified by `path`.
717
    /// Default enabled.
718
0
    Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
719
0
      return this->disable_statistics(path->ToDotString());
720
0
    }
721
722
    /// Disable bloom filter for the column specified by `path`.
723
    /// Default disabled.
724
0
    Builder* disable_bloom_filter(const std::string& path) {
725
0
      bloom_filter_options_.erase(path);
726
0
      return this;
727
0
    }
728
729
    /// Disable bloom filter for the column specified by `path`.
730
    /// Default disabled.
731
0
    Builder* disable_bloom_filter(const std::shared_ptr<schema::ColumnPath>& path) {
732
0
      return this->disable_bloom_filter(path->ToDotString());
733
0
    }
734
735
    /// Enable bloom filter for the column specified by `path`.
736
    ///
737
    /// Default disabled.
738
    ///
739
    /// \note Bloom filter is not supported for boolean columns. ParquetException will
740
    /// be thrown during write if the column is of boolean type.
741
    Builder* enable_bloom_filter(const std::string& path,
742
0
                                 const BloomFilterOptions& bloom_filter_options) {
743
0
      bloom_filter_options_[path] = bloom_filter_options;
744
0
      return this;
745
0
    }
746
747
    /// Enable bloom filter for the column specified by `path`.
748
    ///
749
    /// Default disabled.
750
    ///
751
    /// \note Bloom filter is not supported for boolean columns. ParquetException will
752
    /// be thrown during write if the column is of boolean type.
753
    Builder* enable_bloom_filter(const std::shared_ptr<schema::ColumnPath>& path,
754
0
                                 const BloomFilterOptions& bloom_filter_options) {
755
0
      return this->enable_bloom_filter(path->ToDotString(), bloom_filter_options);
756
0
    }
757
758
    /// Allow decimals with 1 <= precision <= 18 to be stored as integers.
759
    ///
760
    /// In Parquet, DECIMAL can be stored in any of the following physical types:
761
    /// - int32: for 1 <= precision <= 9.
762
    /// - int64: for 10 <= precision <= 18.
763
    /// - fixed_len_byte_array: precision is limited by the array size.
764
    ///   Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits.
765
    /// - binary: precision is unlimited. The minimum number of bytes to store
766
    ///   the unscaled value is used.
767
    ///
768
    /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array.
769
    ///
770
    /// When enabled, the C++ writer will use following physical types to store decimals:
771
    /// - int32: for 1 <= precision <= 9.
772
    /// - int64: for 10 <= precision <= 18.
773
    /// - fixed_len_byte_array: for precision > 18.
774
    ///
775
    /// As a consequence, decimal columns stored in integer types are more compact.
776
0
    Builder* enable_store_decimal_as_integer() {
777
0
      store_decimal_as_integer_ = true;
778
0
      return this;
779
0
    }
780
781
    /// Disable decimal logical type with 1 <= precision <= 18 to be stored
782
    /// as integer physical type.
783
    ///
784
    /// Default disabled.
785
0
    Builder* disable_store_decimal_as_integer() {
786
0
      store_decimal_as_integer_ = false;
787
0
      return this;
788
0
    }
789
790
    /// Enable writing page index in general for all columns. Default enabled.
791
    ///
792
    /// Writing statistics to the page index disables the old method of writing
793
    /// statistics to each data page header.
794
    /// The page index makes filtering more efficient than the page header, as
795
    /// it gathers all the statistics for a Parquet file in a single place,
796
    /// avoiding scattered I/O.
797
    ///
798
    /// Please check the link below for more details:
799
    /// https://github.com/apache/parquet-format/blob/master/PageIndex.md
800
0
    Builder* enable_write_page_index() {
801
0
      default_column_properties_.set_page_index_enabled(true);
802
0
      return this;
803
0
    }
804
805
    /// Disable writing page index in general for all columns. Default enabled.
806
0
    Builder* disable_write_page_index() {
807
0
      default_column_properties_.set_page_index_enabled(false);
808
0
      return this;
809
0
    }
810
811
    /// Enable writing page index for column specified by `path`. Default enabled.
812
0
    Builder* enable_write_page_index(const std::string& path) {
813
0
      page_index_enabled_[path] = true;
814
0
      return this;
815
0
    }
816
817
    /// Enable writing page index for column specified by `path`. Default enabled.
818
0
    Builder* enable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
819
0
      return this->enable_write_page_index(path->ToDotString());
820
0
    }
821
822
    /// Disable writing page index for column specified by `path`. Default enabled.
823
0
    Builder* disable_write_page_index(const std::string& path) {
824
0
      page_index_enabled_[path] = false;
825
0
      return this;
826
0
    }
827
828
    /// Disable writing page index for column specified by `path`. Default enabled.
829
0
    Builder* disable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
830
0
      return this->disable_write_page_index(path->ToDotString());
831
0
    }
832
833
    /// \brief Set the level to write size statistics for all columns. Default is
834
    /// PageAndColumnChunk.
835
    ///
836
    /// \param level The level to write size statistics. Note that if page index is not
837
    /// enabled, page level size statistics will not be written even if the level
838
    /// is set to PageAndColumnChunk.
839
0
    Builder* set_size_statistics_level(SizeStatisticsLevel level) {
840
0
      size_statistics_level_ = level;
841
0
      return this;
842
0
    }
843
844
    /// \brief Build the WriterProperties with the builder parameters.
845
    /// \return The WriterProperties defined by the builder.
846
0
    std::shared_ptr<WriterProperties> build() {
847
0
      std::unordered_map<std::string, ColumnProperties> column_properties;
848
0
      auto get = [&](const std::string& key) -> ColumnProperties& {
849
0
        auto it = column_properties.find(key);
850
0
        if (it == column_properties.end())
851
0
          return column_properties[key] = default_column_properties_;
852
0
        else
853
0
          return it->second;
854
0
      };
855
0
856
0
      for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
857
0
      for (const auto& item : codecs_) get(item.first).set_compression(item.second);
858
0
      for (const auto& item : codec_options_)
859
0
        get(item.first).set_codec_options(item.second);
860
0
      for (const auto& item : dictionary_enabled_)
861
0
        get(item.first).set_dictionary_enabled(item.second);
862
0
      for (const auto& item : statistics_enabled_)
863
0
        get(item.first).set_statistics_enabled(item.second);
864
0
      for (const auto& item : page_index_enabled_)
865
0
        get(item.first).set_page_index_enabled(item.second);
866
0
      for (const auto& item : bloom_filter_options_)
867
0
        get(item.first).set_bloom_filter_options(item.second);
868
0
869
0
      return std::shared_ptr<WriterProperties>(new WriterProperties(
870
0
          pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
871
0
          pagesize_, max_rows_per_page_, version_, created_by_, page_checksum_enabled_,
872
0
          size_statistics_level_, std::move(file_encryption_properties_),
873
0
          default_column_properties_, column_properties, data_page_version_,
874
0
          store_decimal_as_integer_, std::move(sorting_columns_),
875
0
          content_defined_chunking_enabled_, content_defined_chunking_options_));
876
0
    }
877
878
   private:
879
    void CopyColumnSpecificProperties(const WriterProperties& properties);
880
881
    MemoryPool* pool_;
882
    int64_t dictionary_pagesize_limit_;
883
    int64_t write_batch_size_;
884
    int64_t max_row_group_length_;
885
    int64_t pagesize_;
886
    int64_t max_rows_per_page_;
887
    ParquetVersion::type version_;
888
    ParquetDataPageVersion data_page_version_;
889
    std::string created_by_;
890
    bool store_decimal_as_integer_;
891
    bool page_checksum_enabled_;
892
    SizeStatisticsLevel size_statistics_level_;
893
894
    std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
895
896
    // If empty, there is no sorting columns.
897
    std::vector<SortingColumn> sorting_columns_;
898
899
    // Settings used for each column unless overridden in any of the maps below
900
    ColumnProperties default_column_properties_;
901
    std::unordered_map<std::string, Encoding::type> encodings_;
902
    std::unordered_map<std::string, Compression::type> codecs_;
903
    std::unordered_map<std::string, std::shared_ptr<CodecOptions>> codec_options_;
904
    std::unordered_map<std::string, bool> dictionary_enabled_;
905
    std::unordered_map<std::string, bool> statistics_enabled_;
906
    std::unordered_map<std::string, bool> page_index_enabled_;
907
    std::unordered_map<std::string, BloomFilterOptions> bloom_filter_options_;
908
909
    bool content_defined_chunking_enabled_;
910
    CdcOptions content_defined_chunking_options_;
911
  };
912
913
0
  inline MemoryPool* memory_pool() const { return pool_; }
914
915
0
  inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
916
917
0
  inline int64_t write_batch_size() const { return write_batch_size_; }
918
919
0
  inline int64_t max_row_group_length() const { return max_row_group_length_; }
920
921
0
  inline int64_t data_pagesize() const { return pagesize_; }
922
923
0
  inline int64_t max_rows_per_page() const { return max_rows_per_page_; }
924
925
0
  inline ParquetDataPageVersion data_page_version() const {
926
0
    return parquet_data_page_version_;
927
0
  }
928
929
0
  inline ParquetVersion::type version() const { return parquet_version_; }
930
931
0
  inline std::string created_by() const { return parquet_created_by_; }
932
933
0
  inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; }
934
935
0
  inline bool page_checksum_enabled() const { return page_checksum_enabled_; }
936
937
0
  inline bool content_defined_chunking_enabled() const {
938
0
    return content_defined_chunking_enabled_;
939
0
  }
940
0
  inline CdcOptions content_defined_chunking_options() const {
941
0
    return content_defined_chunking_options_;
942
0
  }
943
944
0
  inline SizeStatisticsLevel size_statistics_level() const {
945
0
    return size_statistics_level_;
946
0
  }
947
948
0
  inline Encoding::type dictionary_index_encoding() const {
949
0
    if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
950
0
      return Encoding::PLAIN_DICTIONARY;
951
0
    } else {
952
0
      return Encoding::RLE_DICTIONARY;
953
0
    }
954
0
  }
955
956
0
  inline Encoding::type dictionary_page_encoding() const {
957
0
    if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
958
0
      return Encoding::PLAIN_DICTIONARY;
959
0
    } else {
960
0
      return Encoding::PLAIN;
961
0
    }
962
0
  }
963
964
  const ColumnProperties& column_properties(
965
0
      const std::shared_ptr<schema::ColumnPath>& path) const {
966
0
    auto it = column_properties_.find(path->ToDotString());
967
0
    if (it != column_properties_.end()) return it->second;
968
0
    return default_column_properties_;
969
0
  }
970
971
0
  Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
972
0
    return column_properties(path).encoding();
973
0
  }
974
975
0
  Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
976
0
    return column_properties(path).compression();
977
0
  }
978
979
0
  int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
980
0
    return column_properties(path).compression_level();
981
0
  }
982
983
  const std::shared_ptr<CodecOptions> codec_options(
984
0
      const std::shared_ptr<schema::ColumnPath>& path) const {
985
0
    return column_properties(path).codec_options();
986
0
  }
987
988
0
  bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
989
0
    return column_properties(path).dictionary_enabled();
990
0
  }
991
992
0
  const std::vector<SortingColumn>& sorting_columns() const { return sorting_columns_; }
993
994
0
  bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
995
0
    return column_properties(path).statistics_enabled();
996
0
  }
997
998
0
  size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
999
0
    return column_properties(path).max_statistics_size();
1000
0
  }
1001
1002
0
  bool page_index_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
1003
0
    return column_properties(path).page_index_enabled();
1004
0
  }
1005
1006
0
  bool page_index_enabled() const {
1007
0
    if (default_column_properties_.page_index_enabled()) {
1008
0
      return true;
1009
0
    }
1010
0
    for (const auto& item : column_properties_) {
1011
0
      if (item.second.page_index_enabled()) {
1012
0
        return true;
1013
0
      }
1014
0
    }
1015
0
    return false;
1016
0
  }
1017
1018
  // Return whether bloom filter is enabled for any column.
1019
0
  bool bloom_filter_enabled() const {
1020
0
    return std::any_of(column_properties_.cbegin(), column_properties_.cend(),
1021
0
                       [](const auto& p) { return p.second.bloom_filter_enabled(); });
1022
0
  }
1023
1024
  std::optional<BloomFilterOptions> bloom_filter_options(
1025
0
      const std::shared_ptr<schema::ColumnPath>& path) const {
1026
0
    return column_properties(path).bloom_filter_options();
1027
0
  }
1028
1029
0
  inline FileEncryptionProperties* file_encryption_properties() const {
1030
0
    return file_encryption_properties_.get();
1031
0
  }
1032
1033
  std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
1034
0
      const std::string& path) const {
1035
0
    if (file_encryption_properties_) {
1036
0
      return file_encryption_properties_->column_encryption_properties(path);
1037
0
    } else {
1038
0
      return NULLPTR;
1039
0
    }
1040
0
  }
Unexecuted instantiation: parquet::WriterProperties::column_encryption_properties(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) const
Unexecuted instantiation: parquet::WriterProperties::column_encryption_properties(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&) const
1041
1042
  // \brief Return the default column properties
1043
0
  const ColumnProperties& default_column_properties() const {
1044
0
    return default_column_properties_;
1045
0
  }
1046
1047
 private:
1048
  explicit WriterProperties(
1049
      MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
1050
      int64_t max_row_group_length, int64_t pagesize, int64_t max_rows_per_page,
1051
      ParquetVersion::type version, const std::string& created_by,
1052
      bool page_write_checksum_enabled, SizeStatisticsLevel size_statistics_level,
1053
      std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
1054
      const ColumnProperties& default_column_properties,
1055
      const std::unordered_map<std::string, ColumnProperties>& column_properties,
1056
      ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer,
1057
      std::vector<SortingColumn> sorting_columns, bool content_defined_chunking_enabled,
1058
      CdcOptions content_defined_chunking_options)
1059
      : pool_(pool),
1060
        dictionary_pagesize_limit_(dictionary_pagesize_limit),
1061
        write_batch_size_(write_batch_size),
1062
        max_row_group_length_(max_row_group_length),
1063
        pagesize_(pagesize),
1064
        max_rows_per_page_(max_rows_per_page),
1065
        parquet_data_page_version_(data_page_version),
1066
        parquet_version_(version),
1067
        parquet_created_by_(created_by),
1068
        store_decimal_as_integer_(store_short_decimal_as_integer),
1069
        page_checksum_enabled_(page_write_checksum_enabled),
1070
        size_statistics_level_(size_statistics_level),
1071
        file_encryption_properties_(file_encryption_properties),
1072
        sorting_columns_(std::move(sorting_columns)),
1073
        default_column_properties_(default_column_properties),
1074
        column_properties_(column_properties),
1075
        content_defined_chunking_enabled_(content_defined_chunking_enabled),
1076
0
        content_defined_chunking_options_(content_defined_chunking_options) {}
1077
1078
  MemoryPool* pool_;
1079
  int64_t dictionary_pagesize_limit_;
1080
  int64_t write_batch_size_;
1081
  int64_t max_row_group_length_;
1082
  int64_t pagesize_;
1083
  int64_t max_rows_per_page_;
1084
  ParquetDataPageVersion parquet_data_page_version_;
1085
  ParquetVersion::type parquet_version_;
1086
  std::string parquet_created_by_;
1087
  bool store_decimal_as_integer_;
1088
  bool page_checksum_enabled_;
1089
  SizeStatisticsLevel size_statistics_level_;
1090
1091
  std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
1092
1093
  std::vector<SortingColumn> sorting_columns_;
1094
1095
  ColumnProperties default_column_properties_;
1096
  std::unordered_map<std::string, ColumnProperties> column_properties_;
1097
1098
  bool content_defined_chunking_enabled_;
1099
  CdcOptions content_defined_chunking_options_;
1100
};
1101
1102
PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
1103
1104
// ----------------------------------------------------------------------
1105
// Properties specific to Apache Arrow columnar read and write
1106
1107
static constexpr bool kArrowDefaultUseThreads = false;
1108
1109
// Default number of rows to read when using ::arrow::RecordBatchReader
1110
static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
1111
1112
constexpr inline ::arrow::Type::type kArrowDefaultBinaryType = ::arrow::Type::BINARY;
1113
constexpr inline ::arrow::Type::type kArrowDefaultListType = ::arrow::Type::LIST;
1114
1115
/// EXPERIMENTAL: Properties for configuring FileReader behavior.
1116
class PARQUET_EXPORT ArrowReaderProperties {
1117
 public:
1118
  explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
1119
103k
      : use_threads_(use_threads),
1120
103k
        read_dict_indices_(),
1121
103k
        batch_size_(kArrowDefaultBatchSize),
1122
103k
        pre_buffer_(true),
1123
103k
        cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
1124
103k
        coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
1125
103k
        binary_type_(kArrowDefaultBinaryType),
1126
103k
        list_type_(kArrowDefaultListType),
1127
103k
        arrow_extensions_enabled_(false),
1128
103k
        should_load_statistics_(false),
1129
103k
        smallest_decimal_enabled_(false) {}
1130
1131
  /// \brief Set whether to use the IO thread pool to parse columns in parallel.
1132
  ///
1133
  /// Default is false.
1134
0
  void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
1135
  /// Return whether will use multiple threads.
1136
60.9k
  bool use_threads() const { return use_threads_; }
1137
1138
  /// \brief Set whether to read a particular column as dictionary encoded.
1139
  ///
1140
  /// If the file metadata contains a serialized Arrow schema, then ...
1141
  ////
1142
  /// This is only supported for columns with a Parquet physical type of
1143
  /// BYTE_ARRAY, such as string or binary types.
1144
0
  void set_read_dictionary(int column_index, bool read_dict) {
1145
0
    if (read_dict) {
1146
0
      read_dict_indices_.insert(column_index);
1147
0
    } else {
1148
0
      read_dict_indices_.erase(column_index);
1149
0
    }
1150
0
  }
1151
  /// Return whether the column at the index will be read as dictionary.
1152
257k
  bool read_dictionary(int column_index) const {
1153
257k
    if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
1154
0
      return true;
1155
257k
    } else {
1156
257k
      return false;
1157
257k
    }
1158
257k
  }
1159
1160
  /// \brief Set the Arrow binary type to read BYTE_ARRAY columns as.
1161
  ///
1162
  /// Allowed values are Type::BINARY, Type::LARGE_BINARY and Type::BINARY_VIEW.
1163
  /// Default is Type::BINARY.
1164
  ///
1165
  /// If a BYTE_ARRAY column has the STRING logical type, it is read as the
1166
  /// Arrow string type corresponding to the configured binary type (for example
1167
  /// Type::LARGE_STRING if the configured binary type is Type::LARGE_BINARY).
1168
  ///
1169
  /// However, if a serialized Arrow schema is found in the Parquet metadata,
1170
  /// this setting is ignored and the Arrow schema takes precedence
1171
  /// (see ArrowWriterProperties::store_schema).
1172
0
  void set_binary_type(::arrow::Type::type value) { binary_type_ = value; }
1173
  /// Return the Arrow binary type to read BYTE_ARRAY columns as.
1174
46.7k
  ::arrow::Type::type binary_type() const { return binary_type_; }
1175
1176
  /// \brief Set the Arrow list type to read Parquet list columns as.
1177
  ///
1178
  /// Allowed values are Type::LIST and Type::LARGE_LIST.
1179
  /// Default is Type::LIST.
1180
  ///
1181
  /// However, if a serialized Arrow schema is found in the Parquet metadata,
1182
  /// this setting is ignored and the Arrow schema takes precedence
1183
  /// (see ArrowWriterProperties::store_schema).
1184
0
  void set_list_type(::arrow::Type::type value) { list_type_ = value; }
1185
  /// Return the Arrow list type to read Parquet list columns as.
1186
34.5k
  ::arrow::Type::type list_type() const { return list_type_; }
1187
1188
  /// \brief Set the maximum number of rows to read into a record batch.
1189
  ///
1190
  /// Will only be fewer rows when there are no more rows in the file.
1191
  /// Note that some APIs such as ReadTable may ignore this setting.
1192
33.4k
  void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
1193
  /// Return the batch size in rows.
1194
  ///
1195
  /// Note that some APIs such as ReadTable may ignore this setting.
1196
0
  int64_t batch_size() const { return batch_size_; }
1197
1198
  /// Enable read coalescing (default true).
1199
  ///
1200
  /// When enabled, the Arrow reader will pre-buffer necessary regions
1201
  /// of the file in-memory. This is intended to improve performance on
1202
  /// high-latency filesystems (e.g. Amazon S3).
1203
0
  void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
1204
  /// Return whether read coalescing is enabled.
1205
69.6k
  bool pre_buffer() const { return pre_buffer_; }
1206
1207
  /// Set options for read coalescing. This can be used to tune the
1208
  /// implementation for characteristics of different filesystems.
1209
0
  void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
1210
  /// Return the options for read coalescing.
1211
69.6k
  const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
1212
1213
  /// Set execution context for read coalescing.
1214
0
  void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
1215
  /// Return the execution context used for read coalescing.
1216
69.6k
  const ::arrow::io::IOContext& io_context() const { return io_context_; }
1217
1218
  /// Set timestamp unit to use for deprecated INT96-encoded timestamps
1219
  /// (default is NANO).
1220
0
  void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
1221
0
    coerce_int96_timestamp_unit_ = unit;
1222
0
  }
1223
1224
9.25k
  ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
1225
9.25k
    return coerce_int96_timestamp_unit_;
1226
9.25k
  }
1227
1228
  /// Enable Parquet-supported Arrow extension types.
1229
  ///
1230
  /// When enabled, Parquet logical types will be mapped to their corresponding Arrow
1231
  /// extension types at read time, if such exist. Currently only arrow::extension::json()
1232
  /// extension type is supported. Columns whose LogicalType is JSON will be interpreted
1233
  /// as arrow::extension::json(), with storage type inferred from the serialized Arrow
1234
  /// schema if present, or `utf8` by default.
1235
0
  void set_arrow_extensions_enabled(bool extensions_enabled) {
1236
0
    arrow_extensions_enabled_ = extensions_enabled;
1237
0
  }
1238
24.1k
  bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; }
1239
1240
  /// \brief Set whether to load statistics as much as possible.
1241
  ///
1242
  /// Default is false.
1243
0
  void set_should_load_statistics(bool should_load_statistics) {
1244
0
    should_load_statistics_ = should_load_statistics;
1245
0
  }
1246
  /// Return whether loading statistics as much as possible.
1247
153k
  bool should_load_statistics() const { return should_load_statistics_; }
1248
1249
  /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical types.
1250
  ///
1251
  /// Default is false for compatibility, meaning that only Decimal128 and Decimal256
1252
  /// can be inferred.
1253
0
  void set_smallest_decimal_enabled(bool smallest_decimal_enable) {
1254
0
    smallest_decimal_enabled_ = smallest_decimal_enable;
1255
0
  }
1256
  /// \brief Whether to infer Decimal32/64 from Parquet decimal logical types.
1257
  ///
1258
  /// When enabled, Parquet decimal columns will be inferred as the smallest possible
1259
  /// Arrow Decimal type.
1260
  /// When disabled, Parquet decimal columns will be inferred as either Decimal128 or
1261
  /// Decimal256, but not Decimal32/64.
1262
  ///
1263
  /// Note: if an Arrow schema is found in the Parquet metadata, it will take priority and
1264
  /// this setting will be ignored.
1265
9.49k
  bool smallest_decimal_enabled() const { return smallest_decimal_enabled_; }
1266
1267
 private:
1268
  bool use_threads_;
1269
  std::unordered_set<int> read_dict_indices_;
1270
  int64_t batch_size_;
1271
  bool pre_buffer_;
1272
  ::arrow::io::IOContext io_context_;
1273
  ::arrow::io::CacheOptions cache_options_;
1274
  ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
1275
  ::arrow::Type::type binary_type_;
1276
  ::arrow::Type::type list_type_;
1277
  bool arrow_extensions_enabled_;
1278
  bool should_load_statistics_;
1279
  bool smallest_decimal_enabled_;
1280
};
1281
1282
/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
1283
PARQUET_EXPORT
1284
ArrowReaderProperties default_arrow_reader_properties();
1285
1286
class PARQUET_EXPORT ArrowWriterProperties {
1287
 public:
1288
  enum EngineVersion {
1289
    V1,  // Supports only nested lists.
1290
    V2   // Full support for all nesting combinations
1291
  };
1292
  class Builder {
1293
   public:
1294
    Builder()
1295
0
        : write_timestamps_as_int96_(false),
1296
0
          coerce_timestamps_enabled_(false),
1297
0
          coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
1298
0
          truncated_timestamps_allowed_(false),
1299
0
          store_schema_(false),
1300
0
          compliant_nested_types_(true),
1301
0
          engine_version_(V2),
1302
0
          use_threads_(kArrowDefaultUseThreads),
1303
0
          executor_(NULLPTR),
1304
0
          write_time_adjusted_to_utc_(false) {}
1305
1306
    /// \brief Disable writing legacy int96 timestamps (default disabled).
1307
0
    Builder* disable_deprecated_int96_timestamps() {
1308
0
      write_timestamps_as_int96_ = false;
1309
0
      return this;
1310
0
    }
1311
1312
    /// \brief Enable writing legacy int96 timestamps (default disabled).
1313
    ///
1314
    /// May be turned on to write timestamps compatible with older Parquet writers.
1315
    /// This takes precedent over coerce_timestamps.
1316
0
    Builder* enable_deprecated_int96_timestamps() {
1317
0
      write_timestamps_as_int96_ = true;
1318
0
      return this;
1319
0
    }
1320
1321
    /// \brief Coerce all timestamps to the specified time unit.
1322
    /// \param unit time unit to truncate to.
1323
    /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds.
1324
0
    Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
1325
0
      coerce_timestamps_enabled_ = true;
1326
0
      coerce_timestamps_unit_ = unit;
1327
0
      return this;
1328
0
    }
1329
1330
    /// \brief Allow loss of data when truncating timestamps.
1331
    ///
1332
    /// This is disallowed by default and an error will be returned.
1333
0
    Builder* allow_truncated_timestamps() {
1334
0
      truncated_timestamps_allowed_ = true;
1335
0
      return this;
1336
0
    }
1337
1338
    /// \brief Disallow loss of data when truncating timestamps (default).
1339
0
    Builder* disallow_truncated_timestamps() {
1340
0
      truncated_timestamps_allowed_ = false;
1341
0
      return this;
1342
0
    }
1343
1344
    /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
1345
    /// to enable certain read options (like "read_dictionary") to be set
1346
    /// automatically
1347
0
    Builder* store_schema() {
1348
0
      store_schema_ = true;
1349
0
      return this;
1350
0
    }
1351
1352
    /// \brief When enabled, will not preserve Arrow field names for list types.
1353
    ///
1354
    /// Instead of using the field names Arrow uses for the values array of
1355
    /// list types (default "item"), will use "element", as is specified in
1356
    /// the Parquet spec.
1357
    ///
1358
    /// This is enabled by default.
1359
0
    Builder* enable_compliant_nested_types() {
1360
0
      compliant_nested_types_ = true;
1361
0
      return this;
1362
0
    }
1363
1364
    /// Preserve Arrow list field name.
1365
0
    Builder* disable_compliant_nested_types() {
1366
0
      compliant_nested_types_ = false;
1367
0
      return this;
1368
0
    }
1369
1370
    /// Set the version of the Parquet writer engine.
1371
0
    Builder* set_engine_version(EngineVersion version) {
1372
0
      engine_version_ = version;
1373
0
      return this;
1374
0
    }
1375
1376
    /// \brief Set whether to use multiple threads to write columns
1377
    /// in parallel in the buffered row group mode.
1378
    ///
1379
    /// WARNING: If writing multiple files in parallel in the same
1380
    /// executor, deadlock may occur if use_threads is true. Please
1381
    /// disable it in this case.
1382
    ///
1383
    /// Default is false.
1384
0
    Builder* set_use_threads(bool use_threads) {
1385
0
      use_threads_ = use_threads;
1386
0
      return this;
1387
0
    }
1388
1389
    /// \brief Set the executor to write columns in parallel in the
1390
    /// buffered row group mode.
1391
    ///
1392
    /// Default is nullptr and the default cpu executor will be used.
1393
0
    Builder* set_executor(::arrow::internal::Executor* executor) {
1394
0
      executor_ = executor;
1395
0
      return this;
1396
0
    }
1397
1398
    /// \brief Set the value of isAdjustedTOUTC when writing a TIME column
1399
    ///
1400
    /// Default is false because Arrow TIME data is expressed in an unspecified timezone.
1401
    /// Note this setting doesn't affect TIMESTAMP data.
1402
0
    Builder* set_time_adjusted_to_utc(bool adjusted) {
1403
0
      write_time_adjusted_to_utc_ = adjusted;
1404
0
      return this;
1405
0
    }
1406
1407
    /// Create the final properties.
1408
0
    std::shared_ptr<ArrowWriterProperties> build() {
1409
0
      return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
1410
0
          write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
1411
0
          truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
1412
0
          engine_version_, use_threads_, executor_, write_time_adjusted_to_utc_));
1413
0
    }
1414
1415
   private:
1416
    bool write_timestamps_as_int96_;
1417
1418
    bool coerce_timestamps_enabled_;
1419
    ::arrow::TimeUnit::type coerce_timestamps_unit_;
1420
    bool truncated_timestamps_allowed_;
1421
1422
    bool store_schema_;
1423
    bool compliant_nested_types_;
1424
    EngineVersion engine_version_;
1425
1426
    bool use_threads_;
1427
    ::arrow::internal::Executor* executor_;
1428
1429
    bool write_time_adjusted_to_utc_;
1430
  };
1431
1432
0
  bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
1433
1434
0
  bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
1435
0
  ::arrow::TimeUnit::type coerce_timestamps_unit() const {
1436
0
    return coerce_timestamps_unit_;
1437
0
  }
1438
1439
0
  bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
1440
1441
0
  bool store_schema() const { return store_schema_; }
1442
1443
  /// \brief Enable nested type naming according to the parquet specification.
1444
  ///
1445
  /// Older versions of arrow wrote out field names for nested lists based on the name
1446
  /// of the field.  According to the parquet specification they should always be
1447
  /// "element".
1448
0
  bool compliant_nested_types() const { return compliant_nested_types_; }
1449
1450
  /// \brief The underlying engine version to use when writing Arrow data.
1451
  ///
1452
  /// V2 is currently the latest V1 is considered deprecated but left in
1453
  /// place in case there are bugs detected in V2.
1454
0
  EngineVersion engine_version() const { return engine_version_; }
1455
1456
  /// \brief Returns whether the writer will use multiple threads
1457
  /// to write columns in parallel in the buffered row group mode.
1458
0
  bool use_threads() const { return use_threads_; }
1459
1460
  /// \brief Returns the executor used to write columns in parallel.
1461
  ::arrow::internal::Executor* executor() const;
1462
1463
  /// \brief The value of isAdjustedTOUTC when writing a TIME column
1464
  ///
1465
  /// Note this setting doesn't affect TIMESTAMP data.
1466
0
  bool write_time_adjusted_to_utc() const { return write_time_adjusted_to_utc_; }
1467
1468
 private:
1469
  explicit ArrowWriterProperties(bool write_nanos_as_int96,
1470
                                 bool coerce_timestamps_enabled,
1471
                                 ::arrow::TimeUnit::type coerce_timestamps_unit,
1472
                                 bool truncated_timestamps_allowed, bool store_schema,
1473
                                 bool compliant_nested_types,
1474
                                 EngineVersion engine_version, bool use_threads,
1475
                                 ::arrow::internal::Executor* executor,
1476
                                 bool write_time_adjusted_to_utc)
1477
0
      : write_timestamps_as_int96_(write_nanos_as_int96),
1478
0
        coerce_timestamps_enabled_(coerce_timestamps_enabled),
1479
0
        coerce_timestamps_unit_(coerce_timestamps_unit),
1480
0
        truncated_timestamps_allowed_(truncated_timestamps_allowed),
1481
0
        store_schema_(store_schema),
1482
0
        compliant_nested_types_(compliant_nested_types),
1483
0
        engine_version_(engine_version),
1484
0
        use_threads_(use_threads),
1485
0
        executor_(executor),
1486
0
        write_time_adjusted_to_utc_(write_time_adjusted_to_utc) {}
1487
1488
  const bool write_timestamps_as_int96_;
1489
  const bool coerce_timestamps_enabled_;
1490
  const ::arrow::TimeUnit::type coerce_timestamps_unit_;
1491
  const bool truncated_timestamps_allowed_;
1492
  const bool store_schema_;
1493
  const bool compliant_nested_types_;
1494
  const EngineVersion engine_version_;
1495
  const bool use_threads_;
1496
  ::arrow::internal::Executor* executor_;
1497
  const bool write_time_adjusted_to_utc_;
1498
};
1499
1500
/// \brief State object used for writing Arrow data directly to a Parquet
1501
/// column chunk. API possibly not stable
1502
struct ArrowWriteContext {
1503
  ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
1504
      : memory_pool(memory_pool),
1505
        properties(properties),
1506
        data_buffer(AllocateBuffer(memory_pool)),
1507
0
        def_levels_buffer(AllocateBuffer(memory_pool)) {}
1508
1509
  template <typename T>
1510
  ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
1511
    ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
1512
    *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
1513
    return ::arrow::Status::OK();
1514
  }
1515
1516
  MemoryPool* memory_pool;
1517
  const ArrowWriterProperties* properties;
1518
1519
  // Buffer used for storing the data of an array converted to the physical type
1520
  // as expected by parquet-cpp.
1521
  std::shared_ptr<ResizableBuffer> data_buffer;
1522
1523
  // We use the shared ownership of this buffer
1524
  std::shared_ptr<ResizableBuffer> def_levels_buffer;
1525
};
1526
1527
PARQUET_EXPORT
1528
std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
1529
1530
}  // namespace parquet