Coverage Report

Created: 2025-10-26 07:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/db/column_family.h
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#pragma once
11
12
#include <atomic>
13
#include <string>
14
#include <unordered_map>
15
#include <vector>
16
17
#include "cache/cache_reservation_manager.h"
18
#include "db/memtable_list.h"
19
#include "db/snapshot_checker.h"
20
#include "db/table_cache.h"
21
#include "db/table_properties_collector.h"
22
#include "db/write_batch_internal.h"
23
#include "db/write_controller.h"
24
#include "options/cf_options.h"
25
#include "rocksdb/compaction_job_stats.h"
26
#include "rocksdb/db.h"
27
#include "rocksdb/env.h"
28
#include "rocksdb/options.h"
29
#include "trace_replay/block_cache_tracer.h"
30
#include "util/cast_util.h"
31
#include "util/hash_containers.h"
32
#include "util/thread_local.h"
33
34
namespace ROCKSDB_NAMESPACE {
35
36
class Version;
37
class VersionSet;
38
class VersionStorageInfo;
39
class MemTable;
40
class MemTableListVersion;
41
class CompactionPicker;
42
class Compaction;
43
class InternalKey;
44
class InternalStats;
45
class ColumnFamilyData;
46
class DBImpl;
47
class LogBuffer;
48
class InstrumentedMutex;
49
class InstrumentedMutexLock;
50
struct SuperVersionContext;
51
class BlobFileCache;
52
class BlobSource;
53
54
extern const double kIncSlowdownRatio;
55
// This file contains a list of data structures for managing column family
56
// level metadata.
57
//
58
// The basic relationships among classes declared here are illustrated as
59
// following:
60
//
61
//       +----------------------+    +----------------------+   +--------+
62
//   +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 |   | DBImpl |
63
//   |   +----------------------+ |  +----------------------+   +----+---+
64
//   | +--------------------------+                                  |
65
//   | |                               +-----------------------------+
66
//   | |                               |
67
//   | | +-----------------------------v-------------------------------+
68
//   | | |                                                             |
69
//   | | |                      ColumnFamilySet                        |
70
//   | | |                                                             |
71
//   | | +-------------+--------------------------+----------------+---+
72
//   | |               |                          |                |
73
//   | +-------------------------------------+    |                |
74
//   |                 |                     |    |                v
75
//   |   +-------------v-------------+ +-----v----v---------+
76
//   |   |                           | |                    |
77
//   |   |     ColumnFamilyData 1    | | ColumnFamilyData 2 |    ......
78
//   |   |                           | |                    |
79
//   +--->                           | |                    |
80
//       |                 +---------+ |                    |
81
//       |                 | MemTable| |                    |
82
//       |                 |  List   | |                    |
83
//       +--------+---+--+-+----+----+ +--------------------++
84
//                |   |  |      |
85
//                |   |  |      |
86
//                |   |  |      +-----------------------+
87
//                |   |  +-----------+                  |
88
//                v   +--------+     |                  |
89
//       +--------+--------+   |     |                  |
90
//       |                 |   |     |       +----------v----------+
91
// +---> |SuperVersion 1.a +----------------->                     |
92
//       |                 +------+  |       | MemTableListVersion |
93
//       +---+-------------+   |  |  |       |                     |
94
//           |                 |  |  |       +----+------------+---+
95
//           |      current    |  |  |            |            |
96
//           |   +-------------+  |  |mem         |            |
97
//           |   |                |  |            |            |
98
//         +-v---v-------+    +---v--v---+  +-----v----+  +----v-----+
99
//         |             |    |          |  |          |  |          |
100
//         | Version 1.a |    | memtable |  | memtable |  | memtable |
101
//         |             |    |   1.a    |  |   1.b    |  |   1.c    |
102
//         +-------------+    |          |  |          |  |          |
103
//                            +----------+  +----------+  +----------+
104
//
105
// DBImpl keeps a ColumnFamilySet, which references to all column families by
106
// pointing to respective ColumnFamilyData object of each column family.
107
// This is how DBImpl can list and operate on all the column families.
108
// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
109
// when a user executes a query, it can directly find memtables and Version
110
// as well as SuperVersion to the column family, without going through
111
// ColumnFamilySet.
112
//
113
// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
114
// and SST files) indirectly, while ongoing operations may hold references
115
// to a current or an out-of-date SuperVersion, which in turn points to a
116
// point-in-time view of the LSM-tree. This guarantees the memtables and SST
117
// files being operated on will not go away, until the SuperVersion is
118
// unreferenced to 0 and destoryed.
119
//
120
// The following graph illustrates a possible referencing relationships:
121
//
122
// Column       +--------------+      current       +-----------+
123
// Family +---->+              +------------------->+           |
124
//  Data        | SuperVersion +----------+         | Version A |
125
//              |      3       |   imm    |         |           |
126
// Iter2 +----->+              |  +-------v------+  +-----------+
127
//              +-----+--------+  | MemtableList +----------------> Empty
128
//                    |           |   Version r  |  +-----------+
129
//                    |           +--------------+  |           |
130
//                    +------------------+   current| Version B |
131
//              +--------------+         |   +----->+           |
132
//              |              |         |   |      +-----+-----+
133
// Compaction +>+ SuperVersion +-------------+            ^
134
//    Job       |      2       +------+  |                |current
135
//              |              +----+ |  |     mem        |    +------------+
136
//              +--------------+    | |  +--------------------->            |
137
//                                  | +------------------------> MemTable a |
138
//                                  |          mem        |    |            |
139
//              +--------------+    |                     |    +------------+
140
//              |              +--------------------------+
141
//  Iter1 +-----> SuperVersion |    |                          +------------+
142
//              |      1       +------------------------------>+            |
143
//              |              +-+  |        mem               | MemTable b |
144
//              +--------------+ |  |                          |            |
145
//                               |  |    +--------------+      +-----^------+
146
//                               |  |imm | MemtableList |            |
147
//                               |  +--->+   Version s  +------------+
148
//                               |       +--------------+
149
//                               |       +--------------+
150
//                               |       | MemtableList |
151
//                               +------>+   Version t  +-------->  Empty
152
//                                 imm   +--------------+
153
//
154
// In this example, even if the current LSM-tree consists of Version A and
155
// memtable a, which is also referenced by SuperVersion, two older SuperVersion
156
// SuperVersion2 and Superversion1 still exist, and are referenced by a
157
// compaction job and an old iterator Iter1, respectively. SuperVersion2
158
// contains Version B, memtable a and memtable b; SuperVersion1 contains
159
// Version B and memtable b (mutable). As a result, Version B and memtable b
160
// are prevented from being destroyed or deleted.
161
162
// ColumnFamilyHandleImpl is the class that clients use to access different
163
// column families. It has non-trivial destructor, which gets called when client
164
// is done using the column family
165
class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
166
 public:
167
  // create while holding the mutex
168
  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db,
169
                         InstrumentedMutex* mutex);
170
  // destroy without mutex
171
  virtual ~ColumnFamilyHandleImpl();
172
4.28M
  virtual ColumnFamilyData* cfd() const { return cfd_; }
173
3.84k
  virtual DBImpl* db() const { return db_; }
174
175
  uint32_t GetID() const override;
176
  const std::string& GetName() const override;
177
  Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
178
  const Comparator* GetComparator() const override;
179
180
 private:
181
  ColumnFamilyData* cfd_;
182
  DBImpl* db_;
183
  InstrumentedMutex* mutex_;
184
};
185
186
// Does not ref-count ColumnFamilyData
187
// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
188
// calls DBImpl methods. When this happens, MemTableInserter need access to
189
// ColumnFamilyHandle (same as the client would need). In that case, we feed
190
// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
191
// methods
192
class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
193
 public:
194
  ColumnFamilyHandleInternal()
195
48.4k
      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
196
48.4k
        internal_cfd_(nullptr) {}
197
198
2.02M
  void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
199
962k
  ColumnFamilyData* cfd() const override { return internal_cfd_; }
200
201
 private:
202
  ColumnFamilyData* internal_cfd_;
203
};
204
205
// holds references to memtable, all immutable memtables and version
206
struct SuperVersion {
207
  // Accessing members of this class is not thread-safe and requires external
208
  // synchronization (ie db mutex held or on write thread).
209
  ColumnFamilyData* cfd;
210
  ReadOnlyMemTable* mem;
211
  MemTableListVersion* imm;
212
  Version* current;
213
  // TODO: do we really need this in addition to what's in current Version?
214
  MutableCFOptions mutable_cf_options;
215
  // Version number of the current SuperVersion
216
  uint64_t version_number;
217
  WriteStallCondition write_stall_condition;
218
  // Each time `full_history_ts_low` collapses history, a new SuperVersion is
219
  // installed. This field tracks the effective `full_history_ts_low` for that
220
  // SuperVersion, to be used by read APIs for sanity checks. This field is
221
  // immutable once SuperVersion is installed. For column family that doesn't
222
  // enable UDT feature, this is an empty string.
223
  std::string full_history_ts_low;
224
225
  // An immutable snapshot of the DB's seqno to time mapping, usually shared
226
  // between SuperVersions.
227
  std::shared_ptr<const SeqnoToTimeMapping> seqno_to_time_mapping{nullptr};
228
229
  // should be called outside the mutex
230
94.4k
  SuperVersion() = default;
231
  ~SuperVersion();
232
  SuperVersion* Ref();
233
  // If Unref() returns true, Cleanup() should be called with mutex held
234
  // before deleting this SuperVersion.
235
  bool Unref();
236
237
  // call these two methods with db mutex held
238
  // Cleanup unrefs mem, imm and current. Also, it stores all memtables
239
  // that needs to be deleted in to_delete vector. Unrefing those
240
  // objects needs to be done in the mutex
241
  void Cleanup();
242
  void Init(
243
      ColumnFamilyData* new_cfd, MemTable* new_mem,
244
      MemTableListVersion* new_imm, Version* new_current,
245
      std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping);
246
247
  // Share the ownership of the seqno to time mapping object referred to in this
248
  // SuperVersion. To be used by the new SuperVersion to be installed after this
249
  // one if seqno to time mapping does not change in between these two
250
  // SuperVersions. Or to share the ownership of the mapping with a FlushJob.
251
10.5k
  std::shared_ptr<const SeqnoToTimeMapping> ShareSeqnoToTimeMapping() {
252
10.5k
    return seqno_to_time_mapping;
253
10.5k
  }
254
255
  // Access the seqno to time mapping object in this SuperVersion.
256
32.5k
  UnownedPtr<const SeqnoToTimeMapping> GetSeqnoToTimeMapping() const {
257
32.5k
    return seqno_to_time_mapping.get();
258
32.5k
  }
259
260
  // The value of dummy is not actually used. kSVInUse takes its address as a
261
  // mark in the thread local storage to indicate the SuperVersion is in use
262
  // by thread. This way, the value of kSVInUse is guaranteed to have no
263
  // conflict with SuperVersion object address and portable on different
264
  // platform.
265
  static int dummy;
266
  static void* const kSVInUse;
267
  static void* const kSVObsolete;
268
269
 private:
270
  std::atomic<uint32_t> refs;
271
  // We need to_delete because during Cleanup(), imm->Unref() returns
272
  // all memtables that we need to free through this vector. We then
273
  // delete all those memtables outside of mutex, during destruction
274
  autovector<ReadOnlyMemTable*> to_delete;
275
};
276
277
Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
278
279
Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options);
280
281
Status CheckCFPathsSupported(const DBOptions& db_options,
282
                             const ColumnFamilyOptions& cf_options);
283
284
ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
285
                                      bool read_only,
286
                                      const ColumnFamilyOptions& src);
287
// Wrap user defined table properties collector factories `from cf_options`
288
// into internal ones in internal_tbl_prop_coll_factories. Add a system internal
289
// one too.
290
void GetInternalTblPropCollFactory(
291
    const ImmutableCFOptions& ioptions,
292
    InternalTblPropCollFactories* internal_tbl_prop_coll_factories);
293
294
class ColumnFamilySet;
295
296
// This class keeps all the data that a column family needs.
297
// Most methods require DB mutex held, unless otherwise noted
298
class ColumnFamilyData {
299
 public:
300
  ~ColumnFamilyData();
301
302
  // thread-safe
303
2.81M
  uint32_t GetID() const { return id_; }
304
  // thread-safe
305
477k
  const std::string& GetName() const { return name_; }
306
307
  // Ref() can only be called from a context where the caller can guarantee
308
  // that ColumnFamilyData is alive (while holding a non-zero ref already,
309
  // holding a DB mutex, or as the leader in a write batch group).
310
381k
  void Ref() { refs_.fetch_add(1); }
311
312
  // UnrefAndTryDelete() decreases the reference count and do free if needed,
313
  // return true if this is freed else false, UnrefAndTryDelete() can only
314
  // be called while holding a DB mutex, or during single-threaded recovery.
315
  bool UnrefAndTryDelete();
316
317
  // SetDropped() can only be called under following conditions:
318
  // 1) Holding a DB mutex,
319
  // 2) from single-threaded write thread, AND
320
  // 3) from single-threaded VersionSet::LogAndApply()
321
  // After dropping column family no other operation on that column family
322
  // will be executed. All the files and memory will be, however, kept around
323
  // until client drops the column family handle. That way, client can still
324
  // access data from dropped column family.
325
  // Column family can be dropped and still alive. In that state:
326
  // *) Compaction and flush is not executed on the dropped column family.
327
  // *) Client can continue reading from column family. Writes will fail unless
328
  // WriteOptions::ignore_missing_column_families is true
329
  // When the dropped column family is unreferenced, then we:
330
  // *) Remove column family from the linked list maintained by ColumnFamilySet
331
  // *) delete all memory associated with that column family
332
  // *) delete all the files associated with that column family
333
  void SetDropped();
334
968k
  bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
335
336
  void SetFlushSkipReschedule();
337
338
  bool GetAndClearFlushSkipReschedule();
339
340
  // thread-safe
341
1.28M
  int NumberLevels() const { return ioptions_.num_levels; }
342
343
275k
  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
344
1.45M
  uint64_t GetLogNumber() const { return log_number_; }
345
346
  // thread-safe
347
  const FileOptions* soptions() const;
348
8.31M
  const ImmutableOptions& ioptions() const { return ioptions_; }
349
  // REQUIRES: DB mutex held
350
  // This returns the MutableCFOptions used by current SuperVersion
351
  // You should use this API to reference MutableCFOptions most of the time.
352
386
  const MutableCFOptions& GetCurrentMutableCFOptions() const {
353
386
    return super_version_->mutable_cf_options;
354
386
  }
355
  // REQUIRES: DB mutex held
356
  // This returns the latest MutableCFOptions, which may be not in effect yet.
357
1.29M
  const MutableCFOptions& GetLatestMutableCFOptions() const {
358
1.29M
    return mutable_cf_options_;
359
1.29M
  }
360
361
  // REQUIRES: DB mutex held
362
  // Build ColumnFamiliesOptions with immutable options and latest mutable
363
  // options.
364
  ColumnFamilyOptions GetLatestCFOptions() const;
365
366
962k
  bool is_delete_range_supported() { return is_delete_range_supported_; }
367
368
  // Validate CF options against DB options
369
  static Status ValidateOptions(const DBOptions& db_options,
370
                                const ColumnFamilyOptions& cf_options);
371
  // REQUIRES: DB mutex held
372
  Status SetOptions(
373
      const DBOptions& db_options,
374
      const std::unordered_map<std::string, std::string>& options_map);
375
376
282k
  InternalStats* internal_stats() { return internal_stats_.get(); }
377
378
117k
  MemTableList* imm() { return &imm_; }
379
3.97M
  MemTable* mem() { return mem_; }
380
381
1.74k
  bool IsEmpty() {
382
1.74k
    return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0;
383
1.74k
  }
384
385
755k
  Version* dummy_versions() { return dummy_versions_; }
386
2.30M
  Version* current() { return current_; }  // REQUIRE: DB mutex held
387
  void SetCurrent(Version* _current);      // REQUIRE: DB mutex held
388
  uint64_t GetNumLiveVersions() const;     // REQUIRE: DB mutex held
389
  uint64_t GetTotalSstFilesSize() const;   // REQUIRE: DB mutex held
390
  uint64_t GetLiveSstFilesSize() const;    // REQUIRE: DB mutex held
391
  uint64_t GetTotalBlobFileSize() const;   // REQUIRE: DB mutex held
392
  // REQUIRE: DB mutex held
393
114k
  void SetMemtable(MemTable* new_mem) {
394
114k
    AssignMemtableID(new_mem);
395
114k
    mem_ = new_mem;
396
114k
    if (ioptions_.disallow_memtable_writes) {
397
0
      mem_->MarkImmutable();
398
0
    }
399
114k
  }
400
401
114k
  void AssignMemtableID(ReadOnlyMemTable* new_imm) {
402
114k
    new_imm->SetID(++last_memtable_id_);
403
114k
  }
404
405
  // calculate the oldest log needed for the durability of this column family
406
  uint64_t OldestLogToKeep();
407
408
  // See Memtable constructor for explanation of earliest_seq param.
409
  // `mutable_cf_options` might need to be a saved copy if calling this without
410
  // holding the DB mutex.
411
  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
412
                                 SequenceNumber earliest_seq);
413
  void CreateNewMemtable(SequenceNumber earliest_seq);
414
415
559k
  TableCache* table_cache() const { return table_cache_.get(); }
416
0
  BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
417
233k
  BlobSource* blob_source() const { return blob_source_.get(); }
418
419
  // See documentation in compaction_picker.h
420
  // REQUIRES: DB mutex held
421
  bool NeedsCompaction() const;
422
  // REQUIRES: DB mutex held
423
  Compaction* PickCompaction(
424
      const MutableCFOptions& mutable_options,
425
      const MutableDBOptions& mutable_db_options,
426
      const std::vector<SequenceNumber>& existing_snapshots,
427
      const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
428
      bool require_max_output_level = false);
429
430
  // Check if the passed range overlap with any running compactions.
431
  // REQUIRES: DB mutex held
432
  bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
433
                                  const Slice& largest_user_key,
434
                                  int level) const;
435
436
  // Check if the passed ranges overlap with any unflushed memtables
437
  // (immutable or mutable).
438
  //
439
  // @param super_version A referenced SuperVersion that will be held for the
440
  //    duration of this function.
441
  //
442
  // Thread-safe
443
  Status RangesOverlapWithMemtables(const autovector<UserKeyRange>& ranges,
444
                                    SuperVersion* super_version,
445
                                    bool allow_data_in_errors, bool* overlap);
446
447
  // A flag to tell a manual compaction is to compact all levels together
448
  // instead of a specific level.
449
  static const int kCompactAllLevels;
450
  // A flag to tell a manual compaction's output is base level.
451
  static const int kCompactToBaseLevel;
452
  // REQUIRES: DB mutex held
453
  Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
454
                           const MutableDBOptions& mutable_db_options,
455
                           int input_level, int output_level,
456
                           const CompactRangeOptions& compact_range_options,
457
                           const InternalKey* begin, const InternalKey* end,
458
                           InternalKey** compaction_end, bool* manual_conflict,
459
                           uint64_t max_file_num_to_ignore,
460
                           const std::string& trim_ts);
461
462
6.65k
  CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
463
  // thread-safe
464
4.29M
  const Comparator* user_comparator() const {
465
4.29M
    return internal_comparator_.user_comparator();
466
4.29M
  }
467
  // thread-safe
468
676k
  const InternalKeyComparator& internal_comparator() const {
469
676k
    return internal_comparator_;
470
676k
  }
471
472
20.6k
  const InternalTblPropCollFactories* internal_tbl_prop_coll_factories() const {
473
20.6k
    return &internal_tbl_prop_coll_factories_;
474
20.6k
  }
475
476
195k
  SuperVersion* GetSuperVersion() { return super_version_; }
477
  // thread-safe
478
  // Return a already referenced SuperVersion to be used safely.
479
  SuperVersion* GetReferencedSuperVersion(DBImpl* db);
480
  // thread-safe
481
  // Get SuperVersion stored in thread local storage. If it does not exist,
482
  // get a reference from a current SuperVersion.
483
  SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
484
  // Try to return SuperVersion back to thread local storage. Return true on
485
  // success and false on failure. It fails when the thread local storage
486
  // contains anything other than SuperVersion::kSVInUse flag.
487
  bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
488
  // thread-safe
489
0
  uint64_t GetSuperVersionNumber() const {
490
0
    return super_version_number_.load();
491
0
  }
492
0
  uint64_t GetSuperVersionNumberRelaxed() const {
493
0
    return super_version_number_.load(std::memory_order_relaxed);
494
0
  }
495
  // Only intended for use by DBImpl::InstallSuperVersion() and variants
496
  void InstallSuperVersion(SuperVersionContext* sv_context,
497
                           InstrumentedMutex* db_mutex,
498
                           std::optional<std::shared_ptr<SeqnoToTimeMapping>>
499
                               new_seqno_to_time_mapping = {});
500
501
  void ResetThreadLocalSuperVersions();
502
503
  // Protected by DB mutex
504
3.49k
  void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
505
9.57k
  void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
506
3.49k
  bool queued_for_flush() { return queued_for_flush_; }
507
97.0k
  bool queued_for_compaction() { return queued_for_compaction_; }
508
509
  static std::pair<WriteStallCondition, WriteStallCause>
510
  GetWriteStallConditionAndCause(
511
      int num_unflushed_memtables, int num_l0_files,
512
      uint64_t num_compaction_needed_bytes,
513
      const MutableCFOptions& mutable_cf_options,
514
      const ImmutableCFOptions& immutable_cf_options);
515
516
  // Recalculate some stall conditions, which are changed only during
517
  // compaction, adding new memtable and/or recalculation of compaction score.
518
  WriteStallCondition RecalculateWriteStallConditions(
519
      const MutableCFOptions& mutable_cf_options);
520
521
95.3k
  void set_initialized() { initialized_.store(true); }
522
523
289k
  bool initialized() const { return initialized_.load(); }
524
525
130k
  const ColumnFamilyOptions& initial_cf_options() {
526
130k
    return initial_cf_options_;
527
130k
  }
528
529
  // created_dirs remembers directory created, so that we don't need to call
530
  // the same data creation operation again.
531
  Status AddDirectories(
532
      std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs);
533
534
  FSDirectory* GetDataDir(size_t path_id) const;
535
536
  // full_history_ts_low_ can only increase.
537
0
  void SetFullHistoryTsLow(std::string ts_low) {
538
0
    assert(!ts_low.empty());
539
0
    const Comparator* ucmp = user_comparator();
540
0
    assert(ucmp);
541
    // Guard against resurrected full_history_ts_low persisted in MANIFEST
542
    // from previous DB sessions. This could happen if UDT was enabled and then
543
    // disabled.
544
0
    if (ucmp->timestamp_size() == 0) {
545
0
      return;
546
0
    }
547
0
    if (full_history_ts_low_.empty() ||
548
0
        ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
549
0
      full_history_ts_low_ = std::move(ts_low);
550
0
    }
551
0
  }
552
553
163k
  const std::string& GetFullHistoryTsLow() const {
554
163k
    const Comparator* ucmp = user_comparator();
555
163k
    assert(ucmp);
556
163k
    if (ucmp->timestamp_size() == 0) {
557
163k
      assert(full_history_ts_low_.empty());
558
163k
    }
559
163k
    return full_history_ts_low_;
560
163k
  }
561
562
  // REQUIRES: DB mutex held.
563
  // Return true if flushing up to MemTables with ID `max_memtable_id`
564
  // should be postponed to retain user-defined timestamps according to the
565
  // user's setting. Called by background flush job.
566
  bool ShouldPostponeFlushToRetainUDT(uint64_t max_memtable_id);
567
568
0
  ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
569
0
  WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
570
  std::shared_ptr<CacheReservationManager>
571
253k
  GetFileMetadataCacheReservationManager() {
572
253k
    return file_metadata_cache_res_mgr_;
573
253k
  }
574
575
  static const uint32_t kDummyColumnFamilyDataId;
576
577
  // Keep track of whether the mempurge feature was ever used.
578
0
  void SetMempurgeUsed() { mempurge_used_ = true; }
579
1.74k
  bool GetMempurgeUsed() { return mempurge_used_; }
580
581
  // Allocate and return a new epoch number
582
18.7k
  uint64_t NewEpochNumber() { return next_epoch_number_.fetch_add(1); }
583
584
  // Get the next epoch number to be assigned
585
65.4k
  uint64_t GetNextEpochNumber() const { return next_epoch_number_.load(); }
586
587
  // Set the next epoch number to be assigned
588
65.4k
  void SetNextEpochNumber(uint64_t next_epoch_number) {
589
65.4k
    next_epoch_number_.store(next_epoch_number);
590
65.4k
  }
591
592
  // Reset the next epoch number to be assigned
593
65.4k
  void ResetNextEpochNumber() { next_epoch_number_.store(1); }
594
595
  // Recover the next epoch number of this CF and epoch number
596
  // of its files (if missing)
597
  void RecoverEpochNumbers();
598
599
40
  int GetUnflushedMemTableCountForWriteStallCheck() const {
600
40
    return (mem_->IsEmpty() ? 0 : 1) + imm_.NumNotFlushed();
601
40
  }
602
603
  // thread-safe, DB mutex not needed.
604
275k
  bool AllowIngestBehind() const {
605
275k
    return ioptions_.cf_allow_ingest_behind || ioptions_.allow_ingest_behind;
606
275k
  }
607
608
 private:
609
  friend class ColumnFamilySet;
610
  ColumnFamilyData(
611
      uint32_t id, const std::string& name, Version* dummy_versions,
612
      Cache* table_cache, WriteBufferManager* write_buffer_manager,
613
      const ColumnFamilyOptions& options, const ImmutableDBOptions& db_options,
614
      const FileOptions* file_options, ColumnFamilySet* column_family_set,
615
      BlockCacheTracer* const block_cache_tracer,
616
      const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
617
      const std::string& db_session_id, bool read_only);
618
619
  std::vector<std::string> GetDbPaths() const;
620
621
  uint32_t id_;
622
  const std::string name_;
623
  Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
624
  Version* current_;         // == dummy_versions->prev_
625
626
  std::atomic<int> refs_;  // outstanding references to ColumnFamilyData
627
  std::atomic<bool> initialized_;
628
  std::atomic<bool> dropped_;  // true if client dropped it
629
630
  // When user-defined timestamps in memtable only feature is enabled, this
631
  // flag indicates a successfully requested flush that should
632
  // skip being rescheduled and haven't undergone the rescheduling check yet.
633
  // This flag is cleared when a check skips rescheduling a FlushRequest.
634
  // With this flag, automatic flushes in regular cases can continue to
635
  // retain UDTs by getting rescheduled as usual while manual flushes and
636
  // error recovery flushes will proceed without getting rescheduled.
637
  std::atomic<bool> flush_skip_reschedule_;
638
639
  const InternalKeyComparator internal_comparator_;
640
  InternalTblPropCollFactories internal_tbl_prop_coll_factories_;
641
642
  const ColumnFamilyOptions initial_cf_options_;
643
  const ImmutableOptions ioptions_;
644
  MutableCFOptions mutable_cf_options_;
645
646
  const bool is_delete_range_supported_;
647
648
  std::unique_ptr<TableCache> table_cache_;
649
  std::unique_ptr<BlobFileCache> blob_file_cache_;
650
  std::unique_ptr<BlobSource> blob_source_;
651
652
  std::unique_ptr<InternalStats> internal_stats_;
653
654
  WriteBufferManager* write_buffer_manager_;
655
656
  MemTable* mem_;
657
  MemTableList imm_;
658
  SuperVersion* super_version_;
659
660
  // An ordinal representing the current SuperVersion. Updated by
661
  // InstallSuperVersion(), i.e. incremented every time super_version_
662
  // changes.
663
  std::atomic<uint64_t> super_version_number_;
664
665
  // Thread's local copy of SuperVersion pointer
666
  // This needs to be destructed before mutex_
667
  std::unique_ptr<ThreadLocalPtr> local_sv_;
668
669
  // pointers for a circular linked list. we use it to support iterations over
670
  // all column families that are alive (note: dropped column families can also
671
  // be alive as long as client holds a reference)
672
  ColumnFamilyData* next_;
673
  ColumnFamilyData* prev_;
674
675
  // This is the earliest log file number that contains data from this
676
  // Column Family. All earlier log files must be ignored and not
677
  // recovered from
678
  uint64_t log_number_;
679
680
  // An object that keeps all the compaction stats
681
  // and picks the next compaction
682
  std::unique_ptr<CompactionPicker> compaction_picker_;
683
684
  ColumnFamilySet* column_family_set_;
685
686
  std::unique_ptr<WriteControllerToken> write_controller_token_;
687
688
  // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
689
  bool queued_for_flush_;
690
691
  // If true --> this ColumnFamily is currently present in
692
  // DBImpl::compaction_queue_
693
  bool queued_for_compaction_;
694
695
  uint64_t prev_compaction_needed_bytes_;
696
697
  // if the database was opened with 2pc enabled
698
  bool allow_2pc_;
699
700
  // Memtable id to track flush.
701
  uint64_t last_memtable_id_;
702
703
  // Directories corresponding to cf_paths.
704
  std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
705
706
  bool db_paths_registered_;
707
708
  std::string full_history_ts_low_;
709
710
  // For charging memory usage of file metadata created for newly added files to
711
  // a Version associated with this CFD
712
  std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
713
  bool mempurge_used_;
714
715
  std::atomic<uint64_t> next_epoch_number_;
716
};
717
718
// ColumnFamilySet has interesting thread-safety requirements
719
// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
720
// mutex AND executed in the write thread.
721
// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
722
// single-threaded write thread. It is also called during Recovery and in
723
// DumpManifest().
724
// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
725
// held and it needs to be executed from the write thread. SetDropped() also
726
// guarantees that it will be called only from single-threaded LogAndApply(),
727
// but this condition is not that important.
728
// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
729
// body of the iteration, wrap in a RefedColumnFamilySet.
730
// * GetDefault() -- thread safe
731
// * GetColumnFamily() -- either inside of DB mutex or from a write thread
732
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
733
// NumberOfColumnFamilies -- inside of DB mutex
734
class ColumnFamilySet {
735
 public:
736
  // ColumnFamilySet supports iteration
737
  class iterator {
738
   public:
739
2.76M
    explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {}
740
    // NOTE: minimum operators for for-loop iteration
741
1.91M
    iterator& operator++() {
742
1.91M
      current_ = current_->next_;
743
1.91M
      return *this;
744
1.91M
    }
745
3.29M
    bool operator!=(const iterator& other) const {
746
3.29M
      return this->current_ != other.current_;
747
3.29M
    }
748
1.91M
    ColumnFamilyData* operator*() { return current_; }
749
750
   private:
751
    ColumnFamilyData* current_;
752
  };
753
754
  ColumnFamilySet(const std::string& dbname,
755
                  const ImmutableDBOptions* db_options,
756
                  const FileOptions& file_options, Cache* table_cache,
757
                  WriteBufferManager* _write_buffer_manager,
758
                  WriteController* _write_controller,
759
                  BlockCacheTracer* const block_cache_tracer,
760
                  const std::shared_ptr<IOTracer>& io_tracer,
761
                  const std::string& db_id, const std::string& db_session_id);
762
  ~ColumnFamilySet();
763
764
  ColumnFamilyData* GetDefault() const;
765
  // GetColumnFamily() calls return nullptr if column family is not found
766
  ColumnFamilyData* GetColumnFamily(uint32_t id) const;
767
  ColumnFamilyData* GetColumnFamily(const std::string& name) const;
768
  // this call will return the next available column family ID. it guarantees
769
  // that there is no column family with id greater than or equal to the
770
  // returned value in the current running instance or anytime in RocksDB
771
  // instance history.
772
  uint32_t GetNextColumnFamilyID();
773
  uint32_t GetMaxColumnFamily();
774
  void UpdateMaxColumnFamily(uint32_t new_max_column_family);
775
  size_t NumberOfColumnFamilies() const;
776
777
  ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
778
                                       Version* dummy_version,
779
                                       const ColumnFamilyOptions& options,
780
                                       bool read_only);
781
782
  const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
783
39.2k
      const {
784
39.2k
    return running_ts_sz_;
785
39.2k
  }
786
787
  const UnorderedMap<uint32_t, size_t>&
788
1.01M
  GetColumnFamiliesTimestampSizeForRecord() const {
789
1.01M
    return ts_sz_for_record_;
790
1.01M
  }
791
792
1.38M
  iterator begin() { return iterator(dummy_cfd_->next_); }
793
1.38M
  iterator end() { return iterator(dummy_cfd_); }
794
795
102k
  Cache* get_table_cache() { return table_cache_; }
796
797
0
  WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
798
799
17.6k
  WriteController* write_controller() { return write_controller_; }
800
801
 private:
802
  friend class ColumnFamilyData;
803
  // helper function that gets called from cfd destructor
804
  // REQUIRES: DB mutex held
805
  void RemoveColumnFamily(ColumnFamilyData* cfd);
806
807
  // column_families_ and column_family_data_ need to be protected:
808
  // * when mutating both conditions have to be satisfied:
809
  // 1. DB mutex locked
810
  // 2. thread currently in single-threaded write thread
811
  // * when reading, at least one condition needs to be satisfied:
812
  // 1. DB mutex locked
813
  // 2. accessed from a single-threaded write thread
814
  UnorderedMap<std::string, uint32_t> column_families_;
815
  UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_;
816
817
  // Mutating / reading `running_ts_sz_` and `ts_sz_for_record_` follow
818
  // the same requirements as `column_families_` and `column_family_data_`.
819
  // Mapping from column family id to user-defined timestamp size for all
820
  // running column families.
821
  UnorderedMap<uint32_t, size_t> running_ts_sz_;
822
  // Mapping from column family id to user-defined timestamp size for
823
  // column families with non-zero user-defined timestamp size.
824
  UnorderedMap<uint32_t, size_t> ts_sz_for_record_;
825
826
  uint32_t max_column_family_;
827
  const FileOptions file_options_;
828
829
  ColumnFamilyData* dummy_cfd_;
830
  // We don't hold the refcount here, since default column family always exists
831
  // We are also not responsible for cleaning up default_cfd_cache_. This is
832
  // just a cache that makes common case (accessing default column family)
833
  // faster
834
  ColumnFamilyData* default_cfd_cache_;
835
836
  const std::string db_name_;
837
  const ImmutableDBOptions* const db_options_;
838
  Cache* table_cache_;
839
  WriteBufferManager* write_buffer_manager_;
840
  WriteController* write_controller_;
841
  BlockCacheTracer* const block_cache_tracer_;
842
  std::shared_ptr<IOTracer> io_tracer_;
843
  const std::string& db_id_;
844
  std::string db_session_id_;
845
};
846
847
// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
848
// iteration over the iterator, because the cfd is Refed and Unrefed during
849
// each iteration to prevent concurrent CF drop from destroying it (until
850
// Unref).
851
class RefedColumnFamilySet {
852
 public:
853
274
  explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
854
855
  class iterator {
856
   public:
857
548
    explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
858
548
      MaybeRef(*wrapped_);
859
548
    }
860
548
    ~iterator() { MaybeUnref(*wrapped_); }
861
934
    inline void MaybeRef(ColumnFamilyData* cfd) {
862
934
      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
863
386
        cfd->Ref();
864
386
      }
865
934
    }
866
934
    inline void MaybeUnref(ColumnFamilyData* cfd) {
867
934
      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
868
386
        cfd->UnrefAndTryDelete();
869
386
      }
870
934
    }
871
    // NOTE: minimum operators for for-loop iteration
872
386
    inline iterator& operator++() {
873
386
      ColumnFamilyData* old = *wrapped_;
874
386
      ++wrapped_;
875
      // Can only unref & potentially free cfd after accessing its next_
876
386
      MaybeUnref(old);
877
386
      MaybeRef(*wrapped_);
878
386
      return *this;
879
386
    }
880
660
    inline bool operator!=(const iterator& other) const {
881
660
      return this->wrapped_ != other.wrapped_;
882
660
    }
883
386
    inline ColumnFamilyData* operator*() { return *wrapped_; }
884
885
   private:
886
    ColumnFamilySet::iterator wrapped_;
887
  };
888
889
274
  iterator begin() { return iterator(wrapped_->begin()); }
890
274
  iterator end() { return iterator(wrapped_->end()); }
891
892
 private:
893
  ColumnFamilySet* wrapped_;
894
};
895
896
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
897
// memtables of different column families (specified by ID in the write batch)
898
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
899
 public:
900
  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
901
48.4k
      : column_family_set_(column_family_set), current_(nullptr) {}
902
903
  // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
904
  // with the arguments used to construct *orig.
905
  explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
906
0
      : column_family_set_(orig->column_family_set_), current_(nullptr) {}
907
908
  // sets current_ to ColumnFamilyData with column_family_id
909
  // returns false if column family doesn't exist
910
  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
911
  //           under a DB mutex OR from a write thread
912
  bool Seek(uint32_t column_family_id) override;
913
914
  // Returns log number of the selected column family
915
  // REQUIRES: under a DB mutex OR from a write thread
916
  uint64_t GetLogNumber() const override;
917
918
  // REQUIRES: Seek() called first
919
  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
920
  //           under a DB mutex OR from a write thread
921
  MemTable* GetMemTable() const override;
922
923
  // Returns column family handle for the selected column family
924
  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
925
  //           under a DB mutex OR from a write thread
926
  ColumnFamilyHandle* GetColumnFamilyHandle() override;
927
928
  // Cannot be called while another thread is calling Seek().
929
  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
930
  //           under a DB mutex OR from a write thread
931
6.00M
  ColumnFamilyData* current() override { return current_; }
932
933
 private:
934
  ColumnFamilySet* column_family_set_;
935
  ColumnFamilyData* current_;
936
  ColumnFamilyHandleInternal handle_;
937
};
938
939
uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
940
941
const Comparator* GetColumnFamilyUserComparator(
942
    ColumnFamilyHandle* column_family);
943
944
const ImmutableOptions& GetImmutableOptions(ColumnFamilyHandle* column_family);
945
946
}  // namespace ROCKSDB_NAMESPACE