/src/rocksdb/db/column_family.h
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #pragma once |
11 | | |
12 | | #include <atomic> |
13 | | #include <string> |
14 | | #include <unordered_map> |
15 | | #include <vector> |
16 | | |
17 | | #include "cache/cache_reservation_manager.h" |
18 | | #include "db/memtable_list.h" |
19 | | #include "db/snapshot_checker.h" |
20 | | #include "db/table_cache.h" |
21 | | #include "db/table_properties_collector.h" |
22 | | #include "db/write_batch_internal.h" |
23 | | #include "db/write_controller.h" |
24 | | #include "options/cf_options.h" |
25 | | #include "rocksdb/compaction_job_stats.h" |
26 | | #include "rocksdb/db.h" |
27 | | #include "rocksdb/env.h" |
28 | | #include "rocksdb/options.h" |
29 | | #include "trace_replay/block_cache_tracer.h" |
30 | | #include "util/cast_util.h" |
31 | | #include "util/hash_containers.h" |
32 | | #include "util/thread_local.h" |
33 | | |
34 | | namespace ROCKSDB_NAMESPACE { |
35 | | |
36 | | class Version; |
37 | | class VersionSet; |
38 | | class VersionStorageInfo; |
39 | | class MemTable; |
40 | | class MemTableListVersion; |
41 | | class CompactionPicker; |
42 | | class Compaction; |
43 | | class InternalKey; |
44 | | class InternalStats; |
45 | | class ColumnFamilyData; |
46 | | class DBImpl; |
47 | | class LogBuffer; |
48 | | class InstrumentedMutex; |
49 | | class InstrumentedMutexLock; |
50 | | struct SuperVersionContext; |
51 | | class BlobFileCache; |
52 | | class BlobSource; |
53 | | |
54 | | extern const double kIncSlowdownRatio; |
55 | | // This file contains a list of data structures for managing column family |
56 | | // level metadata. |
57 | | // |
58 | | // The basic relationships among classes declared here are illustrated as |
59 | | // following: |
60 | | // |
61 | | // +----------------------+ +----------------------+ +--------+ |
62 | | // +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 | | DBImpl | |
63 | | // | +----------------------+ | +----------------------+ +----+---+ |
64 | | // | +--------------------------+ | |
65 | | // | | +-----------------------------+ |
66 | | // | | | |
67 | | // | | +-----------------------------v-------------------------------+ |
68 | | // | | | | |
69 | | // | | | ColumnFamilySet | |
70 | | // | | | | |
71 | | // | | +-------------+--------------------------+----------------+---+ |
72 | | // | | | | | |
73 | | // | +-------------------------------------+ | | |
74 | | // | | | | v |
75 | | // | +-------------v-------------+ +-----v----v---------+ |
76 | | // | | | | | |
77 | | // | | ColumnFamilyData 1 | | ColumnFamilyData 2 | ...... |
78 | | // | | | | | |
79 | | // +---> | | | |
80 | | // | +---------+ | | |
81 | | // | | MemTable| | | |
82 | | // | | List | | | |
83 | | // +--------+---+--+-+----+----+ +--------------------++ |
84 | | // | | | | |
85 | | // | | | | |
86 | | // | | | +-----------------------+ |
87 | | // | | +-----------+ | |
88 | | // v +--------+ | | |
89 | | // +--------+--------+ | | | |
90 | | // | | | | +----------v----------+ |
91 | | // +---> |SuperVersion 1.a +-----------------> | |
92 | | // | +------+ | | MemTableListVersion | |
93 | | // +---+-------------+ | | | | | |
94 | | // | | | | +----+------------+---+ |
95 | | // | current | | | | | |
96 | | // | +-------------+ | |mem | | |
97 | | // | | | | | | |
98 | | // +-v---v-------+ +---v--v---+ +-----v----+ +----v-----+ |
99 | | // | | | | | | | | |
100 | | // | Version 1.a | | memtable | | memtable | | memtable | |
101 | | // | | | 1.a | | 1.b | | 1.c | |
102 | | // +-------------+ | | | | | | |
103 | | // +----------+ +----------+ +----------+ |
104 | | // |
105 | | // DBImpl keeps a ColumnFamilySet, which references to all column families by |
106 | | // pointing to respective ColumnFamilyData object of each column family. |
107 | | // This is how DBImpl can list and operate on all the column families. |
108 | | // ColumnFamilyHandle also points to ColumnFamilyData directly, so that |
109 | | // when a user executes a query, it can directly find memtables and Version |
110 | | // as well as SuperVersion to the column family, without going through |
111 | | // ColumnFamilySet. |
112 | | // |
113 | | // ColumnFamilySet points to the latest view of the LSM-tree (list of memtables |
114 | | // and SST files) indirectly, while ongoing operations may hold references |
115 | | // to a current or an out-of-date SuperVersion, which in turn points to a |
116 | | // point-in-time view of the LSM-tree. This guarantees the memtables and SST |
117 | | // files being operated on will not go away, until the SuperVersion is |
118 | | // unreferenced to 0 and destoryed. |
119 | | // |
120 | | // The following graph illustrates a possible referencing relationships: |
121 | | // |
122 | | // Column +--------------+ current +-----------+ |
123 | | // Family +---->+ +------------------->+ | |
124 | | // Data | SuperVersion +----------+ | Version A | |
125 | | // | 3 | imm | | | |
126 | | // Iter2 +----->+ | +-------v------+ +-----------+ |
127 | | // +-----+--------+ | MemtableList +----------------> Empty |
128 | | // | | Version r | +-----------+ |
129 | | // | +--------------+ | | |
130 | | // +------------------+ current| Version B | |
131 | | // +--------------+ | +----->+ | |
132 | | // | | | | +-----+-----+ |
133 | | // Compaction +>+ SuperVersion +-------------+ ^ |
134 | | // Job | 2 +------+ | |current |
135 | | // | +----+ | | mem | +------------+ |
136 | | // +--------------+ | | +---------------------> | |
137 | | // | +------------------------> MemTable a | |
138 | | // | mem | | | |
139 | | // +--------------+ | | +------------+ |
140 | | // | +--------------------------+ |
141 | | // Iter1 +-----> SuperVersion | | +------------+ |
142 | | // | 1 +------------------------------>+ | |
143 | | // | +-+ | mem | MemTable b | |
144 | | // +--------------+ | | | | |
145 | | // | | +--------------+ +-----^------+ |
146 | | // | |imm | MemtableList | | |
147 | | // | +--->+ Version s +------------+ |
148 | | // | +--------------+ |
149 | | // | +--------------+ |
150 | | // | | MemtableList | |
151 | | // +------>+ Version t +--------> Empty |
152 | | // imm +--------------+ |
153 | | // |
154 | | // In this example, even if the current LSM-tree consists of Version A and |
155 | | // memtable a, which is also referenced by SuperVersion, two older SuperVersion |
156 | | // SuperVersion2 and Superversion1 still exist, and are referenced by a |
157 | | // compaction job and an old iterator Iter1, respectively. SuperVersion2 |
158 | | // contains Version B, memtable a and memtable b; SuperVersion1 contains |
159 | | // Version B and memtable b (mutable). As a result, Version B and memtable b |
160 | | // are prevented from being destroyed or deleted. |
161 | | |
162 | | // ColumnFamilyHandleImpl is the class that clients use to access different |
163 | | // column families. It has non-trivial destructor, which gets called when client |
164 | | // is done using the column family |
165 | | class ColumnFamilyHandleImpl : public ColumnFamilyHandle { |
166 | | public: |
167 | | // create while holding the mutex |
168 | | ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, |
169 | | InstrumentedMutex* mutex); |
170 | | // destroy without mutex |
171 | | virtual ~ColumnFamilyHandleImpl(); |
172 | 4.28M | virtual ColumnFamilyData* cfd() const { return cfd_; } |
173 | 3.84k | virtual DBImpl* db() const { return db_; } |
174 | | |
175 | | uint32_t GetID() const override; |
176 | | const std::string& GetName() const override; |
177 | | Status GetDescriptor(ColumnFamilyDescriptor* desc) override; |
178 | | const Comparator* GetComparator() const override; |
179 | | |
180 | | private: |
181 | | ColumnFamilyData* cfd_; |
182 | | DBImpl* db_; |
183 | | InstrumentedMutex* mutex_; |
184 | | }; |
185 | | |
186 | | // Does not ref-count ColumnFamilyData |
187 | | // We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter |
188 | | // calls DBImpl methods. When this happens, MemTableInserter need access to |
189 | | // ColumnFamilyHandle (same as the client would need). In that case, we feed |
190 | | // MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl |
191 | | // methods |
192 | | class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { |
193 | | public: |
194 | | ColumnFamilyHandleInternal() |
195 | 48.4k | : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), |
196 | 48.4k | internal_cfd_(nullptr) {} |
197 | | |
198 | 2.02M | void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } |
199 | 962k | ColumnFamilyData* cfd() const override { return internal_cfd_; } |
200 | | |
201 | | private: |
202 | | ColumnFamilyData* internal_cfd_; |
203 | | }; |
204 | | |
205 | | // holds references to memtable, all immutable memtables and version |
206 | | struct SuperVersion { |
207 | | // Accessing members of this class is not thread-safe and requires external |
208 | | // synchronization (ie db mutex held or on write thread). |
209 | | ColumnFamilyData* cfd; |
210 | | ReadOnlyMemTable* mem; |
211 | | MemTableListVersion* imm; |
212 | | Version* current; |
213 | | // TODO: do we really need this in addition to what's in current Version? |
214 | | MutableCFOptions mutable_cf_options; |
215 | | // Version number of the current SuperVersion |
216 | | uint64_t version_number; |
217 | | WriteStallCondition write_stall_condition; |
218 | | // Each time `full_history_ts_low` collapses history, a new SuperVersion is |
219 | | // installed. This field tracks the effective `full_history_ts_low` for that |
220 | | // SuperVersion, to be used by read APIs for sanity checks. This field is |
221 | | // immutable once SuperVersion is installed. For column family that doesn't |
222 | | // enable UDT feature, this is an empty string. |
223 | | std::string full_history_ts_low; |
224 | | |
225 | | // An immutable snapshot of the DB's seqno to time mapping, usually shared |
226 | | // between SuperVersions. |
227 | | std::shared_ptr<const SeqnoToTimeMapping> seqno_to_time_mapping{nullptr}; |
228 | | |
229 | | // should be called outside the mutex |
230 | 94.4k | SuperVersion() = default; |
231 | | ~SuperVersion(); |
232 | | SuperVersion* Ref(); |
233 | | // If Unref() returns true, Cleanup() should be called with mutex held |
234 | | // before deleting this SuperVersion. |
235 | | bool Unref(); |
236 | | |
237 | | // call these two methods with db mutex held |
238 | | // Cleanup unrefs mem, imm and current. Also, it stores all memtables |
239 | | // that needs to be deleted in to_delete vector. Unrefing those |
240 | | // objects needs to be done in the mutex |
241 | | void Cleanup(); |
242 | | void Init( |
243 | | ColumnFamilyData* new_cfd, MemTable* new_mem, |
244 | | MemTableListVersion* new_imm, Version* new_current, |
245 | | std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping); |
246 | | |
247 | | // Share the ownership of the seqno to time mapping object referred to in this |
248 | | // SuperVersion. To be used by the new SuperVersion to be installed after this |
249 | | // one if seqno to time mapping does not change in between these two |
250 | | // SuperVersions. Or to share the ownership of the mapping with a FlushJob. |
251 | 10.5k | std::shared_ptr<const SeqnoToTimeMapping> ShareSeqnoToTimeMapping() { |
252 | 10.5k | return seqno_to_time_mapping; |
253 | 10.5k | } |
254 | | |
255 | | // Access the seqno to time mapping object in this SuperVersion. |
256 | 32.5k | UnownedPtr<const SeqnoToTimeMapping> GetSeqnoToTimeMapping() const { |
257 | 32.5k | return seqno_to_time_mapping.get(); |
258 | 32.5k | } |
259 | | |
260 | | // The value of dummy is not actually used. kSVInUse takes its address as a |
261 | | // mark in the thread local storage to indicate the SuperVersion is in use |
262 | | // by thread. This way, the value of kSVInUse is guaranteed to have no |
263 | | // conflict with SuperVersion object address and portable on different |
264 | | // platform. |
265 | | static int dummy; |
266 | | static void* const kSVInUse; |
267 | | static void* const kSVObsolete; |
268 | | |
269 | | private: |
270 | | std::atomic<uint32_t> refs; |
271 | | // We need to_delete because during Cleanup(), imm->Unref() returns |
272 | | // all memtables that we need to free through this vector. We then |
273 | | // delete all those memtables outside of mutex, during destruction |
274 | | autovector<ReadOnlyMemTable*> to_delete; |
275 | | }; |
276 | | |
277 | | Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options); |
278 | | |
279 | | Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options); |
280 | | |
281 | | Status CheckCFPathsSupported(const DBOptions& db_options, |
282 | | const ColumnFamilyOptions& cf_options); |
283 | | |
284 | | ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options, |
285 | | bool read_only, |
286 | | const ColumnFamilyOptions& src); |
287 | | // Wrap user defined table properties collector factories `from cf_options` |
288 | | // into internal ones in internal_tbl_prop_coll_factories. Add a system internal |
289 | | // one too. |
290 | | void GetInternalTblPropCollFactory( |
291 | | const ImmutableCFOptions& ioptions, |
292 | | InternalTblPropCollFactories* internal_tbl_prop_coll_factories); |
293 | | |
294 | | class ColumnFamilySet; |
295 | | |
296 | | // This class keeps all the data that a column family needs. |
297 | | // Most methods require DB mutex held, unless otherwise noted |
298 | | class ColumnFamilyData { |
299 | | public: |
300 | | ~ColumnFamilyData(); |
301 | | |
302 | | // thread-safe |
303 | 2.81M | uint32_t GetID() const { return id_; } |
304 | | // thread-safe |
305 | 477k | const std::string& GetName() const { return name_; } |
306 | | |
307 | | // Ref() can only be called from a context where the caller can guarantee |
308 | | // that ColumnFamilyData is alive (while holding a non-zero ref already, |
309 | | // holding a DB mutex, or as the leader in a write batch group). |
310 | 381k | void Ref() { refs_.fetch_add(1); } |
311 | | |
312 | | // UnrefAndTryDelete() decreases the reference count and do free if needed, |
313 | | // return true if this is freed else false, UnrefAndTryDelete() can only |
314 | | // be called while holding a DB mutex, or during single-threaded recovery. |
315 | | bool UnrefAndTryDelete(); |
316 | | |
317 | | // SetDropped() can only be called under following conditions: |
318 | | // 1) Holding a DB mutex, |
319 | | // 2) from single-threaded write thread, AND |
320 | | // 3) from single-threaded VersionSet::LogAndApply() |
321 | | // After dropping column family no other operation on that column family |
322 | | // will be executed. All the files and memory will be, however, kept around |
323 | | // until client drops the column family handle. That way, client can still |
324 | | // access data from dropped column family. |
325 | | // Column family can be dropped and still alive. In that state: |
326 | | // *) Compaction and flush is not executed on the dropped column family. |
327 | | // *) Client can continue reading from column family. Writes will fail unless |
328 | | // WriteOptions::ignore_missing_column_families is true |
329 | | // When the dropped column family is unreferenced, then we: |
330 | | // *) Remove column family from the linked list maintained by ColumnFamilySet |
331 | | // *) delete all memory associated with that column family |
332 | | // *) delete all the files associated with that column family |
333 | | void SetDropped(); |
334 | 968k | bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); } |
335 | | |
336 | | void SetFlushSkipReschedule(); |
337 | | |
338 | | bool GetAndClearFlushSkipReschedule(); |
339 | | |
340 | | // thread-safe |
341 | 1.28M | int NumberLevels() const { return ioptions_.num_levels; } |
342 | | |
343 | 275k | void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } |
344 | 1.45M | uint64_t GetLogNumber() const { return log_number_; } |
345 | | |
346 | | // thread-safe |
347 | | const FileOptions* soptions() const; |
348 | 8.31M | const ImmutableOptions& ioptions() const { return ioptions_; } |
349 | | // REQUIRES: DB mutex held |
350 | | // This returns the MutableCFOptions used by current SuperVersion |
351 | | // You should use this API to reference MutableCFOptions most of the time. |
352 | 386 | const MutableCFOptions& GetCurrentMutableCFOptions() const { |
353 | 386 | return super_version_->mutable_cf_options; |
354 | 386 | } |
355 | | // REQUIRES: DB mutex held |
356 | | // This returns the latest MutableCFOptions, which may be not in effect yet. |
357 | 1.29M | const MutableCFOptions& GetLatestMutableCFOptions() const { |
358 | 1.29M | return mutable_cf_options_; |
359 | 1.29M | } |
360 | | |
361 | | // REQUIRES: DB mutex held |
362 | | // Build ColumnFamiliesOptions with immutable options and latest mutable |
363 | | // options. |
364 | | ColumnFamilyOptions GetLatestCFOptions() const; |
365 | | |
366 | 962k | bool is_delete_range_supported() { return is_delete_range_supported_; } |
367 | | |
368 | | // Validate CF options against DB options |
369 | | static Status ValidateOptions(const DBOptions& db_options, |
370 | | const ColumnFamilyOptions& cf_options); |
371 | | // REQUIRES: DB mutex held |
372 | | Status SetOptions( |
373 | | const DBOptions& db_options, |
374 | | const std::unordered_map<std::string, std::string>& options_map); |
375 | | |
376 | 282k | InternalStats* internal_stats() { return internal_stats_.get(); } |
377 | | |
378 | 117k | MemTableList* imm() { return &imm_; } |
379 | 3.97M | MemTable* mem() { return mem_; } |
380 | | |
381 | 1.74k | bool IsEmpty() { |
382 | 1.74k | return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0; |
383 | 1.74k | } |
384 | | |
385 | 755k | Version* dummy_versions() { return dummy_versions_; } |
386 | 2.30M | Version* current() { return current_; } // REQUIRE: DB mutex held |
387 | | void SetCurrent(Version* _current); // REQUIRE: DB mutex held |
388 | | uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held |
389 | | uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held |
390 | | uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held |
391 | | uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held |
392 | | // REQUIRE: DB mutex held |
393 | 114k | void SetMemtable(MemTable* new_mem) { |
394 | 114k | AssignMemtableID(new_mem); |
395 | 114k | mem_ = new_mem; |
396 | 114k | if (ioptions_.disallow_memtable_writes) { |
397 | 0 | mem_->MarkImmutable(); |
398 | 0 | } |
399 | 114k | } |
400 | | |
401 | 114k | void AssignMemtableID(ReadOnlyMemTable* new_imm) { |
402 | 114k | new_imm->SetID(++last_memtable_id_); |
403 | 114k | } |
404 | | |
405 | | // calculate the oldest log needed for the durability of this column family |
406 | | uint64_t OldestLogToKeep(); |
407 | | |
408 | | // See Memtable constructor for explanation of earliest_seq param. |
409 | | // `mutable_cf_options` might need to be a saved copy if calling this without |
410 | | // holding the DB mutex. |
411 | | MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options, |
412 | | SequenceNumber earliest_seq); |
413 | | void CreateNewMemtable(SequenceNumber earliest_seq); |
414 | | |
415 | 559k | TableCache* table_cache() const { return table_cache_.get(); } |
416 | 0 | BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); } |
417 | 233k | BlobSource* blob_source() const { return blob_source_.get(); } |
418 | | |
419 | | // See documentation in compaction_picker.h |
420 | | // REQUIRES: DB mutex held |
421 | | bool NeedsCompaction() const; |
422 | | // REQUIRES: DB mutex held |
423 | | Compaction* PickCompaction( |
424 | | const MutableCFOptions& mutable_options, |
425 | | const MutableDBOptions& mutable_db_options, |
426 | | const std::vector<SequenceNumber>& existing_snapshots, |
427 | | const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer, |
428 | | bool require_max_output_level = false); |
429 | | |
430 | | // Check if the passed range overlap with any running compactions. |
431 | | // REQUIRES: DB mutex held |
432 | | bool RangeOverlapWithCompaction(const Slice& smallest_user_key, |
433 | | const Slice& largest_user_key, |
434 | | int level) const; |
435 | | |
436 | | // Check if the passed ranges overlap with any unflushed memtables |
437 | | // (immutable or mutable). |
438 | | // |
439 | | // @param super_version A referenced SuperVersion that will be held for the |
440 | | // duration of this function. |
441 | | // |
442 | | // Thread-safe |
443 | | Status RangesOverlapWithMemtables(const autovector<UserKeyRange>& ranges, |
444 | | SuperVersion* super_version, |
445 | | bool allow_data_in_errors, bool* overlap); |
446 | | |
447 | | // A flag to tell a manual compaction is to compact all levels together |
448 | | // instead of a specific level. |
449 | | static const int kCompactAllLevels; |
450 | | // A flag to tell a manual compaction's output is base level. |
451 | | static const int kCompactToBaseLevel; |
452 | | // REQUIRES: DB mutex held |
453 | | Compaction* CompactRange(const MutableCFOptions& mutable_cf_options, |
454 | | const MutableDBOptions& mutable_db_options, |
455 | | int input_level, int output_level, |
456 | | const CompactRangeOptions& compact_range_options, |
457 | | const InternalKey* begin, const InternalKey* end, |
458 | | InternalKey** compaction_end, bool* manual_conflict, |
459 | | uint64_t max_file_num_to_ignore, |
460 | | const std::string& trim_ts); |
461 | | |
462 | 6.65k | CompactionPicker* compaction_picker() { return compaction_picker_.get(); } |
463 | | // thread-safe |
464 | 4.29M | const Comparator* user_comparator() const { |
465 | 4.29M | return internal_comparator_.user_comparator(); |
466 | 4.29M | } |
467 | | // thread-safe |
468 | 676k | const InternalKeyComparator& internal_comparator() const { |
469 | 676k | return internal_comparator_; |
470 | 676k | } |
471 | | |
472 | 20.6k | const InternalTblPropCollFactories* internal_tbl_prop_coll_factories() const { |
473 | 20.6k | return &internal_tbl_prop_coll_factories_; |
474 | 20.6k | } |
475 | | |
476 | 195k | SuperVersion* GetSuperVersion() { return super_version_; } |
477 | | // thread-safe |
478 | | // Return a already referenced SuperVersion to be used safely. |
479 | | SuperVersion* GetReferencedSuperVersion(DBImpl* db); |
480 | | // thread-safe |
481 | | // Get SuperVersion stored in thread local storage. If it does not exist, |
482 | | // get a reference from a current SuperVersion. |
483 | | SuperVersion* GetThreadLocalSuperVersion(DBImpl* db); |
484 | | // Try to return SuperVersion back to thread local storage. Return true on |
485 | | // success and false on failure. It fails when the thread local storage |
486 | | // contains anything other than SuperVersion::kSVInUse flag. |
487 | | bool ReturnThreadLocalSuperVersion(SuperVersion* sv); |
488 | | // thread-safe |
489 | 0 | uint64_t GetSuperVersionNumber() const { |
490 | 0 | return super_version_number_.load(); |
491 | 0 | } |
492 | 0 | uint64_t GetSuperVersionNumberRelaxed() const { |
493 | 0 | return super_version_number_.load(std::memory_order_relaxed); |
494 | 0 | } |
495 | | // Only intended for use by DBImpl::InstallSuperVersion() and variants |
496 | | void InstallSuperVersion(SuperVersionContext* sv_context, |
497 | | InstrumentedMutex* db_mutex, |
498 | | std::optional<std::shared_ptr<SeqnoToTimeMapping>> |
499 | | new_seqno_to_time_mapping = {}); |
500 | | |
501 | | void ResetThreadLocalSuperVersions(); |
502 | | |
503 | | // Protected by DB mutex |
504 | 3.49k | void set_queued_for_flush(bool value) { queued_for_flush_ = value; } |
505 | 9.57k | void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; } |
506 | 3.49k | bool queued_for_flush() { return queued_for_flush_; } |
507 | 97.0k | bool queued_for_compaction() { return queued_for_compaction_; } |
508 | | |
509 | | static std::pair<WriteStallCondition, WriteStallCause> |
510 | | GetWriteStallConditionAndCause( |
511 | | int num_unflushed_memtables, int num_l0_files, |
512 | | uint64_t num_compaction_needed_bytes, |
513 | | const MutableCFOptions& mutable_cf_options, |
514 | | const ImmutableCFOptions& immutable_cf_options); |
515 | | |
516 | | // Recalculate some stall conditions, which are changed only during |
517 | | // compaction, adding new memtable and/or recalculation of compaction score. |
518 | | WriteStallCondition RecalculateWriteStallConditions( |
519 | | const MutableCFOptions& mutable_cf_options); |
520 | | |
521 | 95.3k | void set_initialized() { initialized_.store(true); } |
522 | | |
523 | 289k | bool initialized() const { return initialized_.load(); } |
524 | | |
525 | 130k | const ColumnFamilyOptions& initial_cf_options() { |
526 | 130k | return initial_cf_options_; |
527 | 130k | } |
528 | | |
529 | | // created_dirs remembers directory created, so that we don't need to call |
530 | | // the same data creation operation again. |
531 | | Status AddDirectories( |
532 | | std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs); |
533 | | |
534 | | FSDirectory* GetDataDir(size_t path_id) const; |
535 | | |
536 | | // full_history_ts_low_ can only increase. |
537 | 0 | void SetFullHistoryTsLow(std::string ts_low) { |
538 | 0 | assert(!ts_low.empty()); |
539 | 0 | const Comparator* ucmp = user_comparator(); |
540 | 0 | assert(ucmp); |
541 | | // Guard against resurrected full_history_ts_low persisted in MANIFEST |
542 | | // from previous DB sessions. This could happen if UDT was enabled and then |
543 | | // disabled. |
544 | 0 | if (ucmp->timestamp_size() == 0) { |
545 | 0 | return; |
546 | 0 | } |
547 | 0 | if (full_history_ts_low_.empty() || |
548 | 0 | ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) { |
549 | 0 | full_history_ts_low_ = std::move(ts_low); |
550 | 0 | } |
551 | 0 | } |
552 | | |
553 | 163k | const std::string& GetFullHistoryTsLow() const { |
554 | 163k | const Comparator* ucmp = user_comparator(); |
555 | 163k | assert(ucmp); |
556 | 163k | if (ucmp->timestamp_size() == 0) { |
557 | 163k | assert(full_history_ts_low_.empty()); |
558 | 163k | } |
559 | 163k | return full_history_ts_low_; |
560 | 163k | } |
561 | | |
562 | | // REQUIRES: DB mutex held. |
563 | | // Return true if flushing up to MemTables with ID `max_memtable_id` |
564 | | // should be postponed to retain user-defined timestamps according to the |
565 | | // user's setting. Called by background flush job. |
566 | | bool ShouldPostponeFlushToRetainUDT(uint64_t max_memtable_id); |
567 | | |
568 | 0 | ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } |
569 | 0 | WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } |
570 | | std::shared_ptr<CacheReservationManager> |
571 | 253k | GetFileMetadataCacheReservationManager() { |
572 | 253k | return file_metadata_cache_res_mgr_; |
573 | 253k | } |
574 | | |
575 | | static const uint32_t kDummyColumnFamilyDataId; |
576 | | |
577 | | // Keep track of whether the mempurge feature was ever used. |
578 | 0 | void SetMempurgeUsed() { mempurge_used_ = true; } |
579 | 1.74k | bool GetMempurgeUsed() { return mempurge_used_; } |
580 | | |
581 | | // Allocate and return a new epoch number |
582 | 18.7k | uint64_t NewEpochNumber() { return next_epoch_number_.fetch_add(1); } |
583 | | |
584 | | // Get the next epoch number to be assigned |
585 | 65.4k | uint64_t GetNextEpochNumber() const { return next_epoch_number_.load(); } |
586 | | |
587 | | // Set the next epoch number to be assigned |
588 | 65.4k | void SetNextEpochNumber(uint64_t next_epoch_number) { |
589 | 65.4k | next_epoch_number_.store(next_epoch_number); |
590 | 65.4k | } |
591 | | |
592 | | // Reset the next epoch number to be assigned |
593 | 65.4k | void ResetNextEpochNumber() { next_epoch_number_.store(1); } |
594 | | |
595 | | // Recover the next epoch number of this CF and epoch number |
596 | | // of its files (if missing) |
597 | | void RecoverEpochNumbers(); |
598 | | |
599 | 40 | int GetUnflushedMemTableCountForWriteStallCheck() const { |
600 | 40 | return (mem_->IsEmpty() ? 0 : 1) + imm_.NumNotFlushed(); |
601 | 40 | } |
602 | | |
603 | | // thread-safe, DB mutex not needed. |
604 | 275k | bool AllowIngestBehind() const { |
605 | 275k | return ioptions_.cf_allow_ingest_behind || ioptions_.allow_ingest_behind; |
606 | 275k | } |
607 | | |
608 | | private: |
609 | | friend class ColumnFamilySet; |
610 | | ColumnFamilyData( |
611 | | uint32_t id, const std::string& name, Version* dummy_versions, |
612 | | Cache* table_cache, WriteBufferManager* write_buffer_manager, |
613 | | const ColumnFamilyOptions& options, const ImmutableDBOptions& db_options, |
614 | | const FileOptions* file_options, ColumnFamilySet* column_family_set, |
615 | | BlockCacheTracer* const block_cache_tracer, |
616 | | const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id, |
617 | | const std::string& db_session_id, bool read_only); |
618 | | |
619 | | std::vector<std::string> GetDbPaths() const; |
620 | | |
621 | | uint32_t id_; |
622 | | const std::string name_; |
623 | | Version* dummy_versions_; // Head of circular doubly-linked list of versions. |
624 | | Version* current_; // == dummy_versions->prev_ |
625 | | |
626 | | std::atomic<int> refs_; // outstanding references to ColumnFamilyData |
627 | | std::atomic<bool> initialized_; |
628 | | std::atomic<bool> dropped_; // true if client dropped it |
629 | | |
630 | | // When user-defined timestamps in memtable only feature is enabled, this |
631 | | // flag indicates a successfully requested flush that should |
632 | | // skip being rescheduled and haven't undergone the rescheduling check yet. |
633 | | // This flag is cleared when a check skips rescheduling a FlushRequest. |
634 | | // With this flag, automatic flushes in regular cases can continue to |
635 | | // retain UDTs by getting rescheduled as usual while manual flushes and |
636 | | // error recovery flushes will proceed without getting rescheduled. |
637 | | std::atomic<bool> flush_skip_reschedule_; |
638 | | |
639 | | const InternalKeyComparator internal_comparator_; |
640 | | InternalTblPropCollFactories internal_tbl_prop_coll_factories_; |
641 | | |
642 | | const ColumnFamilyOptions initial_cf_options_; |
643 | | const ImmutableOptions ioptions_; |
644 | | MutableCFOptions mutable_cf_options_; |
645 | | |
646 | | const bool is_delete_range_supported_; |
647 | | |
648 | | std::unique_ptr<TableCache> table_cache_; |
649 | | std::unique_ptr<BlobFileCache> blob_file_cache_; |
650 | | std::unique_ptr<BlobSource> blob_source_; |
651 | | |
652 | | std::unique_ptr<InternalStats> internal_stats_; |
653 | | |
654 | | WriteBufferManager* write_buffer_manager_; |
655 | | |
656 | | MemTable* mem_; |
657 | | MemTableList imm_; |
658 | | SuperVersion* super_version_; |
659 | | |
660 | | // An ordinal representing the current SuperVersion. Updated by |
661 | | // InstallSuperVersion(), i.e. incremented every time super_version_ |
662 | | // changes. |
663 | | std::atomic<uint64_t> super_version_number_; |
664 | | |
665 | | // Thread's local copy of SuperVersion pointer |
666 | | // This needs to be destructed before mutex_ |
667 | | std::unique_ptr<ThreadLocalPtr> local_sv_; |
668 | | |
669 | | // pointers for a circular linked list. we use it to support iterations over |
670 | | // all column families that are alive (note: dropped column families can also |
671 | | // be alive as long as client holds a reference) |
672 | | ColumnFamilyData* next_; |
673 | | ColumnFamilyData* prev_; |
674 | | |
675 | | // This is the earliest log file number that contains data from this |
676 | | // Column Family. All earlier log files must be ignored and not |
677 | | // recovered from |
678 | | uint64_t log_number_; |
679 | | |
680 | | // An object that keeps all the compaction stats |
681 | | // and picks the next compaction |
682 | | std::unique_ptr<CompactionPicker> compaction_picker_; |
683 | | |
684 | | ColumnFamilySet* column_family_set_; |
685 | | |
686 | | std::unique_ptr<WriteControllerToken> write_controller_token_; |
687 | | |
688 | | // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ |
689 | | bool queued_for_flush_; |
690 | | |
691 | | // If true --> this ColumnFamily is currently present in |
692 | | // DBImpl::compaction_queue_ |
693 | | bool queued_for_compaction_; |
694 | | |
695 | | uint64_t prev_compaction_needed_bytes_; |
696 | | |
697 | | // if the database was opened with 2pc enabled |
698 | | bool allow_2pc_; |
699 | | |
700 | | // Memtable id to track flush. |
701 | | uint64_t last_memtable_id_; |
702 | | |
703 | | // Directories corresponding to cf_paths. |
704 | | std::vector<std::shared_ptr<FSDirectory>> data_dirs_; |
705 | | |
706 | | bool db_paths_registered_; |
707 | | |
708 | | std::string full_history_ts_low_; |
709 | | |
710 | | // For charging memory usage of file metadata created for newly added files to |
711 | | // a Version associated with this CFD |
712 | | std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_; |
713 | | bool mempurge_used_; |
714 | | |
715 | | std::atomic<uint64_t> next_epoch_number_; |
716 | | }; |
717 | | |
718 | | // ColumnFamilySet has interesting thread-safety requirements |
719 | | // * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB |
720 | | // mutex AND executed in the write thread. |
721 | | // CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND |
722 | | // single-threaded write thread. It is also called during Recovery and in |
723 | | // DumpManifest(). |
724 | | // RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be |
725 | | // held and it needs to be executed from the write thread. SetDropped() also |
726 | | // guarantees that it will be called only from single-threaded LogAndApply(), |
727 | | // but this condition is not that important. |
728 | | // * Iteration -- hold DB mutex. If you want to release the DB mutex in the |
729 | | // body of the iteration, wrap in a RefedColumnFamilySet. |
730 | | // * GetDefault() -- thread safe |
731 | | // * GetColumnFamily() -- either inside of DB mutex or from a write thread |
732 | | // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), |
733 | | // NumberOfColumnFamilies -- inside of DB mutex |
734 | | class ColumnFamilySet { |
735 | | public: |
736 | | // ColumnFamilySet supports iteration |
737 | | class iterator { |
738 | | public: |
739 | 2.76M | explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {} |
740 | | // NOTE: minimum operators for for-loop iteration |
741 | 1.91M | iterator& operator++() { |
742 | 1.91M | current_ = current_->next_; |
743 | 1.91M | return *this; |
744 | 1.91M | } |
745 | 3.29M | bool operator!=(const iterator& other) const { |
746 | 3.29M | return this->current_ != other.current_; |
747 | 3.29M | } |
748 | 1.91M | ColumnFamilyData* operator*() { return current_; } |
749 | | |
750 | | private: |
751 | | ColumnFamilyData* current_; |
752 | | }; |
753 | | |
754 | | ColumnFamilySet(const std::string& dbname, |
755 | | const ImmutableDBOptions* db_options, |
756 | | const FileOptions& file_options, Cache* table_cache, |
757 | | WriteBufferManager* _write_buffer_manager, |
758 | | WriteController* _write_controller, |
759 | | BlockCacheTracer* const block_cache_tracer, |
760 | | const std::shared_ptr<IOTracer>& io_tracer, |
761 | | const std::string& db_id, const std::string& db_session_id); |
762 | | ~ColumnFamilySet(); |
763 | | |
764 | | ColumnFamilyData* GetDefault() const; |
765 | | // GetColumnFamily() calls return nullptr if column family is not found |
766 | | ColumnFamilyData* GetColumnFamily(uint32_t id) const; |
767 | | ColumnFamilyData* GetColumnFamily(const std::string& name) const; |
768 | | // this call will return the next available column family ID. it guarantees |
769 | | // that there is no column family with id greater than or equal to the |
770 | | // returned value in the current running instance or anytime in RocksDB |
771 | | // instance history. |
772 | | uint32_t GetNextColumnFamilyID(); |
773 | | uint32_t GetMaxColumnFamily(); |
774 | | void UpdateMaxColumnFamily(uint32_t new_max_column_family); |
775 | | size_t NumberOfColumnFamilies() const; |
776 | | |
777 | | ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id, |
778 | | Version* dummy_version, |
779 | | const ColumnFamilyOptions& options, |
780 | | bool read_only); |
781 | | |
782 | | const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize() |
783 | 39.2k | const { |
784 | 39.2k | return running_ts_sz_; |
785 | 39.2k | } |
786 | | |
787 | | const UnorderedMap<uint32_t, size_t>& |
788 | 1.01M | GetColumnFamiliesTimestampSizeForRecord() const { |
789 | 1.01M | return ts_sz_for_record_; |
790 | 1.01M | } |
791 | | |
792 | 1.38M | iterator begin() { return iterator(dummy_cfd_->next_); } |
793 | 1.38M | iterator end() { return iterator(dummy_cfd_); } |
794 | | |
795 | 102k | Cache* get_table_cache() { return table_cache_; } |
796 | | |
797 | 0 | WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; } |
798 | | |
799 | 17.6k | WriteController* write_controller() { return write_controller_; } |
800 | | |
801 | | private: |
802 | | friend class ColumnFamilyData; |
803 | | // helper function that gets called from cfd destructor |
804 | | // REQUIRES: DB mutex held |
805 | | void RemoveColumnFamily(ColumnFamilyData* cfd); |
806 | | |
807 | | // column_families_ and column_family_data_ need to be protected: |
808 | | // * when mutating both conditions have to be satisfied: |
809 | | // 1. DB mutex locked |
810 | | // 2. thread currently in single-threaded write thread |
811 | | // * when reading, at least one condition needs to be satisfied: |
812 | | // 1. DB mutex locked |
813 | | // 2. accessed from a single-threaded write thread |
814 | | UnorderedMap<std::string, uint32_t> column_families_; |
815 | | UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_; |
816 | | |
817 | | // Mutating / reading `running_ts_sz_` and `ts_sz_for_record_` follow |
818 | | // the same requirements as `column_families_` and `column_family_data_`. |
819 | | // Mapping from column family id to user-defined timestamp size for all |
820 | | // running column families. |
821 | | UnorderedMap<uint32_t, size_t> running_ts_sz_; |
822 | | // Mapping from column family id to user-defined timestamp size for |
823 | | // column families with non-zero user-defined timestamp size. |
824 | | UnorderedMap<uint32_t, size_t> ts_sz_for_record_; |
825 | | |
826 | | uint32_t max_column_family_; |
827 | | const FileOptions file_options_; |
828 | | |
829 | | ColumnFamilyData* dummy_cfd_; |
830 | | // We don't hold the refcount here, since default column family always exists |
831 | | // We are also not responsible for cleaning up default_cfd_cache_. This is |
832 | | // just a cache that makes common case (accessing default column family) |
833 | | // faster |
834 | | ColumnFamilyData* default_cfd_cache_; |
835 | | |
836 | | const std::string db_name_; |
837 | | const ImmutableDBOptions* const db_options_; |
838 | | Cache* table_cache_; |
839 | | WriteBufferManager* write_buffer_manager_; |
840 | | WriteController* write_controller_; |
841 | | BlockCacheTracer* const block_cache_tracer_; |
842 | | std::shared_ptr<IOTracer> io_tracer_; |
843 | | const std::string& db_id_; |
844 | | std::string db_session_id_; |
845 | | }; |
846 | | |
847 | | // A wrapper for ColumnFamilySet that supports releasing DB mutex during each |
848 | | // iteration over the iterator, because the cfd is Refed and Unrefed during |
849 | | // each iteration to prevent concurrent CF drop from destroying it (until |
850 | | // Unref). |
851 | | class RefedColumnFamilySet { |
852 | | public: |
853 | 274 | explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {} |
854 | | |
855 | | class iterator { |
856 | | public: |
857 | 548 | explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) { |
858 | 548 | MaybeRef(*wrapped_); |
859 | 548 | } |
860 | 548 | ~iterator() { MaybeUnref(*wrapped_); } |
861 | 934 | inline void MaybeRef(ColumnFamilyData* cfd) { |
862 | 934 | if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { |
863 | 386 | cfd->Ref(); |
864 | 386 | } |
865 | 934 | } |
866 | 934 | inline void MaybeUnref(ColumnFamilyData* cfd) { |
867 | 934 | if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { |
868 | 386 | cfd->UnrefAndTryDelete(); |
869 | 386 | } |
870 | 934 | } |
871 | | // NOTE: minimum operators for for-loop iteration |
872 | 386 | inline iterator& operator++() { |
873 | 386 | ColumnFamilyData* old = *wrapped_; |
874 | 386 | ++wrapped_; |
875 | | // Can only unref & potentially free cfd after accessing its next_ |
876 | 386 | MaybeUnref(old); |
877 | 386 | MaybeRef(*wrapped_); |
878 | 386 | return *this; |
879 | 386 | } |
880 | 660 | inline bool operator!=(const iterator& other) const { |
881 | 660 | return this->wrapped_ != other.wrapped_; |
882 | 660 | } |
883 | 386 | inline ColumnFamilyData* operator*() { return *wrapped_; } |
884 | | |
885 | | private: |
886 | | ColumnFamilySet::iterator wrapped_; |
887 | | }; |
888 | | |
889 | 274 | iterator begin() { return iterator(wrapped_->begin()); } |
890 | 274 | iterator end() { return iterator(wrapped_->end()); } |
891 | | |
892 | | private: |
893 | | ColumnFamilySet* wrapped_; |
894 | | }; |
895 | | |
896 | | // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access |
897 | | // memtables of different column families (specified by ID in the write batch) |
898 | | class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { |
899 | | public: |
900 | | explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) |
901 | 48.4k | : column_family_set_(column_family_set), current_(nullptr) {} |
902 | | |
903 | | // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed |
904 | | // with the arguments used to construct *orig. |
905 | | explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig) |
906 | 0 | : column_family_set_(orig->column_family_set_), current_(nullptr) {} |
907 | | |
908 | | // sets current_ to ColumnFamilyData with column_family_id |
909 | | // returns false if column family doesn't exist |
910 | | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be |
911 | | // under a DB mutex OR from a write thread |
912 | | bool Seek(uint32_t column_family_id) override; |
913 | | |
914 | | // Returns log number of the selected column family |
915 | | // REQUIRES: under a DB mutex OR from a write thread |
916 | | uint64_t GetLogNumber() const override; |
917 | | |
918 | | // REQUIRES: Seek() called first |
919 | | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be |
920 | | // under a DB mutex OR from a write thread |
921 | | MemTable* GetMemTable() const override; |
922 | | |
923 | | // Returns column family handle for the selected column family |
924 | | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be |
925 | | // under a DB mutex OR from a write thread |
926 | | ColumnFamilyHandle* GetColumnFamilyHandle() override; |
927 | | |
928 | | // Cannot be called while another thread is calling Seek(). |
929 | | // REQUIRES: use this function of DBImpl::column_family_memtables_ should be |
930 | | // under a DB mutex OR from a write thread |
931 | 6.00M | ColumnFamilyData* current() override { return current_; } |
932 | | |
933 | | private: |
934 | | ColumnFamilySet* column_family_set_; |
935 | | ColumnFamilyData* current_; |
936 | | ColumnFamilyHandleInternal handle_; |
937 | | }; |
938 | | |
939 | | uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); |
940 | | |
941 | | const Comparator* GetColumnFamilyUserComparator( |
942 | | ColumnFamilyHandle* column_family); |
943 | | |
944 | | const ImmutableOptions& GetImmutableOptions(ColumnFamilyHandle* column_family); |
945 | | |
946 | | } // namespace ROCKSDB_NAMESPACE |