Coverage Report

Created: 2025-07-23 07:17

/src/rocksdb/db/table_cache.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#include "db/table_cache.h"
11
12
#include "db/dbformat.h"
13
#include "db/range_tombstone_fragmenter.h"
14
#include "db/snapshot_impl.h"
15
#include "db/version_edit.h"
16
#include "file/file_util.h"
17
#include "file/filename.h"
18
#include "file/random_access_file_reader.h"
19
#include "monitoring/perf_context_imp.h"
20
#include "rocksdb/advanced_options.h"
21
#include "rocksdb/statistics.h"
22
#include "table/block_based/block_based_table_reader.h"
23
#include "table/get_context.h"
24
#include "table/internal_iterator.h"
25
#include "table/iterator_wrapper.h"
26
#include "table/multiget_context.h"
27
#include "table/table_builder.h"
28
#include "table/table_reader.h"
29
#include "test_util/sync_point.h"
30
#include "util/cast_util.h"
31
#include "util/coding.h"
32
#include "util/stop_watch.h"
33
34
// Generate the regular and coroutine versions of some methods by
35
// including table_cache_sync_and_async.h twice
36
// Macros in the header will expand differently based on whether
37
// WITH_COROUTINES or WITHOUT_COROUTINES is defined
38
// clang-format off
39
#define WITHOUT_COROUTINES
40
#include "db/table_cache_sync_and_async.h"
41
#undef WITHOUT_COROUTINES
42
#define WITH_COROUTINES
43
#include "db/table_cache_sync_and_async.h"
44
#undef WITH_COROUTINES
45
// clang-format on
46
47
namespace ROCKSDB_NAMESPACE {
48
49
namespace {
50
51
165k
static Slice GetSliceForFileNumber(const uint64_t* file_number) {
52
165k
  return Slice(reinterpret_cast<const char*>(file_number),
53
165k
               sizeof(*file_number));
54
165k
}
55
56
0
void AppendVarint64(IterKey* key, uint64_t v) {
57
0
  char buf[10];
58
0
  auto ptr = EncodeVarint64(buf, v);
59
0
  key->TrimAppend(key->Size(), buf, ptr - buf);
60
0
}
61
62
}  // anonymous namespace
63
64
const int kLoadConcurency = 128;
65
66
TableCache::TableCache(const ImmutableOptions& ioptions,
67
                       const FileOptions* file_options, Cache* const cache,
68
                       BlockCacheTracer* const block_cache_tracer,
69
                       const std::shared_ptr<IOTracer>& io_tracer,
70
                       const std::string& db_session_id)
71
118k
    : ioptions_(ioptions),
72
118k
      file_options_(*file_options),
73
118k
      cache_(cache),
74
118k
      immortal_tables_(false),
75
118k
      block_cache_tracer_(block_cache_tracer),
76
118k
      loader_mutex_(kLoadConcurency),
77
118k
      io_tracer_(io_tracer),
78
118k
      db_session_id_(db_session_id) {
79
118k
  if (ioptions_.row_cache) {
80
    // If the same cache is shared by multiple instances, we need to
81
    // disambiguate its entries.
82
0
    PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
83
0
  }
84
118k
}
85
86
118k
TableCache::~TableCache() = default;
87
88
Status TableCache::GetTableReader(
89
    const ReadOptions& ro, const FileOptions& file_options,
90
    const InternalKeyComparator& internal_comparator,
91
    const FileMetaData& file_meta, bool sequential_mode,
92
    HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
93
    const MutableCFOptions& mutable_cf_options, bool skip_filters, int level,
94
    bool prefetch_index_and_filter_in_cache,
95
126k
    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
96
126k
  std::string fname = TableFileName(
97
126k
      ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId());
98
126k
  std::unique_ptr<FSRandomAccessFile> file;
99
126k
  FileOptions fopts = file_options;
100
126k
  fopts.temperature = file_temperature;
101
126k
  Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
102
126k
  TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile",
103
126k
                           const_cast<Status*>(&s));
104
126k
  if (s.ok()) {
105
125k
    s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
106
125k
  }
107
126k
  if (s.ok()) {
108
125k
    RecordTick(ioptions_.stats, NO_FILE_OPENS);
109
125k
  } else if (s.IsPathNotFound()) {
110
0
    fname = Rocks2LevelTableFileName(fname);
111
    // If this file is also not found, we want to use the error message
112
    // that contains the table file name which is less confusing.
113
0
    Status temp_s =
114
0
        PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
115
0
    if (temp_s.ok()) {
116
0
      temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
117
0
                                                 nullptr);
118
0
    }
119
0
    if (temp_s.ok()) {
120
0
      RecordTick(ioptions_.stats, NO_FILE_OPENS);
121
0
      s = temp_s;
122
0
    }
123
0
  }
124
125
126k
  if (s.ok()) {
126
125k
    if (!sequential_mode && ioptions_.advise_random_on_open) {
127
125k
      file->Hint(FSRandomAccessFile::kRandom);
128
125k
    }
129
125k
    if (ioptions_.default_temperature != Temperature::kUnknown &&
130
125k
        file_temperature == Temperature::kUnknown) {
131
0
      file_temperature = ioptions_.default_temperature;
132
0
    }
133
125k
    StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS);
134
125k
    std::unique_ptr<RandomAccessFileReader> file_reader(
135
125k
        new RandomAccessFileReader(std::move(file), fname, ioptions_.clock,
136
125k
                                   io_tracer_, ioptions_.stats, SST_READ_MICROS,
137
125k
                                   file_read_hist, ioptions_.rate_limiter.get(),
138
125k
                                   ioptions_.listeners, file_temperature,
139
125k
                                   level == ioptions_.num_levels - 1));
140
125k
    UniqueId64x2 expected_unique_id;
141
125k
    if (ioptions_.verify_sst_unique_id_in_manifest) {
142
125k
      expected_unique_id = file_meta.unique_id;
143
18.4E
    } else {
144
18.4E
      expected_unique_id = kNullUniqueId64x2;  // null ID == no verification
145
18.4E
    }
146
125k
    s = mutable_cf_options.table_factory->NewTableReader(
147
125k
        ro,
148
125k
        TableReaderOptions(
149
125k
            ioptions_, mutable_cf_options.prefix_extractor,
150
125k
            mutable_cf_options.compression_manager.get(), file_options,
151
125k
            internal_comparator,
152
125k
            mutable_cf_options.block_protection_bytes_per_key, skip_filters,
153
125k
            immortal_tables_, false /* force_direct_prefetch */, level,
154
125k
            block_cache_tracer_, max_file_size_for_l0_meta_pin, db_session_id_,
155
125k
            file_meta.fd.GetNumber(), expected_unique_id,
156
125k
            file_meta.fd.largest_seqno, file_meta.tail_size,
157
125k
            file_meta.user_defined_timestamps_persisted),
158
125k
        std::move(file_reader), file_meta.fd.GetFileSize(), table_reader,
159
125k
        prefetch_index_and_filter_in_cache);
160
125k
    TEST_SYNC_POINT("TableCache::GetTableReader:0");
161
125k
  }
162
126k
  return s;
163
126k
}
164
165
0
Cache::Handle* TableCache::Lookup(Cache* cache, uint64_t file_number) {
166
  // NOTE: sharing same Cache with BlobFileCache
167
0
  Slice key = GetSliceForFileNumber(&file_number);
168
0
  return cache->Lookup(key);
169
0
}
170
171
Status TableCache::FindTable(
172
    const ReadOptions& ro, const FileOptions& file_options,
173
    const InternalKeyComparator& internal_comparator,
174
    const FileMetaData& file_meta, TypedHandle** handle,
175
    const MutableCFOptions& mutable_cf_options, const bool no_io,
176
    HistogramImpl* file_read_hist, bool skip_filters, int level,
177
    bool prefetch_index_and_filter_in_cache,
178
156k
    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
179
156k
  PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
180
156k
  uint64_t number = file_meta.fd.GetNumber();
181
  // NOTE: sharing same Cache with BlobFileCache
182
156k
  Slice key = GetSliceForFileNumber(&number);
183
156k
  *handle = cache_.Lookup(key);
184
156k
  TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
185
156k
                           const_cast<bool*>(&no_io));
186
187
156k
  if (*handle == nullptr) {
188
126k
    if (no_io) {
189
0
      return Status::Incomplete("Table not found in table_cache, no_io is set");
190
0
    }
191
126k
    MutexLock load_lock(&loader_mutex_.Get(key));
192
    // We check the cache again under loading mutex
193
126k
    *handle = cache_.Lookup(key);
194
126k
    if (*handle != nullptr) {
195
0
      return Status::OK();
196
0
    }
197
198
126k
    std::unique_ptr<TableReader> table_reader;
199
126k
    Status s = GetTableReader(ro, file_options, internal_comparator, file_meta,
200
126k
                              false /* sequential mode */, file_read_hist,
201
126k
                              &table_reader, mutable_cf_options, skip_filters,
202
126k
                              level, prefetch_index_and_filter_in_cache,
203
126k
                              max_file_size_for_l0_meta_pin, file_temperature);
204
126k
    if (!s.ok()) {
205
0
      assert(table_reader == nullptr);
206
0
      RecordTick(ioptions_.stats, NO_FILE_ERRORS);
207
      // We do not cache error results so that if the error is transient,
208
      // or somebody repairs the file, we recover automatically.
209
0
      IGNORE_STATUS_IF_ERROR(s);
210
126k
    } else {
211
126k
      s = cache_.Insert(key, table_reader.get(), 1, handle);
212
126k
      if (s.ok()) {
213
        // Release ownership of table reader.
214
125k
        table_reader.release();
215
125k
      }
216
126k
    }
217
126k
    return s;
218
126k
  }
219
30.8k
  return Status::OK();
220
156k
}
221
222
InternalIterator* TableCache::NewIterator(
223
    const ReadOptions& options, const FileOptions& file_options,
224
    const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
225
    RangeDelAggregator* range_del_agg,
226
    const MutableCFOptions& mutable_cf_options, TableReader** table_reader_ptr,
227
    HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
228
    bool skip_filters, int level, size_t max_file_size_for_l0_meta_pin,
229
    const InternalKey* smallest_compaction_key,
230
    const InternalKey* largest_compaction_key, bool allow_unprepared_value,
231
    const SequenceNumber* read_seqno,
232
82.3k
    std::unique_ptr<TruncatedRangeDelIterator>* range_del_iter) {
233
82.3k
  PERF_TIMER_GUARD(new_table_iterator_nanos);
234
235
82.3k
  Status s;
236
82.3k
  TableReader* table_reader = nullptr;
237
82.3k
  TypedHandle* handle = nullptr;
238
82.3k
  if (table_reader_ptr != nullptr) {
239
0
    *table_reader_ptr = nullptr;
240
0
  }
241
82.3k
  bool for_compaction = caller == TableReaderCaller::kCompaction;
242
82.3k
  auto& fd = file_meta.fd;
243
82.3k
  table_reader = fd.table_reader;
244
82.3k
  if (table_reader == nullptr) {
245
23.9k
    s = FindTable(options, file_options, icomparator, file_meta, &handle,
246
23.9k
                  mutable_cf_options,
247
23.9k
                  options.read_tier == kBlockCacheTier /* no_io */,
248
23.9k
                  file_read_hist, skip_filters, level,
249
23.9k
                  true /* prefetch_index_and_filter_in_cache */,
250
23.9k
                  max_file_size_for_l0_meta_pin, file_meta.temperature);
251
23.9k
    if (s.ok()) {
252
23.9k
      table_reader = cache_.Value(handle);
253
23.9k
    }
254
23.9k
  }
255
82.3k
  InternalIterator* result = nullptr;
256
82.3k
  if (s.ok()) {
257
82.3k
    if (options.table_filter &&
258
82.3k
        !options.table_filter(*table_reader->GetTableProperties())) {
259
0
      result = NewEmptyInternalIterator<Slice>(arena);
260
82.3k
    } else {
261
82.3k
      result = table_reader->NewIterator(
262
82.3k
          options, mutable_cf_options.prefix_extractor.get(), arena,
263
82.3k
          skip_filters, caller, file_options.compaction_readahead_size,
264
82.3k
          allow_unprepared_value);
265
82.3k
    }
266
82.3k
    if (handle != nullptr) {
267
23.9k
      cache_.RegisterReleaseAsCleanup(handle, *result);
268
23.9k
      handle = nullptr;  // prevent from releasing below
269
23.9k
    }
270
271
82.3k
    if (for_compaction) {
272
25.0k
      table_reader->SetupForCompaction();
273
25.0k
    }
274
82.3k
    if (table_reader_ptr != nullptr) {
275
0
      *table_reader_ptr = table_reader;
276
0
    }
277
82.3k
  }
278
82.3k
  if (s.ok() && !options.ignore_range_deletions) {
279
82.3k
    if (range_del_iter != nullptr) {
280
54.6k
      auto new_range_del_iter =
281
54.6k
          read_seqno ? table_reader->NewRangeTombstoneIterator(
282
13.8k
                           *read_seqno, options.timestamp)
283
54.6k
                     : table_reader->NewRangeTombstoneIterator(options);
284
54.6k
      if (new_range_del_iter == nullptr || new_range_del_iter->empty()) {
285
50.9k
        delete new_range_del_iter;
286
50.9k
        *range_del_iter = nullptr;
287
50.9k
      } else {
288
3.72k
        *range_del_iter = std::make_unique<TruncatedRangeDelIterator>(
289
3.72k
            std::unique_ptr<FragmentedRangeTombstoneIterator>(
290
3.72k
                new_range_del_iter),
291
3.72k
            &icomparator, &file_meta.smallest, &file_meta.largest);
292
3.72k
      }
293
54.6k
    }
294
82.3k
    if (range_del_agg != nullptr) {
295
28.7k
      if (range_del_agg->AddFile(fd.GetNumber())) {
296
28.7k
        std::unique_ptr<FragmentedRangeTombstoneIterator> new_range_del_iter(
297
28.7k
            static_cast<FragmentedRangeTombstoneIterator*>(
298
28.7k
                table_reader->NewRangeTombstoneIterator(options)));
299
28.7k
        if (new_range_del_iter != nullptr) {
300
0
          s = new_range_del_iter->status();
301
0
        }
302
28.7k
        if (s.ok()) {
303
28.7k
          const InternalKey* smallest = &file_meta.smallest;
304
28.7k
          const InternalKey* largest = &file_meta.largest;
305
28.7k
          if (smallest_compaction_key != nullptr) {
306
5.36k
            smallest = smallest_compaction_key;
307
5.36k
          }
308
28.7k
          if (largest_compaction_key != nullptr) {
309
5.36k
            largest = largest_compaction_key;
310
5.36k
          }
311
28.7k
          range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest,
312
28.7k
                                       largest);
313
28.7k
        }
314
28.7k
      }
315
28.7k
    }
316
82.3k
  }
317
318
82.3k
  if (handle != nullptr) {
319
0
    cache_.Release(handle);
320
0
  }
321
82.3k
  if (!s.ok()) {
322
0
    assert(result == nullptr);
323
0
    result = NewErrorInternalIterator<Slice>(s, arena);
324
0
  }
325
82.3k
  return result;
326
82.3k
}
327
328
Status TableCache::GetRangeTombstoneIterator(
329
    const ReadOptions& options,
330
    const InternalKeyComparator& internal_comparator,
331
    const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options,
332
0
    std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
333
0
  assert(out_iter);
334
0
  const FileDescriptor& fd = file_meta.fd;
335
0
  Status s;
336
0
  TableReader* t = fd.table_reader;
337
0
  TypedHandle* handle = nullptr;
338
0
  if (t == nullptr) {
339
0
    s = FindTable(options, file_options_, internal_comparator, file_meta,
340
0
                  &handle, mutable_cf_options);
341
0
    if (s.ok()) {
342
0
      t = cache_.Value(handle);
343
0
    }
344
0
  }
345
0
  if (s.ok()) {
346
    // Note: NewRangeTombstoneIterator could return nullptr
347
0
    out_iter->reset(t->NewRangeTombstoneIterator(options));
348
0
  }
349
0
  if (handle) {
350
0
    if (*out_iter) {
351
0
      cache_.RegisterReleaseAsCleanup(handle, **out_iter);
352
0
    } else {
353
0
      cache_.Release(handle);
354
0
    }
355
0
  }
356
0
  return s;
357
0
}
358
359
uint64_t TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
360
                                             const FileDescriptor& fd,
361
                                             const Slice& internal_key,
362
                                             GetContext* get_context,
363
0
                                             IterKey& row_cache_key) {
364
0
  uint64_t fd_number = fd.GetNumber();
365
  // We use the user key as cache key instead of the internal key,
366
  // otherwise the whole cache would be invalidated every time the
367
  // sequence key increases. However, to support caching snapshot
368
  // reads, we append a sequence number (incremented by 1 to
369
  // distinguish from 0) other than internal_key seq no
370
  // to determine row cache entry visibility.
371
  // If the snapshot is larger than the largest seqno in the file,
372
  // all data should be exposed to the snapshot, so we treat it
373
  // the same as there is no snapshot. The exception is that if
374
  // a seq-checking callback is registered, some internal keys
375
  // may still be filtered out.
376
0
  uint64_t cache_entry_seq_no = 0;
377
378
  // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
379
0
  if (options.snapshot != nullptr &&
380
0
      (get_context->has_callback() ||
381
0
       static_cast_with_check<const SnapshotImpl>(options.snapshot)
382
0
               ->GetSequenceNumber() <= fd.largest_seqno)) {
383
    // We should consider to use options.snapshot->GetSequenceNumber()
384
    // instead of GetInternalKeySeqno(k), which will make the code
385
    // easier to understand.
386
0
    cache_entry_seq_no = 1 + GetInternalKeySeqno(internal_key);
387
0
  }
388
389
  // Compute row cache key.
390
0
  row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
391
0
                           row_cache_id_.size());
392
0
  AppendVarint64(&row_cache_key, fd_number);
393
0
  AppendVarint64(&row_cache_key, cache_entry_seq_no);
394
395
  // Provide a sequence number for callback checking on cache hit.
396
  // As cache_entry_seq_no starts at 1, decrease it's value by 1 to get
397
  // a sequence number align with get context's logic.
398
0
  return cache_entry_seq_no == 0 ? 0 : cache_entry_seq_no - 1;
399
0
}
400
401
bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
402
                                 size_t prefix_size, GetContext* get_context,
403
0
                                 Status* read_status, SequenceNumber seq_no) {
404
0
  bool found = false;
405
406
0
  row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
407
0
  RowCacheInterface row_cache{ioptions_.row_cache.get()};
408
0
  if (auto row_handle = row_cache.Lookup(row_cache_key.GetUserKey())) {
409
    // Cleanable routine to release the cache entry
410
0
    Cleanable value_pinner;
411
    // If it comes here value is located on the cache.
412
    // found_row_cache_entry points to the value on cache,
413
    // and value_pinner has cleanup procedure for the cached entry.
414
    // After replayGetContextLog() returns, get_context.pinnable_slice_
415
    // will point to cache entry buffer (or a copy based on that) and
416
    // cleanup routine under value_pinner will be delegated to
417
    // get_context.pinnable_slice_. Cache entry is released when
418
    // get_context.pinnable_slice_ is reset.
419
0
    row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner);
420
    // If row cache hit, knowing cache key is the same to row_cache_key,
421
    // can use row_cache_key's seq no to construct InternalKey.
422
0
    *read_status = replayGetContextLog(*row_cache.Value(row_handle), user_key,
423
0
                                       get_context, &value_pinner, seq_no);
424
0
    RecordTick(ioptions_.stats, ROW_CACHE_HIT);
425
0
    found = true;
426
0
  } else {
427
0
    RecordTick(ioptions_.stats, ROW_CACHE_MISS);
428
0
  }
429
0
  return found;
430
0
}
431
432
Status TableCache::Get(const ReadOptions& options,
433
                       const InternalKeyComparator& internal_comparator,
434
                       const FileMetaData& file_meta, const Slice& k,
435
                       GetContext* get_context,
436
                       const MutableCFOptions& mutable_cf_options,
437
                       HistogramImpl* file_read_hist, bool skip_filters,
438
1.69k
                       int level, size_t max_file_size_for_l0_meta_pin) {
439
1.69k
  auto& fd = file_meta.fd;
440
1.69k
  std::string* row_cache_entry = nullptr;
441
1.69k
  bool done = false;
442
1.69k
  IterKey row_cache_key;
443
1.69k
  std::string row_cache_entry_buffer;
444
445
  // Check row cache if enabled.
446
  // Reuse row_cache_key sequence number when row cache hits.
447
1.69k
  Status s;
448
1.69k
  if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
449
0
    auto user_key = ExtractUserKey(k);
450
0
    uint64_t cache_entry_seq_no =
451
0
        CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
452
0
    done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
453
0
                           get_context, &s, cache_entry_seq_no);
454
0
    if (!done) {
455
0
      row_cache_entry = &row_cache_entry_buffer;
456
0
    }
457
0
  }
458
1.69k
  TableReader* t = fd.table_reader;
459
1.69k
  TypedHandle* handle = nullptr;
460
1.69k
  if (s.ok() && !done) {
461
1.69k
    if (t == nullptr) {
462
0
      s = FindTable(options, file_options_, internal_comparator, file_meta,
463
0
                    &handle, mutable_cf_options,
464
0
                    options.read_tier == kBlockCacheTier /* no_io */,
465
0
                    file_read_hist, skip_filters, level,
466
0
                    true /* prefetch_index_and_filter_in_cache */,
467
0
                    max_file_size_for_l0_meta_pin, file_meta.temperature);
468
0
      if (s.ok()) {
469
0
        t = cache_.Value(handle);
470
0
      }
471
0
    }
472
1.69k
    SequenceNumber* max_covering_tombstone_seq =
473
1.69k
        get_context->max_covering_tombstone_seq();
474
1.69k
    if (s.ok() && max_covering_tombstone_seq != nullptr &&
475
1.69k
        !options.ignore_range_deletions) {
476
1.69k
      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
477
1.69k
          t->NewRangeTombstoneIterator(options));
478
1.69k
      if (range_del_iter != nullptr) {
479
0
        SequenceNumber seq =
480
0
            range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k));
481
0
        if (seq > *max_covering_tombstone_seq) {
482
0
          *max_covering_tombstone_seq = seq;
483
0
          if (get_context->NeedTimestamp()) {
484
0
            get_context->SetTimestampFromRangeTombstone(
485
0
                range_del_iter->timestamp());
486
0
          }
487
0
        }
488
0
      }
489
1.69k
    }
490
1.69k
    if (s.ok()) {
491
1.69k
      get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
492
1.69k
      s = t->Get(options, k, get_context,
493
1.69k
                 mutable_cf_options.prefix_extractor.get(), skip_filters);
494
1.69k
      get_context->SetReplayLog(nullptr);
495
1.69k
    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
496
      // Couldn't find table in cache and couldn't open it because of no_io.
497
0
      get_context->MarkKeyMayExist();
498
0
      done = true;
499
0
    }
500
1.69k
  }
501
502
  // Put the replay log in row cache only if something was found.
503
1.69k
  if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
504
0
    RowCacheInterface row_cache{ioptions_.row_cache.get()};
505
0
    size_t charge = row_cache_entry->capacity() + sizeof(std::string);
506
0
    auto row_ptr = new std::string(std::move(*row_cache_entry));
507
0
    Status rcs = row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge);
508
0
    if (!rcs.ok()) {
509
      // If row cache is full, it's OK to continue, but we keep ownership of
510
      // row_ptr.
511
0
      delete row_ptr;
512
0
    }
513
0
  }
514
515
1.69k
  if (handle != nullptr) {
516
0
    cache_.Release(handle);
517
0
  }
518
1.69k
  return s;
519
1.69k
}
520
521
void TableCache::UpdateRangeTombstoneSeqnums(
522
    const ReadOptions& options, TableReader* t,
523
0
    MultiGetContext::Range& table_range) {
524
0
  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
525
0
      t->NewRangeTombstoneIterator(options));
526
0
  if (range_del_iter != nullptr) {
527
0
    for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
528
0
      SequenceNumber* max_covering_tombstone_seq =
529
0
          iter->get_context->max_covering_tombstone_seq();
530
0
      SequenceNumber seq =
531
0
          range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts);
532
0
      if (seq > *max_covering_tombstone_seq) {
533
0
        *max_covering_tombstone_seq = seq;
534
0
        if (iter->get_context->NeedTimestamp()) {
535
0
          iter->get_context->SetTimestampFromRangeTombstone(
536
0
              range_del_iter->timestamp());
537
0
        }
538
0
      }
539
0
    }
540
0
  }
541
0
}
542
543
Status TableCache::MultiGetFilter(
544
    const ReadOptions& options,
545
    const InternalKeyComparator& internal_comparator,
546
    const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options,
547
    HistogramImpl* file_read_hist, int level,
548
0
    MultiGetContext::Range* mget_range, TypedHandle** table_handle) {
549
0
  auto& fd = file_meta.fd;
550
0
  IterKey row_cache_key;
551
0
  std::string row_cache_entry_buffer;
552
553
  // Check if we need to use the row cache. If yes, then we cannot do the
554
  // filtering here, since the filtering needs to happen after the row cache
555
  // lookup.
556
0
  KeyContext& first_key = *mget_range->begin();
557
0
  if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) {
558
0
    return Status::NotSupported();
559
0
  }
560
0
  Status s;
561
0
  TableReader* t = fd.table_reader;
562
0
  TypedHandle* handle = nullptr;
563
0
  MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
564
0
                                         mget_range->end());
565
0
  if (t == nullptr) {
566
0
    s = FindTable(options, file_options_, internal_comparator, file_meta,
567
0
                  &handle, mutable_cf_options,
568
0
                  options.read_tier == kBlockCacheTier /* no_io */,
569
0
                  file_read_hist,
570
0
                  /*skip_filters=*/false, level,
571
0
                  true /* prefetch_index_and_filter_in_cache */,
572
0
                  /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
573
0
    if (s.ok()) {
574
0
      t = cache_.Value(handle);
575
0
    }
576
0
    *table_handle = handle;
577
0
  }
578
0
  if (s.ok()) {
579
0
    s = t->MultiGetFilter(options, mutable_cf_options.prefix_extractor.get(),
580
0
                          mget_range);
581
0
  }
582
0
  if (s.ok() && !options.ignore_range_deletions) {
583
    // Update the range tombstone sequence numbers for the keys here
584
    // as TableCache::MultiGet may or may not be called, and even if it
585
    // is, it may be called with fewer keys in the rangedue to filtering.
586
0
    UpdateRangeTombstoneSeqnums(options, t, tombstone_range);
587
0
  }
588
0
  if (mget_range->empty() && handle) {
589
0
    cache_.Release(handle);
590
0
    *table_handle = nullptr;
591
0
  }
592
593
0
  return s;
594
0
}
595
596
Status TableCache::GetTableProperties(
597
    const FileOptions& file_options, const ReadOptions& read_options,
598
    const InternalKeyComparator& internal_comparator,
599
    const FileMetaData& file_meta,
600
    std::shared_ptr<const TableProperties>* properties,
601
160k
    const MutableCFOptions& mutable_cf_options, bool no_io) {
602
160k
  auto table_reader = file_meta.fd.table_reader;
603
  // table already been pre-loaded?
604
160k
  if (table_reader) {
605
160k
    *properties = table_reader->GetTableProperties();
606
607
160k
    return Status::OK();
608
160k
  }
609
610
0
  TypedHandle* table_handle = nullptr;
611
0
  Status s = FindTable(read_options, file_options, internal_comparator,
612
0
                       file_meta, &table_handle, mutable_cf_options, no_io);
613
0
  if (!s.ok()) {
614
0
    return s;
615
0
  }
616
0
  assert(table_handle);
617
0
  auto table = cache_.Value(table_handle);
618
0
  *properties = table->GetTableProperties();
619
0
  cache_.Release(table_handle);
620
0
  return s;
621
0
}
622
623
Status TableCache::ApproximateKeyAnchors(
624
    const ReadOptions& ro, const InternalKeyComparator& internal_comparator,
625
    const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options,
626
627
0
    std::vector<TableReader::Anchor>& anchors) {
628
0
  Status s;
629
0
  TableReader* t = file_meta.fd.table_reader;
630
0
  TypedHandle* handle = nullptr;
631
0
  if (t == nullptr) {
632
0
    s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle,
633
0
                  mutable_cf_options);
634
0
    if (s.ok()) {
635
0
      t = cache_.Value(handle);
636
0
    }
637
0
  }
638
0
  if (s.ok() && t != nullptr) {
639
0
    s = t->ApproximateKeyAnchors(ro, anchors);
640
0
  }
641
0
  if (handle != nullptr) {
642
0
    cache_.Release(handle);
643
0
  }
644
0
  return s;
645
0
}
646
647
size_t TableCache::GetMemoryUsageByTableReader(
648
    const FileOptions& file_options, const ReadOptions& read_options,
649
    const InternalKeyComparator& internal_comparator,
650
0
    const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options) {
651
0
  auto table_reader = file_meta.fd.table_reader;
652
  // table already been pre-loaded?
653
0
  if (table_reader) {
654
0
    return table_reader->ApproximateMemoryUsage();
655
0
  }
656
657
0
  TypedHandle* table_handle = nullptr;
658
0
  Status s =
659
0
      FindTable(read_options, file_options, internal_comparator, file_meta,
660
0
                &table_handle, mutable_cf_options, true /* no_io */);
661
0
  if (!s.ok()) {
662
0
    return 0;
663
0
  }
664
0
  assert(table_handle);
665
0
  auto table = cache_.Value(table_handle);
666
0
  auto ret = table->ApproximateMemoryUsage();
667
0
  cache_.Release(table_handle);
668
0
  return ret;
669
0
}
670
671
8.27k
void TableCache::Evict(Cache* cache, uint64_t file_number) {
672
8.27k
  cache->Erase(GetSliceForFileNumber(&file_number));
673
8.27k
}
674
675
uint64_t TableCache::ApproximateOffsetOf(
676
    const ReadOptions& read_options, const Slice& key,
677
    const FileMetaData& file_meta, TableReaderCaller caller,
678
    const InternalKeyComparator& internal_comparator,
679
0
    const MutableCFOptions& mutable_cf_options) {
680
0
  uint64_t result = 0;
681
0
  TableReader* table_reader = file_meta.fd.table_reader;
682
0
  TypedHandle* table_handle = nullptr;
683
0
  if (table_reader == nullptr) {
684
0
    Status s =
685
0
        FindTable(read_options, file_options_, internal_comparator, file_meta,
686
0
                  &table_handle, mutable_cf_options, false /* no_io */);
687
0
    if (s.ok()) {
688
0
      table_reader = cache_.Value(table_handle);
689
0
    }
690
0
  }
691
692
0
  if (table_reader != nullptr) {
693
0
    result = table_reader->ApproximateOffsetOf(read_options, key, caller);
694
0
  }
695
0
  if (table_handle != nullptr) {
696
0
    cache_.Release(table_handle);
697
0
  }
698
699
0
  return result;
700
0
}
701
702
uint64_t TableCache::ApproximateSize(
703
    const ReadOptions& read_options, const Slice& start, const Slice& end,
704
    const FileMetaData& file_meta, TableReaderCaller caller,
705
    const InternalKeyComparator& internal_comparator,
706
0
    const MutableCFOptions& mutable_cf_options) {
707
0
  uint64_t result = 0;
708
0
  TableReader* table_reader = file_meta.fd.table_reader;
709
0
  TypedHandle* table_handle = nullptr;
710
0
  if (table_reader == nullptr) {
711
0
    Status s =
712
0
        FindTable(read_options, file_options_, internal_comparator, file_meta,
713
0
                  &table_handle, mutable_cf_options, false /* no_io */);
714
0
    if (s.ok()) {
715
0
      table_reader = cache_.Value(table_handle);
716
0
    }
717
0
  }
718
719
0
  if (table_reader != nullptr) {
720
0
    result = table_reader->ApproximateSize(read_options, start, end, caller);
721
0
  }
722
0
  if (table_handle != nullptr) {
723
0
    cache_.Release(table_handle);
724
0
  }
725
726
0
  return result;
727
0
}
728
729
void TableCache::ReleaseObsolete(Cache* cache, uint64_t file_number,
730
                                 Cache::Handle* h,
731
126k
                                 uint32_t uncache_aggressiveness) {
732
126k
  CacheInterface typed_cache(cache);
733
126k
  TypedHandle* table_handle = reinterpret_cast<TypedHandle*>(h);
734
126k
  if (table_handle == nullptr) {
735
0
    table_handle = typed_cache.Lookup(GetSliceForFileNumber(&file_number));
736
0
  }
737
126k
  if (table_handle != nullptr) {
738
126k
    TableReader* table_reader = typed_cache.Value(table_handle);
739
126k
    table_reader->MarkObsolete(uncache_aggressiveness);
740
126k
    typed_cache.ReleaseAndEraseIfLastRef(table_handle);
741
126k
  }
742
126k
}
743
744
}  // namespace ROCKSDB_NAMESPACE