Coverage Report

Created: 2024-07-27 06:53

/src/rocksdb/db/table_cache.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#include "db/table_cache.h"
11
12
#include "db/dbformat.h"
13
#include "db/range_tombstone_fragmenter.h"
14
#include "db/snapshot_impl.h"
15
#include "db/version_edit.h"
16
#include "file/file_util.h"
17
#include "file/filename.h"
18
#include "file/random_access_file_reader.h"
19
#include "monitoring/perf_context_imp.h"
20
#include "rocksdb/advanced_options.h"
21
#include "rocksdb/statistics.h"
22
#include "table/block_based/block_based_table_reader.h"
23
#include "table/get_context.h"
24
#include "table/internal_iterator.h"
25
#include "table/iterator_wrapper.h"
26
#include "table/multiget_context.h"
27
#include "table/table_builder.h"
28
#include "table/table_reader.h"
29
#include "test_util/sync_point.h"
30
#include "util/cast_util.h"
31
#include "util/coding.h"
32
#include "util/stop_watch.h"
33
34
// Generate the regular and coroutine versions of some methods by
35
// including table_cache_sync_and_async.h twice
36
// Macros in the header will expand differently based on whether
37
// WITH_COROUTINES or WITHOUT_COROUTINES is defined
38
// clang-format off
39
#define WITHOUT_COROUTINES
40
#include "db/table_cache_sync_and_async.h"
41
#undef WITHOUT_COROUTINES
42
#define WITH_COROUTINES
43
#include "db/table_cache_sync_and_async.h"
44
#undef WITH_COROUTINES
45
// clang-format on
46
47
namespace ROCKSDB_NAMESPACE {
48
49
namespace {
50
51
8.96k
static Slice GetSliceForFileNumber(const uint64_t* file_number) {
52
8.96k
  return Slice(reinterpret_cast<const char*>(file_number),
53
8.96k
               sizeof(*file_number));
54
8.96k
}
55
56
57
0
void AppendVarint64(IterKey* key, uint64_t v) {
58
0
  char buf[10];
59
0
  auto ptr = EncodeVarint64(buf, v);
60
0
  key->TrimAppend(key->Size(), buf, ptr - buf);
61
0
}
62
63
64
}  // anonymous namespace
65
66
const int kLoadConcurency = 128;
67
68
TableCache::TableCache(const ImmutableOptions& ioptions,
69
                       const FileOptions* file_options, Cache* const cache,
70
                       BlockCacheTracer* const block_cache_tracer,
71
                       const std::shared_ptr<IOTracer>& io_tracer,
72
                       const std::string& db_session_id)
73
    : ioptions_(ioptions),
74
      file_options_(*file_options),
75
      cache_(cache),
76
      immortal_tables_(false),
77
      block_cache_tracer_(block_cache_tracer),
78
      loader_mutex_(kLoadConcurency),
79
      io_tracer_(io_tracer),
80
9.02k
      db_session_id_(db_session_id) {
81
9.02k
  if (ioptions_.row_cache) {
82
    // If the same cache is shared by multiple instances, we need to
83
    // disambiguate its entries.
84
0
    PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
85
0
  }
86
9.02k
}
87
88
9.02k
TableCache::~TableCache() = default;
89
90
Status TableCache::GetTableReader(
91
    const ReadOptions& ro, const FileOptions& file_options,
92
    const InternalKeyComparator& internal_comparator,
93
    const FileMetaData& file_meta, bool sequential_mode,
94
    uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist,
95
    std::unique_ptr<TableReader>* table_reader,
96
    const std::shared_ptr<const SliceTransform>& prefix_extractor,
97
    bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
98
4.48k
    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
99
4.48k
  std::string fname = TableFileName(
100
4.48k
      ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId());
101
4.48k
  std::unique_ptr<FSRandomAccessFile> file;
102
4.48k
  FileOptions fopts = file_options;
103
4.48k
  fopts.temperature = file_temperature;
104
4.48k
  Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
105
4.48k
  TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile",
106
4.48k
                           const_cast<Status*>(&s));
107
4.48k
  if (s.ok()) {
108
4.48k
    s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
109
4.48k
  }
110
4.48k
  if (s.ok()) {
111
4.48k
    RecordTick(ioptions_.stats, NO_FILE_OPENS);
112
4.48k
  } else if (s.IsPathNotFound()) {
113
0
    fname = Rocks2LevelTableFileName(fname);
114
    // If this file is also not found, we want to use the error message
115
    // that contains the table file name which is less confusing.
116
0
    Status temp_s =
117
0
        PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
118
0
    if (temp_s.ok()) {
119
0
      temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
120
0
                                                 nullptr);
121
0
    }
122
0
    if (temp_s.ok()) {
123
0
      RecordTick(ioptions_.stats, NO_FILE_OPENS);
124
0
      s = temp_s;
125
0
    }
126
0
  }
127
128
4.48k
  if (s.ok()) {
129
4.48k
    if (!sequential_mode && ioptions_.advise_random_on_open) {
130
4.48k
      file->Hint(FSRandomAccessFile::kRandom);
131
4.48k
    }
132
4.48k
    if (ioptions_.default_temperature != Temperature::kUnknown &&
133
4.48k
        file_temperature == Temperature::kUnknown) {
134
0
      file_temperature = ioptions_.default_temperature;
135
0
    }
136
4.48k
    StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS);
137
4.48k
    std::unique_ptr<RandomAccessFileReader> file_reader(
138
4.48k
        new RandomAccessFileReader(std::move(file), fname, ioptions_.clock,
139
4.48k
                                   io_tracer_, ioptions_.stats, SST_READ_MICROS,
140
4.48k
                                   file_read_hist, ioptions_.rate_limiter.get(),
141
4.48k
                                   ioptions_.listeners, file_temperature,
142
4.48k
                                   level == ioptions_.num_levels - 1));
143
4.48k
    UniqueId64x2 expected_unique_id;
144
4.48k
    if (ioptions_.verify_sst_unique_id_in_manifest) {
145
4.48k
      expected_unique_id = file_meta.unique_id;
146
4.48k
    } else {
147
0
      expected_unique_id = kNullUniqueId64x2;  // null ID == no verification
148
0
    }
149
4.48k
    s = ioptions_.table_factory->NewTableReader(
150
4.48k
        ro,
151
4.48k
        TableReaderOptions(
152
4.48k
            ioptions_, prefix_extractor, file_options, internal_comparator,
153
4.48k
            block_protection_bytes_per_key, skip_filters, immortal_tables_,
154
4.48k
            false /* force_direct_prefetch */, level, block_cache_tracer_,
155
4.48k
            max_file_size_for_l0_meta_pin, db_session_id_,
156
4.48k
            file_meta.fd.GetNumber(), expected_unique_id,
157
4.48k
            file_meta.fd.largest_seqno, file_meta.tail_size,
158
4.48k
            file_meta.user_defined_timestamps_persisted),
159
4.48k
        std::move(file_reader), file_meta.fd.GetFileSize(), table_reader,
160
4.48k
        prefetch_index_and_filter_in_cache);
161
4.48k
    TEST_SYNC_POINT("TableCache::GetTableReader:0");
162
4.48k
  }
163
4.48k
  return s;
164
4.48k
}
165
166
0
Cache::Handle* TableCache::Lookup(Cache* cache, uint64_t file_number) {
167
0
  Slice key = GetSliceForFileNumber(&file_number);
168
0
  return cache->Lookup(key);
169
0
}
170
171
Status TableCache::FindTable(
172
    const ReadOptions& ro, const FileOptions& file_options,
173
    const InternalKeyComparator& internal_comparator,
174
    const FileMetaData& file_meta, TypedHandle** handle,
175
    uint8_t block_protection_bytes_per_key,
176
    const std::shared_ptr<const SliceTransform>& prefix_extractor,
177
    const bool no_io, HistogramImpl* file_read_hist, bool skip_filters,
178
    int level, bool prefetch_index_and_filter_in_cache,
179
8.96k
    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
180
8.96k
  PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
181
8.96k
  uint64_t number = file_meta.fd.GetNumber();
182
8.96k
  Slice key = GetSliceForFileNumber(&number);
183
8.96k
  *handle = cache_.Lookup(key);
184
8.96k
  TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
185
8.96k
                           const_cast<bool*>(&no_io));
186
187
8.96k
  if (*handle == nullptr) {
188
4.48k
    if (no_io) {
189
0
      return Status::Incomplete("Table not found in table_cache, no_io is set");
190
0
    }
191
4.48k
    MutexLock load_lock(&loader_mutex_.Get(key));
192
    // We check the cache again under loading mutex
193
4.48k
    *handle = cache_.Lookup(key);
194
4.48k
    if (*handle != nullptr) {
195
0
      return Status::OK();
196
0
    }
197
198
4.48k
    std::unique_ptr<TableReader> table_reader;
199
4.48k
    Status s = GetTableReader(ro, file_options, internal_comparator, file_meta,
200
4.48k
                              false /* sequential mode */,
201
4.48k
                              block_protection_bytes_per_key, file_read_hist,
202
4.48k
                              &table_reader, prefix_extractor, skip_filters,
203
4.48k
                              level, prefetch_index_and_filter_in_cache,
204
4.48k
                              max_file_size_for_l0_meta_pin, file_temperature);
205
4.48k
    if (!s.ok()) {
206
0
      assert(table_reader == nullptr);
207
0
      RecordTick(ioptions_.stats, NO_FILE_ERRORS);
208
      // We do not cache error results so that if the error is transient,
209
      // or somebody repairs the file, we recover automatically.
210
4.48k
    } else {
211
4.48k
      s = cache_.Insert(key, table_reader.get(), 1, handle);
212
4.48k
      if (s.ok()) {
213
        // Release ownership of table reader.
214
4.48k
        table_reader.release();
215
4.48k
      }
216
4.48k
    }
217
4.48k
    return s;
218
4.48k
  }
219
4.48k
  return Status::OK();
220
8.96k
}
221
222
InternalIterator* TableCache::NewIterator(
223
    const ReadOptions& options, const FileOptions& file_options,
224
    const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
225
    RangeDelAggregator* range_del_agg,
226
    const std::shared_ptr<const SliceTransform>& prefix_extractor,
227
    TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
228
    TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
229
    size_t max_file_size_for_l0_meta_pin,
230
    const InternalKey* smallest_compaction_key,
231
    const InternalKey* largest_compaction_key, bool allow_unprepared_value,
232
    uint8_t block_protection_bytes_per_key, const SequenceNumber* read_seqno,
233
8.96k
    std::unique_ptr<TruncatedRangeDelIterator>* range_del_iter) {
234
8.96k
  PERF_TIMER_GUARD(new_table_iterator_nanos);
235
236
8.96k
  Status s;
237
8.96k
  TableReader* table_reader = nullptr;
238
8.96k
  TypedHandle* handle = nullptr;
239
8.96k
  if (table_reader_ptr != nullptr) {
240
0
    *table_reader_ptr = nullptr;
241
0
  }
242
8.96k
  bool for_compaction = caller == TableReaderCaller::kCompaction;
243
8.96k
  auto& fd = file_meta.fd;
244
8.96k
  table_reader = fd.table_reader;
245
8.96k
  if (table_reader == nullptr) {
246
4.48k
    s = FindTable(options, file_options, icomparator, file_meta, &handle,
247
4.48k
                  block_protection_bytes_per_key, prefix_extractor,
248
4.48k
                  options.read_tier == kBlockCacheTier /* no_io */,
249
4.48k
                  file_read_hist, skip_filters, level,
250
4.48k
                  true /* prefetch_index_and_filter_in_cache */,
251
4.48k
                  max_file_size_for_l0_meta_pin, file_meta.temperature);
252
4.48k
    if (s.ok()) {
253
4.48k
      table_reader = cache_.Value(handle);
254
4.48k
    }
255
4.48k
  }
256
8.96k
  InternalIterator* result = nullptr;
257
8.96k
  if (s.ok()) {
258
8.96k
    if (options.table_filter &&
259
8.96k
        !options.table_filter(*table_reader->GetTableProperties())) {
260
0
      result = NewEmptyInternalIterator<Slice>(arena);
261
8.96k
    } else {
262
8.96k
      result = table_reader->NewIterator(
263
8.96k
          options, prefix_extractor.get(), arena, skip_filters, caller,
264
8.96k
          file_options.compaction_readahead_size, allow_unprepared_value);
265
8.96k
    }
266
8.96k
    if (handle != nullptr) {
267
4.48k
      cache_.RegisterReleaseAsCleanup(handle, *result);
268
4.48k
      handle = nullptr;  // prevent from releasing below
269
4.48k
    }
270
271
8.96k
    if (for_compaction) {
272
0
      table_reader->SetupForCompaction();
273
0
    }
274
8.96k
    if (table_reader_ptr != nullptr) {
275
0
      *table_reader_ptr = table_reader;
276
0
    }
277
8.96k
  }
278
8.96k
  if (s.ok() && !options.ignore_range_deletions) {
279
8.96k
    if (range_del_iter != nullptr) {
280
4.48k
      auto new_range_del_iter =
281
4.48k
          read_seqno ? table_reader->NewRangeTombstoneIterator(
282
0
                           *read_seqno, options.timestamp)
283
4.48k
                     : table_reader->NewRangeTombstoneIterator(options);
284
4.48k
      if (new_range_del_iter == nullptr || new_range_del_iter->empty()) {
285
1.57k
        delete new_range_del_iter;
286
1.57k
        *range_del_iter = nullptr;
287
2.90k
      } else {
288
2.90k
        *range_del_iter = std::make_unique<TruncatedRangeDelIterator>(
289
2.90k
            std::unique_ptr<FragmentedRangeTombstoneIterator>(
290
2.90k
                new_range_del_iter),
291
2.90k
            &icomparator, &file_meta.smallest, &file_meta.largest);
292
2.90k
      }
293
4.48k
    }
294
8.96k
    if (range_del_agg != nullptr) {
295
0
      if (range_del_agg->AddFile(fd.GetNumber())) {
296
0
        std::unique_ptr<FragmentedRangeTombstoneIterator> new_range_del_iter(
297
0
            static_cast<FragmentedRangeTombstoneIterator*>(
298
0
                table_reader->NewRangeTombstoneIterator(options)));
299
0
        if (new_range_del_iter != nullptr) {
300
0
          s = new_range_del_iter->status();
301
0
        }
302
0
        if (s.ok()) {
303
0
          const InternalKey* smallest = &file_meta.smallest;
304
0
          const InternalKey* largest = &file_meta.largest;
305
0
          if (smallest_compaction_key != nullptr) {
306
0
            smallest = smallest_compaction_key;
307
0
          }
308
0
          if (largest_compaction_key != nullptr) {
309
0
            largest = largest_compaction_key;
310
0
          }
311
0
          range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest,
312
0
                                       largest);
313
0
        }
314
0
      }
315
0
    }
316
8.96k
  }
317
318
8.96k
  if (handle != nullptr) {
319
0
    cache_.Release(handle);
320
0
  }
321
8.96k
  if (!s.ok()) {
322
0
    assert(result == nullptr);
323
0
    result = NewErrorInternalIterator<Slice>(s, arena);
324
0
  }
325
8.96k
  return result;
326
8.96k
}
327
328
Status TableCache::GetRangeTombstoneIterator(
329
    const ReadOptions& options,
330
    const InternalKeyComparator& internal_comparator,
331
    const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
332
0
    std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
333
0
  assert(out_iter);
334
0
  const FileDescriptor& fd = file_meta.fd;
335
0
  Status s;
336
0
  TableReader* t = fd.table_reader;
337
0
  TypedHandle* handle = nullptr;
338
0
  if (t == nullptr) {
339
0
    s = FindTable(options, file_options_, internal_comparator, file_meta,
340
0
                  &handle, block_protection_bytes_per_key);
341
0
    if (s.ok()) {
342
0
      t = cache_.Value(handle);
343
0
    }
344
0
  }
345
0
  if (s.ok()) {
346
    // Note: NewRangeTombstoneIterator could return nullptr
347
0
    out_iter->reset(t->NewRangeTombstoneIterator(options));
348
0
  }
349
0
  if (handle) {
350
0
    if (*out_iter) {
351
0
      cache_.RegisterReleaseAsCleanup(handle, **out_iter);
352
0
    } else {
353
0
      cache_.Release(handle);
354
0
    }
355
0
  }
356
0
  return s;
357
0
}
358
359
uint64_t TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
360
                                             const FileDescriptor& fd,
361
                                             const Slice& internal_key,
362
                                             GetContext* get_context,
363
0
                                             IterKey& row_cache_key) {
364
0
  uint64_t fd_number = fd.GetNumber();
365
  // We use the user key as cache key instead of the internal key,
366
  // otherwise the whole cache would be invalidated every time the
367
  // sequence key increases. However, to support caching snapshot
368
  // reads, we append a sequence number (incremented by 1 to
369
  // distinguish from 0) other than internal_key seq no
370
  // to determine row cache entry visibility.
371
  // If the snapshot is larger than the largest seqno in the file,
372
  // all data should be exposed to the snapshot, so we treat it
373
  // the same as there is no snapshot. The exception is that if
374
  // a seq-checking callback is registered, some internal keys
375
  // may still be filtered out.
376
0
  uint64_t cache_entry_seq_no = 0;
377
378
  // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
379
0
  if (options.snapshot != nullptr &&
380
0
      (get_context->has_callback() ||
381
0
       static_cast_with_check<const SnapshotImpl>(options.snapshot)
382
0
               ->GetSequenceNumber() <= fd.largest_seqno)) {
383
    // We should consider to use options.snapshot->GetSequenceNumber()
384
    // instead of GetInternalKeySeqno(k), which will make the code
385
    // easier to understand.
386
0
    cache_entry_seq_no = 1 + GetInternalKeySeqno(internal_key);
387
0
  }
388
389
  // Compute row cache key.
390
0
  row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
391
0
                           row_cache_id_.size());
392
0
  AppendVarint64(&row_cache_key, fd_number);
393
0
  AppendVarint64(&row_cache_key, cache_entry_seq_no);
394
395
  // Provide a sequence number for callback checking on cache hit.
396
  // As cache_entry_seq_no starts at 1, decrease it's value by 1 to get
397
  // a sequence number align with get context's logic.
398
0
  return cache_entry_seq_no == 0 ? 0 : cache_entry_seq_no - 1;
399
0
}
400
401
bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
402
                                 size_t prefix_size, GetContext* get_context,
403
0
                                 Status* read_status, SequenceNumber seq_no) {
404
0
  bool found = false;
405
406
0
  row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
407
0
  RowCacheInterface row_cache{ioptions_.row_cache.get()};
408
0
  if (auto row_handle = row_cache.Lookup(row_cache_key.GetUserKey())) {
409
    // Cleanable routine to release the cache entry
410
0
    Cleanable value_pinner;
411
    // If it comes here value is located on the cache.
412
    // found_row_cache_entry points to the value on cache,
413
    // and value_pinner has cleanup procedure for the cached entry.
414
    // After replayGetContextLog() returns, get_context.pinnable_slice_
415
    // will point to cache entry buffer (or a copy based on that) and
416
    // cleanup routine under value_pinner will be delegated to
417
    // get_context.pinnable_slice_. Cache entry is released when
418
    // get_context.pinnable_slice_ is reset.
419
0
    row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner);
420
    // If row cache hit, knowing cache key is the same to row_cache_key,
421
    // can use row_cache_key's seq no to construct InternalKey.
422
0
    *read_status = replayGetContextLog(*row_cache.Value(row_handle), user_key,
423
0
                                       get_context, &value_pinner, seq_no);
424
0
    RecordTick(ioptions_.stats, ROW_CACHE_HIT);
425
0
    found = true;
426
0
  } else {
427
0
    RecordTick(ioptions_.stats, ROW_CACHE_MISS);
428
0
  }
429
0
  return found;
430
0
}
431
432
Status TableCache::Get(
433
    const ReadOptions& options,
434
    const InternalKeyComparator& internal_comparator,
435
    const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
436
    uint8_t block_protection_bytes_per_key,
437
    const std::shared_ptr<const SliceTransform>& prefix_extractor,
438
    HistogramImpl* file_read_hist, bool skip_filters, int level,
439
0
    size_t max_file_size_for_l0_meta_pin) {
440
0
  auto& fd = file_meta.fd;
441
0
  std::string* row_cache_entry = nullptr;
442
0
  bool done = false;
443
0
  IterKey row_cache_key;
444
0
  std::string row_cache_entry_buffer;
445
446
  // Check row cache if enabled.
447
  // Reuse row_cache_key sequence number when row cache hits.
448
0
  Status s;
449
0
  if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
450
0
    auto user_key = ExtractUserKey(k);
451
0
    uint64_t cache_entry_seq_no =
452
0
        CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
453
0
    done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
454
0
                           get_context, &s, cache_entry_seq_no);
455
0
    if (!done) {
456
0
      row_cache_entry = &row_cache_entry_buffer;
457
0
    }
458
0
  }
459
0
  TableReader* t = fd.table_reader;
460
0
  TypedHandle* handle = nullptr;
461
0
  if (s.ok() && !done) {
462
0
    if (t == nullptr) {
463
0
      s = FindTable(options, file_options_, internal_comparator, file_meta,
464
0
                    &handle, block_protection_bytes_per_key, prefix_extractor,
465
0
                    options.read_tier == kBlockCacheTier /* no_io */,
466
0
                    file_read_hist, skip_filters, level,
467
0
                    true /* prefetch_index_and_filter_in_cache */,
468
0
                    max_file_size_for_l0_meta_pin, file_meta.temperature);
469
0
      if (s.ok()) {
470
0
        t = cache_.Value(handle);
471
0
      }
472
0
    }
473
0
    SequenceNumber* max_covering_tombstone_seq =
474
0
        get_context->max_covering_tombstone_seq();
475
0
    if (s.ok() && max_covering_tombstone_seq != nullptr &&
476
0
        !options.ignore_range_deletions) {
477
0
      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
478
0
          t->NewRangeTombstoneIterator(options));
479
0
      if (range_del_iter != nullptr) {
480
0
        SequenceNumber seq =
481
0
            range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k));
482
0
        if (seq > *max_covering_tombstone_seq) {
483
0
          *max_covering_tombstone_seq = seq;
484
0
          if (get_context->NeedTimestamp()) {
485
0
            get_context->SetTimestampFromRangeTombstone(
486
0
                range_del_iter->timestamp());
487
0
          }
488
0
        }
489
0
      }
490
0
    }
491
0
    if (s.ok()) {
492
0
      get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
493
0
      s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters);
494
0
      get_context->SetReplayLog(nullptr);
495
0
    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
496
      // Couldn't find table in cache and couldn't open it because of no_io.
497
0
      get_context->MarkKeyMayExist();
498
0
      done = true;
499
0
    }
500
0
  }
501
502
  // Put the replay log in row cache only if something was found.
503
0
  if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
504
0
    RowCacheInterface row_cache{ioptions_.row_cache.get()};
505
0
    size_t charge = row_cache_entry->capacity() + sizeof(std::string);
506
0
    auto row_ptr = new std::string(std::move(*row_cache_entry));
507
0
    Status rcs = row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge);
508
0
    if (!rcs.ok()) {
509
      // If row cache is full, it's OK to continue, but we keep ownership of
510
      // row_ptr.
511
0
      delete row_ptr;
512
0
    }
513
0
  }
514
515
0
  if (handle != nullptr) {
516
0
    cache_.Release(handle);
517
0
  }
518
0
  return s;
519
0
}
520
521
void TableCache::UpdateRangeTombstoneSeqnums(
522
    const ReadOptions& options, TableReader* t,
523
0
    MultiGetContext::Range& table_range) {
524
0
  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
525
0
      t->NewRangeTombstoneIterator(options));
526
0
  if (range_del_iter != nullptr) {
527
0
    for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
528
0
      SequenceNumber* max_covering_tombstone_seq =
529
0
          iter->get_context->max_covering_tombstone_seq();
530
0
      SequenceNumber seq =
531
0
          range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts);
532
0
      if (seq > *max_covering_tombstone_seq) {
533
0
        *max_covering_tombstone_seq = seq;
534
0
        if (iter->get_context->NeedTimestamp()) {
535
0
          iter->get_context->SetTimestampFromRangeTombstone(
536
0
              range_del_iter->timestamp());
537
0
        }
538
0
      }
539
0
    }
540
0
  }
541
0
}
542
543
Status TableCache::MultiGetFilter(
544
    const ReadOptions& options,
545
    const InternalKeyComparator& internal_comparator,
546
    const FileMetaData& file_meta,
547
    const std::shared_ptr<const SliceTransform>& prefix_extractor,
548
    HistogramImpl* file_read_hist, int level,
549
    MultiGetContext::Range* mget_range, TypedHandle** table_handle,
550
0
    uint8_t block_protection_bytes_per_key) {
551
0
  auto& fd = file_meta.fd;
552
0
  IterKey row_cache_key;
553
0
  std::string row_cache_entry_buffer;
554
555
  // Check if we need to use the row cache. If yes, then we cannot do the
556
  // filtering here, since the filtering needs to happen after the row cache
557
  // lookup.
558
0
  KeyContext& first_key = *mget_range->begin();
559
0
  if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) {
560
0
    return Status::NotSupported();
561
0
  }
562
0
  Status s;
563
0
  TableReader* t = fd.table_reader;
564
0
  TypedHandle* handle = nullptr;
565
0
  MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
566
0
                                         mget_range->end());
567
0
  if (t == nullptr) {
568
0
    s = FindTable(options, file_options_, internal_comparator, file_meta,
569
0
                  &handle, block_protection_bytes_per_key, prefix_extractor,
570
0
                  options.read_tier == kBlockCacheTier /* no_io */,
571
0
                  file_read_hist,
572
0
                  /*skip_filters=*/false, level,
573
0
                  true /* prefetch_index_and_filter_in_cache */,
574
0
                  /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
575
0
    if (s.ok()) {
576
0
      t = cache_.Value(handle);
577
0
    }
578
0
    *table_handle = handle;
579
0
  }
580
0
  if (s.ok()) {
581
0
    s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range);
582
0
  }
583
0
  if (s.ok() && !options.ignore_range_deletions) {
584
    // Update the range tombstone sequence numbers for the keys here
585
    // as TableCache::MultiGet may or may not be called, and even if it
586
    // is, it may be called with fewer keys in the rangedue to filtering.
587
0
    UpdateRangeTombstoneSeqnums(options, t, tombstone_range);
588
0
  }
589
0
  if (mget_range->empty() && handle) {
590
0
    cache_.Release(handle);
591
0
    *table_handle = nullptr;
592
0
  }
593
594
0
  return s;
595
0
}
596
597
Status TableCache::GetTableProperties(
598
    const FileOptions& file_options, const ReadOptions& read_options,
599
    const InternalKeyComparator& internal_comparator,
600
    const FileMetaData& file_meta,
601
    std::shared_ptr<const TableProperties>* properties,
602
    uint8_t block_protection_bytes_per_key,
603
4.48k
    const std::shared_ptr<const SliceTransform>& prefix_extractor, bool no_io) {
604
4.48k
  auto table_reader = file_meta.fd.table_reader;
605
  // table already been pre-loaded?
606
4.48k
  if (table_reader) {
607
4.48k
    *properties = table_reader->GetTableProperties();
608
609
4.48k
    return Status::OK();
610
4.48k
  }
611
612
0
  TypedHandle* table_handle = nullptr;
613
0
  Status s = FindTable(read_options, file_options, internal_comparator,
614
0
                       file_meta, &table_handle, block_protection_bytes_per_key,
615
0
                       prefix_extractor, no_io);
616
0
  if (!s.ok()) {
617
0
    return s;
618
0
  }
619
0
  assert(table_handle);
620
0
  auto table = cache_.Value(table_handle);
621
0
  *properties = table->GetTableProperties();
622
0
  cache_.Release(table_handle);
623
0
  return s;
624
0
}
625
626
Status TableCache::ApproximateKeyAnchors(
627
    const ReadOptions& ro, const InternalKeyComparator& internal_comparator,
628
    const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
629
0
    std::vector<TableReader::Anchor>& anchors) {
630
0
  Status s;
631
0
  TableReader* t = file_meta.fd.table_reader;
632
0
  TypedHandle* handle = nullptr;
633
0
  if (t == nullptr) {
634
0
    s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle,
635
0
                  block_protection_bytes_per_key);
636
0
    if (s.ok()) {
637
0
      t = cache_.Value(handle);
638
0
    }
639
0
  }
640
0
  if (s.ok() && t != nullptr) {
641
0
    s = t->ApproximateKeyAnchors(ro, anchors);
642
0
  }
643
0
  if (handle != nullptr) {
644
0
    cache_.Release(handle);
645
0
  }
646
0
  return s;
647
0
}
648
649
size_t TableCache::GetMemoryUsageByTableReader(
650
    const FileOptions& file_options, const ReadOptions& read_options,
651
    const InternalKeyComparator& internal_comparator,
652
    const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
653
0
    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
654
0
  auto table_reader = file_meta.fd.table_reader;
655
  // table already been pre-loaded?
656
0
  if (table_reader) {
657
0
    return table_reader->ApproximateMemoryUsage();
658
0
  }
659
660
0
  TypedHandle* table_handle = nullptr;
661
0
  Status s = FindTable(read_options, file_options, internal_comparator,
662
0
                       file_meta, &table_handle, block_protection_bytes_per_key,
663
0
                       prefix_extractor, true /* no_io */);
664
0
  if (!s.ok()) {
665
0
    return 0;
666
0
  }
667
0
  assert(table_handle);
668
0
  auto table = cache_.Value(table_handle);
669
0
  auto ret = table->ApproximateMemoryUsage();
670
0
  cache_.Release(table_handle);
671
0
  return ret;
672
0
}
673
674
0
void TableCache::Evict(Cache* cache, uint64_t file_number) {
675
0
  cache->Erase(GetSliceForFileNumber(&file_number));
676
0
}
677
678
uint64_t TableCache::ApproximateOffsetOf(
679
    const ReadOptions& read_options, const Slice& key,
680
    const FileMetaData& file_meta, TableReaderCaller caller,
681
    const InternalKeyComparator& internal_comparator,
682
    uint8_t block_protection_bytes_per_key,
683
0
    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
684
0
  uint64_t result = 0;
685
0
  TableReader* table_reader = file_meta.fd.table_reader;
686
0
  TypedHandle* table_handle = nullptr;
687
0
  if (table_reader == nullptr) {
688
0
    Status s =
689
0
        FindTable(read_options, file_options_, internal_comparator, file_meta,
690
0
                  &table_handle, block_protection_bytes_per_key,
691
0
                  prefix_extractor, false /* no_io */);
692
0
    if (s.ok()) {
693
0
      table_reader = cache_.Value(table_handle);
694
0
    }
695
0
  }
696
697
0
  if (table_reader != nullptr) {
698
0
    result = table_reader->ApproximateOffsetOf(read_options, key, caller);
699
0
  }
700
0
  if (table_handle != nullptr) {
701
0
    cache_.Release(table_handle);
702
0
  }
703
704
0
  return result;
705
0
}
706
707
uint64_t TableCache::ApproximateSize(
708
    const ReadOptions& read_options, const Slice& start, const Slice& end,
709
    const FileMetaData& file_meta, TableReaderCaller caller,
710
    const InternalKeyComparator& internal_comparator,
711
    uint8_t block_protection_bytes_per_key,
712
0
    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
713
0
  uint64_t result = 0;
714
0
  TableReader* table_reader = file_meta.fd.table_reader;
715
0
  TypedHandle* table_handle = nullptr;
716
0
  if (table_reader == nullptr) {
717
0
    Status s =
718
0
        FindTable(read_options, file_options_, internal_comparator, file_meta,
719
0
                  &table_handle, block_protection_bytes_per_key,
720
0
                  prefix_extractor, false /* no_io */);
721
0
    if (s.ok()) {
722
0
      table_reader = cache_.Value(table_handle);
723
0
    }
724
0
  }
725
726
0
  if (table_reader != nullptr) {
727
0
    result = table_reader->ApproximateSize(read_options, start, end, caller);
728
0
  }
729
0
  if (table_handle != nullptr) {
730
0
    cache_.Release(table_handle);
731
0
  }
732
733
0
  return result;
734
0
}
735
736
void TableCache::ReleaseObsolete(Cache* cache, Cache::Handle* h,
737
4.48k
                                 uint32_t uncache_aggressiveness) {
738
4.48k
  CacheInterface typed_cache(cache);
739
4.48k
  TypedHandle* table_handle = reinterpret_cast<TypedHandle*>(h);
740
4.48k
  TableReader* table_reader = typed_cache.Value(table_handle);
741
4.48k
  table_reader->MarkObsolete(uncache_aggressiveness);
742
4.48k
  typed_cache.ReleaseAndEraseIfLastRef(table_handle);
743
4.48k
}
744
745
}  // namespace ROCKSDB_NAMESPACE