Coverage Report

Created: 2026-05-31 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/rocksdb/env/io_posix.h
Line
Count
Source
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
#pragma once
10
#include <errno.h>
11
#if defined(ROCKSDB_IOURING_PRESENT)
12
#include <liburing.h>
13
#include <pthread.h>
14
#include <sys/uio.h>
15
16
#include <cstdio>
17
18
#include "util/string_util.h"
19
20
// Compatibility defines for io_uring flags that may not be present in older
21
// kernel headers. These values are fixed and won't change, so it's safe to
22
// define them even if the running kernel doesn't support them.
23
#ifndef IORING_SETUP_SINGLE_ISSUER
24
#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
25
#endif
26
#ifndef IORING_SETUP_DEFER_TASKRUN
27
#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
28
#endif
29
#endif
30
#include <unistd.h>
31
32
#include <atomic>
33
#include <functional>
34
#include <map>
35
#include <string>
36
37
#include "port/lang.h"
38
#include "port/port.h"
39
#include "rocksdb/env.h"
40
#include "rocksdb/file_system.h"
41
#include "rocksdb/io_status.h"
42
#include "test_util/sync_point.h"
43
#include "util/mutexlock.h"
44
#include "util/thread_local.h"
45
46
// For non linux platform, the following macros are used only as place
47
// holder.
48
#if !(defined OS_LINUX) && !(defined OS_FREEBSD) && !(defined CYGWIN) && \
49
    !(defined OS_AIX) && !(defined OS_ANDROID)
50
#define POSIX_FADV_NORMAL 0     /* [MC1] no further special treatment */
51
#define POSIX_FADV_RANDOM 1     /* [MC1] expect random page refs */
52
#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
53
#define POSIX_FADV_WILLNEED 3   /* [MC1] will need these pages */
54
#define POSIX_FADV_DONTNEED 4   /* [MC1] don't need these pages */
55
56
#define POSIX_MADV_NORMAL 0     /* [MC1] no further special treatment */
57
#define POSIX_MADV_RANDOM 1     /* [MC1] expect random page refs */
58
#define POSIX_MADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
59
#define POSIX_MADV_WILLNEED 3   /* [MC1] will need these pages */
60
#define POSIX_MADV_DONTNEED 4   /* [MC1] don't need these pages */
61
#endif
62
63
namespace ROCKSDB_NAMESPACE {
64
std::string IOErrorMsg(const std::string& context,
65
                       const std::string& file_name);
66
// file_name can be left empty if it is not unkown.
67
IOStatus IOError(const std::string& context, const std::string& file_name,
68
                 int err_number);
69
70
// SyncPoint payload used by deterministic TSAN regression tests to observe
71
// which virtual address range a freshly created mapping occupies.
72
struct TsanMappedMemoryInfo {
73
  const void* addr;
74
  size_t size;
75
};
76
77
#ifdef __SANITIZE_THREAD__
78
extern "C" void AnnotateNewMemory(const char* file, int line,
79
                                  const volatile void* mem, long size);
80
#endif  // __SANITIZE_THREAD__
81
82
0
inline void TsanAnnotateMappedMemory(const volatile void* mem, size_t size) {
83
0
  TsanMappedMemoryInfo info{const_cast<const void*>(mem), size};
84
0
  TEST_SYNC_POINT_CALLBACK("TsanAnnotateMappedMemory", &info);
85
0
  (void)info;
86
#ifdef __SANITIZE_THREAD__
87
  if (mem != nullptr && size != 0) {
88
    // TSAN does not understand that a new mmap or io_uring setup can legally
89
    // reuse a virtual address from an unrelated, previously unmapped region.
90
    // Reset the shadow state as soon as the new mapping exists so later
91
    // accesses are not reported against stale accesses from the old mapping.
92
    AnnotateNewMemory(__FILE__, __LINE__, mem, static_cast<long>(size));
93
  }
94
#endif  // __SANITIZE_THREAD__
95
0
}
96
97
class PosixHelper {
98
 public:
99
154k
  static const std::string& GetLogicalBlockSizeFileName() {
100
154k
    static const std::string kLogicalBlockSizeFileName = "logical_block_size";
101
154k
    return kLogicalBlockSizeFileName;
102
154k
  }
103
14.5k
  static const std::string& GetMaxSectorsKBFileName() {
104
14.5k
    static const std::string kMaxSectorsKBFileName = "max_sectors_kb";
105
14.5k
    return kMaxSectorsKBFileName;
106
14.5k
  }
107
  static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
108
  static size_t GetLogicalBlockSizeOfFd(int fd);
109
  static Status GetLogicalBlockSizeOfDirectory(const std::string& directory,
110
                                               size_t* size);
111
112
  static Status GetMaxSectorsKBOfDirectory(const std::string& directory,
113
                                           size_t* kb);
114
115
 private:
116
  static const size_t kDefaultMaxSectorsKB = 2 * 1024;
117
118
  static size_t GetMaxSectorsKBOfFd(int fd);
119
120
  // Return the value in the specified `file_name` under
121
  // `/sys/block/xxx/queue/` for the device where the file of `fd` is on.
122
  // If not found, then return the specified `default_return_value`
123
  static size_t GetQueueSysfsFileValueOfFd(int fd, const std::string& file_name,
124
                                           size_t default_return_value);
125
126
  /// Return the value in the specified `file_name` under
127
  // `/sys/block/xxx/queue/` for the device where `directory` is on.
128
  // If not found, then return the specified `default_return_value`
129
  static Status GetQueueSysfsFileValueofDirectory(const std::string& directory,
130
                                                  const std::string& file_name,
131
                                                  size_t* value);
132
};
133
134
/*
135
 * DirectIOHelper
136
 */
137
0
inline bool IsSectorAligned(const size_t off, size_t sector_size) {
138
0
  assert((sector_size & (sector_size - 1)) == 0);
139
0
  return (off & (sector_size - 1)) == 0;
140
0
}
141
142
#ifndef NDEBUG
143
inline bool IsSectorAligned(const void* ptr, size_t sector_size) {
144
  return uintptr_t(ptr) % sector_size == 0;
145
}
146
#endif
147
148
#if defined(ROCKSDB_IOURING_PRESENT)
149
struct Posix_IOHandle {
150
  Posix_IOHandle(struct io_uring* _iu,
151
                 std::function<void(FSReadRequest&, void*)> _cb, void* _cb_arg,
152
                 uint64_t _offset, size_t _len, char* _scratch,
153
                 bool _use_direct_io, size_t _alignment)
154
      : iu(_iu),
155
        cb(_cb),
156
        cb_arg(_cb_arg),
157
        offset(_offset),
158
        len(_len),
159
        scratch(_scratch),
160
        use_direct_io(_use_direct_io),
161
        alignment(_alignment),
162
        is_finished(false),
163
        is_being_aborted(false),
164
        req_count(0) {}
165
166
  struct iovec iov;
167
  struct io_uring* iu;
168
  std::function<void(FSReadRequest&, void*)> cb;
169
  void* cb_arg;
170
  uint64_t offset;
171
  size_t len;
172
  char* scratch;
173
  bool use_direct_io;
174
  size_t alignment;
175
  bool is_finished;
176
  // is_being_aborted is set by AbortIO when a cancel request is submitted.
177
  // Used to distinguish between aborted handles (expect 2 completions) and
178
  // non-aborted handles (expect 1 completion) when processing completions.
179
  bool is_being_aborted;
180
  // req_count is used by AbortIO API to keep track of number of requests.
181
  uint32_t req_count;
182
};
183
184
inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name,
185
                         size_t len, size_t iov_len, bool async_read,
186
                         bool use_direct_io, size_t alignment,
187
                         size_t& finished_len, FSReadRequest* req,
188
                         size_t& bytes_read, bool& read_again) {
189
  read_again = false;
190
  if (cqe->res < 0) {
191
    req->result = Slice(req->scratch, 0);
192
    req->status = IOError("Req failed", file_name, cqe->res);
193
  } else {
194
    bytes_read = static_cast<size_t>(cqe->res);
195
    TEST_SYNC_POINT_CALLBACK("UpdateResults::io_uring_result", &bytes_read);
196
    if (bytes_read == iov_len) {
197
      req->result = Slice(req->scratch, req->len);
198
      req->status = IOStatus::OK();
199
    } else if (bytes_read == 0) {
200
      /// cqe->res == 0 can means EOF, or can mean partial results. See
201
      // comment
202
      // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
203
      // Fall back to pread in this case.
204
      if (use_direct_io && !IsSectorAligned(finished_len, alignment)) {
205
        // Bytes reads don't fill sectors. Should only happen at the end
206
        // of the file.
207
        req->result = Slice(req->scratch, finished_len);
208
        req->status = IOStatus::OK();
209
      } else {
210
        if (async_read) {
211
          // No  bytes read. It can means EOF. In case of partial results, it's
212
          // caller responsibility to call read/readasync again.
213
          req->result = Slice(req->scratch, 0);
214
          req->status = IOStatus::OK();
215
        } else {
216
          read_again = true;
217
        }
218
      }
219
    } else if (bytes_read < iov_len) {
220
      assert(bytes_read > 0);
221
      if (async_read) {
222
        req->result = Slice(req->scratch, bytes_read);
223
        req->status = IOStatus::OK();
224
      } else {
225
        assert(bytes_read + finished_len < len);
226
        finished_len += bytes_read;
227
      }
228
    } else {
229
      req->result = Slice(req->scratch, 0);
230
      req->status = IOError("Req returned more bytes than requested", file_name,
231
                            cqe->res);
232
    }
233
  }
234
#ifdef NDEBUG
235
  (void)len;
236
#endif
237
}
238
239
// Finalize a completed async read request.
240
// Processes the CQE result, marks the handle as finished, and invokes the
241
// callback. This is shared between Poll and AbortIO (for non-aborted handles).
242
inline void FinalizeAsyncRead(struct io_uring* iu, struct io_uring_cqe* cqe,
243
                              Posix_IOHandle* posix_handle) {
244
  FSReadRequest req;
245
  req.scratch = posix_handle->scratch;
246
  req.offset = posix_handle->offset;
247
  req.len = posix_handle->len;
248
249
  size_t finished_len = 0;
250
  size_t bytes_read = 0;
251
  bool read_again = false;
252
  UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, true /*async_read*/,
253
               posix_handle->use_direct_io, posix_handle->alignment,
254
               finished_len, &req, bytes_read, read_again);
255
  posix_handle->is_finished = true;
256
  io_uring_cqe_seen(iu, cqe);
257
  posix_handle->cb(req, posix_handle->cb_arg);
258
}
259
#endif
260
261
#ifdef OS_LINUX
262
// Files under a specific directory have the same logical block size.
263
// This class caches the logical block size for the specified directories to
264
// save the CPU cost of computing the size.
265
// Safe for concurrent access from multiple threads without any external
266
// synchronization.
267
class LogicalBlockSizeCache {
268
 public:
269
  LogicalBlockSizeCache(
270
      std::function<size_t(int)> get_logical_block_size_of_fd =
271
          PosixHelper::GetLogicalBlockSizeOfFd,
272
      std::function<Status(const std::string&, size_t*)>
273
          get_logical_block_size_of_directory =
274
              PosixHelper::GetLogicalBlockSizeOfDirectory)
275
4
      : get_logical_block_size_of_fd_(get_logical_block_size_of_fd),
276
4
        get_logical_block_size_of_directory_(
277
4
            get_logical_block_size_of_directory) {}
278
279
  // Takes the following actions:
280
  // 1. Increases reference count of the directories;
281
  // 2. If the directory's logical block size is not cached,
282
  //    compute the buffer size and cache the result.
283
  Status RefAndCacheLogicalBlockSize(
284
      const std::vector<std::string>& directories);
285
286
  // Takes the following actions:
287
  // 1. Decreases reference count of the directories;
288
  // 2. If the reference count of a directory reaches 0, remove the directory
289
  //    from the cache.
290
  void UnrefAndTryRemoveCachedLogicalBlockSize(
291
      const std::vector<std::string>& directories);
292
293
  // Returns the logical block size for the file.
294
  //
295
  // If the file is under a cached directory, return the cached size.
296
  // Otherwise, the size is computed.
297
  size_t GetLogicalBlockSize(const std::string& fname, int fd);
298
299
0
  int GetRefCount(const std::string& dir) {
300
0
    ReadLock lock(&cache_mutex_);
301
0
    auto it = cache_.find(dir);
302
0
    if (it == cache_.end()) {
303
0
      return 0;
304
0
    }
305
0
    return it->second.ref;
306
0
  }
307
308
0
  size_t Size() const { return cache_.size(); }
309
310
0
  bool Contains(const std::string& dir) {
311
0
    ReadLock lock(&cache_mutex_);
312
0
    return cache_.find(dir) != cache_.end();
313
0
  }
314
315
 private:
316
  struct CacheValue {
317
49.7k
    CacheValue() : size(0), ref(0) {}
318
319
    // Logical block size of the directory.
320
    size_t size;
321
    // Reference count of the directory.
322
    int ref;
323
  };
324
325
  std::function<size_t(int)> get_logical_block_size_of_fd_;
326
  std::function<Status(const std::string&, size_t*)>
327
      get_logical_block_size_of_directory_;
328
329
  std::map<std::string, CacheValue> cache_;
330
  port::RWMutex cache_mutex_;
331
};
332
#endif
333
334
class PosixSequentialFile : public FSSequentialFile {
335
 private:
336
  std::string filename_;
337
  FILE* file_;
338
  int fd_;
339
  bool use_direct_io_;
340
  size_t logical_sector_size_;
341
342
 public:
343
  PosixSequentialFile(const std::string& fname, FILE* file, int fd,
344
                      size_t logical_block_size, const EnvOptions& options);
345
  virtual ~PosixSequentialFile();
346
347
  IOStatus Read(size_t n, const IOOptions& opts, Slice* result, char* scratch,
348
                IODebugContext* dbg) override;
349
  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& opts,
350
                          Slice* result, char* scratch,
351
                          IODebugContext* dbg) override;
352
  IOStatus Skip(uint64_t n) override;
353
  IOStatus InvalidateCache(size_t offset, size_t length) override;
354
539k
  bool use_direct_io() const override { return use_direct_io_; }
355
265k
  size_t GetRequiredBufferAlignment() const override {
356
265k
    return logical_sector_size_;
357
265k
  }
358
};
359
360
#if defined(ROCKSDB_IOURING_PRESENT)
361
// io_uring instance queue depth
362
const unsigned int kIoUringDepth = 256;
363
364
inline void TsanAnnotateIOUringMemory(struct io_uring* iu) {
365
  TsanAnnotateMappedMemory(iu->sq.ring_ptr, iu->sq.ring_sz);
366
  // CQ and SQ can share the same mmap region.
367
  if (iu->cq.ring_ptr != iu->sq.ring_ptr) {
368
    TsanAnnotateMappedMemory(iu->cq.ring_ptr, iu->cq.ring_sz);
369
  }
370
  TsanAnnotateMappedMemory(
371
      iu->sq.sqes, static_cast<size_t>(kIoUringDepth) * sizeof(io_uring_sqe));
372
}
373
374
inline void DeleteIOUring(void* p) {
375
  struct io_uring* iu = static_cast<struct io_uring*>(p);
376
  io_uring_queue_exit(iu);
377
  delete iu;
378
}
379
380
inline struct io_uring* CreateIOUring() {
381
  struct io_uring* new_io_uring = new struct io_uring;
382
  unsigned int flags = 0;
383
  flags |= IORING_SETUP_SINGLE_ISSUER;
384
  flags |= IORING_SETUP_DEFER_TASKRUN;
385
  int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, flags);
386
  if (ret) {
387
    fprintf(stdout, "CreateIOUring failed: %s (errno=%d), thread=%lu\n",
388
            errnoStr(-ret).c_str(), -ret,
389
            static_cast<unsigned long>(pthread_self()));
390
    delete new_io_uring;
391
    new_io_uring = nullptr;
392
  } else {
393
    TsanAnnotateIOUringMemory(new_io_uring);
394
  }
395
  return new_io_uring;
396
}
397
#endif  // defined(ROCKSDB_IOURING_PRESENT)
398
399
class PosixRandomAccessFile : public FSRandomAccessFile {
400
 protected:
401
  std::string filename_;
402
  int fd_;
403
  bool use_direct_io_;
404
  size_t logical_sector_size_;
405
#if defined(ROCKSDB_IOURING_PRESENT)
406
  ThreadLocalPtr* thread_local_async_read_io_urings_;
407
  ThreadLocalPtr* thread_local_multi_read_io_urings_;
408
#endif
409
410
 public:
411
  PosixRandomAccessFile(const std::string& fname, int fd,
412
                        size_t logical_block_size, const EnvOptions& options
413
#if defined(ROCKSDB_IOURING_PRESENT)
414
                        ,
415
                        ThreadLocalPtr* thread_local_async_read_io_urings,
416
                        ThreadLocalPtr* thread_local_multi_read_io_urings
417
#endif
418
  );
419
  virtual ~PosixRandomAccessFile();
420
421
  IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result,
422
                char* scratch, IODebugContext* dbg) const override;
423
424
  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
425
                     const IOOptions& options, IODebugContext* dbg) override;
426
427
  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts,
428
                    IODebugContext* dbg) override;
429
430
#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
431
  size_t GetUniqueId(char* id, size_t max_size) const override;
432
#endif
433
  void Hint(AccessPattern pattern) override;
434
  IOStatus InvalidateCache(size_t offset, size_t length) override;
435
2.31M
  bool use_direct_io() const override { return use_direct_io_; }
436
493k
  size_t GetRequiredBufferAlignment() const override {
437
493k
    return logical_sector_size_;
438
493k
  }
439
440
  virtual IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
441
                             std::function<void(FSReadRequest&, void*)> cb,
442
                             void* cb_arg, void** io_handle,
443
                             IOHandleDeleter* del_fn,
444
                             IODebugContext* dbg) override;
445
446
  virtual IOStatus GetFileSize(uint64_t* result) override;
447
};
448
449
class PosixWritableFile : public FSWritableFile {
450
 protected:
451
  const std::string filename_;
452
  const bool use_direct_io_;
453
  int fd_;
454
  uint64_t filesize_;
455
  size_t logical_sector_size_;
456
#ifdef ROCKSDB_FALLOCATE_PRESENT
457
  bool allow_fallocate_;
458
  bool fallocate_with_keep_size_;
459
#endif
460
#ifdef ROCKSDB_RANGESYNC_PRESENT
461
  // Even if the syscall is present, the filesystem may still not properly
462
  // support it, so we need to do a dynamic check too.
463
  bool sync_file_range_supported_;
464
#endif  // ROCKSDB_RANGESYNC_PRESENT
465
466
 public:
467
  explicit PosixWritableFile(const std::string& fname, int fd,
468
                             size_t logical_block_size,
469
                             const EnvOptions& options,
470
                             uint64_t initial_file_size);
471
  virtual ~PosixWritableFile();
472
473
  // Need to implement this so the file is truncated correctly
474
  // with direct I/O
475
  IOStatus Truncate(uint64_t size, const IOOptions& opts,
476
                    IODebugContext* dbg) override;
477
  IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
478
  IOStatus Append(const Slice& data, const IOOptions& opts,
479
                  IODebugContext* dbg) override;
480
  IOStatus Append(const Slice& data, const IOOptions& opts,
481
                  const DataVerificationInfo& /* verification_info */,
482
0
                  IODebugContext* dbg) override {
483
0
    return Append(data, opts, dbg);
484
0
  }
485
  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
486
                            const IOOptions& opts,
487
                            IODebugContext* dbg) override;
488
  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
489
                            const IOOptions& opts,
490
                            const DataVerificationInfo& /* verification_info */,
491
0
                            IODebugContext* dbg) override {
492
0
    return PositionedAppend(data, offset, opts, dbg);
493
0
  }
494
  IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
495
  IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
496
  IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
497
  bool IsSyncThreadSafe() const override;
498
59.7M
  bool use_direct_io() const override { return use_direct_io_; }
499
  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override;
500
  uint64_t GetFileSize(const IOOptions& opts, IODebugContext* dbg) override;
501
  IOStatus InvalidateCache(size_t offset, size_t length) override;
502
275k
  size_t GetRequiredBufferAlignment() const override {
503
275k
    return logical_sector_size_;
504
275k
  }
505
#ifdef ROCKSDB_FALLOCATE_PRESENT
506
  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& opts,
507
                    IODebugContext* dbg) override;
508
#endif
509
  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& opts,
510
                     IODebugContext* dbg) override;
511
#ifdef OS_LINUX
512
  size_t GetUniqueId(char* id, size_t max_size) const override;
513
#endif
514
};
515
516
// mmap() based random-access
517
class PosixMmapReadableFile : public FSRandomAccessFile {
518
 private:
519
  int fd_;
520
  std::string filename_;
521
  void* mmapped_region_;
522
  size_t length_;
523
524
 public:
525
  PosixMmapReadableFile(const int fd, const std::string& fname, void* base,
526
                        size_t length, const EnvOptions& options);
527
  virtual ~PosixMmapReadableFile();
528
  IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result,
529
                char* scratch, IODebugContext* dbg) const override;
530
  void Hint(AccessPattern pattern) override;
531
  IOStatus InvalidateCache(size_t offset, size_t length) override;
532
  virtual IOStatus GetFileSize(uint64_t* result) override;
533
};
534
535
class PosixMmapFile : public FSWritableFile {
536
 private:
537
  std::string filename_;
538
  int fd_;
539
  size_t page_size_;
540
  size_t map_size_;       // How much extra memory to map at a time
541
  char* base_;            // The mapped region
542
  char* limit_;           // Limit of the mapped region
543
  char* dst_;             // Where to write next  (in range [base_,limit_])
544
  char* last_sync_;       // Where have we synced up to
545
  uint64_t file_offset_;  // Offset of base_ in file
546
#ifdef ROCKSDB_FALLOCATE_PRESENT
547
  bool allow_fallocate_;  // If false, fallocate calls are bypassed
548
  bool fallocate_with_keep_size_;
549
#endif
550
551
  // Roundup x to a multiple of y
552
0
  static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
553
554
0
  size_t TruncateToPageBoundary(size_t s) {
555
0
    s -= (s & (page_size_ - 1));
556
0
    assert((s % page_size_) == 0);
557
0
    return s;
558
0
  }
559
560
  IOStatus MapNewRegion();
561
  IOStatus UnmapCurrentRegion();
562
  IOStatus Msync();
563
564
 public:
565
  PosixMmapFile(const std::string& fname, int fd, size_t page_size,
566
                const EnvOptions& options, uint64_t initial_file_size);
567
  ~PosixMmapFile();
568
569
  // Means Close() will properly take care of truncate
570
  // and it does not need any additional information
571
  IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/,
572
0
                    IODebugContext* /*dbg*/) override {
573
0
    return IOStatus::OK();
574
0
  }
575
  IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
576
  IOStatus Append(const Slice& data, const IOOptions& opts,
577
                  IODebugContext* dbg) override;
578
  IOStatus Append(const Slice& data, const IOOptions& opts,
579
                  const DataVerificationInfo& /* verification_info */,
580
0
                  IODebugContext* dbg) override {
581
0
    return Append(data, opts, dbg);
582
0
  }
583
  IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
584
  IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
585
  IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
586
  uint64_t GetFileSize(const IOOptions& opts, IODebugContext* dbg) override;
587
  IOStatus InvalidateCache(size_t offset, size_t length) override;
588
#ifdef ROCKSDB_FALLOCATE_PRESENT
589
  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& opts,
590
                    IODebugContext* dbg) override;
591
#endif
592
};
593
594
class PosixRandomRWFile : public FSRandomRWFile {
595
 public:
596
  explicit PosixRandomRWFile(const std::string& fname, int fd,
597
                             const EnvOptions& options);
598
  virtual ~PosixRandomRWFile();
599
600
  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& opts,
601
                 IODebugContext* dbg) override;
602
603
  IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result,
604
                char* scratch, IODebugContext* dbg) const override;
605
606
  IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override;
607
  IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override;
608
  IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
609
  IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
610
611
 private:
612
  const std::string filename_;
613
  int fd_;
614
};
615
616
struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer {
617
  PosixMemoryMappedFileBuffer(void* _base, size_t _length)
618
0
      : MemoryMappedFileBuffer(_base, _length) {}
619
  virtual ~PosixMemoryMappedFileBuffer();
620
};
621
622
class PosixDirectory : public FSDirectory {
623
 public:
624
  explicit PosixDirectory(int fd, const std::string& directory_name);
625
  ~PosixDirectory();
626
  IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override;
627
628
  IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override;
629
630
  IOStatus FsyncWithDirOptions(
631
      const IOOptions&, IODebugContext*,
632
      const DirFsyncOptions& dir_fsync_options) override;
633
634
 private:
635
  int fd_;
636
  bool is_btrfs_;
637
  const std::string directory_name_;
638
};
639
640
}  // namespace ROCKSDB_NAMESPACE