Coverage Report

Created: 2024-07-27 06:53

/src/rocksdb/env/io_posix.cc
Line
Count
Source (jump to first uncovered line)
1
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2
//  This source code is licensed under both the GPLv2 (found in the
3
//  COPYING file in the root directory) and Apache 2.0 License
4
//  (found in the LICENSE.Apache file in the root directory).
5
//
6
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7
// Use of this source code is governed by a BSD-style license that can be
8
// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10
#ifdef ROCKSDB_LIB_IO_POSIX
11
#include "env/io_posix.h"
12
13
#include <fcntl.h>
14
15
#include <algorithm>
16
#include <cerrno>
17
#if defined(OS_LINUX)
18
#include <linux/fs.h>
19
#ifndef FALLOC_FL_KEEP_SIZE
20
#include <linux/falloc.h>
21
#endif
22
#endif
23
#include <sys/ioctl.h>
24
#include <sys/mman.h>
25
#include <sys/stat.h>
26
#include <sys/types.h>
27
28
#include <cstdio>
29
#include <cstdlib>
30
#include <cstring>
31
#ifdef OS_LINUX
32
#include <sys/statfs.h>
33
#include <sys/sysmacros.h>
34
#endif
35
#include "monitoring/iostats_context_imp.h"
36
#include "port/port.h"
37
#include "port/stack_trace.h"
38
#include "rocksdb/slice.h"
39
#include "test_util/sync_point.h"
40
#include "util/autovector.h"
41
#include "util/coding.h"
42
#include "util/string_util.h"
43
44
#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
45
#define F_LINUX_SPECIFIC_BASE 1024
46
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
47
#endif
48
49
namespace ROCKSDB_NAMESPACE {
50
51
std::string IOErrorMsg(const std::string& context,
52
0
                       const std::string& file_name) {
53
0
  if (file_name.empty()) {
54
0
    return context;
55
0
  }
56
0
  return context + ": " + file_name;
57
0
}
58
59
// file_name can be left empty if it is not unkown.
60
IOStatus IOError(const std::string& context, const std::string& file_name,
61
0
                 int err_number) {
62
0
  switch (err_number) {
63
0
    case ENOSPC: {
64
0
      IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
65
0
                                     errnoStr(err_number).c_str());
66
0
      s.SetRetryable(true);
67
0
      return s;
68
0
    }
69
0
    case ESTALE:
70
0
      return IOStatus::IOError(IOStatus::kStaleFile);
71
0
    case ENOENT:
72
0
      return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
73
0
                                    errnoStr(err_number).c_str());
74
0
    default:
75
0
      return IOStatus::IOError(IOErrorMsg(context, file_name),
76
0
                               errnoStr(err_number).c_str());
77
0
  }
78
0
}
79
80
// A wrapper for fadvise, if the platform doesn't support fadvise,
81
// it will simply return 0.
82
4.48k
int Fadvise(int fd, off_t offset, size_t len, int advice) {
83
4.48k
#ifdef OS_LINUX
84
4.48k
  return posix_fadvise(fd, offset, len, advice);
85
#else
86
  (void)fd;
87
  (void)offset;
88
  (void)len;
89
  (void)advice;
90
  return 0;  // simply do nothing.
91
#endif
92
4.48k
}
93
94
// A wrapper for fadvise, if the platform doesn't support fadvise,
95
// it will simply return 0.
96
0
int Madvise(void* addr, size_t len, int advice) {
97
0
#ifdef OS_LINUX
98
0
  return posix_madvise(addr, len, advice);
99
#else
100
  (void)addr;
101
  (void)len;
102
  (void)advice;
103
  return 0;  // simply do nothing.
104
#endif
105
0
}
106
107
namespace {
108
109
// On MacOS (and probably *BSD), the posix write and pwrite calls do not support
110
// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
111
// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
112
// the writes aligned.
113
114
1.51M
bool PosixWrite(int fd, const char* buf, size_t nbyte) {
115
1.51M
  const size_t kLimit1Gb = 1UL << 30;
116
117
1.51M
  const char* src = buf;
118
1.51M
  size_t left = nbyte;
119
120
3.02M
  while (left != 0) {
121
1.51M
    size_t bytes_to_write = std::min(left, kLimit1Gb);
122
123
1.51M
    ssize_t done = write(fd, src, bytes_to_write);
124
1.51M
    if (done < 0) {
125
0
      if (errno == EINTR) {
126
0
        continue;
127
0
      }
128
0
      return false;
129
0
    }
130
1.51M
    left -= done;
131
1.51M
    src += done;
132
1.51M
  }
133
1.51M
  return true;
134
1.51M
}
135
136
0
bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
137
0
  const size_t kLimit1Gb = 1UL << 30;
138
139
0
  const char* src = buf;
140
0
  size_t left = nbyte;
141
142
0
  while (left != 0) {
143
0
    size_t bytes_to_write = std::min(left, kLimit1Gb);
144
145
0
    ssize_t done = pwrite(fd, src, bytes_to_write, offset);
146
0
    if (done < 0) {
147
0
      if (errno == EINTR) {
148
0
        continue;
149
0
      }
150
0
      return false;
151
0
    }
152
0
    left -= done;
153
0
    offset += done;
154
0
    src += done;
155
0
  }
156
157
0
  return true;
158
0
}
159
160
#ifdef ROCKSDB_RANGESYNC_PRESENT
161
162
#if !defined(ZFS_SUPER_MAGIC)
163
// The magic number for ZFS was not exposed until recently. It should be fixed
164
// forever so we can just copy the magic number here.
165
72.1k
#define ZFS_SUPER_MAGIC 0x2fc12fc1
166
#endif
167
168
72.1k
bool IsSyncFileRangeSupported(int fd) {
169
  // This function tracks and checks for cases where we know `sync_file_range`
170
  // definitely will not work properly despite passing the compile-time check
171
  // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks
172
  // fail in unexpected ways, we allow `sync_file_range` to be used. This way
173
  // should minimize risk of impacting existing use cases.
174
72.1k
  struct statfs buf;
175
72.1k
  int ret = fstatfs(fd, &buf);
176
72.1k
  assert(ret == 0);
177
72.1k
  if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
178
    // Testing on ZFS showed the writeback did not happen asynchronously when
179
    // `sync_file_range` was called, even though it returned success. Avoid it
180
    // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
181
    // even though this'll incur extra I/O for metadata.
182
0
    return false;
183
0
  }
184
185
72.1k
  ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
186
72.1k
  assert(!(ret == -1 && errno != ENOSYS));
187
72.1k
  if (ret == -1 && errno == ENOSYS) {
188
    // `sync_file_range` is not implemented on all platforms even if
189
    // compile-time checks pass and a supported filesystem is in-use. For
190
    // example, using ext4 on WSL (Windows Subsystem for Linux),
191
    // `sync_file_range()` returns `ENOSYS`
192
    // ("Function not implemented").
193
0
    return false;
194
0
  }
195
  // None of the known cases matched, so allow `sync_file_range` use.
196
72.1k
  return true;
197
72.1k
}
198
199
#undef ZFS_SUPER_MAGIC
200
201
#endif  // ROCKSDB_RANGESYNC_PRESENT
202
203
}  // anonymous namespace
204
205
/*
206
 * PosixSequentialFile
207
 */
208
PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
209
                                         int fd, size_t logical_block_size,
210
                                         const EnvOptions& options)
211
    : filename_(fname),
212
      file_(file),
213
      fd_(fd),
214
      use_direct_io_(options.use_direct_reads),
215
40.6k
      logical_sector_size_(logical_block_size) {
216
40.6k
  assert(!options.use_direct_reads || !options.use_mmap_reads);
217
40.6k
}
218
219
40.6k
PosixSequentialFile::~PosixSequentialFile() {
220
40.6k
  if (!use_direct_io()) {
221
40.6k
    assert(file_);
222
40.6k
    fclose(file_);
223
40.6k
  } else {
224
0
    assert(fd_);
225
0
    close(fd_);
226
0
  }
227
40.6k
}
228
229
IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
230
                                   Slice* result, char* scratch,
231
69.1k
                                   IODebugContext* /*dbg*/) {
232
69.1k
  assert(result != nullptr && !use_direct_io());
233
69.1k
  IOStatus s;
234
69.1k
  size_t r = 0;
235
69.1k
  do {
236
69.1k
    clearerr(file_);
237
69.1k
    r = fread_unlocked(scratch, 1, n, file_);
238
69.1k
  } while (r == 0 && ferror(file_) && errno == EINTR);
239
69.1k
  *result = Slice(scratch, r);
240
69.1k
  if (r < n) {
241
58.6k
    if (feof(file_)) {
242
      // We leave status as ok if we hit the end of the file
243
      // We also clear the error so that the reads can continue
244
      // if a new data is written to the file
245
58.6k
      clearerr(file_);
246
58.6k
    } else {
247
      // A partial read with an error: return a non-ok status
248
0
      s = IOError("While reading file sequentially", filename_, errno);
249
0
    }
250
58.6k
  }
251
69.1k
  return s;
252
69.1k
}
253
254
IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
255
                                             const IOOptions& /*opts*/,
256
                                             Slice* result, char* scratch,
257
0
                                             IODebugContext* /*dbg*/) {
258
0
  assert(use_direct_io());
259
0
  assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
260
0
  assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
261
0
  assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
262
263
0
  IOStatus s;
264
0
  ssize_t r = -1;
265
0
  size_t left = n;
266
0
  char* ptr = scratch;
267
0
  while (left > 0) {
268
0
    r = pread(fd_, ptr, left, static_cast<off_t>(offset));
269
0
    if (r <= 0) {
270
0
      if (r == -1 && errno == EINTR) {
271
0
        continue;
272
0
      }
273
0
      break;
274
0
    }
275
0
    ptr += r;
276
0
    offset += r;
277
0
    left -= r;
278
0
    if (!IsSectorAligned(r, GetRequiredBufferAlignment())) {
279
      // Bytes reads don't fill sectors. Should only happen at the end
280
      // of the file.
281
0
      break;
282
0
    }
283
0
  }
284
0
  if (r < 0) {
285
    // An error: return a non-ok status
286
0
    s = IOError("While pread " + std::to_string(n) + " bytes from offset " +
287
0
                    std::to_string(offset),
288
0
                filename_, errno);
289
0
  }
290
0
  *result = Slice(scratch, (r < 0) ? 0 : n - left);
291
0
  return s;
292
0
}
293
294
0
IOStatus PosixSequentialFile::Skip(uint64_t n) {
295
0
  if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
296
0
    return IOError("While fseek to skip " + std::to_string(n) + " bytes",
297
0
                   filename_, errno);
298
0
  }
299
0
  return IOStatus::OK();
300
0
}
301
302
0
IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
303
#ifndef OS_LINUX
304
  (void)offset;
305
  (void)length;
306
  return IOStatus::OK();
307
#else
308
0
  if (!use_direct_io()) {
309
    // free OS pages
310
0
    int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
311
0
    if (ret != 0) {
312
0
      return IOError("While fadvise NotNeeded offset " +
313
0
                         std::to_string(offset) + " len " +
314
0
                         std::to_string(length),
315
0
                     filename_, errno);
316
0
    }
317
0
  }
318
0
  return IOStatus::OK();
319
0
#endif
320
0
}
321
322
/*
323
 * PosixRandomAccessFile
324
 */
325
#if defined(OS_LINUX)
326
0
size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
327
0
  if (max_size < kMaxVarint64Length * 3) {
328
0
    return 0;
329
0
  }
330
331
0
  struct stat buf;
332
0
  int result = fstat(fd, &buf);
333
0
  if (result == -1) {
334
0
    return 0;
335
0
  }
336
337
0
  long version = 0;
338
0
  result = ioctl(fd, FS_IOC_GETVERSION, &version);
339
0
  TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
340
0
  if (result == -1) {
341
0
    return 0;
342
0
  }
343
0
  uint64_t uversion = (uint64_t)version;
344
345
0
  char* rid = id;
346
0
  rid = EncodeVarint64(rid, buf.st_dev);
347
0
  rid = EncodeVarint64(rid, buf.st_ino);
348
0
  rid = EncodeVarint64(rid, uversion);
349
0
  assert(rid >= id);
350
0
  return static_cast<size_t>(rid - id);
351
0
}
352
#endif
353
354
#if defined(OS_MACOSX) || defined(OS_AIX)
355
size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
356
  if (max_size < kMaxVarint64Length * 3) {
357
    return 0;
358
  }
359
360
  struct stat buf;
361
  int result = fstat(fd, &buf);
362
  if (result == -1) {
363
    return 0;
364
  }
365
366
  char* rid = id;
367
  rid = EncodeVarint64(rid, buf.st_dev);
368
  rid = EncodeVarint64(rid, buf.st_ino);
369
  rid = EncodeVarint64(rid, buf.st_gen);
370
  assert(rid >= id);
371
  return static_cast<size_t>(rid - id);
372
}
373
#endif
374
375
#ifdef OS_LINUX
376
18.0k
std::string RemoveTrailingSlash(const std::string& path) {
377
18.0k
  std::string p = path;
378
18.0k
  if (p.size() > 1 && p.back() == '/') {
379
0
    p.pop_back();
380
0
  }
381
18.0k
  return p;
382
18.0k
}
383
384
Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize(
385
9.02k
    const std::vector<std::string>& directories) {
386
9.02k
  std::vector<std::string> dirs;
387
9.02k
  dirs.reserve(directories.size());
388
9.02k
  for (auto& d : directories) {
389
9.02k
    dirs.emplace_back(RemoveTrailingSlash(d));
390
9.02k
  }
391
392
9.02k
  std::map<std::string, size_t> dir_sizes;
393
9.02k
  {
394
9.02k
    ReadLock lock(&cache_mutex_);
395
9.02k
    for (const auto& dir : dirs) {
396
9.02k
      if (cache_.find(dir) == cache_.end()) {
397
9.02k
        dir_sizes.emplace(dir, 0);
398
9.02k
      }
399
9.02k
    }
400
9.02k
  }
401
402
9.02k
  Status s;
403
9.02k
  for (auto& dir_size : dir_sizes) {
404
9.02k
    s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second);
405
9.02k
    if (!s.ok()) {
406
0
      return s;
407
0
    }
408
9.02k
  }
409
410
9.02k
  WriteLock lock(&cache_mutex_);
411
9.02k
  for (const auto& dir : dirs) {
412
9.02k
    auto& v = cache_[dir];
413
9.02k
    v.ref++;
414
9.02k
    auto dir_size = dir_sizes.find(dir);
415
9.02k
    if (dir_size != dir_sizes.end()) {
416
9.02k
      v.size = dir_size->second;
417
9.02k
    }
418
9.02k
  }
419
9.02k
  return s;
420
9.02k
}
421
422
void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize(
423
9.02k
    const std::vector<std::string>& directories) {
424
9.02k
  std::vector<std::string> dirs;
425
9.02k
  dirs.reserve(directories.size());
426
9.02k
  for (auto& dir : directories) {
427
9.02k
    dirs.emplace_back(RemoveTrailingSlash(dir));
428
9.02k
  }
429
430
9.02k
  WriteLock lock(&cache_mutex_);
431
9.02k
  for (const auto& dir : dirs) {
432
9.02k
    auto it = cache_.find(dir);
433
9.02k
    if (it != cache_.end() && !(--(it->second.ref))) {
434
9.02k
      cache_.erase(it);
435
9.02k
    }
436
9.02k
  }
437
9.02k
}
438
439
size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname,
440
0
                                                  int fd) {
441
0
  std::string dir = fname.substr(0, fname.find_last_of('/'));
442
0
  if (dir.empty()) {
443
0
    dir = "/";
444
0
  }
445
0
  {
446
0
    ReadLock lock(&cache_mutex_);
447
0
    auto it = cache_.find(dir);
448
0
    if (it != cache_.end()) {
449
0
      return it->second.size;
450
0
    }
451
0
  }
452
0
  return get_logical_block_size_of_fd_(fd);
453
0
}
454
#endif
455
456
Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory,
457
9.02k
                                                   size_t* size) {
458
9.02k
  int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY);
459
9.02k
  if (fd == -1) {
460
0
    return Status::IOError("Cannot open directory " + directory);
461
0
  }
462
9.02k
  *size = PosixHelper::GetLogicalBlockSizeOfFd(fd);
463
9.02k
  close(fd);
464
9.02k
  return Status::OK();
465
9.02k
}
466
467
9.02k
size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) {
468
9.02k
#ifdef OS_LINUX
469
9.02k
  struct stat buf;
470
9.02k
  int result = fstat(fd, &buf);
471
9.02k
  if (result == -1) {
472
0
    return kDefaultPageSize;
473
0
  }
474
9.02k
  if (major(buf.st_dev) == 0) {
475
    // Unnamed devices (e.g. non-device mounts), reserved as null device number.
476
    // These don't have an entry in /sys/dev/block/. Return a sensible default.
477
9.02k
    return kDefaultPageSize;
478
9.02k
  }
479
480
  // Reading queue/logical_block_size does not require special permissions.
481
0
  const int kBufferSize = 100;
482
0
  char path[kBufferSize];
483
0
  char real_path[PATH_MAX + 1];
484
0
  snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
485
0
           minor(buf.st_dev));
486
0
  if (realpath(path, real_path) == nullptr) {
487
0
    return kDefaultPageSize;
488
0
  }
489
0
  std::string device_dir(real_path);
490
0
  if (!device_dir.empty() && device_dir.back() == '/') {
491
0
    device_dir.pop_back();
492
0
  }
493
  // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
494
  // and nvme0n1 have it.
495
  // $ ls -al '/sys/dev/block/8:3'
496
  // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
497
  // ../../block/sda/sda3
498
  // $ ls -al '/sys/dev/block/259:4'
499
  // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
500
  // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
501
0
  size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
502
0
  if (parent_end == std::string::npos) {
503
0
    return kDefaultPageSize;
504
0
  }
505
0
  size_t parent_begin = device_dir.rfind('/', parent_end - 1);
506
0
  if (parent_begin == std::string::npos) {
507
0
    return kDefaultPageSize;
508
0
  }
509
0
  std::string parent =
510
0
      device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
511
0
  std::string child = device_dir.substr(parent_end + 1, std::string::npos);
512
0
  if (parent != "block" &&
513
0
      (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
514
0
    device_dir = device_dir.substr(0, parent_end);
515
0
  }
516
0
  std::string fname = device_dir + "/queue/logical_block_size";
517
0
  FILE* fp;
518
0
  size_t size = 0;
519
0
  fp = fopen(fname.c_str(), "r");
520
0
  if (fp != nullptr) {
521
0
    char* line = nullptr;
522
0
    size_t len = 0;
523
0
    if (getline(&line, &len, fp) != -1) {
524
0
      sscanf(line, "%zu", &size);
525
0
    }
526
0
    free(line);
527
0
    fclose(fp);
528
0
  }
529
0
  if (size != 0 && (size & (size - 1)) == 0) {
530
0
    return size;
531
0
  }
532
0
#endif
533
0
  (void)fd;
534
0
  return kDefaultPageSize;
535
0
}
536
537
/*
538
 * PosixRandomAccessFile
539
 *
540
 * pread() based random-access
541
 */
542
PosixRandomAccessFile::PosixRandomAccessFile(
543
    const std::string& fname, int fd, size_t logical_block_size,
544
    const EnvOptions& options
545
#if defined(ROCKSDB_IOURING_PRESENT)
546
    ,
547
    ThreadLocalPtr* thread_local_io_urings
548
#endif
549
    )
550
    : filename_(fname),
551
      fd_(fd),
552
      use_direct_io_(options.use_direct_reads),
553
      logical_sector_size_(logical_block_size)
554
#if defined(ROCKSDB_IOURING_PRESENT)
555
      ,
556
      thread_local_io_urings_(thread_local_io_urings)
557
#endif
558
13.5k
{
559
13.5k
  assert(!options.use_direct_reads || !options.use_mmap_reads);
560
13.5k
  assert(!options.use_mmap_reads);
561
13.5k
}
562
563
13.5k
PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
564
565
IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
566
                                     const IOOptions& /*opts*/, Slice* result,
567
                                     char* scratch,
568
29.6k
                                     IODebugContext* /*dbg*/) const {
569
29.6k
  if (use_direct_io()) {
570
0
    assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
571
0
    assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
572
0
    assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
573
0
  }
574
29.6k
  IOStatus s;
575
29.6k
  ssize_t r = -1;
576
29.6k
  size_t left = n;
577
29.6k
  char* ptr = scratch;
578
59.3k
  while (left > 0) {
579
29.6k
    r = pread(fd_, ptr, left, static_cast<off_t>(offset));
580
29.6k
    if (r <= 0) {
581
0
      if (r == -1 && errno == EINTR) {
582
0
        continue;
583
0
      }
584
0
      break;
585
0
    }
586
29.6k
    ptr += r;
587
29.6k
    offset += r;
588
29.6k
    left -= r;
589
29.6k
    if (use_direct_io() &&
590
29.6k
        r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
591
      // Bytes reads don't fill sectors. Should only happen at the end
592
      // of the file.
593
0
      break;
594
0
    }
595
29.6k
  }
596
29.6k
  if (r < 0) {
597
    // An error: return a non-ok status
598
0
    s = IOError("While pread offset " + std::to_string(offset) + " len " +
599
0
                    std::to_string(n),
600
0
                filename_, errno);
601
0
  }
602
29.6k
  *result = Slice(scratch, (r < 0) ? 0 : n - left);
603
29.6k
  return s;
604
29.6k
}
605
606
IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
607
                                          const IOOptions& options,
608
0
                                          IODebugContext* dbg) {
609
0
  if (use_direct_io()) {
610
0
    for (size_t i = 0; i < num_reqs; i++) {
611
0
      assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment()));
612
0
      assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment()));
613
0
      assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment()));
614
0
    }
615
0
  }
616
617
#if defined(ROCKSDB_IOURING_PRESENT)
618
  struct io_uring* iu = nullptr;
619
  if (thread_local_io_urings_) {
620
    iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
621
    if (iu == nullptr) {
622
      iu = CreateIOUring();
623
      if (iu != nullptr) {
624
        thread_local_io_urings_->Reset(iu);
625
      }
626
    }
627
  }
628
629
  // Init failed, platform doesn't support io_uring. Fall back to
630
  // serialized reads
631
  if (iu == nullptr) {
632
    return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
633
  }
634
635
  IOStatus ios = IOStatus::OK();
636
637
  struct WrappedReadRequest {
638
    FSReadRequest* req;
639
    struct iovec iov;
640
    size_t finished_len;
641
    explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
642
  };
643
644
  autovector<WrappedReadRequest, 32> req_wraps;
645
  autovector<WrappedReadRequest*, 4> incomplete_rq_list;
646
  std::unordered_set<WrappedReadRequest*> wrap_cache;
647
648
  for (size_t i = 0; i < num_reqs; i++) {
649
    req_wraps.emplace_back(&reqs[i]);
650
  }
651
652
  size_t reqs_off = 0;
653
  while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
654
    size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
655
656
    // If requests exceed depth, split it into batches
657
    if (this_reqs > kIoUringDepth) {
658
      this_reqs = kIoUringDepth;
659
    }
660
661
    assert(incomplete_rq_list.size() <= this_reqs);
662
    for (size_t i = 0; i < this_reqs; i++) {
663
      WrappedReadRequest* rep_to_submit;
664
      if (i < incomplete_rq_list.size()) {
665
        rep_to_submit = incomplete_rq_list[i];
666
      } else {
667
        rep_to_submit = &req_wraps[reqs_off++];
668
      }
669
      assert(rep_to_submit->req->len > rep_to_submit->finished_len);
670
      rep_to_submit->iov.iov_base =
671
          rep_to_submit->req->scratch + rep_to_submit->finished_len;
672
      rep_to_submit->iov.iov_len =
673
          rep_to_submit->req->len - rep_to_submit->finished_len;
674
675
      struct io_uring_sqe* sqe;
676
      sqe = io_uring_get_sqe(iu);
677
      io_uring_prep_readv(
678
          sqe, fd_, &rep_to_submit->iov, 1,
679
          rep_to_submit->req->offset + rep_to_submit->finished_len);
680
      io_uring_sqe_set_data(sqe, rep_to_submit);
681
      wrap_cache.emplace(rep_to_submit);
682
    }
683
    incomplete_rq_list.clear();
684
685
    ssize_t ret =
686
        io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
687
    TEST_SYNC_POINT_CALLBACK(
688
        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
689
        &ret);
690
    TEST_SYNC_POINT_CALLBACK(
691
        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
692
        iu);
693
694
    if (static_cast<size_t>(ret) != this_reqs) {
695
      fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
696
      // If error happens and we submitted fewer than expected, it is an
697
      // exception case and we don't retry here. We should still consume
698
      // what is is submitted in the ring.
699
      for (ssize_t i = 0; i < ret; i++) {
700
        struct io_uring_cqe* cqe = nullptr;
701
        io_uring_wait_cqe(iu, &cqe);
702
        if (cqe != nullptr) {
703
          io_uring_cqe_seen(iu, cqe);
704
        }
705
      }
706
      return IOStatus::IOError("io_uring_submit_and_wait() requested " +
707
                               std::to_string(this_reqs) + " but returned " +
708
                               std::to_string(ret));
709
    }
710
711
    for (size_t i = 0; i < this_reqs; i++) {
712
      struct io_uring_cqe* cqe = nullptr;
713
      WrappedReadRequest* req_wrap;
714
715
      // We could use the peek variant here, but this seems safer in terms
716
      // of our initial wait not reaping all completions
717
      ret = io_uring_wait_cqe(iu, &cqe);
718
      TEST_SYNC_POINT_CALLBACK(
719
          "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
720
      if (ret) {
721
        ios = IOStatus::IOError("io_uring_wait_cqe() returns " +
722
                                std::to_string(ret));
723
724
        if (cqe != nullptr) {
725
          io_uring_cqe_seen(iu, cqe);
726
        }
727
        continue;
728
      }
729
730
      req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
731
      // Reset cqe data to catch any stray reuse of it
732
      static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
733
      // Check that we got a valid unique cqe data
734
      auto wrap_check = wrap_cache.find(req_wrap);
735
      if (wrap_check == wrap_cache.end()) {
736
        fprintf(stderr,
737
                "PosixRandomAccessFile::MultiRead: "
738
                "Bad cqe data from IO uring - %p\n",
739
                req_wrap);
740
        port::PrintStack();
741
        ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
742
                                std::to_string((uint64_t)req_wrap));
743
        continue;
744
      }
745
      wrap_cache.erase(wrap_check);
746
747
      FSReadRequest* req = req_wrap->req;
748
      size_t bytes_read = 0;
749
      bool read_again = false;
750
      UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
751
                   false /*async_read*/, use_direct_io(),
752
                   GetRequiredBufferAlignment(), req_wrap->finished_len, req,
753
                   bytes_read, read_again);
754
      int32_t res = cqe->res;
755
      if (res >= 0) {
756
        if (bytes_read == 0) {
757
          if (read_again) {
758
            Slice tmp_slice;
759
            req->status =
760
                Read(req->offset + req_wrap->finished_len,
761
                     req->len - req_wrap->finished_len, options, &tmp_slice,
762
                     req->scratch + req_wrap->finished_len, dbg);
763
            req->result =
764
                Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
765
          }
766
          // else It means EOF so no need to do anything.
767
        } else if (bytes_read < req_wrap->iov.iov_len) {
768
          incomplete_rq_list.push_back(req_wrap);
769
        }
770
      }
771
      io_uring_cqe_seen(iu, cqe);
772
    }
773
    wrap_cache.clear();
774
  }
775
  return ios;
776
#else
777
0
  return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
778
0
#endif
779
0
}
780
781
IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n,
782
                                         const IOOptions& /*opts*/,
783
5.53k
                                         IODebugContext* /*dbg*/) {
784
5.53k
  IOStatus s;
785
5.53k
  if (!use_direct_io()) {
786
5.53k
    ssize_t r = 0;
787
5.53k
#ifdef OS_LINUX
788
5.53k
    r = readahead(fd_, offset, n);
789
5.53k
#endif
790
#ifdef OS_MACOSX
791
    radvisory advice;
792
    advice.ra_offset = static_cast<off_t>(offset);
793
    advice.ra_count = static_cast<int>(n);
794
    r = fcntl(fd_, F_RDADVISE, &advice);
795
#endif
796
5.53k
    if (r == -1) {
797
0
      s = IOError("While prefetching offset " + std::to_string(offset) +
798
0
                      " len " + std::to_string(n),
799
0
                  filename_, errno);
800
0
    }
801
5.53k
  }
802
5.53k
  return s;
803
5.53k
}
804
805
#if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
806
0
size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
807
0
  return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
808
0
}
809
#endif
810
811
4.48k
void PosixRandomAccessFile::Hint(AccessPattern pattern) {
812
4.48k
  if (use_direct_io()) {
813
0
    return;
814
0
  }
815
4.48k
  switch (pattern) {
816
0
    case kNormal:
817
0
      Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
818
0
      break;
819
4.48k
    case kRandom:
820
4.48k
      Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
821
4.48k
      break;
822
0
    case kSequential:
823
0
      Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
824
0
      break;
825
0
    case kWillNeed:
826
0
      Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
827
0
      break;
828
0
    case kWontNeed:
829
0
      Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
830
0
      break;
831
0
    default:
832
0
      assert(false);
833
0
      break;
834
4.48k
  }
835
4.48k
}
836
837
0
IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
838
0
  if (use_direct_io()) {
839
0
    return IOStatus::OK();
840
0
  }
841
#ifndef OS_LINUX
842
  (void)offset;
843
  (void)length;
844
  return IOStatus::OK();
845
#else
846
  // free OS pages
847
0
  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
848
0
  if (ret == 0) {
849
0
    return IOStatus::OK();
850
0
  }
851
0
  return IOError("While fadvise NotNeeded offset " + std::to_string(offset) +
852
0
                     " len " + std::to_string(length),
853
0
                 filename_, errno);
854
0
#endif
855
0
}
856
857
IOStatus PosixRandomAccessFile::ReadAsync(
858
    FSReadRequest& req, const IOOptions& /*opts*/,
859
    std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
860
0
    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) {
861
0
  if (use_direct_io()) {
862
0
    assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment()));
863
0
    assert(IsSectorAligned(req.len, GetRequiredBufferAlignment()));
864
0
    assert(IsSectorAligned(req.scratch, GetRequiredBufferAlignment()));
865
0
  }
866
867
#if defined(ROCKSDB_IOURING_PRESENT)
868
  // io_uring_queue_init.
869
  struct io_uring* iu = nullptr;
870
  if (thread_local_io_urings_) {
871
    iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
872
    if (iu == nullptr) {
873
      iu = CreateIOUring();
874
      if (iu != nullptr) {
875
        thread_local_io_urings_->Reset(iu);
876
      }
877
    }
878
  }
879
880
  // Init failed, platform doesn't support io_uring.
881
  if (iu == nullptr) {
882
    return IOStatus::NotSupported("ReadAsync");
883
  }
884
885
  // Allocate io_handle.
886
  IOHandleDeleter deletefn = [](void* args) -> void {
887
    delete (static_cast<Posix_IOHandle*>(args));
888
    args = nullptr;
889
  };
890
891
  // Initialize Posix_IOHandle.
892
  Posix_IOHandle* posix_handle =
893
      new Posix_IOHandle(iu, cb, cb_arg, req.offset, req.len, req.scratch,
894
                         use_direct_io(), GetRequiredBufferAlignment());
895
  posix_handle->iov.iov_base = req.scratch;
896
  posix_handle->iov.iov_len = req.len;
897
898
  *io_handle = static_cast<void*>(posix_handle);
899
  *del_fn = deletefn;
900
901
  // Step 3: io_uring_sqe_set_data
902
  struct io_uring_sqe* sqe;
903
  sqe = io_uring_get_sqe(iu);
904
905
  io_uring_prep_readv(sqe, fd_, /*sqe->addr=*/&posix_handle->iov,
906
                      /*sqe->len=*/1, /*sqe->offset=*/posix_handle->offset);
907
908
  // Sets sqe->user_data to posix_handle.
909
  io_uring_sqe_set_data(sqe, posix_handle);
910
911
  // Step 4: io_uring_submit
912
  ssize_t ret = io_uring_submit(iu);
913
  if (ret < 0) {
914
    fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
915
    return IOStatus::IOError("io_uring_submit() requested but returned " +
916
                             std::to_string(ret));
917
  }
918
  return IOStatus::OK();
919
#else
920
0
  (void)req;
921
0
  (void)cb;
922
0
  (void)cb_arg;
923
0
  (void)io_handle;
924
0
  (void)del_fn;
925
0
  return IOStatus::NotSupported("ReadAsync");
926
0
#endif
927
0
}
928
929
/*
930
 * PosixMmapReadableFile
931
 *
932
 * mmap() based random-access
933
 */
934
// base[0,length-1] contains the mmapped contents of the file.
935
PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
936
                                             const std::string& fname,
937
                                             void* base, size_t length,
938
                                             const EnvOptions& options)
939
0
    : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
940
0
#ifdef NDEBUG
941
0
  (void)options;
942
0
#endif
943
0
  fd_ = fd_ + 0;  // suppress the warning for used variables
944
0
  assert(options.use_mmap_reads);
945
0
  assert(!options.use_direct_reads);
946
0
}
947
948
0
PosixMmapReadableFile::~PosixMmapReadableFile() {
949
0
  int ret = munmap(mmapped_region_, length_);
950
0
  if (ret != 0) {
951
0
    fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
952
0
            mmapped_region_, length_);
953
0
  }
954
0
  close(fd_);
955
0
}
956
957
IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
958
                                     const IOOptions& /*opts*/, Slice* result,
959
                                     char* /*scratch*/,
960
0
                                     IODebugContext* /*dbg*/) const {
961
0
  IOStatus s;
962
0
  if (offset > length_) {
963
0
    *result = Slice();
964
0
    return IOError("While mmap read offset " + std::to_string(offset) +
965
0
                       " larger than file length " + std::to_string(length_),
966
0
                   filename_, EINVAL);
967
0
  } else if (offset + n > length_) {
968
0
    n = static_cast<size_t>(length_ - offset);
969
0
  }
970
0
  *result = Slice(static_cast<char*>(mmapped_region_) + offset, n);
971
0
  return s;
972
0
}
973
974
0
void PosixMmapReadableFile::Hint(AccessPattern pattern) {
975
0
  switch (pattern) {
976
0
    case kNormal:
977
0
      Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL);
978
0
      break;
979
0
    case kRandom:
980
0
      Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM);
981
0
      break;
982
0
    case kSequential:
983
0
      Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL);
984
0
      break;
985
0
    case kWillNeed:
986
0
      Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED);
987
0
      break;
988
0
    case kWontNeed:
989
0
      Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED);
990
0
      break;
991
0
    default:
992
0
      assert(false);
993
0
      break;
994
0
  }
995
0
}
996
997
0
IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
998
#ifndef OS_LINUX
999
  (void)offset;
1000
  (void)length;
1001
  return IOStatus::OK();
1002
#else
1003
  // free OS pages
1004
0
  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
1005
0
  if (ret == 0) {
1006
0
    return IOStatus::OK();
1007
0
  }
1008
0
  return IOError("While fadvise not needed. Offset " + std::to_string(offset) +
1009
0
                     " len" + std::to_string(length),
1010
0
                 filename_, errno);
1011
0
#endif
1012
0
}
1013
1014
/*
1015
 * PosixMmapFile
1016
 *
1017
 * We preallocate up to an extra megabyte and use memcpy to append new
1018
 * data to the file.  This is safe since we either properly close the
1019
 * file before reading from it, or for log files, the reading code
1020
 * knows enough to skip zero suffixes.
1021
 */
1022
0
IOStatus PosixMmapFile::UnmapCurrentRegion() {
1023
0
  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
1024
0
  if (base_ != nullptr) {
1025
0
    int munmap_status = munmap(base_, limit_ - base_);
1026
0
    if (munmap_status != 0) {
1027
0
      return IOError("While munmap", filename_, munmap_status);
1028
0
    }
1029
0
    file_offset_ += limit_ - base_;
1030
0
    base_ = nullptr;
1031
0
    limit_ = nullptr;
1032
0
    last_sync_ = nullptr;
1033
0
    dst_ = nullptr;
1034
1035
    // Increase the amount we map the next time, but capped at 1MB
1036
0
    if (map_size_ < (1 << 20)) {
1037
0
      map_size_ *= 2;
1038
0
    }
1039
0
  }
1040
0
  return IOStatus::OK();
1041
0
}
1042
1043
0
IOStatus PosixMmapFile::MapNewRegion() {
1044
0
#ifdef ROCKSDB_FALLOCATE_PRESENT
1045
0
  assert(base_ == nullptr);
1046
0
  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
1047
  // we can't fallocate with FALLOC_FL_KEEP_SIZE here
1048
0
  if (allow_fallocate_) {
1049
0
    IOSTATS_TIMER_GUARD(allocate_nanos);
1050
0
    int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
1051
0
    if (alloc_status != 0) {
1052
      // fallback to posix_fallocate
1053
0
      alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
1054
0
    }
1055
0
    if (alloc_status != 0) {
1056
0
      return IOStatus::IOError("Error allocating space to file : " + filename_ +
1057
0
                               "Error : " + errnoStr(alloc_status).c_str());
1058
0
    }
1059
0
  }
1060
1061
0
  TEST_KILL_RANDOM("PosixMmapFile::Append:1");
1062
0
  void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
1063
0
                   file_offset_);
1064
0
  if (ptr == MAP_FAILED) {
1065
0
    return IOStatus::IOError("MMap failed on " + filename_);
1066
0
  }
1067
0
  TEST_KILL_RANDOM("PosixMmapFile::Append:2");
1068
1069
0
  base_ = static_cast<char*>(ptr);
1070
0
  limit_ = base_ + map_size_;
1071
0
  dst_ = base_;
1072
0
  last_sync_ = base_;
1073
0
  return IOStatus::OK();
1074
#else
1075
  return IOStatus::NotSupported("This platform doesn't support fallocate()");
1076
#endif
1077
0
}
1078
1079
0
IOStatus PosixMmapFile::Msync() {
1080
0
  if (dst_ == last_sync_) {
1081
0
    return IOStatus::OK();
1082
0
  }
1083
  // Find the beginnings of the pages that contain the first and last
1084
  // bytes to be synced.
1085
0
  size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
1086
0
  size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
1087
0
  last_sync_ = dst_;
1088
0
  TEST_KILL_RANDOM("PosixMmapFile::Msync:0");
1089
0
  if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
1090
0
    return IOError("While msync", filename_, errno);
1091
0
  }
1092
0
  return IOStatus::OK();
1093
0
}
1094
1095
PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
1096
                             const EnvOptions& options)
1097
    : filename_(fname),
1098
      fd_(fd),
1099
      page_size_(page_size),
1100
      map_size_(Roundup(65536, page_size)),
1101
      base_(nullptr),
1102
      limit_(nullptr),
1103
      dst_(nullptr),
1104
      last_sync_(nullptr),
1105
0
      file_offset_(0) {
1106
0
#ifdef ROCKSDB_FALLOCATE_PRESENT
1107
0
  allow_fallocate_ = options.allow_fallocate;
1108
0
  fallocate_with_keep_size_ = options.fallocate_with_keep_size;
1109
#else
1110
  (void)options;
1111
#endif
1112
0
  assert((page_size & (page_size - 1)) == 0);
1113
0
  assert(options.use_mmap_writes);
1114
0
  assert(!options.use_direct_writes);
1115
0
}
1116
1117
0
PosixMmapFile::~PosixMmapFile() {
1118
0
  if (fd_ >= 0) {
1119
0
    IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr);
1120
0
    s.PermitUncheckedError();
1121
0
  }
1122
0
}
1123
1124
IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
1125
0
                               IODebugContext* /*dbg*/) {
1126
0
  const char* src = data.data();
1127
0
  size_t left = data.size();
1128
0
  while (left > 0) {
1129
0
    assert(base_ <= dst_);
1130
0
    assert(dst_ <= limit_);
1131
0
    size_t avail = limit_ - dst_;
1132
0
    if (avail == 0) {
1133
0
      IOStatus s = UnmapCurrentRegion();
1134
0
      if (!s.ok()) {
1135
0
        return s;
1136
0
      }
1137
0
      s = MapNewRegion();
1138
0
      if (!s.ok()) {
1139
0
        return s;
1140
0
      }
1141
0
      TEST_KILL_RANDOM("PosixMmapFile::Append:0");
1142
0
    }
1143
1144
0
    size_t n = (left <= avail) ? left : avail;
1145
0
    assert(dst_);
1146
0
    memcpy(dst_, src, n);
1147
0
    dst_ += n;
1148
0
    src += n;
1149
0
    left -= n;
1150
0
  }
1151
0
  return IOStatus::OK();
1152
0
}
1153
1154
IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/,
1155
0
                              IODebugContext* /*dbg*/) {
1156
0
  IOStatus s;
1157
0
  size_t unused = limit_ - dst_;
1158
1159
0
  s = UnmapCurrentRegion();
1160
0
  if (!s.ok()) {
1161
0
    s = IOError("While closing mmapped file", filename_, errno);
1162
0
  } else if (unused > 0) {
1163
    // Trim the extra space at the end of the file
1164
0
    if (ftruncate(fd_, file_offset_ - unused) < 0) {
1165
0
      s = IOError("While ftruncating mmaped file", filename_, errno);
1166
0
    }
1167
0
  }
1168
1169
0
  if (close(fd_) < 0) {
1170
0
    if (s.ok()) {
1171
0
      s = IOError("While closing mmapped file", filename_, errno);
1172
0
    }
1173
0
  }
1174
1175
0
  fd_ = -1;
1176
0
  base_ = nullptr;
1177
0
  limit_ = nullptr;
1178
0
  return s;
1179
0
}
1180
1181
IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
1182
0
                              IODebugContext* /*dbg*/) {
1183
0
  return IOStatus::OK();
1184
0
}
1185
1186
IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
1187
0
                             IODebugContext* /*dbg*/) {
1188
#ifdef HAVE_FULLFSYNC
1189
  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
1190
    return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno);
1191
  }
1192
#else   // HAVE_FULLFSYNC
1193
0
  if (fdatasync(fd_) < 0) {
1194
0
    return IOError("While fdatasync mmapped file", filename_, errno);
1195
0
  }
1196
0
#endif  // HAVE_FULLFSYNC
1197
1198
0
  return Msync();
1199
0
}
1200
1201
/**
1202
 * Flush data as well as metadata to stable storage.
1203
 */
1204
IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
1205
0
                              IODebugContext* /*dbg*/) {
1206
#ifdef HAVE_FULLFSYNC
1207
  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
1208
    return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno);
1209
  }
1210
#else   // HAVE_FULLFSYNC
1211
0
  if (fsync(fd_) < 0) {
1212
0
    return IOError("While fsync mmaped file", filename_, errno);
1213
0
  }
1214
0
#endif  // HAVE_FULLFSYNC
1215
1216
0
  return Msync();
1217
0
}
1218
1219
/**
1220
 * Get the size of valid data in the file. This will not match the
1221
 * size that is returned from the filesystem because we use mmap
1222
 * to extend file by map_size every time.
1223
 */
1224
uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/,
1225
0
                                    IODebugContext* /*dbg*/) {
1226
0
  size_t used = dst_ - base_;
1227
0
  return file_offset_ + used;
1228
0
}
1229
1230
0
IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
1231
#ifndef OS_LINUX
1232
  (void)offset;
1233
  (void)length;
1234
  return IOStatus::OK();
1235
#else
1236
  // free OS pages
1237
0
  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
1238
0
  if (ret == 0) {
1239
0
    return IOStatus::OK();
1240
0
  }
1241
0
  return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
1242
0
#endif
1243
0
}
1244
1245
#ifdef ROCKSDB_FALLOCATE_PRESENT
1246
IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
1247
                                 const IOOptions& /*opts*/,
1248
0
                                 IODebugContext* /*dbg*/) {
1249
0
  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1250
0
  assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1251
0
  TEST_KILL_RANDOM("PosixMmapFile::Allocate:0");
1252
0
  int alloc_status = 0;
1253
0
  if (allow_fallocate_) {
1254
0
    alloc_status =
1255
0
        fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
1256
0
                  static_cast<off_t>(offset), static_cast<off_t>(len));
1257
0
  }
1258
0
  if (alloc_status == 0) {
1259
0
    return IOStatus::OK();
1260
0
  } else {
1261
0
    return IOError("While fallocate offset " + std::to_string(offset) +
1262
0
                       " len " + std::to_string(len),
1263
0
                   filename_, errno);
1264
0
  }
1265
0
}
1266
#endif
1267
1268
/*
1269
 * PosixWritableFile
1270
 *
1271
 * Use posix write to write data to a file.
1272
 */
1273
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
1274
                                     size_t logical_block_size,
1275
                                     const EnvOptions& options)
1276
    : FSWritableFile(options),
1277
      filename_(fname),
1278
      use_direct_io_(options.use_direct_writes),
1279
      fd_(fd),
1280
      filesize_(0),
1281
72.1k
      logical_sector_size_(logical_block_size) {
1282
72.1k
#ifdef ROCKSDB_FALLOCATE_PRESENT
1283
72.1k
  allow_fallocate_ = options.allow_fallocate;
1284
72.1k
  fallocate_with_keep_size_ = options.fallocate_with_keep_size;
1285
72.1k
#endif
1286
72.1k
#ifdef ROCKSDB_RANGESYNC_PRESENT
1287
72.1k
  sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
1288
72.1k
#endif  // ROCKSDB_RANGESYNC_PRESENT
1289
72.1k
  assert(!options.use_mmap_writes);
1290
72.1k
}
1291
1292
72.1k
PosixWritableFile::~PosixWritableFile() {
1293
72.1k
  if (fd_ >= 0) {
1294
18.0k
    IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr);
1295
18.0k
    s.PermitUncheckedError();
1296
18.0k
  }
1297
72.1k
}
1298
1299
IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/,
1300
1.51M
                                   IODebugContext* /*dbg*/) {
1301
1.51M
  if (use_direct_io()) {
1302
0
    assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
1303
0
    assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
1304
0
  }
1305
1.51M
  const char* src = data.data();
1306
1.51M
  size_t nbytes = data.size();
1307
1308
1.51M
  if (!PosixWrite(fd_, src, nbytes)) {
1309
0
    return IOError("While appending to file", filename_, errno);
1310
0
  }
1311
1312
1.51M
  filesize_ += nbytes;
1313
1.51M
  return IOStatus::OK();
1314
1.51M
}
1315
1316
IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
1317
                                             const IOOptions& /*opts*/,
1318
0
                                             IODebugContext* /*dbg*/) {
1319
0
  if (use_direct_io()) {
1320
0
    assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
1321
0
    assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
1322
0
    assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
1323
0
  }
1324
0
  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1325
0
  const char* src = data.data();
1326
0
  size_t nbytes = data.size();
1327
0
  if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
1328
0
    return IOError("While pwrite to file at offset " + std::to_string(offset),
1329
0
                   filename_, errno);
1330
0
  }
1331
0
  filesize_ = offset + nbytes;
1332
0
  return IOStatus::OK();
1333
0
}
1334
1335
IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
1336
4.51k
                                     IODebugContext* /*dbg*/) {
1337
4.51k
  IOStatus s;
1338
4.51k
  int r = ftruncate(fd_, size);
1339
4.51k
  if (r < 0) {
1340
0
    s = IOError("While ftruncate file to size " + std::to_string(size),
1341
0
                filename_, errno);
1342
4.51k
  } else {
1343
4.51k
    filesize_ = size;
1344
4.51k
  }
1345
4.51k
  return s;
1346
4.51k
}
1347
1348
IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
1349
72.1k
                                  IODebugContext* /*dbg*/) {
1350
72.1k
  IOStatus s;
1351
1352
72.1k
  size_t block_size;
1353
72.1k
  size_t last_allocated_block;
1354
72.1k
  GetPreallocationStatus(&block_size, &last_allocated_block);
1355
72.1k
  TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block);
1356
72.1k
  if (last_allocated_block > 0) {
1357
    // trim the extra space preallocated at the end of the file
1358
    // NOTE(ljin): we probably don't want to surface failure as an IOError,
1359
    // but it will be nice to log these errors.
1360
18.0k
    int dummy __attribute__((__unused__));
1361
18.0k
    dummy = ftruncate(fd_, filesize_);
1362
18.0k
#if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE)
1363
    // in some file systems, ftruncate only trims trailing space if the
1364
    // new file size is smaller than the current size. Calling fallocate
1365
    // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
1366
    // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
1367
    // filesystems:
1368
    //   XFS (since Linux 2.6.38)
1369
    //   ext4 (since Linux 3.0)
1370
    //   Btrfs (since Linux 3.7)
1371
    //   tmpfs (since Linux 3.5)
1372
    // We ignore error since failure of this operation does not affect
1373
    // correctness.
1374
18.0k
    struct stat file_stats;
1375
18.0k
    int result = fstat(fd_, &file_stats);
1376
    // After ftruncate, we check whether ftruncate has the correct behavior.
1377
    // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
1378
18.0k
    if (result == 0 &&
1379
18.0k
        (file_stats.st_size + file_stats.st_blksize - 1) /
1380
18.0k
                file_stats.st_blksize !=
1381
18.0k
            file_stats.st_blocks / (file_stats.st_blksize / 512)) {
1382
0
      IOSTATS_TIMER_GUARD(allocate_nanos);
1383
0
      if (allow_fallocate_) {
1384
0
        fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
1385
0
                  block_size * last_allocated_block - filesize_);
1386
0
      }
1387
0
    }
1388
18.0k
#endif
1389
18.0k
  }
1390
1391
72.1k
  if (close(fd_) < 0) {
1392
0
    s = IOError("While closing file after writing", filename_, errno);
1393
0
  }
1394
72.1k
  fd_ = -1;
1395
72.1k
  return s;
1396
72.1k
}
1397
1398
// write out the cached data to the OS cache
1399
IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
1400
1.57M
                                  IODebugContext* /*dbg*/) {
1401
1.57M
  return IOStatus::OK();
1402
1.57M
}
1403
1404
IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
1405
36.0k
                                 IODebugContext* /*dbg*/) {
1406
#ifdef HAVE_FULLFSYNC
1407
  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
1408
    return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
1409
  }
1410
#else   // HAVE_FULLFSYNC
1411
36.0k
  if (fdatasync(fd_) < 0) {
1412
0
    return IOError("While fdatasync", filename_, errno);
1413
0
  }
1414
36.0k
#endif  // HAVE_FULLFSYNC
1415
36.0k
  return IOStatus::OK();
1416
36.0k
}
1417
1418
IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
1419
9.02k
                                  IODebugContext* /*dbg*/) {
1420
#ifdef HAVE_FULLFSYNC
1421
  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
1422
    return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
1423
  }
1424
#else   // HAVE_FULLFSYNC
1425
9.02k
  if (fsync(fd_) < 0) {
1426
0
    return IOError("While fsync", filename_, errno);
1427
0
  }
1428
9.02k
#endif  // HAVE_FULLFSYNC
1429
9.02k
  return IOStatus::OK();
1430
9.02k
}
1431
1432
0
bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
1433
1434
uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/,
1435
1.49M
                                        IODebugContext* /*dbg*/) {
1436
1.49M
  return filesize_;
1437
1.49M
}
1438
1439
13.5k
void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
1440
13.5k
#ifdef OS_LINUX
1441
// Suppress Valgrind "Unimplemented functionality" error.
1442
13.5k
#ifndef ROCKSDB_VALGRIND_RUN
1443
13.5k
  uint64_t fcntl_hint = hint;
1444
1445
13.5k
  if (hint == write_hint_) {
1446
0
    return;
1447
0
  }
1448
13.5k
  if (fcntl(fd_, F_SET_RW_HINT, &fcntl_hint) == 0) {
1449
13.5k
    write_hint_ = hint;
1450
13.5k
  }
1451
#else
1452
  (void)hint;
1453
#endif  // ROCKSDB_VALGRIND_RUN
1454
#else
1455
  (void)hint;
1456
#endif  // OS_LINUX
1457
13.5k
}
1458
1459
0
IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
1460
0
  if (use_direct_io()) {
1461
0
    return IOStatus::OK();
1462
0
  }
1463
#ifndef OS_LINUX
1464
  (void)offset;
1465
  (void)length;
1466
  return IOStatus::OK();
1467
#else
1468
  // free OS pages
1469
0
  int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
1470
0
  if (ret == 0) {
1471
0
    return IOStatus::OK();
1472
0
  }
1473
0
  return IOError("While fadvise NotNeeded", filename_, errno);
1474
0
#endif
1475
0
}
1476
1477
#ifdef ROCKSDB_FALLOCATE_PRESENT
1478
IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
1479
                                     const IOOptions& /*opts*/,
1480
18.0k
                                     IODebugContext* /*dbg*/) {
1481
18.0k
  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1482
18.0k
  assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1483
18.0k
  TEST_KILL_RANDOM("PosixWritableFile::Allocate:0");
1484
18.0k
  IOSTATS_TIMER_GUARD(allocate_nanos);
1485
18.0k
  int alloc_status = 0;
1486
18.0k
  if (allow_fallocate_) {
1487
18.0k
    alloc_status =
1488
18.0k
        fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
1489
18.0k
                  static_cast<off_t>(offset), static_cast<off_t>(len));
1490
18.0k
  }
1491
18.0k
  if (alloc_status == 0) {
1492
18.0k
    return IOStatus::OK();
1493
18.0k
  } else {
1494
0
    return IOError("While fallocate offset " + std::to_string(offset) +
1495
0
                       " len " + std::to_string(len),
1496
0
                   filename_, errno);
1497
0
  }
1498
18.0k
}
1499
#endif
1500
1501
IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
1502
                                      const IOOptions& opts,
1503
0
                                      IODebugContext* dbg) {
1504
0
#ifdef ROCKSDB_RANGESYNC_PRESENT
1505
0
  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1506
0
  assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1507
0
  if (sync_file_range_supported_) {
1508
0
    int ret;
1509
0
    if (strict_bytes_per_sync_) {
1510
      // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
1511
      // that spans all bytes written so far tells `sync_file_range` to wait for
1512
      // any outstanding writeback requests to finish before issuing a new one.
1513
0
      ret =
1514
0
          sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
1515
0
                          SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
1516
0
    } else {
1517
0
      ret = sync_file_range(fd_, static_cast<off_t>(offset),
1518
0
                            static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
1519
0
    }
1520
0
    if (ret != 0) {
1521
0
      return IOError("While sync_file_range returned " + std::to_string(ret),
1522
0
                     filename_, errno);
1523
0
    }
1524
0
    return IOStatus::OK();
1525
0
  }
1526
0
#endif  // ROCKSDB_RANGESYNC_PRESENT
1527
0
  return FSWritableFile::RangeSync(offset, nbytes, opts, dbg);
1528
0
}
1529
1530
#ifdef OS_LINUX
1531
0
size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
1532
0
  return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
1533
0
}
1534
#endif
1535
1536
/*
1537
 * PosixRandomRWFile
1538
 */
1539
1540
PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
1541
                                     const EnvOptions& /*options*/)
1542
0
    : filename_(fname), fd_(fd) {}
1543
1544
0
PosixRandomRWFile::~PosixRandomRWFile() {
1545
0
  if (fd_ >= 0) {
1546
0
    IOStatus s = Close(IOOptions(), nullptr);
1547
0
    s.PermitUncheckedError();
1548
0
  }
1549
0
}
1550
1551
IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data,
1552
                                  const IOOptions& /*opts*/,
1553
0
                                  IODebugContext* /*dbg*/) {
1554
0
  const char* src = data.data();
1555
0
  size_t nbytes = data.size();
1556
0
  if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
1557
0
    return IOError("While write random read/write file at offset " +
1558
0
                       std::to_string(offset),
1559
0
                   filename_, errno);
1560
0
  }
1561
1562
0
  return IOStatus::OK();
1563
0
}
1564
1565
IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n,
1566
                                 const IOOptions& /*opts*/, Slice* result,
1567
0
                                 char* scratch, IODebugContext* /*dbg*/) const {
1568
0
  size_t left = n;
1569
0
  char* ptr = scratch;
1570
0
  while (left > 0) {
1571
0
    ssize_t done = pread(fd_, ptr, left, offset);
1572
0
    if (done < 0) {
1573
      // error while reading from file
1574
0
      if (errno == EINTR) {
1575
        // read was interrupted, try again.
1576
0
        continue;
1577
0
      }
1578
0
      return IOError("While reading random read/write file offset " +
1579
0
                         std::to_string(offset) + " len " + std::to_string(n),
1580
0
                     filename_, errno);
1581
0
    } else if (done == 0) {
1582
      // Nothing more to read
1583
0
      break;
1584
0
    }
1585
1586
    // Read `done` bytes
1587
0
    ptr += done;
1588
0
    offset += done;
1589
0
    left -= done;
1590
0
  }
1591
1592
0
  *result = Slice(scratch, n - left);
1593
0
  return IOStatus::OK();
1594
0
}
1595
1596
IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
1597
0
                                  IODebugContext* /*dbg*/) {
1598
0
  return IOStatus::OK();
1599
0
}
1600
1601
IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
1602
0
                                 IODebugContext* /*dbg*/) {
1603
#ifdef HAVE_FULLFSYNC
1604
  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
1605
    return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno);
1606
  }
1607
#else   // HAVE_FULLFSYNC
1608
0
  if (fdatasync(fd_) < 0) {
1609
0
    return IOError("While fdatasync random read/write file", filename_, errno);
1610
0
  }
1611
0
#endif  // HAVE_FULLFSYNC
1612
0
  return IOStatus::OK();
1613
0
}
1614
1615
IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
1616
0
                                  IODebugContext* /*dbg*/) {
1617
#ifdef HAVE_FULLFSYNC
1618
  if (::fcntl(fd_, F_FULLFSYNC) < 0) {
1619
    return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno);
1620
  }
1621
#else   // HAVE_FULLFSYNC
1622
0
  if (fsync(fd_) < 0) {
1623
0
    return IOError("While fsync random read/write file", filename_, errno);
1624
0
  }
1625
0
#endif  // HAVE_FULLFSYNC
1626
0
  return IOStatus::OK();
1627
0
}
1628
1629
IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/,
1630
0
                                  IODebugContext* /*dbg*/) {
1631
0
  if (close(fd_) < 0) {
1632
0
    return IOError("While close random read/write file", filename_, errno);
1633
0
  }
1634
0
  fd_ = -1;
1635
0
  return IOStatus::OK();
1636
0
}
1637
1638
0
PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
1639
  // TODO should have error handling though not much we can do...
1640
0
  munmap(this->base_, length_);
1641
0
}
1642
1643
/*
1644
 * PosixDirectory
1645
 */
1646
#if !defined(BTRFS_SUPER_MAGIC)
1647
// The magic number for BTRFS is fixed, if it's not defined, define it here
1648
31.5k
#define BTRFS_SUPER_MAGIC 0x9123683E
1649
#endif
1650
PosixDirectory::PosixDirectory(int fd, const std::string& directory_name)
1651
31.5k
    : fd_(fd), directory_name_(directory_name) {
1652
31.5k
  is_btrfs_ = false;
1653
31.5k
#ifdef OS_LINUX
1654
31.5k
  struct statfs buf;
1655
31.5k
  int ret = fstatfs(fd, &buf);
1656
31.5k
  is_btrfs_ = (ret == 0 && buf.f_type == static_cast<decltype(buf.f_type)>(
1657
31.5k
                                             BTRFS_SUPER_MAGIC));
1658
31.5k
#endif
1659
31.5k
}
1660
1661
31.5k
PosixDirectory::~PosixDirectory() {
1662
31.5k
  if (fd_ >= 0) {
1663
9.02k
    IOStatus s = PosixDirectory::Close(IOOptions(), nullptr);
1664
9.02k
    s.PermitUncheckedError();
1665
9.02k
  }
1666
31.5k
}
1667
1668
0
IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) {
1669
0
  return FsyncWithDirOptions(opts, dbg, DirFsyncOptions());
1670
0
}
1671
1672
// Users who want the file entries synced in Directory project must call a
1673
// Fsync or FsyncWithDirOptions function before Close
1674
IOStatus PosixDirectory::Close(const IOOptions& /*opts*/,
1675
31.5k
                               IODebugContext* /*dbg*/) {
1676
31.5k
  IOStatus s = IOStatus::OK();
1677
31.5k
  if (close(fd_) < 0) {
1678
0
    s = IOError("While closing directory ", directory_name_, errno);
1679
31.5k
  } else {
1680
31.5k
    fd_ = -1;
1681
31.5k
  }
1682
31.5k
  return s;
1683
31.5k
}
1684
1685
IOStatus PosixDirectory::FsyncWithDirOptions(
1686
    const IOOptions& /*opts*/, IODebugContext* /*dbg*/,
1687
27.0k
    const DirFsyncOptions& dir_fsync_options) {
1688
27.0k
  assert(fd_ >= 0);  // Check use after close
1689
27.0k
  IOStatus s = IOStatus::OK();
1690
27.0k
#ifndef OS_AIX
1691
27.0k
  if (is_btrfs_) {
1692
    // skip dir fsync for new file creation, which is not needed for btrfs
1693
0
    if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) {
1694
0
      return s;
1695
0
    }
1696
    // skip dir fsync for renaming file, only need to sync new file
1697
0
    if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) {
1698
0
      std::string new_name = dir_fsync_options.renamed_new_name;
1699
0
      assert(!new_name.empty());
1700
0
      int fd;
1701
0
      do {
1702
0
        IOSTATS_TIMER_GUARD(open_nanos);
1703
0
        fd = open(new_name.c_str(), O_RDONLY);
1704
0
      } while (fd < 0 && errno == EINTR);
1705
0
      if (fd < 0) {
1706
0
        s = IOError("While open renaming file", new_name, errno);
1707
0
      } else if (fsync(fd) < 0) {
1708
0
        s = IOError("While fsync renaming file", new_name, errno);
1709
0
      }
1710
0
      if (close(fd) < 0) {
1711
0
        s = IOError("While closing file after fsync", new_name, errno);
1712
0
      }
1713
0
      return s;
1714
0
    }
1715
    // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
1716
0
  }
1717
1718
  // skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed
1719
  // in either the de-construction or the close function, data must have been
1720
  // fsync-ed before de-construction and close is called
1721
#ifdef HAVE_FULLFSYNC
1722
  // btrfs is a Linux file system, while currently F_FULLFSYNC is available on
1723
  // Mac OS.
1724
  assert(!is_btrfs_);
1725
  if (fd_ != -1 && ::fcntl(fd_, F_FULLFSYNC) < 0) {
1726
    return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno);
1727
  }
1728
#else   // HAVE_FULLFSYNC
1729
27.0k
  if (fd_ != -1 && fsync(fd_) == -1) {
1730
0
    s = IOError("While fsync", "a directory", errno);
1731
0
  }
1732
27.0k
#endif  // HAVE_FULLFSYNC
1733
27.0k
#endif  // OS_AIX
1734
27.0k
  return s;
1735
27.0k
}
1736
}  // namespace ROCKSDB_NAMESPACE
1737
#endif