/src/rocksdb/env/io_posix.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #ifdef ROCKSDB_LIB_IO_POSIX |
11 | | #include "env/io_posix.h" |
12 | | |
13 | | #include <fcntl.h> |
14 | | |
15 | | #include <algorithm> |
16 | | #include <cerrno> |
17 | | #if defined(OS_LINUX) |
18 | | #include <linux/fs.h> |
19 | | #ifndef FALLOC_FL_KEEP_SIZE |
20 | | #include <linux/falloc.h> |
21 | | #endif |
22 | | #endif |
23 | | #include <sys/ioctl.h> |
24 | | #include <sys/mman.h> |
25 | | #include <sys/stat.h> |
26 | | #include <sys/types.h> |
27 | | |
28 | | #include <cstdio> |
29 | | #include <cstdlib> |
30 | | #include <cstring> |
31 | | #ifdef OS_LINUX |
32 | | #include <sys/statfs.h> |
33 | | #include <sys/sysmacros.h> |
34 | | #endif |
35 | | #include "monitoring/iostats_context_imp.h" |
36 | | #include "port/port.h" |
37 | | #include "port/stack_trace.h" |
38 | | #include "rocksdb/slice.h" |
39 | | #include "test_util/sync_point.h" |
40 | | #include "util/autovector.h" |
41 | | #include "util/coding.h" |
42 | | #include "util/string_util.h" |
43 | | |
44 | | #if defined(OS_LINUX) && !defined(F_SET_RW_HINT) |
45 | | #define F_LINUX_SPECIFIC_BASE 1024 |
46 | | #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) |
47 | | #endif |
48 | | |
49 | | namespace ROCKSDB_NAMESPACE { |
50 | | |
51 | | std::string IOErrorMsg(const std::string& context, |
52 | 0 | const std::string& file_name) { |
53 | 0 | if (file_name.empty()) { |
54 | 0 | return context; |
55 | 0 | } |
56 | 0 | return context + ": " + file_name; |
57 | 0 | } |
58 | | |
59 | | // file_name can be left empty if it is not unkown. |
60 | | IOStatus IOError(const std::string& context, const std::string& file_name, |
61 | 0 | int err_number) { |
62 | 0 | switch (err_number) { |
63 | 0 | case ENOSPC: { |
64 | 0 | IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name), |
65 | 0 | errnoStr(err_number).c_str()); |
66 | 0 | s.SetRetryable(true); |
67 | 0 | return s; |
68 | 0 | } |
69 | 0 | case ESTALE: |
70 | 0 | return IOStatus::IOError(IOStatus::kStaleFile); |
71 | 0 | case ENOENT: |
72 | 0 | return IOStatus::PathNotFound(IOErrorMsg(context, file_name), |
73 | 0 | errnoStr(err_number).c_str()); |
74 | 0 | default: |
75 | 0 | return IOStatus::IOError(IOErrorMsg(context, file_name), |
76 | 0 | errnoStr(err_number).c_str()); |
77 | 0 | } |
78 | 0 | } |
79 | | |
80 | | // A wrapper for fadvise, if the platform doesn't support fadvise, |
81 | | // it will simply return 0. |
82 | 4.48k | int Fadvise(int fd, off_t offset, size_t len, int advice) { |
83 | 4.48k | #ifdef OS_LINUX |
84 | 4.48k | return posix_fadvise(fd, offset, len, advice); |
85 | | #else |
86 | | (void)fd; |
87 | | (void)offset; |
88 | | (void)len; |
89 | | (void)advice; |
90 | | return 0; // simply do nothing. |
91 | | #endif |
92 | 4.48k | } |
93 | | |
94 | | // A wrapper for fadvise, if the platform doesn't support fadvise, |
95 | | // it will simply return 0. |
96 | 0 | int Madvise(void* addr, size_t len, int advice) { |
97 | 0 | #ifdef OS_LINUX |
98 | 0 | return posix_madvise(addr, len, advice); |
99 | | #else |
100 | | (void)addr; |
101 | | (void)len; |
102 | | (void)advice; |
103 | | return 0; // simply do nothing. |
104 | | #endif |
105 | 0 | } |
106 | | |
107 | | namespace { |
108 | | |
109 | | // On MacOS (and probably *BSD), the posix write and pwrite calls do not support |
110 | | // buffers larger than 2^31-1 bytes. These two wrappers fix this issue by |
111 | | // cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep |
112 | | // the writes aligned. |
113 | | |
114 | 1.51M | bool PosixWrite(int fd, const char* buf, size_t nbyte) { |
115 | 1.51M | const size_t kLimit1Gb = 1UL << 30; |
116 | | |
117 | 1.51M | const char* src = buf; |
118 | 1.51M | size_t left = nbyte; |
119 | | |
120 | 3.02M | while (left != 0) { |
121 | 1.51M | size_t bytes_to_write = std::min(left, kLimit1Gb); |
122 | | |
123 | 1.51M | ssize_t done = write(fd, src, bytes_to_write); |
124 | 1.51M | if (done < 0) { |
125 | 0 | if (errno == EINTR) { |
126 | 0 | continue; |
127 | 0 | } |
128 | 0 | return false; |
129 | 0 | } |
130 | 1.51M | left -= done; |
131 | 1.51M | src += done; |
132 | 1.51M | } |
133 | 1.51M | return true; |
134 | 1.51M | } |
135 | | |
136 | 0 | bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) { |
137 | 0 | const size_t kLimit1Gb = 1UL << 30; |
138 | |
|
139 | 0 | const char* src = buf; |
140 | 0 | size_t left = nbyte; |
141 | |
|
142 | 0 | while (left != 0) { |
143 | 0 | size_t bytes_to_write = std::min(left, kLimit1Gb); |
144 | |
|
145 | 0 | ssize_t done = pwrite(fd, src, bytes_to_write, offset); |
146 | 0 | if (done < 0) { |
147 | 0 | if (errno == EINTR) { |
148 | 0 | continue; |
149 | 0 | } |
150 | 0 | return false; |
151 | 0 | } |
152 | 0 | left -= done; |
153 | 0 | offset += done; |
154 | 0 | src += done; |
155 | 0 | } |
156 | | |
157 | 0 | return true; |
158 | 0 | } |
159 | | |
160 | | #ifdef ROCKSDB_RANGESYNC_PRESENT |
161 | | |
162 | | #if !defined(ZFS_SUPER_MAGIC) |
163 | | // The magic number for ZFS was not exposed until recently. It should be fixed |
164 | | // forever so we can just copy the magic number here. |
165 | 72.1k | #define ZFS_SUPER_MAGIC 0x2fc12fc1 |
166 | | #endif |
167 | | |
168 | 72.1k | bool IsSyncFileRangeSupported(int fd) { |
169 | | // This function tracks and checks for cases where we know `sync_file_range` |
170 | | // definitely will not work properly despite passing the compile-time check |
171 | | // (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or if any of the checks |
172 | | // fail in unexpected ways, we allow `sync_file_range` to be used. This way |
173 | | // should minimize risk of impacting existing use cases. |
174 | 72.1k | struct statfs buf; |
175 | 72.1k | int ret = fstatfs(fd, &buf); |
176 | 72.1k | assert(ret == 0); |
177 | 72.1k | if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) { |
178 | | // Testing on ZFS showed the writeback did not happen asynchronously when |
179 | | // `sync_file_range` was called, even though it returned success. Avoid it |
180 | | // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`, |
181 | | // even though this'll incur extra I/O for metadata. |
182 | 0 | return false; |
183 | 0 | } |
184 | | |
185 | 72.1k | ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */); |
186 | 72.1k | assert(!(ret == -1 && errno != ENOSYS)); |
187 | 72.1k | if (ret == -1 && errno == ENOSYS) { |
188 | | // `sync_file_range` is not implemented on all platforms even if |
189 | | // compile-time checks pass and a supported filesystem is in-use. For |
190 | | // example, using ext4 on WSL (Windows Subsystem for Linux), |
191 | | // `sync_file_range()` returns `ENOSYS` |
192 | | // ("Function not implemented"). |
193 | 0 | return false; |
194 | 0 | } |
195 | | // None of the known cases matched, so allow `sync_file_range` use. |
196 | 72.1k | return true; |
197 | 72.1k | } |
198 | | |
199 | | #undef ZFS_SUPER_MAGIC |
200 | | |
201 | | #endif // ROCKSDB_RANGESYNC_PRESENT |
202 | | |
203 | | } // anonymous namespace |
204 | | |
205 | | /* |
206 | | * PosixSequentialFile |
207 | | */ |
208 | | PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file, |
209 | | int fd, size_t logical_block_size, |
210 | | const EnvOptions& options) |
211 | | : filename_(fname), |
212 | | file_(file), |
213 | | fd_(fd), |
214 | | use_direct_io_(options.use_direct_reads), |
215 | 40.6k | logical_sector_size_(logical_block_size) { |
216 | 40.6k | assert(!options.use_direct_reads || !options.use_mmap_reads); |
217 | 40.6k | } |
218 | | |
219 | 40.6k | PosixSequentialFile::~PosixSequentialFile() { |
220 | 40.6k | if (!use_direct_io()) { |
221 | 40.6k | assert(file_); |
222 | 40.6k | fclose(file_); |
223 | 40.6k | } else { |
224 | 0 | assert(fd_); |
225 | 0 | close(fd_); |
226 | 0 | } |
227 | 40.6k | } |
228 | | |
229 | | IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/, |
230 | | Slice* result, char* scratch, |
231 | 69.1k | IODebugContext* /*dbg*/) { |
232 | 69.1k | assert(result != nullptr && !use_direct_io()); |
233 | 69.1k | IOStatus s; |
234 | 69.1k | size_t r = 0; |
235 | 69.1k | do { |
236 | 69.1k | clearerr(file_); |
237 | 69.1k | r = fread_unlocked(scratch, 1, n, file_); |
238 | 69.1k | } while (r == 0 && ferror(file_) && errno == EINTR); |
239 | 69.1k | *result = Slice(scratch, r); |
240 | 69.1k | if (r < n) { |
241 | 58.6k | if (feof(file_)) { |
242 | | // We leave status as ok if we hit the end of the file |
243 | | // We also clear the error so that the reads can continue |
244 | | // if a new data is written to the file |
245 | 58.6k | clearerr(file_); |
246 | 58.6k | } else { |
247 | | // A partial read with an error: return a non-ok status |
248 | 0 | s = IOError("While reading file sequentially", filename_, errno); |
249 | 0 | } |
250 | 58.6k | } |
251 | 69.1k | return s; |
252 | 69.1k | } |
253 | | |
254 | | IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, |
255 | | const IOOptions& /*opts*/, |
256 | | Slice* result, char* scratch, |
257 | 0 | IODebugContext* /*dbg*/) { |
258 | 0 | assert(use_direct_io()); |
259 | 0 | assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); |
260 | 0 | assert(IsSectorAligned(n, GetRequiredBufferAlignment())); |
261 | 0 | assert(IsSectorAligned(scratch, GetRequiredBufferAlignment())); |
262 | |
|
263 | 0 | IOStatus s; |
264 | 0 | ssize_t r = -1; |
265 | 0 | size_t left = n; |
266 | 0 | char* ptr = scratch; |
267 | 0 | while (left > 0) { |
268 | 0 | r = pread(fd_, ptr, left, static_cast<off_t>(offset)); |
269 | 0 | if (r <= 0) { |
270 | 0 | if (r == -1 && errno == EINTR) { |
271 | 0 | continue; |
272 | 0 | } |
273 | 0 | break; |
274 | 0 | } |
275 | 0 | ptr += r; |
276 | 0 | offset += r; |
277 | 0 | left -= r; |
278 | 0 | if (!IsSectorAligned(r, GetRequiredBufferAlignment())) { |
279 | | // Bytes reads don't fill sectors. Should only happen at the end |
280 | | // of the file. |
281 | 0 | break; |
282 | 0 | } |
283 | 0 | } |
284 | 0 | if (r < 0) { |
285 | | // An error: return a non-ok status |
286 | 0 | s = IOError("While pread " + std::to_string(n) + " bytes from offset " + |
287 | 0 | std::to_string(offset), |
288 | 0 | filename_, errno); |
289 | 0 | } |
290 | 0 | *result = Slice(scratch, (r < 0) ? 0 : n - left); |
291 | 0 | return s; |
292 | 0 | } |
293 | | |
294 | 0 | IOStatus PosixSequentialFile::Skip(uint64_t n) { |
295 | 0 | if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) { |
296 | 0 | return IOError("While fseek to skip " + std::to_string(n) + " bytes", |
297 | 0 | filename_, errno); |
298 | 0 | } |
299 | 0 | return IOStatus::OK(); |
300 | 0 | } |
301 | | |
302 | 0 | IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) { |
303 | | #ifndef OS_LINUX |
304 | | (void)offset; |
305 | | (void)length; |
306 | | return IOStatus::OK(); |
307 | | #else |
308 | 0 | if (!use_direct_io()) { |
309 | | // free OS pages |
310 | 0 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); |
311 | 0 | if (ret != 0) { |
312 | 0 | return IOError("While fadvise NotNeeded offset " + |
313 | 0 | std::to_string(offset) + " len " + |
314 | 0 | std::to_string(length), |
315 | 0 | filename_, errno); |
316 | 0 | } |
317 | 0 | } |
318 | 0 | return IOStatus::OK(); |
319 | 0 | #endif |
320 | 0 | } |
321 | | |
322 | | /* |
323 | | * PosixRandomAccessFile |
324 | | */ |
325 | | #if defined(OS_LINUX) |
326 | 0 | size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { |
327 | 0 | if (max_size < kMaxVarint64Length * 3) { |
328 | 0 | return 0; |
329 | 0 | } |
330 | | |
331 | 0 | struct stat buf; |
332 | 0 | int result = fstat(fd, &buf); |
333 | 0 | if (result == -1) { |
334 | 0 | return 0; |
335 | 0 | } |
336 | | |
337 | 0 | long version = 0; |
338 | 0 | result = ioctl(fd, FS_IOC_GETVERSION, &version); |
339 | 0 | TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); |
340 | 0 | if (result == -1) { |
341 | 0 | return 0; |
342 | 0 | } |
343 | 0 | uint64_t uversion = (uint64_t)version; |
344 | |
|
345 | 0 | char* rid = id; |
346 | 0 | rid = EncodeVarint64(rid, buf.st_dev); |
347 | 0 | rid = EncodeVarint64(rid, buf.st_ino); |
348 | 0 | rid = EncodeVarint64(rid, uversion); |
349 | 0 | assert(rid >= id); |
350 | 0 | return static_cast<size_t>(rid - id); |
351 | 0 | } |
352 | | #endif |
353 | | |
354 | | #if defined(OS_MACOSX) || defined(OS_AIX) |
355 | | size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { |
356 | | if (max_size < kMaxVarint64Length * 3) { |
357 | | return 0; |
358 | | } |
359 | | |
360 | | struct stat buf; |
361 | | int result = fstat(fd, &buf); |
362 | | if (result == -1) { |
363 | | return 0; |
364 | | } |
365 | | |
366 | | char* rid = id; |
367 | | rid = EncodeVarint64(rid, buf.st_dev); |
368 | | rid = EncodeVarint64(rid, buf.st_ino); |
369 | | rid = EncodeVarint64(rid, buf.st_gen); |
370 | | assert(rid >= id); |
371 | | return static_cast<size_t>(rid - id); |
372 | | } |
373 | | #endif |
374 | | |
375 | | #ifdef OS_LINUX |
376 | 18.0k | std::string RemoveTrailingSlash(const std::string& path) { |
377 | 18.0k | std::string p = path; |
378 | 18.0k | if (p.size() > 1 && p.back() == '/') { |
379 | 0 | p.pop_back(); |
380 | 0 | } |
381 | 18.0k | return p; |
382 | 18.0k | } |
383 | | |
384 | | Status LogicalBlockSizeCache::RefAndCacheLogicalBlockSize( |
385 | 9.02k | const std::vector<std::string>& directories) { |
386 | 9.02k | std::vector<std::string> dirs; |
387 | 9.02k | dirs.reserve(directories.size()); |
388 | 9.02k | for (auto& d : directories) { |
389 | 9.02k | dirs.emplace_back(RemoveTrailingSlash(d)); |
390 | 9.02k | } |
391 | | |
392 | 9.02k | std::map<std::string, size_t> dir_sizes; |
393 | 9.02k | { |
394 | 9.02k | ReadLock lock(&cache_mutex_); |
395 | 9.02k | for (const auto& dir : dirs) { |
396 | 9.02k | if (cache_.find(dir) == cache_.end()) { |
397 | 9.02k | dir_sizes.emplace(dir, 0); |
398 | 9.02k | } |
399 | 9.02k | } |
400 | 9.02k | } |
401 | | |
402 | 9.02k | Status s; |
403 | 9.02k | for (auto& dir_size : dir_sizes) { |
404 | 9.02k | s = get_logical_block_size_of_directory_(dir_size.first, &dir_size.second); |
405 | 9.02k | if (!s.ok()) { |
406 | 0 | return s; |
407 | 0 | } |
408 | 9.02k | } |
409 | | |
410 | 9.02k | WriteLock lock(&cache_mutex_); |
411 | 9.02k | for (const auto& dir : dirs) { |
412 | 9.02k | auto& v = cache_[dir]; |
413 | 9.02k | v.ref++; |
414 | 9.02k | auto dir_size = dir_sizes.find(dir); |
415 | 9.02k | if (dir_size != dir_sizes.end()) { |
416 | 9.02k | v.size = dir_size->second; |
417 | 9.02k | } |
418 | 9.02k | } |
419 | 9.02k | return s; |
420 | 9.02k | } |
421 | | |
422 | | void LogicalBlockSizeCache::UnrefAndTryRemoveCachedLogicalBlockSize( |
423 | 9.02k | const std::vector<std::string>& directories) { |
424 | 9.02k | std::vector<std::string> dirs; |
425 | 9.02k | dirs.reserve(directories.size()); |
426 | 9.02k | for (auto& dir : directories) { |
427 | 9.02k | dirs.emplace_back(RemoveTrailingSlash(dir)); |
428 | 9.02k | } |
429 | | |
430 | 9.02k | WriteLock lock(&cache_mutex_); |
431 | 9.02k | for (const auto& dir : dirs) { |
432 | 9.02k | auto it = cache_.find(dir); |
433 | 9.02k | if (it != cache_.end() && !(--(it->second.ref))) { |
434 | 9.02k | cache_.erase(it); |
435 | 9.02k | } |
436 | 9.02k | } |
437 | 9.02k | } |
438 | | |
439 | | size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname, |
440 | 0 | int fd) { |
441 | 0 | std::string dir = fname.substr(0, fname.find_last_of('/')); |
442 | 0 | if (dir.empty()) { |
443 | 0 | dir = "/"; |
444 | 0 | } |
445 | 0 | { |
446 | 0 | ReadLock lock(&cache_mutex_); |
447 | 0 | auto it = cache_.find(dir); |
448 | 0 | if (it != cache_.end()) { |
449 | 0 | return it->second.size; |
450 | 0 | } |
451 | 0 | } |
452 | 0 | return get_logical_block_size_of_fd_(fd); |
453 | 0 | } |
454 | | #endif |
455 | | |
456 | | Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory, |
457 | 9.02k | size_t* size) { |
458 | 9.02k | int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY); |
459 | 9.02k | if (fd == -1) { |
460 | 0 | return Status::IOError("Cannot open directory " + directory); |
461 | 0 | } |
462 | 9.02k | *size = PosixHelper::GetLogicalBlockSizeOfFd(fd); |
463 | 9.02k | close(fd); |
464 | 9.02k | return Status::OK(); |
465 | 9.02k | } |
466 | | |
467 | 9.02k | size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { |
468 | 9.02k | #ifdef OS_LINUX |
469 | 9.02k | struct stat buf; |
470 | 9.02k | int result = fstat(fd, &buf); |
471 | 9.02k | if (result == -1) { |
472 | 0 | return kDefaultPageSize; |
473 | 0 | } |
474 | 9.02k | if (major(buf.st_dev) == 0) { |
475 | | // Unnamed devices (e.g. non-device mounts), reserved as null device number. |
476 | | // These don't have an entry in /sys/dev/block/. Return a sensible default. |
477 | 9.02k | return kDefaultPageSize; |
478 | 9.02k | } |
479 | | |
480 | | // Reading queue/logical_block_size does not require special permissions. |
481 | 0 | const int kBufferSize = 100; |
482 | 0 | char path[kBufferSize]; |
483 | 0 | char real_path[PATH_MAX + 1]; |
484 | 0 | snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev), |
485 | 0 | minor(buf.st_dev)); |
486 | 0 | if (realpath(path, real_path) == nullptr) { |
487 | 0 | return kDefaultPageSize; |
488 | 0 | } |
489 | 0 | std::string device_dir(real_path); |
490 | 0 | if (!device_dir.empty() && device_dir.back() == '/') { |
491 | 0 | device_dir.pop_back(); |
492 | 0 | } |
493 | | // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda |
494 | | // and nvme0n1 have it. |
495 | | // $ ls -al '/sys/dev/block/8:3' |
496 | | // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 -> |
497 | | // ../../block/sda/sda3 |
498 | | // $ ls -al '/sys/dev/block/259:4' |
499 | | // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 -> |
500 | | // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 |
501 | 0 | size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); |
502 | 0 | if (parent_end == std::string::npos) { |
503 | 0 | return kDefaultPageSize; |
504 | 0 | } |
505 | 0 | size_t parent_begin = device_dir.rfind('/', parent_end - 1); |
506 | 0 | if (parent_begin == std::string::npos) { |
507 | 0 | return kDefaultPageSize; |
508 | 0 | } |
509 | 0 | std::string parent = |
510 | 0 | device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); |
511 | 0 | std::string child = device_dir.substr(parent_end + 1, std::string::npos); |
512 | 0 | if (parent != "block" && |
513 | 0 | (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { |
514 | 0 | device_dir = device_dir.substr(0, parent_end); |
515 | 0 | } |
516 | 0 | std::string fname = device_dir + "/queue/logical_block_size"; |
517 | 0 | FILE* fp; |
518 | 0 | size_t size = 0; |
519 | 0 | fp = fopen(fname.c_str(), "r"); |
520 | 0 | if (fp != nullptr) { |
521 | 0 | char* line = nullptr; |
522 | 0 | size_t len = 0; |
523 | 0 | if (getline(&line, &len, fp) != -1) { |
524 | 0 | sscanf(line, "%zu", &size); |
525 | 0 | } |
526 | 0 | free(line); |
527 | 0 | fclose(fp); |
528 | 0 | } |
529 | 0 | if (size != 0 && (size & (size - 1)) == 0) { |
530 | 0 | return size; |
531 | 0 | } |
532 | 0 | #endif |
533 | 0 | (void)fd; |
534 | 0 | return kDefaultPageSize; |
535 | 0 | } |
536 | | |
537 | | /* |
538 | | * PosixRandomAccessFile |
539 | | * |
540 | | * pread() based random-access |
541 | | */ |
542 | | PosixRandomAccessFile::PosixRandomAccessFile( |
543 | | const std::string& fname, int fd, size_t logical_block_size, |
544 | | const EnvOptions& options |
545 | | #if defined(ROCKSDB_IOURING_PRESENT) |
546 | | , |
547 | | ThreadLocalPtr* thread_local_io_urings |
548 | | #endif |
549 | | ) |
550 | | : filename_(fname), |
551 | | fd_(fd), |
552 | | use_direct_io_(options.use_direct_reads), |
553 | | logical_sector_size_(logical_block_size) |
554 | | #if defined(ROCKSDB_IOURING_PRESENT) |
555 | | , |
556 | | thread_local_io_urings_(thread_local_io_urings) |
557 | | #endif |
558 | 13.5k | { |
559 | 13.5k | assert(!options.use_direct_reads || !options.use_mmap_reads); |
560 | 13.5k | assert(!options.use_mmap_reads); |
561 | 13.5k | } |
562 | | |
563 | 13.5k | PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); } |
564 | | |
565 | | IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n, |
566 | | const IOOptions& /*opts*/, Slice* result, |
567 | | char* scratch, |
568 | 29.6k | IODebugContext* /*dbg*/) const { |
569 | 29.6k | if (use_direct_io()) { |
570 | 0 | assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); |
571 | 0 | assert(IsSectorAligned(n, GetRequiredBufferAlignment())); |
572 | 0 | assert(IsSectorAligned(scratch, GetRequiredBufferAlignment())); |
573 | 0 | } |
574 | 29.6k | IOStatus s; |
575 | 29.6k | ssize_t r = -1; |
576 | 29.6k | size_t left = n; |
577 | 29.6k | char* ptr = scratch; |
578 | 59.3k | while (left > 0) { |
579 | 29.6k | r = pread(fd_, ptr, left, static_cast<off_t>(offset)); |
580 | 29.6k | if (r <= 0) { |
581 | 0 | if (r == -1 && errno == EINTR) { |
582 | 0 | continue; |
583 | 0 | } |
584 | 0 | break; |
585 | 0 | } |
586 | 29.6k | ptr += r; |
587 | 29.6k | offset += r; |
588 | 29.6k | left -= r; |
589 | 29.6k | if (use_direct_io() && |
590 | 29.6k | r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) { |
591 | | // Bytes reads don't fill sectors. Should only happen at the end |
592 | | // of the file. |
593 | 0 | break; |
594 | 0 | } |
595 | 29.6k | } |
596 | 29.6k | if (r < 0) { |
597 | | // An error: return a non-ok status |
598 | 0 | s = IOError("While pread offset " + std::to_string(offset) + " len " + |
599 | 0 | std::to_string(n), |
600 | 0 | filename_, errno); |
601 | 0 | } |
602 | 29.6k | *result = Slice(scratch, (r < 0) ? 0 : n - left); |
603 | 29.6k | return s; |
604 | 29.6k | } |
605 | | |
606 | | IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, |
607 | | const IOOptions& options, |
608 | 0 | IODebugContext* dbg) { |
609 | 0 | if (use_direct_io()) { |
610 | 0 | for (size_t i = 0; i < num_reqs; i++) { |
611 | 0 | assert(IsSectorAligned(reqs[i].offset, GetRequiredBufferAlignment())); |
612 | 0 | assert(IsSectorAligned(reqs[i].len, GetRequiredBufferAlignment())); |
613 | 0 | assert(IsSectorAligned(reqs[i].scratch, GetRequiredBufferAlignment())); |
614 | 0 | } |
615 | 0 | } |
616 | |
|
617 | | #if defined(ROCKSDB_IOURING_PRESENT) |
618 | | struct io_uring* iu = nullptr; |
619 | | if (thread_local_io_urings_) { |
620 | | iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get()); |
621 | | if (iu == nullptr) { |
622 | | iu = CreateIOUring(); |
623 | | if (iu != nullptr) { |
624 | | thread_local_io_urings_->Reset(iu); |
625 | | } |
626 | | } |
627 | | } |
628 | | |
629 | | // Init failed, platform doesn't support io_uring. Fall back to |
630 | | // serialized reads |
631 | | if (iu == nullptr) { |
632 | | return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); |
633 | | } |
634 | | |
635 | | IOStatus ios = IOStatus::OK(); |
636 | | |
637 | | struct WrappedReadRequest { |
638 | | FSReadRequest* req; |
639 | | struct iovec iov; |
640 | | size_t finished_len; |
641 | | explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {} |
642 | | }; |
643 | | |
644 | | autovector<WrappedReadRequest, 32> req_wraps; |
645 | | autovector<WrappedReadRequest*, 4> incomplete_rq_list; |
646 | | std::unordered_set<WrappedReadRequest*> wrap_cache; |
647 | | |
648 | | for (size_t i = 0; i < num_reqs; i++) { |
649 | | req_wraps.emplace_back(&reqs[i]); |
650 | | } |
651 | | |
652 | | size_t reqs_off = 0; |
653 | | while (num_reqs > reqs_off || !incomplete_rq_list.empty()) { |
654 | | size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size(); |
655 | | |
656 | | // If requests exceed depth, split it into batches |
657 | | if (this_reqs > kIoUringDepth) { |
658 | | this_reqs = kIoUringDepth; |
659 | | } |
660 | | |
661 | | assert(incomplete_rq_list.size() <= this_reqs); |
662 | | for (size_t i = 0; i < this_reqs; i++) { |
663 | | WrappedReadRequest* rep_to_submit; |
664 | | if (i < incomplete_rq_list.size()) { |
665 | | rep_to_submit = incomplete_rq_list[i]; |
666 | | } else { |
667 | | rep_to_submit = &req_wraps[reqs_off++]; |
668 | | } |
669 | | assert(rep_to_submit->req->len > rep_to_submit->finished_len); |
670 | | rep_to_submit->iov.iov_base = |
671 | | rep_to_submit->req->scratch + rep_to_submit->finished_len; |
672 | | rep_to_submit->iov.iov_len = |
673 | | rep_to_submit->req->len - rep_to_submit->finished_len; |
674 | | |
675 | | struct io_uring_sqe* sqe; |
676 | | sqe = io_uring_get_sqe(iu); |
677 | | io_uring_prep_readv( |
678 | | sqe, fd_, &rep_to_submit->iov, 1, |
679 | | rep_to_submit->req->offset + rep_to_submit->finished_len); |
680 | | io_uring_sqe_set_data(sqe, rep_to_submit); |
681 | | wrap_cache.emplace(rep_to_submit); |
682 | | } |
683 | | incomplete_rq_list.clear(); |
684 | | |
685 | | ssize_t ret = |
686 | | io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs)); |
687 | | TEST_SYNC_POINT_CALLBACK( |
688 | | "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1", |
689 | | &ret); |
690 | | TEST_SYNC_POINT_CALLBACK( |
691 | | "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2", |
692 | | iu); |
693 | | |
694 | | if (static_cast<size_t>(ret) != this_reqs) { |
695 | | fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs); |
696 | | // If error happens and we submitted fewer than expected, it is an |
697 | | // exception case and we don't retry here. We should still consume |
698 | | // what is is submitted in the ring. |
699 | | for (ssize_t i = 0; i < ret; i++) { |
700 | | struct io_uring_cqe* cqe = nullptr; |
701 | | io_uring_wait_cqe(iu, &cqe); |
702 | | if (cqe != nullptr) { |
703 | | io_uring_cqe_seen(iu, cqe); |
704 | | } |
705 | | } |
706 | | return IOStatus::IOError("io_uring_submit_and_wait() requested " + |
707 | | std::to_string(this_reqs) + " but returned " + |
708 | | std::to_string(ret)); |
709 | | } |
710 | | |
711 | | for (size_t i = 0; i < this_reqs; i++) { |
712 | | struct io_uring_cqe* cqe = nullptr; |
713 | | WrappedReadRequest* req_wrap; |
714 | | |
715 | | // We could use the peek variant here, but this seems safer in terms |
716 | | // of our initial wait not reaping all completions |
717 | | ret = io_uring_wait_cqe(iu, &cqe); |
718 | | TEST_SYNC_POINT_CALLBACK( |
719 | | "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret); |
720 | | if (ret) { |
721 | | ios = IOStatus::IOError("io_uring_wait_cqe() returns " + |
722 | | std::to_string(ret)); |
723 | | |
724 | | if (cqe != nullptr) { |
725 | | io_uring_cqe_seen(iu, cqe); |
726 | | } |
727 | | continue; |
728 | | } |
729 | | |
730 | | req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe)); |
731 | | // Reset cqe data to catch any stray reuse of it |
732 | | static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; |
733 | | // Check that we got a valid unique cqe data |
734 | | auto wrap_check = wrap_cache.find(req_wrap); |
735 | | if (wrap_check == wrap_cache.end()) { |
736 | | fprintf(stderr, |
737 | | "PosixRandomAccessFile::MultiRead: " |
738 | | "Bad cqe data from IO uring - %p\n", |
739 | | req_wrap); |
740 | | port::PrintStack(); |
741 | | ios = IOStatus::IOError("io_uring_cqe_get_data() returned " + |
742 | | std::to_string((uint64_t)req_wrap)); |
743 | | continue; |
744 | | } |
745 | | wrap_cache.erase(wrap_check); |
746 | | |
747 | | FSReadRequest* req = req_wrap->req; |
748 | | size_t bytes_read = 0; |
749 | | bool read_again = false; |
750 | | UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len, |
751 | | false /*async_read*/, use_direct_io(), |
752 | | GetRequiredBufferAlignment(), req_wrap->finished_len, req, |
753 | | bytes_read, read_again); |
754 | | int32_t res = cqe->res; |
755 | | if (res >= 0) { |
756 | | if (bytes_read == 0) { |
757 | | if (read_again) { |
758 | | Slice tmp_slice; |
759 | | req->status = |
760 | | Read(req->offset + req_wrap->finished_len, |
761 | | req->len - req_wrap->finished_len, options, &tmp_slice, |
762 | | req->scratch + req_wrap->finished_len, dbg); |
763 | | req->result = |
764 | | Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); |
765 | | } |
766 | | // else It means EOF so no need to do anything. |
767 | | } else if (bytes_read < req_wrap->iov.iov_len) { |
768 | | incomplete_rq_list.push_back(req_wrap); |
769 | | } |
770 | | } |
771 | | io_uring_cqe_seen(iu, cqe); |
772 | | } |
773 | | wrap_cache.clear(); |
774 | | } |
775 | | return ios; |
776 | | #else |
777 | 0 | return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); |
778 | 0 | #endif |
779 | 0 | } |
780 | | |
781 | | IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n, |
782 | | const IOOptions& /*opts*/, |
783 | 5.53k | IODebugContext* /*dbg*/) { |
784 | 5.53k | IOStatus s; |
785 | 5.53k | if (!use_direct_io()) { |
786 | 5.53k | ssize_t r = 0; |
787 | 5.53k | #ifdef OS_LINUX |
788 | 5.53k | r = readahead(fd_, offset, n); |
789 | 5.53k | #endif |
790 | | #ifdef OS_MACOSX |
791 | | radvisory advice; |
792 | | advice.ra_offset = static_cast<off_t>(offset); |
793 | | advice.ra_count = static_cast<int>(n); |
794 | | r = fcntl(fd_, F_RDADVISE, &advice); |
795 | | #endif |
796 | 5.53k | if (r == -1) { |
797 | 0 | s = IOError("While prefetching offset " + std::to_string(offset) + |
798 | 0 | " len " + std::to_string(n), |
799 | 0 | filename_, errno); |
800 | 0 | } |
801 | 5.53k | } |
802 | 5.53k | return s; |
803 | 5.53k | } |
804 | | |
805 | | #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX) |
806 | 0 | size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { |
807 | 0 | return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); |
808 | 0 | } |
809 | | #endif |
810 | | |
811 | 4.48k | void PosixRandomAccessFile::Hint(AccessPattern pattern) { |
812 | 4.48k | if (use_direct_io()) { |
813 | 0 | return; |
814 | 0 | } |
815 | 4.48k | switch (pattern) { |
816 | 0 | case kNormal: |
817 | 0 | Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); |
818 | 0 | break; |
819 | 4.48k | case kRandom: |
820 | 4.48k | Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); |
821 | 4.48k | break; |
822 | 0 | case kSequential: |
823 | 0 | Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); |
824 | 0 | break; |
825 | 0 | case kWillNeed: |
826 | 0 | Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); |
827 | 0 | break; |
828 | 0 | case kWontNeed: |
829 | 0 | Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); |
830 | 0 | break; |
831 | 0 | default: |
832 | 0 | assert(false); |
833 | 0 | break; |
834 | 4.48k | } |
835 | 4.48k | } |
836 | | |
837 | 0 | IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) { |
838 | 0 | if (use_direct_io()) { |
839 | 0 | return IOStatus::OK(); |
840 | 0 | } |
841 | | #ifndef OS_LINUX |
842 | | (void)offset; |
843 | | (void)length; |
844 | | return IOStatus::OK(); |
845 | | #else |
846 | | // free OS pages |
847 | 0 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); |
848 | 0 | if (ret == 0) { |
849 | 0 | return IOStatus::OK(); |
850 | 0 | } |
851 | 0 | return IOError("While fadvise NotNeeded offset " + std::to_string(offset) + |
852 | 0 | " len " + std::to_string(length), |
853 | 0 | filename_, errno); |
854 | 0 | #endif |
855 | 0 | } |
856 | | |
857 | | IOStatus PosixRandomAccessFile::ReadAsync( |
858 | | FSReadRequest& req, const IOOptions& /*opts*/, |
859 | | std::function<void(FSReadRequest&, void*)> cb, void* cb_arg, |
860 | 0 | void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) { |
861 | 0 | if (use_direct_io()) { |
862 | 0 | assert(IsSectorAligned(req.offset, GetRequiredBufferAlignment())); |
863 | 0 | assert(IsSectorAligned(req.len, GetRequiredBufferAlignment())); |
864 | 0 | assert(IsSectorAligned(req.scratch, GetRequiredBufferAlignment())); |
865 | 0 | } |
866 | |
|
867 | | #if defined(ROCKSDB_IOURING_PRESENT) |
868 | | // io_uring_queue_init. |
869 | | struct io_uring* iu = nullptr; |
870 | | if (thread_local_io_urings_) { |
871 | | iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get()); |
872 | | if (iu == nullptr) { |
873 | | iu = CreateIOUring(); |
874 | | if (iu != nullptr) { |
875 | | thread_local_io_urings_->Reset(iu); |
876 | | } |
877 | | } |
878 | | } |
879 | | |
880 | | // Init failed, platform doesn't support io_uring. |
881 | | if (iu == nullptr) { |
882 | | return IOStatus::NotSupported("ReadAsync"); |
883 | | } |
884 | | |
885 | | // Allocate io_handle. |
886 | | IOHandleDeleter deletefn = [](void* args) -> void { |
887 | | delete (static_cast<Posix_IOHandle*>(args)); |
888 | | args = nullptr; |
889 | | }; |
890 | | |
891 | | // Initialize Posix_IOHandle. |
892 | | Posix_IOHandle* posix_handle = |
893 | | new Posix_IOHandle(iu, cb, cb_arg, req.offset, req.len, req.scratch, |
894 | | use_direct_io(), GetRequiredBufferAlignment()); |
895 | | posix_handle->iov.iov_base = req.scratch; |
896 | | posix_handle->iov.iov_len = req.len; |
897 | | |
898 | | *io_handle = static_cast<void*>(posix_handle); |
899 | | *del_fn = deletefn; |
900 | | |
901 | | // Step 3: io_uring_sqe_set_data |
902 | | struct io_uring_sqe* sqe; |
903 | | sqe = io_uring_get_sqe(iu); |
904 | | |
905 | | io_uring_prep_readv(sqe, fd_, /*sqe->addr=*/&posix_handle->iov, |
906 | | /*sqe->len=*/1, /*sqe->offset=*/posix_handle->offset); |
907 | | |
908 | | // Sets sqe->user_data to posix_handle. |
909 | | io_uring_sqe_set_data(sqe, posix_handle); |
910 | | |
911 | | // Step 4: io_uring_submit |
912 | | ssize_t ret = io_uring_submit(iu); |
913 | | if (ret < 0) { |
914 | | fprintf(stderr, "io_uring_submit error: %ld\n", long(ret)); |
915 | | return IOStatus::IOError("io_uring_submit() requested but returned " + |
916 | | std::to_string(ret)); |
917 | | } |
918 | | return IOStatus::OK(); |
919 | | #else |
920 | 0 | (void)req; |
921 | 0 | (void)cb; |
922 | 0 | (void)cb_arg; |
923 | 0 | (void)io_handle; |
924 | 0 | (void)del_fn; |
925 | 0 | return IOStatus::NotSupported("ReadAsync"); |
926 | 0 | #endif |
927 | 0 | } |
928 | | |
929 | | /* |
930 | | * PosixMmapReadableFile |
931 | | * |
932 | | * mmap() based random-access |
933 | | */ |
934 | | // base[0,length-1] contains the mmapped contents of the file. |
935 | | PosixMmapReadableFile::PosixMmapReadableFile(const int fd, |
936 | | const std::string& fname, |
937 | | void* base, size_t length, |
938 | | const EnvOptions& options) |
939 | 0 | : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { |
940 | 0 | #ifdef NDEBUG |
941 | 0 | (void)options; |
942 | 0 | #endif |
943 | 0 | fd_ = fd_ + 0; // suppress the warning for used variables |
944 | 0 | assert(options.use_mmap_reads); |
945 | 0 | assert(!options.use_direct_reads); |
946 | 0 | } |
947 | | |
948 | 0 | PosixMmapReadableFile::~PosixMmapReadableFile() { |
949 | 0 | int ret = munmap(mmapped_region_, length_); |
950 | 0 | if (ret != 0) { |
951 | 0 | fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n", |
952 | 0 | mmapped_region_, length_); |
953 | 0 | } |
954 | 0 | close(fd_); |
955 | 0 | } |
956 | | |
957 | | IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n, |
958 | | const IOOptions& /*opts*/, Slice* result, |
959 | | char* /*scratch*/, |
960 | 0 | IODebugContext* /*dbg*/) const { |
961 | 0 | IOStatus s; |
962 | 0 | if (offset > length_) { |
963 | 0 | *result = Slice(); |
964 | 0 | return IOError("While mmap read offset " + std::to_string(offset) + |
965 | 0 | " larger than file length " + std::to_string(length_), |
966 | 0 | filename_, EINVAL); |
967 | 0 | } else if (offset + n > length_) { |
968 | 0 | n = static_cast<size_t>(length_ - offset); |
969 | 0 | } |
970 | 0 | *result = Slice(static_cast<char*>(mmapped_region_) + offset, n); |
971 | 0 | return s; |
972 | 0 | } |
973 | | |
974 | 0 | void PosixMmapReadableFile::Hint(AccessPattern pattern) { |
975 | 0 | switch (pattern) { |
976 | 0 | case kNormal: |
977 | 0 | Madvise(mmapped_region_, length_, POSIX_MADV_NORMAL); |
978 | 0 | break; |
979 | 0 | case kRandom: |
980 | 0 | Madvise(mmapped_region_, length_, POSIX_MADV_RANDOM); |
981 | 0 | break; |
982 | 0 | case kSequential: |
983 | 0 | Madvise(mmapped_region_, length_, POSIX_MADV_SEQUENTIAL); |
984 | 0 | break; |
985 | 0 | case kWillNeed: |
986 | 0 | Madvise(mmapped_region_, length_, POSIX_MADV_WILLNEED); |
987 | 0 | break; |
988 | 0 | case kWontNeed: |
989 | 0 | Madvise(mmapped_region_, length_, POSIX_MADV_DONTNEED); |
990 | 0 | break; |
991 | 0 | default: |
992 | 0 | assert(false); |
993 | 0 | break; |
994 | 0 | } |
995 | 0 | } |
996 | | |
997 | 0 | IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) { |
998 | | #ifndef OS_LINUX |
999 | | (void)offset; |
1000 | | (void)length; |
1001 | | return IOStatus::OK(); |
1002 | | #else |
1003 | | // free OS pages |
1004 | 0 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); |
1005 | 0 | if (ret == 0) { |
1006 | 0 | return IOStatus::OK(); |
1007 | 0 | } |
1008 | 0 | return IOError("While fadvise not needed. Offset " + std::to_string(offset) + |
1009 | 0 | " len" + std::to_string(length), |
1010 | 0 | filename_, errno); |
1011 | 0 | #endif |
1012 | 0 | } |
1013 | | |
1014 | | /* |
1015 | | * PosixMmapFile |
1016 | | * |
1017 | | * We preallocate up to an extra megabyte and use memcpy to append new |
1018 | | * data to the file. This is safe since we either properly close the |
1019 | | * file before reading from it, or for log files, the reading code |
1020 | | * knows enough to skip zero suffixes. |
1021 | | */ |
1022 | 0 | IOStatus PosixMmapFile::UnmapCurrentRegion() { |
1023 | 0 | TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); |
1024 | 0 | if (base_ != nullptr) { |
1025 | 0 | int munmap_status = munmap(base_, limit_ - base_); |
1026 | 0 | if (munmap_status != 0) { |
1027 | 0 | return IOError("While munmap", filename_, munmap_status); |
1028 | 0 | } |
1029 | 0 | file_offset_ += limit_ - base_; |
1030 | 0 | base_ = nullptr; |
1031 | 0 | limit_ = nullptr; |
1032 | 0 | last_sync_ = nullptr; |
1033 | 0 | dst_ = nullptr; |
1034 | | |
1035 | | // Increase the amount we map the next time, but capped at 1MB |
1036 | 0 | if (map_size_ < (1 << 20)) { |
1037 | 0 | map_size_ *= 2; |
1038 | 0 | } |
1039 | 0 | } |
1040 | 0 | return IOStatus::OK(); |
1041 | 0 | } |
1042 | | |
1043 | 0 | IOStatus PosixMmapFile::MapNewRegion() { |
1044 | 0 | #ifdef ROCKSDB_FALLOCATE_PRESENT |
1045 | 0 | assert(base_ == nullptr); |
1046 | 0 | TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0"); |
1047 | | // we can't fallocate with FALLOC_FL_KEEP_SIZE here |
1048 | 0 | if (allow_fallocate_) { |
1049 | 0 | IOSTATS_TIMER_GUARD(allocate_nanos); |
1050 | 0 | int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); |
1051 | 0 | if (alloc_status != 0) { |
1052 | | // fallback to posix_fallocate |
1053 | 0 | alloc_status = posix_fallocate(fd_, file_offset_, map_size_); |
1054 | 0 | } |
1055 | 0 | if (alloc_status != 0) { |
1056 | 0 | return IOStatus::IOError("Error allocating space to file : " + filename_ + |
1057 | 0 | "Error : " + errnoStr(alloc_status).c_str()); |
1058 | 0 | } |
1059 | 0 | } |
1060 | | |
1061 | 0 | TEST_KILL_RANDOM("PosixMmapFile::Append:1"); |
1062 | 0 | void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, |
1063 | 0 | file_offset_); |
1064 | 0 | if (ptr == MAP_FAILED) { |
1065 | 0 | return IOStatus::IOError("MMap failed on " + filename_); |
1066 | 0 | } |
1067 | 0 | TEST_KILL_RANDOM("PosixMmapFile::Append:2"); |
1068 | |
|
1069 | 0 | base_ = static_cast<char*>(ptr); |
1070 | 0 | limit_ = base_ + map_size_; |
1071 | 0 | dst_ = base_; |
1072 | 0 | last_sync_ = base_; |
1073 | 0 | return IOStatus::OK(); |
1074 | | #else |
1075 | | return IOStatus::NotSupported("This platform doesn't support fallocate()"); |
1076 | | #endif |
1077 | 0 | } |
1078 | | |
1079 | 0 | IOStatus PosixMmapFile::Msync() { |
1080 | 0 | if (dst_ == last_sync_) { |
1081 | 0 | return IOStatus::OK(); |
1082 | 0 | } |
1083 | | // Find the beginnings of the pages that contain the first and last |
1084 | | // bytes to be synced. |
1085 | 0 | size_t p1 = TruncateToPageBoundary(last_sync_ - base_); |
1086 | 0 | size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); |
1087 | 0 | last_sync_ = dst_; |
1088 | 0 | TEST_KILL_RANDOM("PosixMmapFile::Msync:0"); |
1089 | 0 | if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { |
1090 | 0 | return IOError("While msync", filename_, errno); |
1091 | 0 | } |
1092 | 0 | return IOStatus::OK(); |
1093 | 0 | } |
1094 | | |
1095 | | PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size, |
1096 | | const EnvOptions& options) |
1097 | | : filename_(fname), |
1098 | | fd_(fd), |
1099 | | page_size_(page_size), |
1100 | | map_size_(Roundup(65536, page_size)), |
1101 | | base_(nullptr), |
1102 | | limit_(nullptr), |
1103 | | dst_(nullptr), |
1104 | | last_sync_(nullptr), |
1105 | 0 | file_offset_(0) { |
1106 | 0 | #ifdef ROCKSDB_FALLOCATE_PRESENT |
1107 | 0 | allow_fallocate_ = options.allow_fallocate; |
1108 | 0 | fallocate_with_keep_size_ = options.fallocate_with_keep_size; |
1109 | | #else |
1110 | | (void)options; |
1111 | | #endif |
1112 | 0 | assert((page_size & (page_size - 1)) == 0); |
1113 | 0 | assert(options.use_mmap_writes); |
1114 | 0 | assert(!options.use_direct_writes); |
1115 | 0 | } |
1116 | | |
1117 | 0 | PosixMmapFile::~PosixMmapFile() { |
1118 | 0 | if (fd_ >= 0) { |
1119 | 0 | IOStatus s = PosixMmapFile::Close(IOOptions(), nullptr); |
1120 | 0 | s.PermitUncheckedError(); |
1121 | 0 | } |
1122 | 0 | } |
1123 | | |
1124 | | IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/, |
1125 | 0 | IODebugContext* /*dbg*/) { |
1126 | 0 | const char* src = data.data(); |
1127 | 0 | size_t left = data.size(); |
1128 | 0 | while (left > 0) { |
1129 | 0 | assert(base_ <= dst_); |
1130 | 0 | assert(dst_ <= limit_); |
1131 | 0 | size_t avail = limit_ - dst_; |
1132 | 0 | if (avail == 0) { |
1133 | 0 | IOStatus s = UnmapCurrentRegion(); |
1134 | 0 | if (!s.ok()) { |
1135 | 0 | return s; |
1136 | 0 | } |
1137 | 0 | s = MapNewRegion(); |
1138 | 0 | if (!s.ok()) { |
1139 | 0 | return s; |
1140 | 0 | } |
1141 | 0 | TEST_KILL_RANDOM("PosixMmapFile::Append:0"); |
1142 | 0 | } |
1143 | | |
1144 | 0 | size_t n = (left <= avail) ? left : avail; |
1145 | 0 | assert(dst_); |
1146 | 0 | memcpy(dst_, src, n); |
1147 | 0 | dst_ += n; |
1148 | 0 | src += n; |
1149 | 0 | left -= n; |
1150 | 0 | } |
1151 | 0 | return IOStatus::OK(); |
1152 | 0 | } |
1153 | | |
1154 | | IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/, |
1155 | 0 | IODebugContext* /*dbg*/) { |
1156 | 0 | IOStatus s; |
1157 | 0 | size_t unused = limit_ - dst_; |
1158 | |
|
1159 | 0 | s = UnmapCurrentRegion(); |
1160 | 0 | if (!s.ok()) { |
1161 | 0 | s = IOError("While closing mmapped file", filename_, errno); |
1162 | 0 | } else if (unused > 0) { |
1163 | | // Trim the extra space at the end of the file |
1164 | 0 | if (ftruncate(fd_, file_offset_ - unused) < 0) { |
1165 | 0 | s = IOError("While ftruncating mmaped file", filename_, errno); |
1166 | 0 | } |
1167 | 0 | } |
1168 | |
|
1169 | 0 | if (close(fd_) < 0) { |
1170 | 0 | if (s.ok()) { |
1171 | 0 | s = IOError("While closing mmapped file", filename_, errno); |
1172 | 0 | } |
1173 | 0 | } |
1174 | |
|
1175 | 0 | fd_ = -1; |
1176 | 0 | base_ = nullptr; |
1177 | 0 | limit_ = nullptr; |
1178 | 0 | return s; |
1179 | 0 | } |
1180 | | |
1181 | | IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/, |
1182 | 0 | IODebugContext* /*dbg*/) { |
1183 | 0 | return IOStatus::OK(); |
1184 | 0 | } |
1185 | | |
1186 | | IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/, |
1187 | 0 | IODebugContext* /*dbg*/) { |
1188 | | #ifdef HAVE_FULLFSYNC |
1189 | | if (::fcntl(fd_, F_FULLFSYNC) < 0) { |
1190 | | return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno); |
1191 | | } |
1192 | | #else // HAVE_FULLFSYNC |
1193 | 0 | if (fdatasync(fd_) < 0) { |
1194 | 0 | return IOError("While fdatasync mmapped file", filename_, errno); |
1195 | 0 | } |
1196 | 0 | #endif // HAVE_FULLFSYNC |
1197 | | |
1198 | 0 | return Msync(); |
1199 | 0 | } |
1200 | | |
1201 | | /** |
1202 | | * Flush data as well as metadata to stable storage. |
1203 | | */ |
1204 | | IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/, |
1205 | 0 | IODebugContext* /*dbg*/) { |
1206 | | #ifdef HAVE_FULLFSYNC |
1207 | | if (::fcntl(fd_, F_FULLFSYNC) < 0) { |
1208 | | return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno); |
1209 | | } |
1210 | | #else // HAVE_FULLFSYNC |
1211 | 0 | if (fsync(fd_) < 0) { |
1212 | 0 | return IOError("While fsync mmaped file", filename_, errno); |
1213 | 0 | } |
1214 | 0 | #endif // HAVE_FULLFSYNC |
1215 | | |
1216 | 0 | return Msync(); |
1217 | 0 | } |
1218 | | |
1219 | | /** |
1220 | | * Get the size of valid data in the file. This will not match the |
1221 | | * size that is returned from the filesystem because we use mmap |
1222 | | * to extend file by map_size every time. |
1223 | | */ |
1224 | | uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/, |
1225 | 0 | IODebugContext* /*dbg*/) { |
1226 | 0 | size_t used = dst_ - base_; |
1227 | 0 | return file_offset_ + used; |
1228 | 0 | } |
1229 | | |
1230 | 0 | IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) { |
1231 | | #ifndef OS_LINUX |
1232 | | (void)offset; |
1233 | | (void)length; |
1234 | | return IOStatus::OK(); |
1235 | | #else |
1236 | | // free OS pages |
1237 | 0 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); |
1238 | 0 | if (ret == 0) { |
1239 | 0 | return IOStatus::OK(); |
1240 | 0 | } |
1241 | 0 | return IOError("While fadvise NotNeeded mmapped file", filename_, errno); |
1242 | 0 | #endif |
1243 | 0 | } |
1244 | | |
1245 | | #ifdef ROCKSDB_FALLOCATE_PRESENT |
1246 | | IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len, |
1247 | | const IOOptions& /*opts*/, |
1248 | 0 | IODebugContext* /*dbg*/) { |
1249 | 0 | assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); |
1250 | 0 | assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); |
1251 | 0 | TEST_KILL_RANDOM("PosixMmapFile::Allocate:0"); |
1252 | 0 | int alloc_status = 0; |
1253 | 0 | if (allow_fallocate_) { |
1254 | 0 | alloc_status = |
1255 | 0 | fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, |
1256 | 0 | static_cast<off_t>(offset), static_cast<off_t>(len)); |
1257 | 0 | } |
1258 | 0 | if (alloc_status == 0) { |
1259 | 0 | return IOStatus::OK(); |
1260 | 0 | } else { |
1261 | 0 | return IOError("While fallocate offset " + std::to_string(offset) + |
1262 | 0 | " len " + std::to_string(len), |
1263 | 0 | filename_, errno); |
1264 | 0 | } |
1265 | 0 | } |
1266 | | #endif |
1267 | | |
1268 | | /* |
1269 | | * PosixWritableFile |
1270 | | * |
1271 | | * Use posix write to write data to a file. |
1272 | | */ |
1273 | | PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, |
1274 | | size_t logical_block_size, |
1275 | | const EnvOptions& options) |
1276 | | : FSWritableFile(options), |
1277 | | filename_(fname), |
1278 | | use_direct_io_(options.use_direct_writes), |
1279 | | fd_(fd), |
1280 | | filesize_(0), |
1281 | 72.1k | logical_sector_size_(logical_block_size) { |
1282 | 72.1k | #ifdef ROCKSDB_FALLOCATE_PRESENT |
1283 | 72.1k | allow_fallocate_ = options.allow_fallocate; |
1284 | 72.1k | fallocate_with_keep_size_ = options.fallocate_with_keep_size; |
1285 | 72.1k | #endif |
1286 | 72.1k | #ifdef ROCKSDB_RANGESYNC_PRESENT |
1287 | 72.1k | sync_file_range_supported_ = IsSyncFileRangeSupported(fd_); |
1288 | 72.1k | #endif // ROCKSDB_RANGESYNC_PRESENT |
1289 | 72.1k | assert(!options.use_mmap_writes); |
1290 | 72.1k | } |
1291 | | |
1292 | 72.1k | PosixWritableFile::~PosixWritableFile() { |
1293 | 72.1k | if (fd_ >= 0) { |
1294 | 18.0k | IOStatus s = PosixWritableFile::Close(IOOptions(), nullptr); |
1295 | 18.0k | s.PermitUncheckedError(); |
1296 | 18.0k | } |
1297 | 72.1k | } |
1298 | | |
1299 | | IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/, |
1300 | 1.51M | IODebugContext* /*dbg*/) { |
1301 | 1.51M | if (use_direct_io()) { |
1302 | 0 | assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); |
1303 | 0 | assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); |
1304 | 0 | } |
1305 | 1.51M | const char* src = data.data(); |
1306 | 1.51M | size_t nbytes = data.size(); |
1307 | | |
1308 | 1.51M | if (!PosixWrite(fd_, src, nbytes)) { |
1309 | 0 | return IOError("While appending to file", filename_, errno); |
1310 | 0 | } |
1311 | | |
1312 | 1.51M | filesize_ += nbytes; |
1313 | 1.51M | return IOStatus::OK(); |
1314 | 1.51M | } |
1315 | | |
1316 | | IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset, |
1317 | | const IOOptions& /*opts*/, |
1318 | 0 | IODebugContext* /*dbg*/) { |
1319 | 0 | if (use_direct_io()) { |
1320 | 0 | assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); |
1321 | 0 | assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); |
1322 | 0 | assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); |
1323 | 0 | } |
1324 | 0 | assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); |
1325 | 0 | const char* src = data.data(); |
1326 | 0 | size_t nbytes = data.size(); |
1327 | 0 | if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) { |
1328 | 0 | return IOError("While pwrite to file at offset " + std::to_string(offset), |
1329 | 0 | filename_, errno); |
1330 | 0 | } |
1331 | 0 | filesize_ = offset + nbytes; |
1332 | 0 | return IOStatus::OK(); |
1333 | 0 | } |
1334 | | |
1335 | | IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/, |
1336 | 4.51k | IODebugContext* /*dbg*/) { |
1337 | 4.51k | IOStatus s; |
1338 | 4.51k | int r = ftruncate(fd_, size); |
1339 | 4.51k | if (r < 0) { |
1340 | 0 | s = IOError("While ftruncate file to size " + std::to_string(size), |
1341 | 0 | filename_, errno); |
1342 | 4.51k | } else { |
1343 | 4.51k | filesize_ = size; |
1344 | 4.51k | } |
1345 | 4.51k | return s; |
1346 | 4.51k | } |
1347 | | |
1348 | | IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/, |
1349 | 72.1k | IODebugContext* /*dbg*/) { |
1350 | 72.1k | IOStatus s; |
1351 | | |
1352 | 72.1k | size_t block_size; |
1353 | 72.1k | size_t last_allocated_block; |
1354 | 72.1k | GetPreallocationStatus(&block_size, &last_allocated_block); |
1355 | 72.1k | TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block); |
1356 | 72.1k | if (last_allocated_block > 0) { |
1357 | | // trim the extra space preallocated at the end of the file |
1358 | | // NOTE(ljin): we probably don't want to surface failure as an IOError, |
1359 | | // but it will be nice to log these errors. |
1360 | 18.0k | int dummy __attribute__((__unused__)); |
1361 | 18.0k | dummy = ftruncate(fd_, filesize_); |
1362 | 18.0k | #if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) |
1363 | | // in some file systems, ftruncate only trims trailing space if the |
1364 | | // new file size is smaller than the current size. Calling fallocate |
1365 | | // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused |
1366 | | // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following |
1367 | | // filesystems: |
1368 | | // XFS (since Linux 2.6.38) |
1369 | | // ext4 (since Linux 3.0) |
1370 | | // Btrfs (since Linux 3.7) |
1371 | | // tmpfs (since Linux 3.5) |
1372 | | // We ignore error since failure of this operation does not affect |
1373 | | // correctness. |
1374 | 18.0k | struct stat file_stats; |
1375 | 18.0k | int result = fstat(fd_, &file_stats); |
1376 | | // After ftruncate, we check whether ftruncate has the correct behavior. |
1377 | | // If not, we should hack it with FALLOC_FL_PUNCH_HOLE |
1378 | 18.0k | if (result == 0 && |
1379 | 18.0k | (file_stats.st_size + file_stats.st_blksize - 1) / |
1380 | 18.0k | file_stats.st_blksize != |
1381 | 18.0k | file_stats.st_blocks / (file_stats.st_blksize / 512)) { |
1382 | 0 | IOSTATS_TIMER_GUARD(allocate_nanos); |
1383 | 0 | if (allow_fallocate_) { |
1384 | 0 | fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, |
1385 | 0 | block_size * last_allocated_block - filesize_); |
1386 | 0 | } |
1387 | 0 | } |
1388 | 18.0k | #endif |
1389 | 18.0k | } |
1390 | | |
1391 | 72.1k | if (close(fd_) < 0) { |
1392 | 0 | s = IOError("While closing file after writing", filename_, errno); |
1393 | 0 | } |
1394 | 72.1k | fd_ = -1; |
1395 | 72.1k | return s; |
1396 | 72.1k | } |
1397 | | |
1398 | | // write out the cached data to the OS cache |
1399 | | IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/, |
1400 | 1.57M | IODebugContext* /*dbg*/) { |
1401 | 1.57M | return IOStatus::OK(); |
1402 | 1.57M | } |
1403 | | |
1404 | | IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/, |
1405 | 36.0k | IODebugContext* /*dbg*/) { |
1406 | | #ifdef HAVE_FULLFSYNC |
1407 | | if (::fcntl(fd_, F_FULLFSYNC) < 0) { |
1408 | | return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); |
1409 | | } |
1410 | | #else // HAVE_FULLFSYNC |
1411 | 36.0k | if (fdatasync(fd_) < 0) { |
1412 | 0 | return IOError("While fdatasync", filename_, errno); |
1413 | 0 | } |
1414 | 36.0k | #endif // HAVE_FULLFSYNC |
1415 | 36.0k | return IOStatus::OK(); |
1416 | 36.0k | } |
1417 | | |
1418 | | IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/, |
1419 | 9.02k | IODebugContext* /*dbg*/) { |
1420 | | #ifdef HAVE_FULLFSYNC |
1421 | | if (::fcntl(fd_, F_FULLFSYNC) < 0) { |
1422 | | return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); |
1423 | | } |
1424 | | #else // HAVE_FULLFSYNC |
1425 | 9.02k | if (fsync(fd_) < 0) { |
1426 | 0 | return IOError("While fsync", filename_, errno); |
1427 | 0 | } |
1428 | 9.02k | #endif // HAVE_FULLFSYNC |
1429 | 9.02k | return IOStatus::OK(); |
1430 | 9.02k | } |
1431 | | |
1432 | 0 | bool PosixWritableFile::IsSyncThreadSafe() const { return true; } |
1433 | | |
1434 | | uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/, |
1435 | 1.49M | IODebugContext* /*dbg*/) { |
1436 | 1.49M | return filesize_; |
1437 | 1.49M | } |
1438 | | |
1439 | 13.5k | void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { |
1440 | 13.5k | #ifdef OS_LINUX |
1441 | | // Suppress Valgrind "Unimplemented functionality" error. |
1442 | 13.5k | #ifndef ROCKSDB_VALGRIND_RUN |
1443 | 13.5k | uint64_t fcntl_hint = hint; |
1444 | | |
1445 | 13.5k | if (hint == write_hint_) { |
1446 | 0 | return; |
1447 | 0 | } |
1448 | 13.5k | if (fcntl(fd_, F_SET_RW_HINT, &fcntl_hint) == 0) { |
1449 | 13.5k | write_hint_ = hint; |
1450 | 13.5k | } |
1451 | | #else |
1452 | | (void)hint; |
1453 | | #endif // ROCKSDB_VALGRIND_RUN |
1454 | | #else |
1455 | | (void)hint; |
1456 | | #endif // OS_LINUX |
1457 | 13.5k | } |
1458 | | |
1459 | 0 | IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) { |
1460 | 0 | if (use_direct_io()) { |
1461 | 0 | return IOStatus::OK(); |
1462 | 0 | } |
1463 | | #ifndef OS_LINUX |
1464 | | (void)offset; |
1465 | | (void)length; |
1466 | | return IOStatus::OK(); |
1467 | | #else |
1468 | | // free OS pages |
1469 | 0 | int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); |
1470 | 0 | if (ret == 0) { |
1471 | 0 | return IOStatus::OK(); |
1472 | 0 | } |
1473 | 0 | return IOError("While fadvise NotNeeded", filename_, errno); |
1474 | 0 | #endif |
1475 | 0 | } |
1476 | | |
1477 | | #ifdef ROCKSDB_FALLOCATE_PRESENT |
1478 | | IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len, |
1479 | | const IOOptions& /*opts*/, |
1480 | 18.0k | IODebugContext* /*dbg*/) { |
1481 | 18.0k | assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); |
1482 | 18.0k | assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); |
1483 | 18.0k | TEST_KILL_RANDOM("PosixWritableFile::Allocate:0"); |
1484 | 18.0k | IOSTATS_TIMER_GUARD(allocate_nanos); |
1485 | 18.0k | int alloc_status = 0; |
1486 | 18.0k | if (allow_fallocate_) { |
1487 | 18.0k | alloc_status = |
1488 | 18.0k | fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, |
1489 | 18.0k | static_cast<off_t>(offset), static_cast<off_t>(len)); |
1490 | 18.0k | } |
1491 | 18.0k | if (alloc_status == 0) { |
1492 | 18.0k | return IOStatus::OK(); |
1493 | 18.0k | } else { |
1494 | 0 | return IOError("While fallocate offset " + std::to_string(offset) + |
1495 | 0 | " len " + std::to_string(len), |
1496 | 0 | filename_, errno); |
1497 | 0 | } |
1498 | 18.0k | } |
1499 | | #endif |
1500 | | |
1501 | | IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, |
1502 | | const IOOptions& opts, |
1503 | 0 | IODebugContext* dbg) { |
1504 | 0 | #ifdef ROCKSDB_RANGESYNC_PRESENT |
1505 | 0 | assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); |
1506 | 0 | assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); |
1507 | 0 | if (sync_file_range_supported_) { |
1508 | 0 | int ret; |
1509 | 0 | if (strict_bytes_per_sync_) { |
1510 | | // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length |
1511 | | // that spans all bytes written so far tells `sync_file_range` to wait for |
1512 | | // any outstanding writeback requests to finish before issuing a new one. |
1513 | 0 | ret = |
1514 | 0 | sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), |
1515 | 0 | SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE); |
1516 | 0 | } else { |
1517 | 0 | ret = sync_file_range(fd_, static_cast<off_t>(offset), |
1518 | 0 | static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE); |
1519 | 0 | } |
1520 | 0 | if (ret != 0) { |
1521 | 0 | return IOError("While sync_file_range returned " + std::to_string(ret), |
1522 | 0 | filename_, errno); |
1523 | 0 | } |
1524 | 0 | return IOStatus::OK(); |
1525 | 0 | } |
1526 | 0 | #endif // ROCKSDB_RANGESYNC_PRESENT |
1527 | 0 | return FSWritableFile::RangeSync(offset, nbytes, opts, dbg); |
1528 | 0 | } |
1529 | | |
1530 | | #ifdef OS_LINUX |
1531 | 0 | size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const { |
1532 | 0 | return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); |
1533 | 0 | } |
1534 | | #endif |
1535 | | |
1536 | | /* |
1537 | | * PosixRandomRWFile |
1538 | | */ |
1539 | | |
1540 | | PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd, |
1541 | | const EnvOptions& /*options*/) |
1542 | 0 | : filename_(fname), fd_(fd) {} |
1543 | | |
1544 | 0 | PosixRandomRWFile::~PosixRandomRWFile() { |
1545 | 0 | if (fd_ >= 0) { |
1546 | 0 | IOStatus s = Close(IOOptions(), nullptr); |
1547 | 0 | s.PermitUncheckedError(); |
1548 | 0 | } |
1549 | 0 | } |
1550 | | |
1551 | | IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data, |
1552 | | const IOOptions& /*opts*/, |
1553 | 0 | IODebugContext* /*dbg*/) { |
1554 | 0 | const char* src = data.data(); |
1555 | 0 | size_t nbytes = data.size(); |
1556 | 0 | if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) { |
1557 | 0 | return IOError("While write random read/write file at offset " + |
1558 | 0 | std::to_string(offset), |
1559 | 0 | filename_, errno); |
1560 | 0 | } |
1561 | | |
1562 | 0 | return IOStatus::OK(); |
1563 | 0 | } |
1564 | | |
1565 | | IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n, |
1566 | | const IOOptions& /*opts*/, Slice* result, |
1567 | 0 | char* scratch, IODebugContext* /*dbg*/) const { |
1568 | 0 | size_t left = n; |
1569 | 0 | char* ptr = scratch; |
1570 | 0 | while (left > 0) { |
1571 | 0 | ssize_t done = pread(fd_, ptr, left, offset); |
1572 | 0 | if (done < 0) { |
1573 | | // error while reading from file |
1574 | 0 | if (errno == EINTR) { |
1575 | | // read was interrupted, try again. |
1576 | 0 | continue; |
1577 | 0 | } |
1578 | 0 | return IOError("While reading random read/write file offset " + |
1579 | 0 | std::to_string(offset) + " len " + std::to_string(n), |
1580 | 0 | filename_, errno); |
1581 | 0 | } else if (done == 0) { |
1582 | | // Nothing more to read |
1583 | 0 | break; |
1584 | 0 | } |
1585 | | |
1586 | | // Read `done` bytes |
1587 | 0 | ptr += done; |
1588 | 0 | offset += done; |
1589 | 0 | left -= done; |
1590 | 0 | } |
1591 | | |
1592 | 0 | *result = Slice(scratch, n - left); |
1593 | 0 | return IOStatus::OK(); |
1594 | 0 | } |
1595 | | |
1596 | | IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/, |
1597 | 0 | IODebugContext* /*dbg*/) { |
1598 | 0 | return IOStatus::OK(); |
1599 | 0 | } |
1600 | | |
1601 | | IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/, |
1602 | 0 | IODebugContext* /*dbg*/) { |
1603 | | #ifdef HAVE_FULLFSYNC |
1604 | | if (::fcntl(fd_, F_FULLFSYNC) < 0) { |
1605 | | return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno); |
1606 | | } |
1607 | | #else // HAVE_FULLFSYNC |
1608 | 0 | if (fdatasync(fd_) < 0) { |
1609 | 0 | return IOError("While fdatasync random read/write file", filename_, errno); |
1610 | 0 | } |
1611 | 0 | #endif // HAVE_FULLFSYNC |
1612 | 0 | return IOStatus::OK(); |
1613 | 0 | } |
1614 | | |
1615 | | IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/, |
1616 | 0 | IODebugContext* /*dbg*/) { |
1617 | | #ifdef HAVE_FULLFSYNC |
1618 | | if (::fcntl(fd_, F_FULLFSYNC) < 0) { |
1619 | | return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno); |
1620 | | } |
1621 | | #else // HAVE_FULLFSYNC |
1622 | 0 | if (fsync(fd_) < 0) { |
1623 | 0 | return IOError("While fsync random read/write file", filename_, errno); |
1624 | 0 | } |
1625 | 0 | #endif // HAVE_FULLFSYNC |
1626 | 0 | return IOStatus::OK(); |
1627 | 0 | } |
1628 | | |
1629 | | IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/, |
1630 | 0 | IODebugContext* /*dbg*/) { |
1631 | 0 | if (close(fd_) < 0) { |
1632 | 0 | return IOError("While close random read/write file", filename_, errno); |
1633 | 0 | } |
1634 | 0 | fd_ = -1; |
1635 | 0 | return IOStatus::OK(); |
1636 | 0 | } |
1637 | | |
1638 | 0 | PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() { |
1639 | | // TODO should have error handling though not much we can do... |
1640 | 0 | munmap(this->base_, length_); |
1641 | 0 | } |
1642 | | |
1643 | | /* |
1644 | | * PosixDirectory |
1645 | | */ |
1646 | | #if !defined(BTRFS_SUPER_MAGIC) |
1647 | | // The magic number for BTRFS is fixed, if it's not defined, define it here |
1648 | 31.5k | #define BTRFS_SUPER_MAGIC 0x9123683E |
1649 | | #endif |
1650 | | PosixDirectory::PosixDirectory(int fd, const std::string& directory_name) |
1651 | 31.5k | : fd_(fd), directory_name_(directory_name) { |
1652 | 31.5k | is_btrfs_ = false; |
1653 | 31.5k | #ifdef OS_LINUX |
1654 | 31.5k | struct statfs buf; |
1655 | 31.5k | int ret = fstatfs(fd, &buf); |
1656 | 31.5k | is_btrfs_ = (ret == 0 && buf.f_type == static_cast<decltype(buf.f_type)>( |
1657 | 31.5k | BTRFS_SUPER_MAGIC)); |
1658 | 31.5k | #endif |
1659 | 31.5k | } |
1660 | | |
1661 | 31.5k | PosixDirectory::~PosixDirectory() { |
1662 | 31.5k | if (fd_ >= 0) { |
1663 | 9.02k | IOStatus s = PosixDirectory::Close(IOOptions(), nullptr); |
1664 | 9.02k | s.PermitUncheckedError(); |
1665 | 9.02k | } |
1666 | 31.5k | } |
1667 | | |
1668 | 0 | IOStatus PosixDirectory::Fsync(const IOOptions& opts, IODebugContext* dbg) { |
1669 | 0 | return FsyncWithDirOptions(opts, dbg, DirFsyncOptions()); |
1670 | 0 | } |
1671 | | |
1672 | | // Users who want the file entries synced in Directory project must call a |
1673 | | // Fsync or FsyncWithDirOptions function before Close |
1674 | | IOStatus PosixDirectory::Close(const IOOptions& /*opts*/, |
1675 | 31.5k | IODebugContext* /*dbg*/) { |
1676 | 31.5k | IOStatus s = IOStatus::OK(); |
1677 | 31.5k | if (close(fd_) < 0) { |
1678 | 0 | s = IOError("While closing directory ", directory_name_, errno); |
1679 | 31.5k | } else { |
1680 | 31.5k | fd_ = -1; |
1681 | 31.5k | } |
1682 | 31.5k | return s; |
1683 | 31.5k | } |
1684 | | |
1685 | | IOStatus PosixDirectory::FsyncWithDirOptions( |
1686 | | const IOOptions& /*opts*/, IODebugContext* /*dbg*/, |
1687 | 27.0k | const DirFsyncOptions& dir_fsync_options) { |
1688 | 27.0k | assert(fd_ >= 0); // Check use after close |
1689 | 27.0k | IOStatus s = IOStatus::OK(); |
1690 | 27.0k | #ifndef OS_AIX |
1691 | 27.0k | if (is_btrfs_) { |
1692 | | // skip dir fsync for new file creation, which is not needed for btrfs |
1693 | 0 | if (dir_fsync_options.reason == DirFsyncOptions::kNewFileSynced) { |
1694 | 0 | return s; |
1695 | 0 | } |
1696 | | // skip dir fsync for renaming file, only need to sync new file |
1697 | 0 | if (dir_fsync_options.reason == DirFsyncOptions::kFileRenamed) { |
1698 | 0 | std::string new_name = dir_fsync_options.renamed_new_name; |
1699 | 0 | assert(!new_name.empty()); |
1700 | 0 | int fd; |
1701 | 0 | do { |
1702 | 0 | IOSTATS_TIMER_GUARD(open_nanos); |
1703 | 0 | fd = open(new_name.c_str(), O_RDONLY); |
1704 | 0 | } while (fd < 0 && errno == EINTR); |
1705 | 0 | if (fd < 0) { |
1706 | 0 | s = IOError("While open renaming file", new_name, errno); |
1707 | 0 | } else if (fsync(fd) < 0) { |
1708 | 0 | s = IOError("While fsync renaming file", new_name, errno); |
1709 | 0 | } |
1710 | 0 | if (close(fd) < 0) { |
1711 | 0 | s = IOError("While closing file after fsync", new_name, errno); |
1712 | 0 | } |
1713 | 0 | return s; |
1714 | 0 | } |
1715 | | // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted |
1716 | 0 | } |
1717 | | |
1718 | | // skip fsync/fcntl when fd_ == -1 since this file descriptor has been closed |
1719 | | // in either the de-construction or the close function, data must have been |
1720 | | // fsync-ed before de-construction and close is called |
1721 | | #ifdef HAVE_FULLFSYNC |
1722 | | // btrfs is a Linux file system, while currently F_FULLFSYNC is available on |
1723 | | // Mac OS. |
1724 | | assert(!is_btrfs_); |
1725 | | if (fd_ != -1 && ::fcntl(fd_, F_FULLFSYNC) < 0) { |
1726 | | return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno); |
1727 | | } |
1728 | | #else // HAVE_FULLFSYNC |
1729 | 27.0k | if (fd_ != -1 && fsync(fd_) == -1) { |
1730 | 0 | s = IOError("While fsync", "a directory", errno); |
1731 | 0 | } |
1732 | 27.0k | #endif // HAVE_FULLFSYNC |
1733 | 27.0k | #endif // OS_AIX |
1734 | 27.0k | return s; |
1735 | 27.0k | } |
1736 | | } // namespace ROCKSDB_NAMESPACE |
1737 | | #endif |