/src/rocksdb/env/io_posix.h
Line | Count | Source |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | #pragma once |
10 | | #include <errno.h> |
11 | | #if defined(ROCKSDB_IOURING_PRESENT) |
12 | | #include <liburing.h> |
13 | | #include <pthread.h> |
14 | | #include <sys/uio.h> |
15 | | |
16 | | #include <cstdio> |
17 | | |
18 | | #include "util/string_util.h" |
19 | | |
20 | | // Compatibility defines for io_uring flags that may not be present in older |
21 | | // kernel headers. These values are fixed and won't change, so it's safe to |
22 | | // define them even if the running kernel doesn't support them. |
23 | | #ifndef IORING_SETUP_SINGLE_ISSUER |
24 | | #define IORING_SETUP_SINGLE_ISSUER (1U << 12) |
25 | | #endif |
26 | | #ifndef IORING_SETUP_DEFER_TASKRUN |
27 | | #define IORING_SETUP_DEFER_TASKRUN (1U << 13) |
28 | | #endif |
29 | | #endif |
30 | | #include <unistd.h> |
31 | | |
32 | | #include <atomic> |
33 | | #include <functional> |
34 | | #include <map> |
35 | | #include <string> |
36 | | |
37 | | #include "port/lang.h" |
38 | | #include "port/port.h" |
39 | | #include "rocksdb/env.h" |
40 | | #include "rocksdb/file_system.h" |
41 | | #include "rocksdb/io_status.h" |
42 | | #include "test_util/sync_point.h" |
43 | | #include "util/mutexlock.h" |
44 | | #include "util/thread_local.h" |
45 | | |
46 | | // For non linux platform, the following macros are used only as place |
47 | | // holder. |
48 | | #if !(defined OS_LINUX) && !(defined OS_FREEBSD) && !(defined CYGWIN) && \ |
49 | | !(defined OS_AIX) && !(defined OS_ANDROID) |
50 | | #define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ |
51 | | #define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ |
52 | | #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ |
53 | | #define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ |
54 | | #define POSIX_FADV_DONTNEED 4 /* [MC1] don't need these pages */ |
55 | | |
56 | | #define POSIX_MADV_NORMAL 0 /* [MC1] no further special treatment */ |
57 | | #define POSIX_MADV_RANDOM 1 /* [MC1] expect random page refs */ |
58 | | #define POSIX_MADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ |
59 | | #define POSIX_MADV_WILLNEED 3 /* [MC1] will need these pages */ |
60 | | #define POSIX_MADV_DONTNEED 4 /* [MC1] don't need these pages */ |
61 | | #endif |
62 | | |
63 | | namespace ROCKSDB_NAMESPACE { |
64 | | std::string IOErrorMsg(const std::string& context, |
65 | | const std::string& file_name); |
66 | | // file_name can be left empty if it is not unkown. |
67 | | IOStatus IOError(const std::string& context, const std::string& file_name, |
68 | | int err_number); |
69 | | |
70 | | // SyncPoint payload used by deterministic TSAN regression tests to observe |
71 | | // which virtual address range a freshly created mapping occupies. |
72 | | struct TsanMappedMemoryInfo { |
73 | | const void* addr; |
74 | | size_t size; |
75 | | }; |
76 | | |
77 | | #ifdef __SANITIZE_THREAD__ |
78 | | extern "C" void AnnotateNewMemory(const char* file, int line, |
79 | | const volatile void* mem, long size); |
80 | | #endif // __SANITIZE_THREAD__ |
81 | | |
82 | 0 | inline void TsanAnnotateMappedMemory(const volatile void* mem, size_t size) { |
83 | 0 | TsanMappedMemoryInfo info{const_cast<const void*>(mem), size}; |
84 | 0 | TEST_SYNC_POINT_CALLBACK("TsanAnnotateMappedMemory", &info); |
85 | 0 | (void)info; |
86 | | #ifdef __SANITIZE_THREAD__ |
87 | | if (mem != nullptr && size != 0) { |
88 | | // TSAN does not understand that a new mmap or io_uring setup can legally |
89 | | // reuse a virtual address from an unrelated, previously unmapped region. |
90 | | // Reset the shadow state as soon as the new mapping exists so later |
91 | | // accesses are not reported against stale accesses from the old mapping. |
92 | | AnnotateNewMemory(__FILE__, __LINE__, mem, static_cast<long>(size)); |
93 | | } |
94 | | #endif // __SANITIZE_THREAD__ |
95 | 0 | } |
96 | | |
97 | | class PosixHelper { |
98 | | public: |
99 | 154k | static const std::string& GetLogicalBlockSizeFileName() { |
100 | 154k | static const std::string kLogicalBlockSizeFileName = "logical_block_size"; |
101 | 154k | return kLogicalBlockSizeFileName; |
102 | 154k | } |
103 | 14.5k | static const std::string& GetMaxSectorsKBFileName() { |
104 | 14.5k | static const std::string kMaxSectorsKBFileName = "max_sectors_kb"; |
105 | 14.5k | return kMaxSectorsKBFileName; |
106 | 14.5k | } |
107 | | static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size); |
108 | | static size_t GetLogicalBlockSizeOfFd(int fd); |
109 | | static Status GetLogicalBlockSizeOfDirectory(const std::string& directory, |
110 | | size_t* size); |
111 | | |
112 | | static Status GetMaxSectorsKBOfDirectory(const std::string& directory, |
113 | | size_t* kb); |
114 | | |
115 | | private: |
116 | | static const size_t kDefaultMaxSectorsKB = 2 * 1024; |
117 | | |
118 | | static size_t GetMaxSectorsKBOfFd(int fd); |
119 | | |
120 | | // Return the value in the specified `file_name` under |
121 | | // `/sys/block/xxx/queue/` for the device where the file of `fd` is on. |
122 | | // If not found, then return the specified `default_return_value` |
123 | | static size_t GetQueueSysfsFileValueOfFd(int fd, const std::string& file_name, |
124 | | size_t default_return_value); |
125 | | |
126 | | /// Return the value in the specified `file_name` under |
127 | | // `/sys/block/xxx/queue/` for the device where `directory` is on. |
128 | | // If not found, then return the specified `default_return_value` |
129 | | static Status GetQueueSysfsFileValueofDirectory(const std::string& directory, |
130 | | const std::string& file_name, |
131 | | size_t* value); |
132 | | }; |
133 | | |
134 | | /* |
135 | | * DirectIOHelper |
136 | | */ |
137 | 0 | inline bool IsSectorAligned(const size_t off, size_t sector_size) { |
138 | 0 | assert((sector_size & (sector_size - 1)) == 0); |
139 | 0 | return (off & (sector_size - 1)) == 0; |
140 | 0 | } |
141 | | |
142 | | #ifndef NDEBUG |
143 | | inline bool IsSectorAligned(const void* ptr, size_t sector_size) { |
144 | | return uintptr_t(ptr) % sector_size == 0; |
145 | | } |
146 | | #endif |
147 | | |
148 | | #if defined(ROCKSDB_IOURING_PRESENT) |
149 | | struct Posix_IOHandle { |
150 | | Posix_IOHandle(struct io_uring* _iu, |
151 | | std::function<void(FSReadRequest&, void*)> _cb, void* _cb_arg, |
152 | | uint64_t _offset, size_t _len, char* _scratch, |
153 | | bool _use_direct_io, size_t _alignment) |
154 | | : iu(_iu), |
155 | | cb(_cb), |
156 | | cb_arg(_cb_arg), |
157 | | offset(_offset), |
158 | | len(_len), |
159 | | scratch(_scratch), |
160 | | use_direct_io(_use_direct_io), |
161 | | alignment(_alignment), |
162 | | is_finished(false), |
163 | | is_being_aborted(false), |
164 | | req_count(0) {} |
165 | | |
166 | | struct iovec iov; |
167 | | struct io_uring* iu; |
168 | | std::function<void(FSReadRequest&, void*)> cb; |
169 | | void* cb_arg; |
170 | | uint64_t offset; |
171 | | size_t len; |
172 | | char* scratch; |
173 | | bool use_direct_io; |
174 | | size_t alignment; |
175 | | bool is_finished; |
176 | | // is_being_aborted is set by AbortIO when a cancel request is submitted. |
177 | | // Used to distinguish between aborted handles (expect 2 completions) and |
178 | | // non-aborted handles (expect 1 completion) when processing completions. |
179 | | bool is_being_aborted; |
180 | | // req_count is used by AbortIO API to keep track of number of requests. |
181 | | uint32_t req_count; |
182 | | }; |
183 | | |
184 | | inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name, |
185 | | size_t len, size_t iov_len, bool async_read, |
186 | | bool use_direct_io, size_t alignment, |
187 | | size_t& finished_len, FSReadRequest* req, |
188 | | size_t& bytes_read, bool& read_again) { |
189 | | read_again = false; |
190 | | if (cqe->res < 0) { |
191 | | req->result = Slice(req->scratch, 0); |
192 | | req->status = IOError("Req failed", file_name, cqe->res); |
193 | | } else { |
194 | | bytes_read = static_cast<size_t>(cqe->res); |
195 | | TEST_SYNC_POINT_CALLBACK("UpdateResults::io_uring_result", &bytes_read); |
196 | | if (bytes_read == iov_len) { |
197 | | req->result = Slice(req->scratch, req->len); |
198 | | req->status = IOStatus::OK(); |
199 | | } else if (bytes_read == 0) { |
200 | | /// cqe->res == 0 can means EOF, or can mean partial results. See |
201 | | // comment |
202 | | // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435 |
203 | | // Fall back to pread in this case. |
204 | | if (use_direct_io && !IsSectorAligned(finished_len, alignment)) { |
205 | | // Bytes reads don't fill sectors. Should only happen at the end |
206 | | // of the file. |
207 | | req->result = Slice(req->scratch, finished_len); |
208 | | req->status = IOStatus::OK(); |
209 | | } else { |
210 | | if (async_read) { |
211 | | // No bytes read. It can means EOF. In case of partial results, it's |
212 | | // caller responsibility to call read/readasync again. |
213 | | req->result = Slice(req->scratch, 0); |
214 | | req->status = IOStatus::OK(); |
215 | | } else { |
216 | | read_again = true; |
217 | | } |
218 | | } |
219 | | } else if (bytes_read < iov_len) { |
220 | | assert(bytes_read > 0); |
221 | | if (async_read) { |
222 | | req->result = Slice(req->scratch, bytes_read); |
223 | | req->status = IOStatus::OK(); |
224 | | } else { |
225 | | assert(bytes_read + finished_len < len); |
226 | | finished_len += bytes_read; |
227 | | } |
228 | | } else { |
229 | | req->result = Slice(req->scratch, 0); |
230 | | req->status = IOError("Req returned more bytes than requested", file_name, |
231 | | cqe->res); |
232 | | } |
233 | | } |
234 | | #ifdef NDEBUG |
235 | | (void)len; |
236 | | #endif |
237 | | } |
238 | | |
239 | | // Finalize a completed async read request. |
240 | | // Processes the CQE result, marks the handle as finished, and invokes the |
241 | | // callback. This is shared between Poll and AbortIO (for non-aborted handles). |
242 | | inline void FinalizeAsyncRead(struct io_uring* iu, struct io_uring_cqe* cqe, |
243 | | Posix_IOHandle* posix_handle) { |
244 | | FSReadRequest req; |
245 | | req.scratch = posix_handle->scratch; |
246 | | req.offset = posix_handle->offset; |
247 | | req.len = posix_handle->len; |
248 | | |
249 | | size_t finished_len = 0; |
250 | | size_t bytes_read = 0; |
251 | | bool read_again = false; |
252 | | UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, true /*async_read*/, |
253 | | posix_handle->use_direct_io, posix_handle->alignment, |
254 | | finished_len, &req, bytes_read, read_again); |
255 | | posix_handle->is_finished = true; |
256 | | io_uring_cqe_seen(iu, cqe); |
257 | | posix_handle->cb(req, posix_handle->cb_arg); |
258 | | } |
259 | | #endif |
260 | | |
261 | | #ifdef OS_LINUX |
262 | | // Files under a specific directory have the same logical block size. |
263 | | // This class caches the logical block size for the specified directories to |
264 | | // save the CPU cost of computing the size. |
265 | | // Safe for concurrent access from multiple threads without any external |
266 | | // synchronization. |
267 | | class LogicalBlockSizeCache { |
268 | | public: |
269 | | LogicalBlockSizeCache( |
270 | | std::function<size_t(int)> get_logical_block_size_of_fd = |
271 | | PosixHelper::GetLogicalBlockSizeOfFd, |
272 | | std::function<Status(const std::string&, size_t*)> |
273 | | get_logical_block_size_of_directory = |
274 | | PosixHelper::GetLogicalBlockSizeOfDirectory) |
275 | 4 | : get_logical_block_size_of_fd_(get_logical_block_size_of_fd), |
276 | 4 | get_logical_block_size_of_directory_( |
277 | 4 | get_logical_block_size_of_directory) {} |
278 | | |
279 | | // Takes the following actions: |
280 | | // 1. Increases reference count of the directories; |
281 | | // 2. If the directory's logical block size is not cached, |
282 | | // compute the buffer size and cache the result. |
283 | | Status RefAndCacheLogicalBlockSize( |
284 | | const std::vector<std::string>& directories); |
285 | | |
286 | | // Takes the following actions: |
287 | | // 1. Decreases reference count of the directories; |
288 | | // 2. If the reference count of a directory reaches 0, remove the directory |
289 | | // from the cache. |
290 | | void UnrefAndTryRemoveCachedLogicalBlockSize( |
291 | | const std::vector<std::string>& directories); |
292 | | |
293 | | // Returns the logical block size for the file. |
294 | | // |
295 | | // If the file is under a cached directory, return the cached size. |
296 | | // Otherwise, the size is computed. |
297 | | size_t GetLogicalBlockSize(const std::string& fname, int fd); |
298 | | |
299 | 0 | int GetRefCount(const std::string& dir) { |
300 | 0 | ReadLock lock(&cache_mutex_); |
301 | 0 | auto it = cache_.find(dir); |
302 | 0 | if (it == cache_.end()) { |
303 | 0 | return 0; |
304 | 0 | } |
305 | 0 | return it->second.ref; |
306 | 0 | } |
307 | | |
308 | 0 | size_t Size() const { return cache_.size(); } |
309 | | |
310 | 0 | bool Contains(const std::string& dir) { |
311 | 0 | ReadLock lock(&cache_mutex_); |
312 | 0 | return cache_.find(dir) != cache_.end(); |
313 | 0 | } |
314 | | |
315 | | private: |
316 | | struct CacheValue { |
317 | 49.7k | CacheValue() : size(0), ref(0) {} |
318 | | |
319 | | // Logical block size of the directory. |
320 | | size_t size; |
321 | | // Reference count of the directory. |
322 | | int ref; |
323 | | }; |
324 | | |
325 | | std::function<size_t(int)> get_logical_block_size_of_fd_; |
326 | | std::function<Status(const std::string&, size_t*)> |
327 | | get_logical_block_size_of_directory_; |
328 | | |
329 | | std::map<std::string, CacheValue> cache_; |
330 | | port::RWMutex cache_mutex_; |
331 | | }; |
332 | | #endif |
333 | | |
334 | | class PosixSequentialFile : public FSSequentialFile { |
335 | | private: |
336 | | std::string filename_; |
337 | | FILE* file_; |
338 | | int fd_; |
339 | | bool use_direct_io_; |
340 | | size_t logical_sector_size_; |
341 | | |
342 | | public: |
343 | | PosixSequentialFile(const std::string& fname, FILE* file, int fd, |
344 | | size_t logical_block_size, const EnvOptions& options); |
345 | | virtual ~PosixSequentialFile(); |
346 | | |
347 | | IOStatus Read(size_t n, const IOOptions& opts, Slice* result, char* scratch, |
348 | | IODebugContext* dbg) override; |
349 | | IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& opts, |
350 | | Slice* result, char* scratch, |
351 | | IODebugContext* dbg) override; |
352 | | IOStatus Skip(uint64_t n) override; |
353 | | IOStatus InvalidateCache(size_t offset, size_t length) override; |
354 | 539k | bool use_direct_io() const override { return use_direct_io_; } |
355 | 265k | size_t GetRequiredBufferAlignment() const override { |
356 | 265k | return logical_sector_size_; |
357 | 265k | } |
358 | | }; |
359 | | |
360 | | #if defined(ROCKSDB_IOURING_PRESENT) |
361 | | // io_uring instance queue depth |
362 | | const unsigned int kIoUringDepth = 256; |
363 | | |
364 | | inline void TsanAnnotateIOUringMemory(struct io_uring* iu) { |
365 | | TsanAnnotateMappedMemory(iu->sq.ring_ptr, iu->sq.ring_sz); |
366 | | // CQ and SQ can share the same mmap region. |
367 | | if (iu->cq.ring_ptr != iu->sq.ring_ptr) { |
368 | | TsanAnnotateMappedMemory(iu->cq.ring_ptr, iu->cq.ring_sz); |
369 | | } |
370 | | TsanAnnotateMappedMemory( |
371 | | iu->sq.sqes, static_cast<size_t>(kIoUringDepth) * sizeof(io_uring_sqe)); |
372 | | } |
373 | | |
374 | | inline void DeleteIOUring(void* p) { |
375 | | struct io_uring* iu = static_cast<struct io_uring*>(p); |
376 | | io_uring_queue_exit(iu); |
377 | | delete iu; |
378 | | } |
379 | | |
380 | | inline struct io_uring* CreateIOUring() { |
381 | | struct io_uring* new_io_uring = new struct io_uring; |
382 | | unsigned int flags = 0; |
383 | | flags |= IORING_SETUP_SINGLE_ISSUER; |
384 | | flags |= IORING_SETUP_DEFER_TASKRUN; |
385 | | int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, flags); |
386 | | if (ret) { |
387 | | fprintf(stdout, "CreateIOUring failed: %s (errno=%d), thread=%lu\n", |
388 | | errnoStr(-ret).c_str(), -ret, |
389 | | static_cast<unsigned long>(pthread_self())); |
390 | | delete new_io_uring; |
391 | | new_io_uring = nullptr; |
392 | | } else { |
393 | | TsanAnnotateIOUringMemory(new_io_uring); |
394 | | } |
395 | | return new_io_uring; |
396 | | } |
397 | | #endif // defined(ROCKSDB_IOURING_PRESENT) |
398 | | |
399 | | class PosixRandomAccessFile : public FSRandomAccessFile { |
400 | | protected: |
401 | | std::string filename_; |
402 | | int fd_; |
403 | | bool use_direct_io_; |
404 | | size_t logical_sector_size_; |
405 | | #if defined(ROCKSDB_IOURING_PRESENT) |
406 | | ThreadLocalPtr* thread_local_async_read_io_urings_; |
407 | | ThreadLocalPtr* thread_local_multi_read_io_urings_; |
408 | | #endif |
409 | | |
410 | | public: |
411 | | PosixRandomAccessFile(const std::string& fname, int fd, |
412 | | size_t logical_block_size, const EnvOptions& options |
413 | | #if defined(ROCKSDB_IOURING_PRESENT) |
414 | | , |
415 | | ThreadLocalPtr* thread_local_async_read_io_urings, |
416 | | ThreadLocalPtr* thread_local_multi_read_io_urings |
417 | | #endif |
418 | | ); |
419 | | virtual ~PosixRandomAccessFile(); |
420 | | |
421 | | IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result, |
422 | | char* scratch, IODebugContext* dbg) const override; |
423 | | |
424 | | IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, |
425 | | const IOOptions& options, IODebugContext* dbg) override; |
426 | | |
427 | | IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& opts, |
428 | | IODebugContext* dbg) override; |
429 | | |
430 | | #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX) |
431 | | size_t GetUniqueId(char* id, size_t max_size) const override; |
432 | | #endif |
433 | | void Hint(AccessPattern pattern) override; |
434 | | IOStatus InvalidateCache(size_t offset, size_t length) override; |
435 | 2.31M | bool use_direct_io() const override { return use_direct_io_; } |
436 | 493k | size_t GetRequiredBufferAlignment() const override { |
437 | 493k | return logical_sector_size_; |
438 | 493k | } |
439 | | |
440 | | virtual IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, |
441 | | std::function<void(FSReadRequest&, void*)> cb, |
442 | | void* cb_arg, void** io_handle, |
443 | | IOHandleDeleter* del_fn, |
444 | | IODebugContext* dbg) override; |
445 | | |
446 | | virtual IOStatus GetFileSize(uint64_t* result) override; |
447 | | }; |
448 | | |
449 | | class PosixWritableFile : public FSWritableFile { |
450 | | protected: |
451 | | const std::string filename_; |
452 | | const bool use_direct_io_; |
453 | | int fd_; |
454 | | uint64_t filesize_; |
455 | | size_t logical_sector_size_; |
456 | | #ifdef ROCKSDB_FALLOCATE_PRESENT |
457 | | bool allow_fallocate_; |
458 | | bool fallocate_with_keep_size_; |
459 | | #endif |
460 | | #ifdef ROCKSDB_RANGESYNC_PRESENT |
461 | | // Even if the syscall is present, the filesystem may still not properly |
462 | | // support it, so we need to do a dynamic check too. |
463 | | bool sync_file_range_supported_; |
464 | | #endif // ROCKSDB_RANGESYNC_PRESENT |
465 | | |
466 | | public: |
467 | | explicit PosixWritableFile(const std::string& fname, int fd, |
468 | | size_t logical_block_size, |
469 | | const EnvOptions& options, |
470 | | uint64_t initial_file_size); |
471 | | virtual ~PosixWritableFile(); |
472 | | |
473 | | // Need to implement this so the file is truncated correctly |
474 | | // with direct I/O |
475 | | IOStatus Truncate(uint64_t size, const IOOptions& opts, |
476 | | IODebugContext* dbg) override; |
477 | | IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; |
478 | | IOStatus Append(const Slice& data, const IOOptions& opts, |
479 | | IODebugContext* dbg) override; |
480 | | IOStatus Append(const Slice& data, const IOOptions& opts, |
481 | | const DataVerificationInfo& /* verification_info */, |
482 | 0 | IODebugContext* dbg) override { |
483 | 0 | return Append(data, opts, dbg); |
484 | 0 | } |
485 | | IOStatus PositionedAppend(const Slice& data, uint64_t offset, |
486 | | const IOOptions& opts, |
487 | | IODebugContext* dbg) override; |
488 | | IOStatus PositionedAppend(const Slice& data, uint64_t offset, |
489 | | const IOOptions& opts, |
490 | | const DataVerificationInfo& /* verification_info */, |
491 | 0 | IODebugContext* dbg) override { |
492 | 0 | return PositionedAppend(data, offset, opts, dbg); |
493 | 0 | } |
494 | | IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; |
495 | | IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; |
496 | | IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; |
497 | | bool IsSyncThreadSafe() const override; |
498 | 59.7M | bool use_direct_io() const override { return use_direct_io_; } |
499 | | void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override; |
500 | | uint64_t GetFileSize(const IOOptions& opts, IODebugContext* dbg) override; |
501 | | IOStatus InvalidateCache(size_t offset, size_t length) override; |
502 | 275k | size_t GetRequiredBufferAlignment() const override { |
503 | 275k | return logical_sector_size_; |
504 | 275k | } |
505 | | #ifdef ROCKSDB_FALLOCATE_PRESENT |
506 | | IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& opts, |
507 | | IODebugContext* dbg) override; |
508 | | #endif |
509 | | IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& opts, |
510 | | IODebugContext* dbg) override; |
511 | | #ifdef OS_LINUX |
512 | | size_t GetUniqueId(char* id, size_t max_size) const override; |
513 | | #endif |
514 | | }; |
515 | | |
516 | | // mmap() based random-access |
517 | | class PosixMmapReadableFile : public FSRandomAccessFile { |
518 | | private: |
519 | | int fd_; |
520 | | std::string filename_; |
521 | | void* mmapped_region_; |
522 | | size_t length_; |
523 | | |
524 | | public: |
525 | | PosixMmapReadableFile(const int fd, const std::string& fname, void* base, |
526 | | size_t length, const EnvOptions& options); |
527 | | virtual ~PosixMmapReadableFile(); |
528 | | IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result, |
529 | | char* scratch, IODebugContext* dbg) const override; |
530 | | void Hint(AccessPattern pattern) override; |
531 | | IOStatus InvalidateCache(size_t offset, size_t length) override; |
532 | | virtual IOStatus GetFileSize(uint64_t* result) override; |
533 | | }; |
534 | | |
535 | | class PosixMmapFile : public FSWritableFile { |
536 | | private: |
537 | | std::string filename_; |
538 | | int fd_; |
539 | | size_t page_size_; |
540 | | size_t map_size_; // How much extra memory to map at a time |
541 | | char* base_; // The mapped region |
542 | | char* limit_; // Limit of the mapped region |
543 | | char* dst_; // Where to write next (in range [base_,limit_]) |
544 | | char* last_sync_; // Where have we synced up to |
545 | | uint64_t file_offset_; // Offset of base_ in file |
546 | | #ifdef ROCKSDB_FALLOCATE_PRESENT |
547 | | bool allow_fallocate_; // If false, fallocate calls are bypassed |
548 | | bool fallocate_with_keep_size_; |
549 | | #endif |
550 | | |
551 | | // Roundup x to a multiple of y |
552 | 0 | static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } |
553 | | |
554 | 0 | size_t TruncateToPageBoundary(size_t s) { |
555 | 0 | s -= (s & (page_size_ - 1)); |
556 | 0 | assert((s % page_size_) == 0); |
557 | 0 | return s; |
558 | 0 | } |
559 | | |
560 | | IOStatus MapNewRegion(); |
561 | | IOStatus UnmapCurrentRegion(); |
562 | | IOStatus Msync(); |
563 | | |
564 | | public: |
565 | | PosixMmapFile(const std::string& fname, int fd, size_t page_size, |
566 | | const EnvOptions& options, uint64_t initial_file_size); |
567 | | ~PosixMmapFile(); |
568 | | |
569 | | // Means Close() will properly take care of truncate |
570 | | // and it does not need any additional information |
571 | | IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*opts*/, |
572 | 0 | IODebugContext* /*dbg*/) override { |
573 | 0 | return IOStatus::OK(); |
574 | 0 | } |
575 | | IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; |
576 | | IOStatus Append(const Slice& data, const IOOptions& opts, |
577 | | IODebugContext* dbg) override; |
578 | | IOStatus Append(const Slice& data, const IOOptions& opts, |
579 | | const DataVerificationInfo& /* verification_info */, |
580 | 0 | IODebugContext* dbg) override { |
581 | 0 | return Append(data, opts, dbg); |
582 | 0 | } |
583 | | IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; |
584 | | IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; |
585 | | IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; |
586 | | uint64_t GetFileSize(const IOOptions& opts, IODebugContext* dbg) override; |
587 | | IOStatus InvalidateCache(size_t offset, size_t length) override; |
588 | | #ifdef ROCKSDB_FALLOCATE_PRESENT |
589 | | IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& opts, |
590 | | IODebugContext* dbg) override; |
591 | | #endif |
592 | | }; |
593 | | |
594 | | class PosixRandomRWFile : public FSRandomRWFile { |
595 | | public: |
596 | | explicit PosixRandomRWFile(const std::string& fname, int fd, |
597 | | const EnvOptions& options); |
598 | | virtual ~PosixRandomRWFile(); |
599 | | |
600 | | IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& opts, |
601 | | IODebugContext* dbg) override; |
602 | | |
603 | | IOStatus Read(uint64_t offset, size_t n, const IOOptions& opts, Slice* result, |
604 | | char* scratch, IODebugContext* dbg) const override; |
605 | | |
606 | | IOStatus Flush(const IOOptions& opts, IODebugContext* dbg) override; |
607 | | IOStatus Sync(const IOOptions& opts, IODebugContext* dbg) override; |
608 | | IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; |
609 | | IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; |
610 | | |
611 | | private: |
612 | | const std::string filename_; |
613 | | int fd_; |
614 | | }; |
615 | | |
616 | | struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer { |
617 | | PosixMemoryMappedFileBuffer(void* _base, size_t _length) |
618 | 0 | : MemoryMappedFileBuffer(_base, _length) {} |
619 | | virtual ~PosixMemoryMappedFileBuffer(); |
620 | | }; |
621 | | |
622 | | class PosixDirectory : public FSDirectory { |
623 | | public: |
624 | | explicit PosixDirectory(int fd, const std::string& directory_name); |
625 | | ~PosixDirectory(); |
626 | | IOStatus Fsync(const IOOptions& opts, IODebugContext* dbg) override; |
627 | | |
628 | | IOStatus Close(const IOOptions& opts, IODebugContext* dbg) override; |
629 | | |
630 | | IOStatus FsyncWithDirOptions( |
631 | | const IOOptions&, IODebugContext*, |
632 | | const DirFsyncOptions& dir_fsync_options) override; |
633 | | |
634 | | private: |
635 | | int fd_; |
636 | | bool is_btrfs_; |
637 | | const std::string directory_name_; |
638 | | }; |
639 | | |
640 | | } // namespace ROCKSDB_NAMESPACE |