Line data Source code
1 : // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2 : // of this source code is governed by a BSD-style license that can be found in
3 : // the LICENSE file.
4 :
5 : //go:build linux
6 : // +build linux
7 :
8 : package vfs
9 :
10 : import (
11 : "io/fs"
12 : "os"
13 : "syscall"
14 :
15 : "github.com/cockroachdb/errors"
16 : "golang.org/x/sys/unix"
17 : )
18 :
19 2 : func wrapOSFileImpl(f *os.File) File {
20 2 : lf := &linuxFile{File: f, fd: f.Fd()}
21 2 : if lf.fd != InvalidFd {
22 2 : lf.useSyncRange = isSyncRangeSupported(lf.fd)
23 2 : }
24 2 : return lf
25 : }
26 :
27 2 : func (defaultFS) OpenDir(name string) (File, error) {
28 2 : f, err := os.OpenFile(name, syscall.O_CLOEXEC, 0)
29 2 : if err != nil {
30 0 : return nil, errors.WithStack(err)
31 0 : }
32 2 : return &linuxDir{f}, nil
33 : }
34 :
35 : // Assert that linuxFile and linuxDir implement vfs.File.
36 : var (
37 : _ File = (*linuxDir)(nil)
38 : _ File = (*linuxFile)(nil)
39 : )
40 :
41 : type linuxDir struct {
42 : *os.File
43 : }
44 :
45 0 : func (d *linuxDir) Prefetch(offset int64, length int64) error { return nil }
46 0 : func (d *linuxDir) Preallocate(offset, length int64) error { return nil }
47 0 : func (d *linuxDir) Stat() (FileInfo, error) { return maybeWrapFileInfo(d.File.Stat()) }
48 0 : func (d *linuxDir) SyncData() error { return d.Sync() }
49 0 : func (d *linuxDir) SyncTo(offset int64) (fullSync bool, err error) { return false, nil }
50 :
51 : type linuxFile struct {
52 : *os.File
53 : fd uintptr
54 : useSyncRange bool
55 : }
56 :
57 1 : func (f *linuxFile) Prefetch(offset int64, length int64) error {
58 1 : _, _, err := unix.Syscall(unix.SYS_READAHEAD, uintptr(f.fd), uintptr(offset), uintptr(length))
59 1 : return err
60 1 : }
61 :
62 2 : func (f *linuxFile) Preallocate(offset, length int64) error {
63 2 : return unix.Fallocate(int(f.fd), unix.FALLOC_FL_KEEP_SIZE, offset, length)
64 2 : }
65 :
66 2 : func (f *linuxFile) Stat() (FileInfo, error) {
67 2 : fi, err := f.File.Stat()
68 2 : if err != nil {
69 0 : return nil, err
70 0 : }
71 2 : return defaultFileInfo{fi}, nil
72 : }
73 :
74 2 : func (f *linuxFile) SyncData() error {
75 2 : return unix.Fdatasync(int(f.fd))
76 2 : }
77 :
78 1 : func (f *linuxFile) SyncTo(offset int64) (fullSync bool, err error) {
79 1 : if !f.useSyncRange {
80 0 : // Use fdatasync, which does provide persistence guarantees but won't
81 0 : // update all file metadata. From the `fdatasync` man page:
82 0 : //
83 0 : // fdatasync() is similar to fsync(), but does not flush modified
84 0 : // metadata unless that metadata is needed in order to allow a
85 0 : // subsequent data retrieval to be correctly handled. For example,
86 0 : // changes to st_atime or st_mtime (respectively, time of last access
87 0 : // and time of last modification; see stat(2)) do not require flushing
88 0 : // because they are not necessary for a subsequent data read to be
89 0 : // handled correctly. On the other hand, a change to the file size
90 0 : // (st_size, as made by say ftruncate(2)), would require a metadata
91 0 : // flush.
92 0 : if err = unix.Fdatasync(int(f.fd)); err != nil {
93 0 : return false, err
94 0 : }
95 0 : return true, nil
96 : }
97 :
98 1 : const (
99 1 : waitBefore = 0x1
100 1 : write = 0x2
101 1 : // waitAfter = 0x4
102 1 : )
103 1 :
104 1 : // By specifying write|waitBefore for the flags, we're instructing
105 1 : // SyncFileRange to a) wait for any outstanding data being written to finish,
106 1 : // and b) to queue any other dirty data blocks in the range [0,offset] for
107 1 : // writing. The actual writing of this data will occur asynchronously. The
108 1 : // use of `waitBefore` is to limit how much dirty data is allowed to
109 1 : // accumulate. Linux sometimes behaves poorly when a large amount of dirty
110 1 : // data accumulates, impacting other I/O operations.
111 1 : return false, unix.SyncFileRange(int(f.fd), 0, offset, write|waitBefore)
112 : }
113 :
114 : type syncFileRange func(fd int, off int64, n int64, flags int) (err error)
115 :
116 : // sync_file_range depends on both the filesystem, and the broader kernel
117 : // support. In particular, Windows Subsystem for Linux does not support
118 : // sync_file_range, even when used with ext{2,3,4}. syncRangeSmokeTest performs
119 : // a test of of sync_file_range, returning false on ENOSYS, and true otherwise.
120 2 : func syncRangeSmokeTest(fd uintptr, syncFn syncFileRange) bool {
121 2 : err := syncFn(int(fd), 0 /* offset */, 0 /* nbytes */, 0 /* flags */)
122 2 : return err != unix.ENOSYS
123 2 : }
124 :
125 2 : func isSyncRangeSupported(fd uintptr) bool {
126 2 : var stat unix.Statfs_t
127 2 : if err := unix.Fstatfs(int(fd), &stat); err != nil {
128 0 : return false
129 0 : }
130 :
131 : // Allowlist which filesystems we allow using sync_file_range with as some
132 : // filesystems treat that syscall as a noop (notably ZFS). A allowlist is
133 : // used instead of a denylist in order to have a more graceful failure mode
134 : // in case a filesystem we haven't tested is encountered. Currently only
135 : // ext2/3/4 are known to work properly.
136 2 : const extMagic = 0xef53
137 2 : switch stat.Type {
138 2 : case extMagic:
139 2 : return syncRangeSmokeTest(fd, unix.SyncFileRange)
140 : }
141 0 : return false
142 : }
143 :
144 0 : func deviceIDFromFileInfo(finfo fs.FileInfo) DeviceID {
145 0 : statInfo := finfo.Sys().(*syscall.Stat_t)
146 0 : return DeviceID{
147 0 : major: unix.Major(uint64(statInfo.Dev)),
148 0 : minor: unix.Minor(uint64(statInfo.Dev)),
149 0 : }
150 0 : }
|