Commit 265a3926 authored by Jakob Unterwurzacher's avatar Jakob Unterwurzacher

fuse: Increase MAX_KERNEL_WRITE to 1 MiB & enable CAP_MAX_PAGES

Kernel 4.20 allows writes & reads up to 1 MiB (before: 128 kiB)
via CAP_MAX_PAGES & MaxPages.

Instead of exposing MaxPages in the API, we follow what libfuse
does, and calculate MaxPages from MaxWrite (rounding up).

Contrary to what libfuse does, we also set max_read to the same
value as MaxWrite. This prevents reads getting larger than writes
due to the rounding-up for MaxPages, which is unexpected. This
also changes the default behavoir of go-fuse, which was 64 kiB
writes, but 128 kiB for reads. Now it is 128 kiB for both.

The tests are implemented in the fs package because it's
easier there. They also test MaxReadAhead.

Tested on Linux 4.19.0 and Linux 6.1.7 via all.bash,
and on 6.1.7 also via the gocryptfs test suite.

Supersedes https://github.com/hanwen/go-fuse/pull/347

Change-Id: I5a1d4ee91945155c367888da7a90814a24a9ee6e
parent 915cf541
// Copyright 2022 the Go-FUSE Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fs
import (
"context"
"fmt"
"io/ioutil"
"strconv"
"strings"
"sync"
"syscall"
"testing"
"golang.org/x/sys/unix"
"github.com/hanwen/go-fuse/v2/fuse"
)
type maxWriteTestRoot struct {
Inode
sync.Mutex
// largest observed read size
largestRead int
// largest observed write size
largestWrite int
}
// https://github.com/torvalds/linux/blob/e2ae0d4a6b0ba461542f0fd0ba0b828658013e9f/include/linux/pagemap.h#L999
const VM_READAHEAD = 131072
var _ = (NodeOnAdder)((*maxWriteTestRoot)(nil))
func (n *maxWriteTestRoot) OnAdd(ctx context.Context) {
n.Inode.AddChild("file", n.Inode.NewInode(ctx, &maxWriteTestNode{maxWriteTestRoot: n}, StableAttr{}), false)
}
func (n *maxWriteTestRoot) resetStats() {
n.Lock()
n.largestWrite = 0
n.largestRead = 0
n.Unlock()
}
type maxWriteTestNode struct {
Inode
maxWriteTestRoot *maxWriteTestRoot
}
var _ = (NodeGetattrer)((*maxWriteTestNode)(nil))
func (n *maxWriteTestNode) Getattr(ctx context.Context, f FileHandle, out *fuse.AttrOut) syscall.Errno {
out.Size = 1024 * 1024 * 1024 // 1 GiB
return 0
}
var _ = (NodeOpener)((*maxWriteTestNode)(nil))
func (n *maxWriteTestNode) Open(ctx context.Context, flags uint32) (fh FileHandle, fuseFlags uint32, errno syscall.Errno) {
return &maxWriteTestFH{n.maxWriteTestRoot}, 0, OK
}
type maxWriteTestFH struct {
maxWriteTestRoot *maxWriteTestRoot
}
var _ = (FileReader)((*maxWriteTestFH)(nil))
func (fh *maxWriteTestFH) Read(ctx context.Context, data []byte, off int64) (fuse.ReadResult, syscall.Errno) {
fh.maxWriteTestRoot.Lock()
if fh.maxWriteTestRoot.largestRead < len(data) {
fh.maxWriteTestRoot.largestRead = len(data)
}
fh.maxWriteTestRoot.Unlock()
return fuse.ReadResultData(data), 0
}
var _ = (FileWriter)((*maxWriteTestFH)(nil))
func (fh *maxWriteTestFH) Write(ctx context.Context, data []byte, off int64) (written uint32, errno syscall.Errno) {
fh.maxWriteTestRoot.Lock()
if fh.maxWriteTestRoot.largestWrite < len(data) {
fh.maxWriteTestRoot.largestWrite = len(data)
}
fh.maxWriteTestRoot.Unlock()
return uint32(len(data)), 0
}
// TestMaxWrite checks that combinations of the MaxWrite, MaxReadAhead, max_read
// options result in the expected observed read and write sizes from the kernel.
func TestMaxWrite(t *testing.T) {
testcases := []fuse.MountOptions{
{
MaxWrite: 4 * 1024, // 4 kiB (one page) = lower limit in all Linux versions
},
{
MaxWrite: 8 * 1024,
},
{
MaxWrite: 9999, // let's see what happens if this is unaligned
},
{
MaxWrite: 64 * 1024, // 64 kiB = go-fuse default
},
{
MaxWrite: 128 * 1024, // 128 kiB = upper limit in Linux v4.19 and older
},
{
MaxWrite: 1024 * 1024, // 1 MiB = upper limit in Linux v4.20+
},
// cycle through readahead values
{
MaxWrite: 128 * 1024,
MaxReadAhead: 4 * 1024,
},
{
MaxWrite: 128 * 1024,
MaxReadAhead: 8 * 1024,
},
{
MaxWrite: 128 * 1024,
MaxReadAhead: 16 * 1024,
},
{
MaxWrite: 128 * 1024,
MaxReadAhead: 32 * 1024,
},
{
MaxWrite: 128 * 1024,
MaxReadAhead: 64 * 1024,
},
{
MaxWrite: 128 * 1024,
MaxReadAhead: 128 * 1024,
},
{
// both at default
},
{
// default MaxWrite
MaxReadAhead: 4 * 1024,
},
}
for _, tc := range testcases {
name := fmt.Sprintf("MaxWr%d.MaxRa%d", tc.MaxWrite, tc.MaxReadAhead)
t.Run(name, func(t *testing.T) {
root := &maxWriteTestRoot{}
root.resetStats()
mntDir, srv, clean := testMount(t, root, &Options{MountOptions: tc})
defer clean()
readAheadWant := tc.MaxReadAhead
if readAheadWant == 0 {
readAheadWant = VM_READAHEAD
}
readAheadHave := bdiReadahead(mntDir)
if readAheadHave != readAheadWant {
t.Errorf("Readahead mismatch: have=bdiReadahead=%d want=%d", readAheadHave, readAheadWant)
}
actualMaxWrite := tc.MaxWrite
if srv.KernelSettings().Flags&fuse.CAP_MAX_PAGES == 0 && actualMaxWrite > 128*1024 {
// Kernel 4.19 and lower don't have CAP_MAX_PAGES and limit to 128 kiB.
actualMaxWrite = 128 * 1024
} else if tc.MaxWrite == 0 {
actualMaxWrite = 128 * 1024
}
// Try to make 2 MiB requests, which is more than the kernel supports, so
// we will observe the imposed limits in the actual request sizes.
buf := make([]byte, 2*1024*1024)
// Direct I/O
fdDirect, err := syscall.Open(mntDir+"/file", syscall.O_RDWR|syscall.O_DIRECT, 0600)
if err != nil {
t.Fatal(err)
}
defer syscall.Close(fdDirect)
_, err = syscall.Pwrite(fdDirect, buf, 0)
if err != nil {
t.Errorf("write failed: %v", err)
}
root.Lock()
if root.largestWrite != actualMaxWrite {
t.Errorf("Direct I/O largestWrite: have=%d, want=%d", root.largestWrite, actualMaxWrite)
}
root.Unlock()
_, err = syscall.Pread(fdDirect, buf, 0)
if err != nil {
t.Errorf("read failed: %v", err)
}
root.Lock()
if root.largestRead != actualMaxWrite {
t.Errorf("Direct I/O largestRead: have=%d, want=%d", root.largestRead, actualMaxWrite)
}
root.Unlock()
root.resetStats()
// Buffered I/O
fdBuffered, err := syscall.Open(mntDir+"/file", syscall.O_RDWR, 0600)
if err != nil {
t.Fatal(err)
}
defer syscall.Close(fdBuffered)
// Buffered read
_, err = syscall.Pread(fdBuffered, buf, 0)
if err != nil {
t.Errorf("read failed: %v", err)
}
root.Lock()
// On Linux 4.19, I get exactly tc.MaxReadAhead, while on 6.0 I also get
// larger reads up to 128 kiB. We log the results but don't expect anything.
t.Logf("Buffered I/O largestRead: have=%d", root.largestRead)
root.Unlock()
// Buffered write
_, err = syscall.Pwrite(fdBuffered, buf, 0)
if err != nil {
t.Errorf("write failed: %v", err)
}
root.Lock()
if root.largestWrite != actualMaxWrite {
t.Errorf("Buffered I/O largestWrite: have=%d, want=%d", root.largestWrite, actualMaxWrite)
}
root.Unlock()
})
}
}
// bdiReadahead extracts the readahead size (in bytes) of the filesystem at mnt from
// /sys/class/bdi/%d:%d/read_ahead_kb .
func bdiReadahead(mnt string) int {
var st syscall.Stat_t
err := syscall.Stat(mnt, &st)
if err != nil {
panic(err)
}
path := fmt.Sprintf("/sys/class/bdi/%d:%d/read_ahead_kb", unix.Major(st.Dev), unix.Minor(st.Dev))
buf, err := ioutil.ReadFile(path)
if err != nil {
panic(err)
}
trimmed := strings.TrimSpace(string(buf))
val, err := strconv.Atoi(trimmed)
if err != nil {
panic(err)
}
return val * 1024
}
...@@ -153,12 +153,42 @@ type MountOptions struct { ...@@ -153,12 +153,42 @@ type MountOptions struct {
// async I/O. Concurrency for synchronous I/O is not limited. // async I/O. Concurrency for synchronous I/O is not limited.
MaxBackground int MaxBackground int
// Write size to use. If 0, use default. This number is // MaxWrite is the max size for read and write requests. If 0, use
// capped at the kernel maximum. // go-fuse default (currently 64 kiB).
// This number is internally capped at MAX_KERNEL_WRITE (higher values don't make
// sense).
//
// Non-direct-io reads are mostly served via kernel readahead, which is
// additionally subject to the MaxReadAhead limit.
//
// Implementation notes:
//
// There's four values the Linux kernel looks at when deciding the request size:
// * MaxWrite, passed via InitOut.MaxWrite. Limits the WRITE size.
// * max_read, passed via a string mount option. Limits the READ size.
// go-fuse sets max_read equal to MaxWrite.
// You can see the current max_read value in /proc/self/mounts .
// * MaxPages, passed via InitOut.MaxPages. In Linux 4.20 and later, the value
// can go up to 1 MiB and go-fuse calculates the MaxPages value acc.
// to MaxWrite, rounding up.
// On older kernels, the value is fixed at 128 kiB and the
// passed value is ignored. No request can be larger than MaxPages, so
// READ and WRITE are effectively capped at MaxPages.
// * MaxReadAhead, passed via InitOut.MaxReadAhead.
MaxWrite int MaxWrite int
// Max read ahead to use. If 0, use default. This number is // MaxReadAhead is the max read ahead size to use. It controls how much data the
// capped at the kernel maximum. // kernel reads in advance to satisfy future read requests from applications.
// How much exactly is subject to clever heuristics in the kernel
// (see https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/readahead.c?h=v6.2-rc5#n375
// if you are brave) and hence also depends on the kernel version.
//
// If 0, use kernel default. This number is capped at the kernel maximum
// (128 kiB on Linux) and cannot be larger than MaxWrite.
//
// MaxReadAhead only affects buffered reads (=non-direct-io), but even then, the
// kernel can and does send larger reads to satisfy read reqests from applications
// (up to MaxWrite or VM_READAHEAD_PAGES=128 kiB, whichever is less).
MaxReadAhead int MaxReadAhead int
// If IgnoreSecurityLabels is set, all security related xattr // If IgnoreSecurityLabels is set, all security related xattr
......
...@@ -70,6 +70,12 @@ const ( ...@@ -70,6 +70,12 @@ const (
_OP_NOTIFY_DELETE = uint32(104) // protocol version 18 _OP_NOTIFY_DELETE = uint32(104) // protocol version 18
_OPCODE_COUNT = uint32(105) _OPCODE_COUNT = uint32(105)
// Constants from Linux kernel fs/fuse/fuse_i.h
// Default MaxPages value in all kernel versions
_FUSE_DEFAULT_MAX_PAGES_PER_REQ = 32
// Upper MaxPages limit in Linux v4.20+ (v4.19 and older: 32)
_FUSE_MAX_MAX_PAGES = 256
) )
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
...@@ -90,7 +96,7 @@ func doInit(server *Server, req *request) { ...@@ -90,7 +96,7 @@ func doInit(server *Server, req *request) {
server.reqMu.Lock() server.reqMu.Lock()
server.kernelSettings = *input server.kernelSettings = *input
server.kernelSettings.Flags = input.Flags & (CAP_ASYNC_READ | CAP_BIG_WRITES | CAP_FILE_OPS | server.kernelSettings.Flags = input.Flags & (CAP_ASYNC_READ | CAP_BIG_WRITES | CAP_FILE_OPS |
CAP_READDIRPLUS | CAP_NO_OPEN_SUPPORT | CAP_PARALLEL_DIROPS) CAP_READDIRPLUS | CAP_NO_OPEN_SUPPORT | CAP_PARALLEL_DIROPS | CAP_MAX_PAGES)
if server.opts.EnableLocks { if server.opts.EnableLocks {
server.kernelSettings.Flags |= CAP_FLOCK_LOCKS | CAP_POSIX_LOCKS server.kernelSettings.Flags |= CAP_FLOCK_LOCKS | CAP_POSIX_LOCKS
...@@ -123,6 +129,11 @@ func doInit(server *Server, req *request) { ...@@ -123,6 +129,11 @@ func doInit(server *Server, req *request) {
if input.Minor >= 13 { if input.Minor >= 13 {
server.setSplice() server.setSplice()
} }
// maxPages is the maximum request size we want the kernel to use, in units of
// memory pages (usually 4kiB). Linux v4.19 and older ignore this and always use
// 128kiB.
maxPages := (server.opts.MaxWrite-1)/syscall.Getpagesize() + 1 // Round up
server.reqMu.Unlock() server.reqMu.Unlock()
out := (*InitOut)(req.outData()) out := (*InitOut)(req.outData())
...@@ -134,6 +145,7 @@ func doInit(server *Server, req *request) { ...@@ -134,6 +145,7 @@ func doInit(server *Server, req *request) {
MaxWrite: uint32(server.opts.MaxWrite), MaxWrite: uint32(server.opts.MaxWrite),
CongestionThreshold: uint16(server.opts.MaxBackground * 3 / 4), CongestionThreshold: uint16(server.opts.MaxBackground * 3 / 4),
MaxBackground: uint16(server.opts.MaxBackground), MaxBackground: uint16(server.opts.MaxBackground),
MaxPages: uint16(maxPages),
} }
if server.opts.MaxReadAhead != 0 && uint32(server.opts.MaxReadAhead) < out.MaxReadAhead { if server.opts.MaxReadAhead != 0 && uint32(server.opts.MaxReadAhead) < out.MaxReadAhead {
...@@ -536,6 +548,7 @@ func getHandler(o uint32) *operationHandler { ...@@ -536,6 +548,7 @@ func getHandler(o uint32) *operationHandler {
return operationHandlers[o] return operationHandlers[o]
} }
// maximum size of all input headers
var maxInputSize uintptr var maxInputSize uintptr
func init() { func init() {
......
...@@ -21,13 +21,16 @@ import ( ...@@ -21,13 +21,16 @@ import (
) )
const ( const (
// The kernel caps writes at 128k. // Linux v4.20+ caps requests at 1 MiB. Older kernels at 128 kiB.
MAX_KERNEL_WRITE = 128 * 1024 MAX_KERNEL_WRITE = 1024 * 1024
// Linux kernel constant from include/uapi/linux/fuse.h // Linux kernel constant from include/uapi/linux/fuse.h
// Reads from /dev/fuse that are smaller fail with EINVAL. // Reads from /dev/fuse that are smaller fail with EINVAL.
_FUSE_MIN_READ_BUFFER = 8192 _FUSE_MIN_READ_BUFFER = 8192
// defaultMaxWrite is the default value for MountOptions.MaxWrite
defaultMaxWrite = 128 * 1024 // 128 kiB
minMaxReaders = 2 minMaxReaders = 2
maxMaxReaders = 16 maxMaxReaders = 16
) )
...@@ -167,11 +170,12 @@ func NewServer(fs RawFileSystem, mountPoint string, opts *MountOptions) (*Server ...@@ -167,11 +170,12 @@ func NewServer(fs RawFileSystem, mountPoint string, opts *MountOptions) (*Server
o.MaxWrite = 0 o.MaxWrite = 0
} }
if o.MaxWrite == 0 { if o.MaxWrite == 0 {
o.MaxWrite = 1 << 16 o.MaxWrite = defaultMaxWrite
} }
if o.MaxWrite > MAX_KERNEL_WRITE { if o.MaxWrite > MAX_KERNEL_WRITE {
o.MaxWrite = MAX_KERNEL_WRITE o.MaxWrite = MAX_KERNEL_WRITE
} }
if o.Name == "" { if o.Name == "" {
name := fs.String() name := fs.String()
l := len(name) l := len(name)
...@@ -254,13 +258,13 @@ func (o *MountOptions) optionsStrings() []string { ...@@ -254,13 +258,13 @@ func (o *MountOptions) optionsStrings() []string {
if o.AllowOther { if o.AllowOther {
r = append(r, "allow_other") r = append(r, "allow_other")
} }
if o.FsName != "" { if o.FsName != "" {
r = append(r, "fsname="+o.FsName) r = append(r, "fsname="+o.FsName)
} }
if o.Name != "" { if o.Name != "" {
r = append(r, "subtype="+o.Name) r = append(r, "subtype="+o.Name)
} }
r = append(r, fmt.Sprintf("max_read=%d", o.MaxWrite))
// OSXFUSE applies a 60-second timeout for file operations. This // OSXFUSE applies a 60-second timeout for file operations. This
// is inconsistent with how FUSE works on Linux, where operations // is inconsistent with how FUSE works on Linux, where operations
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment