aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorMike Snitzer <snitzer@kernel.org>2025-11-11 09:59:31 -0500
committerChuck Lever <chuck.lever@oracle.com>2025-12-01 09:57:10 -0500
commit06c5c97293e3fca99ce15da157068edf45a7c6e4 (patch)
tree1493d0a45a2cc0077914c18d91b9027c2537f43e /fs
parente3e8e176ca4876e6212582022ad80835dddc9de4 (diff)
downloadtip-06c5c97293e3fca99ce15da157068edf45a7c6e4.tar.gz
NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
When NFSD_IO_DIRECT is selected via the /sys/kernel/debug/nfsd/io_cache_write experimental tunable, split incoming unaligned NFS WRITE requests into a prefix, middle and suffix segment, as needed. The middle segment is now DIO-aligned and the prefix and/or suffix are unaligned. Synchronous buffered IO is used for the unaligned segments, and IOCB_DIRECT is used for the middle DIO-aligned extent. Although IOCB_DIRECT avoids the use of the page cache, by itself it doesn't guarantee data durability. For UNSTABLE WRITE requests, durability is obtained by a subsequent NFS COMMIT request. Signed-off-by: Mike Snitzer <snitzer@kernel.org> Co-developed-by: Chuck Lever <chuck.lever@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/nfsd/debugfs.c1
-rw-r--r--fs/nfsd/trace.h2
-rw-r--r--fs/nfsd/vfs.c145
3 files changed, 144 insertions, 4 deletions
diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
index 00eb1ecef6ac18..7f44689e0a53ed 100644
--- a/fs/nfsd/debugfs.c
+++ b/fs/nfsd/debugfs.c
@@ -108,6 +108,7 @@ static int nfsd_io_cache_write_set(void *data, u64 val)
switch (val) {
case NFSD_IO_BUFFERED:
case NFSD_IO_DONTCACHE:
+ case NFSD_IO_DIRECT:
nfsd_io_cache_write = val;
break;
default:
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 85a1521ad7574d..5ae2a611e57f4b 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -469,6 +469,8 @@ DEFINE_NFSD_IO_EVENT(read_io_done);
DEFINE_NFSD_IO_EVENT(read_done);
DEFINE_NFSD_IO_EVENT(write_start);
DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_direct);
+DEFINE_NFSD_IO_EVENT(write_vector);
DEFINE_NFSD_IO_EVENT(write_io_done);
DEFINE_NFSD_IO_EVENT(write_done);
DEFINE_NFSD_IO_EVENT(commit_start);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5333d49910d9ae..ab46301da4ae0b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1254,6 +1254,136 @@ static int wait_for_concurrent_writes(struct file *file)
return err;
}
+struct nfsd_write_dio_seg {
+ struct iov_iter iter;
+ int flags;
+};
+
+static unsigned long
+iov_iter_bvec_offset(const struct iov_iter *iter)
+{
+ return (unsigned long)(iter->bvec->bv_offset + iter->iov_offset);
+}
+
+static void
+nfsd_write_dio_seg_init(struct nfsd_write_dio_seg *segment,
+ struct bio_vec *bvec, unsigned int nvecs,
+ unsigned long total, size_t start, size_t len,
+ struct kiocb *iocb)
+{
+ iov_iter_bvec(&segment->iter, ITER_SOURCE, bvec, nvecs, total);
+ if (start)
+ iov_iter_advance(&segment->iter, start);
+ iov_iter_truncate(&segment->iter, len);
+ segment->flags = iocb->ki_flags;
+}
+
+static unsigned int
+nfsd_write_dio_iters_init(struct nfsd_file *nf, struct bio_vec *bvec,
+ unsigned int nvecs, struct kiocb *iocb,
+ unsigned long total,
+ struct nfsd_write_dio_seg segments[3])
+{
+ u32 offset_align = nf->nf_dio_offset_align;
+ loff_t prefix_end, orig_end, middle_end;
+ u32 mem_align = nf->nf_dio_mem_align;
+ size_t prefix, middle, suffix;
+ loff_t offset = iocb->ki_pos;
+ unsigned int nsegs = 0;
+
+ /*
+ * Check if direct I/O is feasible for this write request.
+ * If alignments are not available, the write is too small,
+ * or no alignment can be found, fall back to buffered I/O.
+ */
+ if (unlikely(!mem_align || !offset_align) ||
+ unlikely(total < max(offset_align, mem_align)))
+ goto no_dio;
+
+ prefix_end = round_up(offset, offset_align);
+ orig_end = offset + total;
+ middle_end = round_down(orig_end, offset_align);
+
+ prefix = prefix_end - offset;
+ middle = middle_end - prefix_end;
+ suffix = orig_end - middle_end;
+
+ if (!middle)
+ goto no_dio;
+
+ if (prefix)
+ nfsd_write_dio_seg_init(&segments[nsegs++], bvec,
+ nvecs, total, 0, prefix, iocb);
+
+ nfsd_write_dio_seg_init(&segments[nsegs], bvec, nvecs,
+ total, prefix, middle, iocb);
+
+ /*
+ * Check if the bvec iterator is aligned for direct I/O.
+ *
+ * bvecs generated from RPC receive buffers are contiguous: After
+ * the first bvec, all subsequent bvecs start at bv_offset zero
+ * (page-aligned). Therefore, only the first bvec is checked.
+ */
+ if (iov_iter_bvec_offset(&segments[nsegs].iter) & (mem_align - 1))
+ goto no_dio;
+ segments[nsegs].flags |= IOCB_DIRECT;
+ nsegs++;
+
+ if (suffix)
+ nfsd_write_dio_seg_init(&segments[nsegs++], bvec, nvecs, total,
+ prefix + middle, suffix, iocb);
+
+ return nsegs;
+
+no_dio:
+ /* No DIO alignment possible - pack into single non-DIO segment. */
+ nfsd_write_dio_seg_init(&segments[0], bvec, nvecs, total, 0,
+ total, iocb);
+ return 1;
+}
+
+static noinline_for_stack int
+nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, unsigned int nvecs,
+ unsigned long *cnt, struct kiocb *kiocb)
+{
+ struct nfsd_write_dio_seg segments[3];
+ struct file *file = nf->nf_file;
+ unsigned int nsegs, i;
+ ssize_t host_err;
+
+ nsegs = nfsd_write_dio_iters_init(nf, rqstp->rq_bvec, nvecs,
+ kiocb, *cnt, segments);
+
+ *cnt = 0;
+ for (i = 0; i < nsegs; i++) {
+ kiocb->ki_flags = segments[i].flags;
+ if (kiocb->ki_flags & IOCB_DIRECT)
+ trace_nfsd_write_direct(rqstp, fhp, kiocb->ki_pos,
+ segments[i].iter.count);
+ else {
+ trace_nfsd_write_vector(rqstp, fhp, kiocb->ki_pos,
+ segments[i].iter.count);
+ /*
+ * Mark the I/O buffer as evict-able to reduce
+ * memory contention.
+ */
+ if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE)
+ kiocb->ki_flags |= IOCB_DONTCACHE;
+ }
+
+ host_err = vfs_iocb_iter_write(file, kiocb, &segments[i].iter);
+ if (host_err < 0)
+ return host_err;
+ *cnt += host_err;
+ if (host_err < segments[i].iter.count)
+ break; /* partial write */
+ }
+
+ return 0;
+}
+
/**
* nfsd_vfs_write - write data to an already-open file
* @rqstp: RPC execution context
@@ -1328,25 +1458,32 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
- iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
switch (nfsd_io_cache_write) {
- case NFSD_IO_BUFFERED:
+ case NFSD_IO_DIRECT:
+ host_err = nfsd_direct_write(rqstp, fhp, nf, nvecs,
+ cnt, &kiocb);
break;
case NFSD_IO_DONTCACHE:
if (file->f_op->fop_flags & FOP_DONTCACHE)
kiocb.ki_flags |= IOCB_DONTCACHE;
+ fallthrough;
+ case NFSD_IO_BUFFERED:
+ iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
+ host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
+ if (host_err < 0)
+ break;
+ *cnt = host_err;
break;
}
- host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
}
- *cnt = host_err;
nfsd_stats_io_write_add(nn, exp, *cnt);
fsnotify_modify(file);
host_err = filemap_check_wb_err(file->f_mapping, since);