From cd95e366c9e380ca4fbe91cb38756f06d2ad869f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 30 May 2025 20:32:04 +0300 Subject: [PATCH 01/24] fs/read_write: Fix spelling typo 'implemenation' --> 'implementation'. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/20250530173204.3611576-1-andriy.shevchenko@linux.intel.com Signed-off-by: Christian Brauner --- fs/read_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/read_write.c b/fs/read_write.c index 0ef70e128c4a..81a9474dc396 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -237,7 +237,7 @@ EXPORT_SYMBOL(generic_llseek_cookie); * @offset: file offset to seek to * @whence: type of seek * - * This is a generic implemenation of ->llseek useable for all normal local + * This is a generic implementation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ From dd765ba8723958514eab2fc742bef69019a21069 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 May 2025 05:25:35 -0600 Subject: [PATCH 02/24] fs/pipe: set FMODE_NOWAIT in create_pipe_files() Rather than have the caller set the FMODE_NOWAIT flags for both output files, move it to create_pipe_files() where other f_mode flags are set anyway with stream_open(). With that, both __do_pipe_flags() and io_pipe() can remove the manual setting of the NOWAIT flags. No intended functional changes, just a code cleanup. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/1f0473f8-69f3-4eb1-aa77-3334c6a71d24@kernel.dk Signed-off-by: Christian Brauner --- fs/pipe.c | 8 +++++--- io_uring/openclose.c | 2 -- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 45077c37bad1..731622d0738d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -963,6 +963,11 @@ int create_pipe_files(struct file **res, int flags) res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); + + /* pipe groks IOCB_NOWAIT */ + res[0]->f_mode |= FMODE_NOWAIT; + res[1]->f_mode |= FMODE_NOWAIT; + /* * Disable permission and pre-content events, but enable legacy * inotify events for legacy users. @@ -997,9 +1002,6 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags) audit_fd_pair(fdr, fdw); fd[0] = fdr; fd[1] = fdw; - /* pipe groks IOCB_NOWAIT */ - files[0]->f_mode |= FMODE_NOWAIT; - files[1]->f_mode |= FMODE_NOWAIT; return 0; err_fdr: diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 83e36ad4e31b..d70700e5cef8 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -416,8 +416,6 @@ int io_pipe(struct io_kiocb *req, unsigned int issue_flags) ret = create_pipe_files(files, p->flags); if (ret) return ret; - files[0]->f_mode |= FMODE_NOWAIT; - files[1]->f_mode |= FMODE_NOWAIT; if (!!p->file_slot) ret = io_pipe_fixed(req, files, issue_flags); From d209f6e122950d9b6f329f3538b785dd709001e5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 2 Jun 2025 07:58:54 -0400 Subject: [PATCH 03/24] filelock: add new locks_wake_up_waiter() helper Currently the function that does this takes a struct file_lock, but __locks_wake_up_blocks() deals with both locks and leases. Currently this works because both file_lock and file_lease have the file_lock_core at the beginning of the struct, but it's fragile to rely on that. Add a new locks_wake_up_waiter() function and call that from __locks_wake_up_blocks(). Signed-off-by: Jeff Layton Link: https://lore.kernel.org/20250602-filelock-6-16-v1-1-7da5b2c930fd@kernel.org Signed-off-by: Christian Brauner --- fs/locks.c | 2 +- include/linux/filelock.h | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 1619cddfa7a4..f96024feab17 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -712,7 +712,7 @@ static void __locks_wake_up_blocks(struct file_lock_core *blocker) fl->fl_lmops && fl->fl_lmops->lm_notify) fl->fl_lmops->lm_notify(fl); else - locks_wake_up(fl); + locks_wake_up_waiter(waiter); /* * The setting of flc_blocker to NULL marks the "done" diff --git a/include/linux/filelock.h b/include/linux/filelock.h index c412ded9171e..c2ce8ba05d06 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -175,9 +175,14 @@ static inline bool lock_is_write(struct file_lock *fl) return fl->c.flc_type == F_WRLCK; } +static inline void locks_wake_up_waiter(struct file_lock_core *flc) +{ + wake_up(&flc->flc_wait); +} + static inline void locks_wake_up(struct file_lock *fl) { - wake_up(&fl->c.flc_wait); + locks_wake_up_waiter(&fl->c); } static inline bool locks_can_async_lock(const struct file_operations *fops) From 88b1de54979c7ac8b1faec646c600ce66f0da24e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Jun 2025 01:32:16 +0100 Subject: [PATCH 04/24] don't duplicate vfs_open() in kernel_file_open() Signed-off-by: Al Viro Link: https://lore.kernel.org/20250615003216.GB3011112@ZenIV Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/open.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/open.c b/fs/open.c index 7828234a7caa..b29d1e077164 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1204,14 +1204,11 @@ struct file *kernel_file_open(const struct path *path, int flags, if (IS_ERR(f)) return f; - f->f_path = *path; - error = do_dentry_open(f, NULL); + error = vfs_open(path, f); if (error) { fput(f); return ERR_PTR(error); } - - fsnotify_open(f); return f; } EXPORT_SYMBOL_GPL(kernel_file_open); From b5ba648a7dd2b42725ae2d07335f0551111437a2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Jun 2025 01:33:21 +0100 Subject: [PATCH 05/24] proc_fd_getattr(): don't bother with S_ISDIR() check that thing is callable only as ->i_op->getattr() instance and only for directory inodes (/proc/*/fd and /proc/*/task/*/fd) Signed-off-by: Al Viro Link: https://lore.kernel.org/20250615003321.GC3011112@ZenIV Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/proc/fd.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 37aa778d1af7..9eeccff49b2a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -352,18 +352,9 @@ static int proc_fd_getattr(struct mnt_idmap *idmap, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - int rv = 0; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - - /* If it's a directory, put the number of open fds there */ - if (S_ISDIR(inode->i_mode)) { - rv = proc_readfd_count(inode, &stat->size); - if (rv < 0) - return rv; - } - - return rv; + return proc_readfd_count(inode, &stat->size); } const struct inode_operations proc_fd_inode_operations = { From bc9241367aac08de44633fd957b2452a6da8e6d4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 13 Jun 2025 09:28:10 +1000 Subject: [PATCH 06/24] VFS: change old_dir and new_dir in struct renamedata to dentrys all users of 'struct renamedata' have the dentry for the old and new directories, and often have no use for the inode except to store it in the renamedata. This patch changes struct renamedata to hold the dentry, rather than the inode, for the old and new directories, and changes callers to match. The names are also changed from a _dir suffix to _parent. This is consistent with other usage in namei.c and elsewhere. This results in the removal of several local variables and several dereferences of ->d_inode at the cost of adding ->d_inode dereferences to vfs_rename(). Acked-by: Miklos Szeredi Reviewed-by: Chuck Lever Reviewed-by: Namjae Jeon Signed-off-by: NeilBrown Link: https://lore.kernel.org/174977089072.608730.4244531834577097454@noble.neil.brown.name Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 4 ++-- fs/ecryptfs/inode.c | 4 ++-- fs/namei.c | 7 ++++--- fs/nfsd/vfs.c | 7 ++----- fs/overlayfs/copy_up.c | 6 +++--- fs/overlayfs/dir.c | 16 ++++++++-------- fs/overlayfs/overlayfs.h | 16 ++++++++-------- fs/overlayfs/readdir.c | 2 +- fs/overlayfs/super.c | 2 +- fs/overlayfs/util.c | 2 +- fs/smb/server/vfs.c | 4 ++-- include/linux/fs.h | 8 ++++---- 12 files changed, 38 insertions(+), 40 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index aecfc5c37b49..91dfd0231877 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -388,10 +388,10 @@ int cachefiles_bury_object(struct cachefiles_cache *cache, } else { struct renamedata rd = { .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = d_inode(dir), + .old_parent = dir, .old_dentry = rep, .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = d_inode(cache->graveyard), + .new_parent = cache->graveyard, .new_dentry = grave, }; trace_cachefiles_rename(object, d_inode(rep)->i_ino, why); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 493d7f194956..bd317d943d62 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -635,10 +635,10 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } rd.old_mnt_idmap = &nop_mnt_idmap; - rd.old_dir = d_inode(lower_old_dir_dentry); + rd.old_parent = lower_old_dir_dentry; rd.old_dentry = lower_old_dentry; rd.new_mnt_idmap = &nop_mnt_idmap; - rd.new_dir = d_inode(lower_new_dir_dentry); + rd.new_parent = lower_new_dir_dentry; rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); if (rc) diff --git a/fs/namei.c b/fs/namei.c index 4bb889fc980b..981da44e1291 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5007,7 +5007,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname int vfs_rename(struct renamedata *rd) { int error; - struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; + struct inode *old_dir = d_inode(rd->old_parent); + struct inode *new_dir = d_inode(rd->new_parent); struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; @@ -5266,10 +5267,10 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, if (error) goto exit5; - rd.old_dir = old_path.dentry->d_inode; + rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); - rd.new_dir = new_path.dentry->d_inode; + rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index cd689df2ca5d..7d522e426b2d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1864,7 +1864,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct svc_fh *tfhp, char *tname, int tlen) { struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; - struct inode *fdir, *tdir; int type = S_IFDIR; __be32 err; int host_err; @@ -1880,10 +1879,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; fdentry = ffhp->fh_dentry; - fdir = d_inode(fdentry); tdentry = tfhp->fh_dentry; - tdir = d_inode(tdentry); err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) @@ -1944,10 +1941,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, } else { struct renamedata rd = { .old_mnt_idmap = &nop_mnt_idmap, - .old_dir = fdir, + .old_parent = fdentry, .old_dentry = odentry, .new_mnt_idmap = &nop_mnt_idmap, - .new_dir = tdir, + .new_parent = tdentry, .new_dentry = ndentry, }; int retries; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index d7310fcf3888..8a3c0d18ec2e 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -563,7 +563,7 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, if (IS_ERR(index)) { err = PTR_ERR(index); } else { - err = ovl_do_rename(ofs, dir, temp, dir, index, 0); + err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0); dput(index); } out: @@ -762,7 +762,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; - struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); + struct inode *wdir = d_inode(c->workdir); struct path path = { .mnt = ovl_upper_mnt(ofs) }; struct dentry *temp, *upper, *trap; struct ovl_cu_creds cc; @@ -829,7 +829,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (IS_ERR(upper)) goto cleanup; - err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0); + err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0); dput(upper); if (err) goto cleanup; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index fe493f3ed6b6..4fc221ea6480 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -107,7 +107,7 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) } /* Caller must hold i_mutex on both workdir and dir */ -int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry) { struct inode *wdir = ofs->workdir->d_inode; @@ -123,7 +123,7 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, if (d_is_dir(dentry)) flags = RENAME_EXCHANGE; - err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags); + err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags); if (err) goto kill_whiteout; if (flags) @@ -384,7 +384,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE); if (err) goto out_cleanup; @@ -491,14 +491,14 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, + err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, RENAME_EXCHANGE); if (err) goto out_cleanup; ovl_cleanup(ofs, wdir, upper); } else { - err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0); + err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0); if (err) goto out_cleanup; } @@ -774,7 +774,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, goto out_dput_upper; } - err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper); + err = ovl_cleanup_and_whiteout(ofs, upperdir, upper); if (err) goto out_d_drop; @@ -1246,8 +1246,8 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, if (err) goto out_dput; - err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry, - new_upperdir->d_inode, newdentry, flags); + err = ovl_do_rename(ofs, old_upperdir, olddentry, + new_upperdir, newdentry, flags); if (err) goto out_dput; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 8baaba0a3fe5..78deb89e16b5 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -353,19 +353,19 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } -static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, - struct dentry *olddentry, struct inode *newdir, +static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, + struct dentry *olddentry, struct dentry *newdir, struct dentry *newdentry, unsigned int flags) { int err; struct renamedata rd = { .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), - .old_dir = olddir, - .old_dentry = olddentry, + .old_parent = olddir, + .old_dentry = olddentry, .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), - .new_dir = newdir, - .new_dentry = newdentry, - .flags = flags, + .new_parent = newdir, + .new_dentry = newdentry, + .flags = flags, }; pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); @@ -826,7 +826,7 @@ static inline void ovl_copyflags(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, +int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry); struct ovl_cattr { dev_t rdev; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 474c80d210d1..68cca52ae2ac 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -1235,7 +1235,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) * Whiteout orphan index to block future open by * handle after overlay nlink dropped to zero. */ - err = ovl_cleanup_and_whiteout(ofs, dir, index); + err = ovl_cleanup_and_whiteout(ofs, indexdir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, dir, index); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e19940d649ca..cf99b276fdfb 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -580,7 +580,7 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); - err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT); + err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT); if (err) { if (err == -EINVAL) err = 0; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index dcccb4b4a66c..2b4754c645ee 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1115,7 +1115,7 @@ static void ovl_cleanup_index(struct dentry *dentry) } else if (ovl_index_all(dentry->d_sb)) { /* Whiteout orphan index to block future open by handle */ err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb), - dir, index); + indexdir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, dir, index); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index ba45e809555a..2f0171896e5d 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -764,10 +764,10 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, } rd.old_mnt_idmap = mnt_idmap(old_path->mnt), - rd.old_dir = d_inode(old_parent), + rd.old_parent = old_parent, rd.old_dentry = old_child, rd.new_mnt_idmap = mnt_idmap(new_path.mnt), - rd.new_dir = new_path.dentry->d_inode, + rd.new_parent = new_path.dentry, rd.new_dentry = new_dentry, rd.flags = flags, rd.delegated_inode = NULL, diff --git a/include/linux/fs.h b/include/linux/fs.h index 96c7925a6551..1d9586a78041 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2004,20 +2004,20 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, /** * struct renamedata - contains all information required for renaming * @old_mnt_idmap: idmap of the old mount the inode was found from - * @old_dir: parent of source + * @old_parent: parent of source * @old_dentry: source * @new_mnt_idmap: idmap of the new mount the inode was found from - * @new_dir: parent of destination + * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { struct mnt_idmap *old_mnt_idmap; - struct inode *old_dir; + struct dentry *old_parent; struct dentry *old_dentry; struct mnt_idmap *new_mnt_idmap; - struct inode *new_dir; + struct dentry *new_parent; struct dentry *new_dentry; struct inode **delegated_inode; unsigned int flags; From ffaf1bf3737f706e4e9be876de4bc3c8fc578091 Mon Sep 17 00:00:00 2001 From: RubenKelevra Date: Wed, 18 Jun 2025 01:09:27 +0200 Subject: [PATCH 07/24] fs_context: fix parameter name in infofc() macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The macro takes a parameter called "p" but references "fc" internally. This happens to compile as long as callers pass a variable named fc, but breaks otherwise. Rename the first parameter to “fc” to match the usage and to be consistent with warnfc() / errorfc(). Fixes: a3ff937b33d9 ("prefix-handling analogues of errorf() and friends") Signed-off-by: RubenKelevra Link: https://lore.kernel.org/20250617230927.1790401-1-rubenkelevra@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs_context.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index a19e4bd32e4d..7773eb870039 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -200,7 +200,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, */ #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__) #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__) -#define infofc(p, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) +#define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) /** * warnf - Store supplementary warning message From 6ae58121126dcf8efcc2611f216a36a5e50b8ad9 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Wed, 18 Jun 2025 09:58:21 +0200 Subject: [PATCH 08/24] fs/buffer: remove comment about hard sectorsize Commit e1defc4ff0cf ("block: Do away with the notion of hardsect_size") changed hardsect_size to logical block size. The comment on top still says hardsect_size. Remove the comment as the code is pretty clear. While we are at it, format the relevant code. Reviewed-by: "Martin K. Petersen" Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/20250618075821.111459-1-p.raghav@samsung.com Reviewed-by: Daniel Gomez Signed-off-by: Christian Brauner --- fs/buffer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 8cf4a1dc481e..a14d281c6a74 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1122,9 +1122,8 @@ __getblk_slow(struct block_device *bdev, sector_t block, { bool blocking = gfpflags_allow_blocking(gfp); - /* Size must be multiple of hard sectorsize */ - if (unlikely(size & (bdev_logical_block_size(bdev)-1) || - (size < 512 || size > PAGE_SIZE))) { + if (unlikely(size & (bdev_logical_block_size(bdev) - 1) || + (size < 512 || size > PAGE_SIZE))) { printk(KERN_ERR "getblk(): invalid block size %d requested\n", size); printk(KERN_ERR "logical block size: %d\n", From 2773d282cd56464f62e9b4703c41d2f733a67842 Mon Sep 17 00:00:00 2001 From: Junxuan Liao Date: Sun, 22 Jun 2025 23:01:32 -0500 Subject: [PATCH 09/24] docs/vfs: update references to i_mutex to i_rwsem VFS has switched to i_rwsem for ten years now (9902af79c01a: parallel lookups actual switch to rwsem), but the VFS documentation and comments still has references to i_mutex. Signed-off-by: Junxuan Liao Link: https://lore.kernel.org/72223729-5471-474a-af3c-f366691fba82@cs.wisc.edu Signed-off-by: Christian Brauner --- Documentation/filesystems/vfs.rst | 5 +++-- fs/attr.c | 10 +++++----- fs/buffer.c | 2 +- fs/dcache.c | 10 +++++----- fs/direct-io.c | 8 ++++---- fs/inode.c | 9 ++++----- fs/libfs.c | 5 +++-- fs/locks.c | 2 +- fs/namei.c | 22 +++++++++++----------- fs/namespace.c | 2 +- fs/stack.c | 4 ++-- fs/xattr.c | 2 +- include/linux/exportfs.h | 4 ++-- include/linux/fs.h | 6 +++--- include/linux/fs_stack.h | 2 +- include/linux/quotaops.h | 2 +- 16 files changed, 48 insertions(+), 47 deletions(-) diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index fd32a9a17bfb..dd9da7e04a99 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -758,8 +758,9 @@ process is more complicated and uses write_begin/write_end or dirty_folio to write data into the address_space, and writepages to writeback data to storage. -Adding and removing pages to/from an address_space is protected by the -inode's i_mutex. +Removing pages from an address_space requires holding the inode's i_rwsem +exclusively, while adding pages to the address_space requires holding the +inode's i_mapping->invalidate_lock exclusively. When data is written to a page, the PG_Dirty flag should be set. It typically remains set until writepages asks for it to be written. This diff --git a/fs/attr.c b/fs/attr.c index 9caf63d20d03..5425c1dbbff9 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -230,7 +230,7 @@ EXPORT_SYMBOL(setattr_prepare); * @inode: the inode to be truncated * @offset: the new size to assign to the inode * - * inode_newsize_ok must be called with i_mutex held. + * inode_newsize_ok must be called with i_rwsem held exclusively. * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ @@ -318,7 +318,7 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) * @inode: the inode to be updated * @attr: the new attributes * - * setattr_copy must be called with i_mutex held. + * setattr_copy must be called with i_rwsem held exclusively. * * setattr_copy updates the inode's metadata with that specified * in attr on idmapped mounts. Necessary permission checks to determine @@ -403,13 +403,13 @@ EXPORT_SYMBOL(may_setattr); * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated * - * The caller must hold the i_mutex on the affected object. + * The caller must hold the i_rwsem exclusively on the affected object. * * If notify_change discovers a delegation in need of breaking, * it will return -EWOULDBLOCK and return a reference to the inode in * delegated_inode. The caller should then break the delegation and * retry. Because breaking a delegation may take a long time, the - * caller should drop the i_mutex before doing so. + * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -456,7 +456,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - /* Flag setting protected by i_mutex */ + /* Flag setting protected by i_rwsem */ if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } diff --git a/fs/buffer.c b/fs/buffer.c index a14d281c6a74..1d34200f69c8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2609,7 +2609,7 @@ EXPORT_SYMBOL(cont_write_begin); * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * - * We are not allowed to take the i_mutex here so we have to play games to + * We are not allowed to take the i_rwsem here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not diff --git a/fs/dcache.c b/fs/dcache.c index 03d58b2d4fa3..ab8465ae9cad 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2774,10 +2774,10 @@ static void copy_name(struct dentry *dentry, struct dentry *target) * @target: new dentry * @exchange: exchange the two dentries * - * Update the dcache to reflect the move of a file name. Negative - * dcache entries should not be moved in this way. Caller must hold - * rename_lock, the i_mutex of the source and target directories, - * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). + * Update the dcache to reflect the move of a file name. Negative dcache + * entries should not be moved in this way. Caller must hold rename_lock, the + * i_rwsem of the source and target directories (exclusively), and the sb-> + * s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) @@ -2923,7 +2923,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, and rename_lock + * dentry->d_parent->d_inode->i_rwsem, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... diff --git a/fs/direct-io.c b/fs/direct-io.c index bbd05f1a2145..1694ee9a9382 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1083,8 +1083,8 @@ static inline int drop_refcount(struct dio *dio) * The locking rules are governed by the flags parameter: * - if the flags value contains DIO_LOCKING we use a fancy locking * scheme for dumb filesystems. - * For writes this function is called under i_mutex and returns with - * i_mutex held, for reads, i_mutex is not held on entry, but it is + * For writes this function is called under i_rwsem and returns with + * i_rwsem held, for reads, i_rwsem is not held on entry, but it is * taken and dropped again before returning. * - if the flags value does NOT contain DIO_LOCKING we don't use any * internal locking but rather rely on the filesystem to synchronize @@ -1094,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio) * counter before starting direct I/O, and decrement it once we are done. * Truncate can wait for it to reach zero to provide exclusion. It is * expected that filesystem provide exclusion between new direct I/O - * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, + * and truncates. For DIO_LOCKING filesystems this is done by i_rwsem, * but other filesystems need to take care of this on their own. * * NOTE: if you pass "sdio" to anything by pointer make sure that function @@ -1279,7 +1279,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* * All block lookups have been performed. For READ requests - * we can let i_mutex go now that its achieved its purpose + * we can let i_rwsem go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) diff --git a/fs/inode.c b/fs/inode.c index 99318b157a9a..a0150e2ef22a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1158,9 +1158,8 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode) /* Set new key only if filesystem hasn't already changed it */ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* - * ensure nobody is actually holding i_mutex + * ensure nobody is actually holding i_rwsem */ - // mutex_destroy(&inode->i_mutex); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); @@ -2615,7 +2614,7 @@ EXPORT_SYMBOL(inode_dio_finished); * proceed with a truncate or equivalent operation. * * Must be called under a lock that serializes taking new references - * to i_dio_count, usually by inode->i_mutex. + * to i_dio_count, usually by inode->i_rwsem. */ void inode_dio_wait(struct inode *inode) { @@ -2633,7 +2632,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible); /* * inode_set_flags - atomically set some inode flags * - * Note: the caller should be holding i_mutex, or else be sure that + * Note: the caller should be holding i_rwsem exclusively, or else be sure that * they have exclusive access to the inode structure (i.e., while the * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify @@ -2641,7 +2640,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible); * code path which doesn't today so we use cmpxchg() out of an abundance * of caution. * - * In the long run, i_mutex is overkill, and we should probably look + * In the long run, i_rwsem is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure * it is so documented in include/linux/fs.h and that all code follows * the locking convention!! diff --git a/fs/libfs.c b/fs/libfs.c index 9ea0ecc325a8..4d1862f589e8 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -946,7 +946,8 @@ EXPORT_SYMBOL(simple_write_begin); * simple_write_end does the minimum needed for updating a folio after * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for - * FSes that don't need any other processing. i_mutex is assumed to be held. + * FSes that don't need any other processing. i_rwsem is assumed to be held + * exclusively. * Block based filesystems should use generic_write_end(). * NOTE: Even though i_size might get updated by this function, mark_inode_dirty * is not called, so a filesystem that actually does store data in .write_inode @@ -973,7 +974,7 @@ static int simple_write_end(struct file *file, struct address_space *mapping, } /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_mutex. + * cannot change under us because we hold the i_rwsem. */ if (last_pos > inode->i_size) i_size_write(inode, last_pos); diff --git a/fs/locks.c b/fs/locks.c index f96024feab17..559f02aa4172 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1794,7 +1794,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr /* * In the delegation case we need mutual exclusion with - * a number of operations that take the i_mutex. We trylock + * a number of operations that take the i_rwsem. We trylock * because delegations are an optional optimization, and if * there's some chance of a conflict--we'd rather not * bother, maybe that's a sign this just isn't a good file to diff --git a/fs/namei.c b/fs/namei.c index 981da44e1291..f5c157290ce2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1469,7 +1469,7 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { - /* Allow the filesystem to manage the transit without i_mutex + /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); @@ -2945,7 +2945,7 @@ EXPORT_SYMBOL(try_lookup_noperm); * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * - * The caller must hold base->i_mutex. + * The caller must hold base->i_rwsem. */ struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) { @@ -2971,7 +2971,7 @@ EXPORT_SYMBOL(lookup_noperm); * * This can be used for in-kernel filesystem clients such as file servers. * - * The caller must hold base->i_mutex. + * The caller must hold base->i_rwsem. */ struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base) @@ -4542,13 +4542,13 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * - * The caller must hold dir->i_mutex. + * The caller must hold dir->i_rwsem exclusively. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop - * dir->i_mutex before doing so. + * dir->i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -4607,7 +4607,7 @@ EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its - * directory's i_mutex. Truncate can take a long time if there is a lot of + * directory's i_rwsem. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ @@ -4785,13 +4785,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * - * The caller must hold dir->i_mutex + * The caller must hold dir->i_rwsem exclusively. * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the - * caller should drop the i_mutex before doing so. + * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not @@ -4987,7 +4987,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * c) we may have to lock up to _four_ objects - parents and victim (if it exists), * and source (if it's a non-directory or a subdirectory that moves to * different parent). - * And that - after we got ->i_mutex on parents (until then we don't know + * And that - after we got ->i_rwsem on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we @@ -4999,9 +4999,9 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when - * we are removing the target. Solution: we will have to grab ->i_mutex + * we are removing the target. Solution: we will have to grab ->i_rwsem * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on - * ->i_mutex on parents, which works but leads to some truly excessive + * ->i_rwsem on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) diff --git a/fs/namespace.c b/fs/namespace.c index e13d9ab4f564..8a1bfdf862f8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2053,7 +2053,7 @@ static int do_umount(struct mount *mnt, int flags) * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * - * The caller may hold dentry->d_inode->i_mutex. + * The caller may hold dentry->d_inode->i_rwsem. */ void __detach_mounts(struct dentry *dentry) { diff --git a/fs/stack.c b/fs/stack.c index f18920119944..d8c782e064e3 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -3,7 +3,7 @@ #include #include -/* does _NOT_ require i_mutex to be held. +/* does _NOT_ require i_rwsem to be held. * * This function cannot be inlined since i_size_{read,write} is rather * heavy-weight on 32-bit systems @@ -41,7 +41,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * If CONFIG_SMP or CONFIG_PREEMPTION on 32-bit, it's vital for * fsstack_copy_inode_size() to hold some lock around * i_size_write(), otherwise i_size_read() may spin forever (see - * include/linux/fs.h). We don't necessarily hold i_mutex when this + * include/linux/fs.h). We don't necessarily hold i_rwsem when this * is called, so take i_lock for that case. * * And if on 32-bit, continue our effort to keep the two halves of diff --git a/fs/xattr.c b/fs/xattr.c index 8ec5b0204bfd..c32e7d56a5d3 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -215,7 +215,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * * returns the result of the internal setxattr or setsecurity operations. * - * This function requires the caller to lock the inode's i_mutex before it + * This function requires the caller to lock the inode's i_rwsem before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 25c4a5afbd44..cfb0dd1ea49c 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -230,7 +230,7 @@ struct handle_to_path_ctx { * directory. The name should be stored in the @name (with the * understanding that it is already pointing to a %NAME_MAX+1 sized * buffer. get_name() should return %0 on success, a negative error code - * or error. @get_name will be called without @parent->i_mutex held. + * or error. @get_name will be called without @parent->i_rwsem held. * * get_parent: * @get_parent should find the parent directory for the given @child which @@ -247,7 +247,7 @@ struct handle_to_path_ctx { * @commit_metadata should commit metadata changes to stable storage. * * Locking rules: - * get_parent is called with child->d_inode->i_mutex down + * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 1d9586a78041..09e3e80b0528 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -837,7 +837,7 @@ static inline void inode_fake_hash(struct inode *inode) } /* - * inode->i_mutex nesting subclasses for the lock validator: + * inode->i_rwsem nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent @@ -989,7 +989,7 @@ static inline loff_t i_size_read(const struct inode *inode) /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it - * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount + * (normally i_rwsem), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) @@ -1921,7 +1921,7 @@ static inline void sb_end_intwrite(struct super_block *sb) * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write - * -> i_mutex (write path, truncate, directory ops, ...) + * -> i_rwsem (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h index 2b1f74b24070..0cc2fa283305 100644 --- a/include/linux/fs_stack.h +++ b/include/linux/fs_stack.h @@ -3,7 +3,7 @@ #define _LINUX_FS_STACK_H /* This file defines generic functions used primarily by stackable - * filesystems; none of these functions require i_mutex to be held. + * filesystems; none of these functions require i_rwsem to be held. */ #include diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 06cc8888199e..c334f82ed385 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -19,7 +19,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb) return &sb->s_dquot; } -/* i_mutex must being held */ +/* i_rwsem must being held */ static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { From 2b7c9664c3ce9d8da1113a1ca2546046de8eb93e Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 20 Jun 2025 09:30:59 +0300 Subject: [PATCH 10/24] fs: annotate suspected data race between poll_schedule_timeout() and pollwake() When running almost any select()/poll() workload intense enough, KCSAN is likely to report data races around using 'triggered' flag of 'struct poll_wqueues'. For example, running 'find /' on a tty console may trigger the following: BUG: KCSAN: data-race in poll_schedule_timeout / pollwake write to 0xffffc900030cfb90 of 4 bytes by task 97 on cpu 5: pollwake+0xd1/0x130 __wake_up_common_lock+0x7f/0xd0 n_tty_receive_buf_common+0x776/0xc30 n_tty_receive_buf2+0x3d/0x60 tty_ldisc_receive_buf+0x6b/0x100 tty_port_default_receive_buf+0x63/0xa0 flush_to_ldisc+0x169/0x3c0 process_scheduled_works+0x6fe/0xf40 worker_thread+0x53b/0x7b0 kthread+0x4f8/0x590 ret_from_fork+0x28c/0x450 ret_from_fork_asm+0x1a/0x30 read to 0xffffc900030cfb90 of 4 bytes by task 5802 on cpu 4: poll_schedule_timeout+0x96/0x160 do_sys_poll+0x966/0xb30 __se_sys_ppoll+0x1c3/0x210 __x64_sys_ppoll+0x71/0x90 x64_sys_call+0x3079/0x32b0 do_syscall_64+0xfa/0x3b0 entry_SYSCALL_64_after_hwframe+0x77/0x7f According to Jan, "there's no practical issue here because it is hard to imagine how the compiler could compile the above code using some intermediate values stored into 'triggered' or multiple fetches from 'triggered'". Nevertheless, silence KCSAN by using WRITE_ONCE() in __pollwake() and READ_ONCE() in poll_schedule_timeout(), respectively. Link: https://lore.kernel.org/linux-fsdevel/bwx72orsztfjx6aoftzzkl7wle3hi4syvusuwc7x36nw6t235e@bjwrosehblty Signed-off-by: Dmitry Antipov Link: https://lore.kernel.org/20250620063059.1800689-1-dmantipov@yandex.ru Acked-by: Marco Elver Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/select.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/select.c b/fs/select.c index 9fb650d03d52..082cf60c7e23 100644 --- a/fs/select.c +++ b/fs/select.c @@ -192,7 +192,7 @@ static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *k * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); - pwq->triggered = 1; + WRITE_ONCE(pwq->triggered, 1); /* * Perform the default wake up operation using a dummy @@ -237,7 +237,7 @@ static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, int rc = -EINTR; set_current_state(state); - if (!pwq->triggered) + if (!READ_ONCE(pwq->triggered)) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); From 06a705356d7543b4eb2f41bef1ee0d5951700f7a Mon Sep 17 00:00:00 2001 From: Ankit Chauhan Date: Thu, 19 Jun 2025 08:45:36 +0530 Subject: [PATCH 11/24] fs/ecryptfs: replace snprintf with sysfs_emit in show function Use sysfs_emit() instead of snprintf() in version_show() function to follow the preferred kernel API. Signed-off-by: Ankit Chauhan Link: https://lore.kernel.org/20250619031536.19352-1-ankitchauhan2065@gmail.com Signed-off-by: Christian Brauner --- fs/ecryptfs/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 8dd1d7189c3b..0db21e592fe7 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include "ecryptfs_kernel.h" @@ -764,7 +765,7 @@ static struct kobject *ecryptfs_kobj; static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buff) { - return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); + return sysfs_emit(buff, "%d\n", ECRYPTFS_VERSIONING_MASK); } static struct kobj_attribute version_attr = __ATTR_RO(version); From b39f7d75dc41b5f5d028192cd5d66cff71179f35 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 24 Jun 2025 14:21:27 +0100 Subject: [PATCH 12/24] fs: Remove three arguments from block_write_end() block_write_end() looks like it can be used as a ->write_end() implementation. However, it can't as it does not unlock nor put the folio. Since it does not use the 'file', 'mapping' nor 'fsdata' arguments, remove them. Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/20250624132130.1590285-1-willy@infradead.org Signed-off-by: Christian Brauner --- block/fops.c | 2 +- fs/buffer.c | 7 +++---- fs/ext2/dir.c | 2 +- fs/ext4/inode.c | 5 ++--- fs/iomap/buffered-io.c | 3 +-- fs/minix/dir.c | 2 +- fs/nilfs2/dir.c | 2 +- fs/nilfs2/recovery.c | 3 +-- fs/ufs/dir.c | 2 +- include/linux/buffer_head.h | 4 +--- 10 files changed, 13 insertions(+), 19 deletions(-) diff --git a/block/fops.c b/block/fops.c index 1309861d4c2c..35cea0cb304d 100644 --- a/block/fops.c +++ b/block/fops.c @@ -507,7 +507,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, void *fsdata) { int ret; - ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = block_write_end(pos, len, copied, folio); folio_unlock(folio); folio_put(folio); diff --git a/fs/buffer.c b/fs/buffer.c index 1d34200f69c8..d61073143127 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2270,9 +2270,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, } EXPORT_SYMBOL(block_write_begin); -int block_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +int block_write_end(loff_t pos, unsigned len, unsigned copied, + struct folio *folio) { size_t start = pos - folio_pos(folio); @@ -2311,7 +2310,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, loff_t old_size = inode->i_size; bool i_size_changed = false; - copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + copied = block_write_end(pos, len, copied, folio); /* * No need to use i_size_read() here, the i_size cannot change under us diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 402fecf90a44..b07b3b369710 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index be9a4cba35fd..e6aa7ca6d842 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1424,7 +1424,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); + copied = block_write_end(pos, len, copied, folio); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -3144,8 +3144,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ - copied = block_write_end(NULL, mapping, pos, len, copied, - folio, NULL); + copied = block_write_end(pos, len, copied, folio); new_i_size = pos + copied; /* diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3729391a18f3..775e8f1f0286 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -923,8 +923,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { size_t bh_written; - bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, folio, NULL); + bh_written = block_write_end(pos, len, copied, folio); WARN_ON_ONCE(bh_written != copied && bh_written != 0); return bh_written == copied; } diff --git a/fs/minix/dir.c b/fs/minix/dir.c index dd2a425b41f0..19052fc47e9e 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,7 +45,7 @@ static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 9b7f8e9655a2..6ca3d74be1e1 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio, int err; nr_dirty = nilfs_page_count_clean_buffers(folio, from, to); - copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL); + copied = block_write_end(pos, len, len, folio); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 22aecf6e2344..a9c61d0492cb 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -560,8 +560,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, if (unlikely(err)) goto failed_folio; - block_write_end(NULL, inode->i_mapping, pos, blocksize, - blocksize, folio, NULL); + block_write_end(pos, blocksize, blocksize, folio); folio_unlock(folio); folio_put(folio); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 88d0062cfdb9..0388a1bae326 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -48,7 +48,7 @@ static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, folio, NULL); + block_write_end(pos, len, len, folio); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 0029ff880e27..178eb90e9cf3 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -262,9 +262,7 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block); int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); -int block_write_end(struct file *, struct address_space *, - loff_t, unsigned len, unsigned copied, - struct folio *, void *); +int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *); int generic_write_end(struct file *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); From 04a2c4b4511d186b0fce685da21085a5d4acd370 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sun, 29 Jun 2025 03:40:21 -0400 Subject: [PATCH 13/24] fs: Prevent file descriptor table allocations exceeding INT_MAX When sysctl_nr_open is set to a very high value (for example, 1073741816 as set by systemd), processes attempting to use file descriptors near the limit can trigger massive memory allocation attempts that exceed INT_MAX, resulting in a WARNING in mm/slub.c: WARNING: CPU: 0 PID: 44 at mm/slub.c:5027 __kvmalloc_node_noprof+0x21a/0x288 This happens because kvmalloc_array() and kvmalloc() check if the requested size exceeds INT_MAX and emit a warning when the allocation is not flagged with __GFP_NOWARN. Specifically, when nr_open is set to 1073741816 (0x3ffffff8) and a process calls dup2(oldfd, 1073741880), the kernel attempts to allocate: - File descriptor array: 1073741880 * 8 bytes = 8,589,935,040 bytes - Multiple bitmaps: ~400MB - Total allocation size: > 8GB (exceeding INT_MAX = 2,147,483,647) Reproducer: 1. Set /proc/sys/fs/nr_open to 1073741816: # echo 1073741816 > /proc/sys/fs/nr_open 2. Run a program that uses a high file descriptor: #include #include int main() { struct rlimit rlim = {1073741824, 1073741824}; setrlimit(RLIMIT_NOFILE, &rlim); dup2(2, 1073741880); // Triggers the warning return 0; } 3. Observe WARNING in dmesg at mm/slub.c:5027 systemd commit a8b627a introduced automatic bumping of fs.nr_open to the maximum possible value. The rationale was that systems with memory control groups (memcg) no longer need separate file descriptor limits since memory is properly accounted. However, this change overlooked that: 1. The kernel's allocation functions still enforce INT_MAX as a maximum size regardless of memcg accounting 2. Programs and tests that legitimately test file descriptor limits can inadvertently trigger massive allocations 3. The resulting allocations (>8GB) are impractical and will always fail systemd's algorithm starts with INT_MAX and keeps halving the value until the kernel accepts it. On most systems, this results in nr_open being set to 1073741816 (0x3ffffff8), which is just under 1GB of file descriptors. While processes rarely use file descriptors near this limit in normal operation, certain selftests (like tools/testing/selftests/core/unshare_test.c) and programs that test file descriptor limits can trigger this issue. Fix this by adding a check in alloc_fdtable() to ensure the requested allocation size does not exceed INT_MAX. This causes the operation to fail with -EMFILE instead of triggering a kernel warning and avoids the impractical >8GB memory allocation request. Fixes: 9cfe015aa424 ("get rid of NR_OPEN and introduce a sysctl_nr_open") Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin Link: https://lore.kernel.org/20250629074021.1038845-1-sashal@kernel.org Signed-off-by: Christian Brauner --- fs/file.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/file.c b/fs/file.c index 3a3146664cf3..441238261372 100644 --- a/fs/file.c +++ b/fs/file.c @@ -197,6 +197,21 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) return ERR_PTR(-EMFILE); } + /* + * Check if the allocation size would exceed INT_MAX. kvmalloc_array() + * and kvmalloc() will warn if the allocation size is greater than + * INT_MAX, as filp_cachep objects are not __GFP_NOWARN. + * + * This can happen when sysctl_nr_open is set to a very high value and + * a process tries to use a file descriptor near that limit. For example, + * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what + * systemd typically sets it to - then trying to use a file descriptor + * close to that value will require allocating a file descriptor table + * that exceeds 8GB in size. + */ + if (unlikely(nr > INT_MAX / sizeof(struct file *))) + return ERR_PTR(-EMFILE); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; From 77eb64439ad52d8afb57bb4dae24a2743c68f50d Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 26 Jun 2025 13:32:23 +0200 Subject: [PATCH 14/24] fs/buffer: remove the min and max limit checks in __getblk_slow() All filesystems will already check the max and min value of their block size during their initialization. __getblk_slow() is a very low-level function to have these checks. Remove them and only check for logical block size alignment. As this check with logical block size alignment might never trigger, add WARN_ON_ONCE() to the check. As WARN_ON_ONCE() will already print the stack, remove the call to dump_stack(). Suggested-by: Matthew Wilcox Reviewed-by: Jan Kara Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/20250626113223.181399-1-p.raghav@samsung.com Reviewed-by: Baokun Li Signed-off-by: Christian Brauner --- fs/buffer.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index d61073143127..565fe88773c2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1122,14 +1122,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, { bool blocking = gfpflags_allow_blocking(gfp); - if (unlikely(size & (bdev_logical_block_size(bdev) - 1) || - (size < 512 || size > PAGE_SIZE))) { - printk(KERN_ERR "getblk(): invalid block size %d requested\n", - size); - printk(KERN_ERR "logical block size: %d\n", - bdev_logical_block_size(bdev)); - - dump_stack(); + if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) { + printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n", + size, bdev_logical_block_size(bdev)); return NULL; } From 25050181b61aa0153f3a378a5472d134f8e4ef25 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 30 Jun 2025 12:40:18 +0200 Subject: [PATCH 15/24] fs/libfs: don't assume blocksize <= PAGE_SIZE in generic_check_addressable Since [1], it is possible for filesystems to have blocksize > PAGE_SIZE of the system. Remove the assumption and make the check generic for all blocksizes in generic_check_addressable(). [1] https://lore.kernel.org/linux-xfs/20240822135018.1931258-1-kernel@pankajraghav.com/ Reviewed-by: Jan Kara Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/20250630104018.213985-1-p.raghav@samsung.com Reviewed-by: Zhang Yi Reviewed-by: Baokun Li Signed-off-by: Christian Brauner --- fs/libfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 4d1862f589e8..f99ecc300647 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1584,13 +1584,17 @@ EXPORT_SYMBOL(generic_file_fsync); int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; - u64 last_fs_page = - last_fs_block >> (PAGE_SHIFT - blocksize_bits); + u64 last_fs_page, max_bytes; + + if (check_shl_overflow(num_blocks, blocksize_bits, &max_bytes)) + return -EFBIG; + + last_fs_page = (max_bytes >> PAGE_SHIFT) - 1; if (unlikely(num_blocks == 0)) return 0; - if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) + if (blocksize_bits < 9) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || From 3bc4e4410830d556b0f40dfa6671bfcaeacc1599 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Jul 2025 11:06:36 +0200 Subject: [PATCH 16/24] vfs: Remove unnecessary list_for_each_entry_safe() from evict_inodes() evict_inodes() uses list_for_each_entry_safe() to iterate sb->s_inodes list. However, since we use i_lru list entry for our local temporary list of inodes to destroy, the inode is guaranteed to stay in sb->s_inodes list while we hold sb->s_inode_list_lock. So there is no real need for safe iteration variant and we can use list_for_each_entry() just fine. Signed-off-by: Jan Kara Link: https://lore.kernel.org/20250709090635.26319-2-jack@suse.cz Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index a0150e2ef22a..01ebdc40021e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -865,12 +865,12 @@ static void dispose_list(struct list_head *head) */ void evict_inodes(struct super_block *sb) { - struct inode *inode, *next; + struct inode *inode; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; From f2e467a48287c868818085aa35389a224d226732 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 11 Jul 2025 18:33:36 +0200 Subject: [PATCH 17/24] eventpoll: Fix semi-unbounded recursion Ensure that epoll instances can never form a graph deeper than EP_MAX_NESTS+1 links. Currently, ep_loop_check_proc() ensures that the graph is loop-free and does some recursion depth checks, but those recursion depth checks don't limit the depth of the resulting tree for two reasons: - They don't look upwards in the tree. - If there are multiple downwards paths of different lengths, only one of the paths is actually considered for the depth check since commit 28d82dc1c4ed ("epoll: limit paths"). Essentially, the current recursion depth check in ep_loop_check_proc() just serves to prevent it from recursing too deeply while checking for loops. A more thorough check is done in reverse_path_check() after the new graph edge has already been created; this checks, among other things, that no paths going upwards from any non-epoll file with a length of more than 5 edges exist. However, this check does not apply to non-epoll files. As a result, it is possible to recurse to a depth of at least roughly 500, tested on v6.15. (I am unsure if deeper recursion is possible; and this may have changed with commit 8c44dac8add7 ("eventpoll: Fix priority inversion problem").) To fix it: 1. In ep_loop_check_proc(), note the subtree depth of each visited node, and use subtree depths for the total depth calculation even when a subtree has already been visited. 2. Add ep_get_upwards_depth_proc() for similarly determining the maximum depth of an upwards walk. 3. In ep_loop_check(), use these values to limit the total path length between epoll nodes to EP_MAX_NESTS edges. Fixes: 22bacca48a17 ("epoll: prevent creating circular epoll structures") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Link: https://lore.kernel.org/20250711-epoll-recursion-fix-v1-1-fb2457c33292@google.com Signed-off-by: Christian Brauner --- fs/eventpoll.c | 60 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d4dbffdedd08..44648cc09250 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,6 +218,7 @@ struct eventpoll { /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; + u8 loop_check_depth; /* * usage count, used together with epitem->dying to @@ -2142,23 +2143,24 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } /** - * ep_loop_check_proc - verify that adding an epoll file inside another - * epoll structure does not violate the constraints, in - * terms of closed loops, or too deep chains (which can - * result in excessive stack usage). + * ep_loop_check_proc - verify that adding an epoll file @ep inside another + * epoll file does not create closed loops, and + * determine the depth of the subtree starting at @ep * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Return: %zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or %-1 otherwise. + * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { - int error = 0; + int result = 0; struct rb_node *rbp; struct epitem *epi; + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { @@ -2166,13 +2168,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->gen == loop_check_gen) - continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) - error = -1; + result = INT_MAX; else - error = ep_loop_check_proc(ep_tovisit, depth + 1); - if (error != 0) + result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); + if (result > EP_MAX_NESTS) break; } else { /* @@ -2186,9 +2186,27 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) list_file(epi->ffd.file); } } + ep->loop_check_depth = result; mutex_unlock(&ep->mtx); - return error; + return result; +} + +/** + * ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards + */ +static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) +{ + int result = 0; + struct epitem *epi; + + if (ep->gen == loop_check_gen) + return ep->loop_check_depth; + hlist_for_each_entry_rcu(epi, &ep->refs, fllink) + result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1); + ep->gen = loop_check_gen; + ep->loop_check_depth = result; + return result; } /** @@ -2204,8 +2222,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { + int depth, upwards_depth; + inserting_into = ep; - return ep_loop_check_proc(to, 0); + /* + * Check how deep down we can get from @to, and whether it is possible + * to loop up to @ep. + */ + depth = ep_loop_check_proc(to, 0); + if (depth > EP_MAX_NESTS) + return -1; + /* Check how far up we can go from @ep. */ + rcu_read_lock(); + upwards_depth = ep_get_upwards_depth_proc(ep, 0); + rcu_read_unlock(); + + return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0; } static void clear_tfile_check_list(void) From e7b840fd4956da86e93a8258155cf3e127f509e7 Mon Sep 17 00:00:00 2001 From: Taotao Chen Date: Wed, 16 Jul 2025 09:36:03 +0000 Subject: [PATCH 18/24] drm/i915: Use kernel_write() in shmem object create Replace the write_begin/write_end loop in i915_gem_object_create_shmem_from_data() with call to kernel_write(). This function initializes shmem-backed GEM objects. kernel_write() simplifies the code by removing manual folio handling. Part of a series refactoring address_space_operations write_begin and write_end callbacks to use struct kiocb for passing write context and flags. Signed-off-by: Taotao Chen Link: https://lore.kernel.org/20250716093559.217344-2-chentaotao@didiglobal.com Signed-off-by: Christian Brauner --- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 33 +++++++---------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 19a3eb82dc6a..1e8f66ac48ca 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -637,9 +637,8 @@ i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, { struct drm_i915_gem_object *obj; struct file *file; - const struct address_space_operations *aops; - loff_t pos; - int err; + loff_t pos = 0; + ssize_t err; GEM_WARN_ON(IS_DGFX(i915)); obj = i915_gem_object_create_shmem(i915, round_up(size, PAGE_SIZE)); @@ -649,29 +648,15 @@ i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU); file = obj->base.filp; - aops = file->f_mapping->a_ops; - pos = 0; - do { - unsigned int len = min_t(typeof(size), size, PAGE_SIZE); - struct folio *folio; - void *fsdata; + err = kernel_write(file, data, size, &pos); - err = aops->write_begin(file, file->f_mapping, pos, len, - &folio, &fsdata); - if (err < 0) - goto fail; + if (err < 0) + goto fail; - memcpy_to_folio(folio, offset_in_folio(folio, pos), data, len); - - err = aops->write_end(file, file->f_mapping, pos, len, len, - folio, fsdata); - if (err < 0) - goto fail; - - size -= len; - data += len; - pos += len; - } while (size); + if (err != size) { + err = -EIO; + goto fail; + } return obj; From 048832a3f4003113c3a0f060b08376c103622099 Mon Sep 17 00:00:00 2001 From: Taotao Chen Date: Wed, 16 Jul 2025 09:36:04 +0000 Subject: [PATCH 19/24] drm/i915: Refactor shmem_pwrite() to use kiocb and write_iter Refactors shmem_pwrite() to replace the ->write_begin/end logic with a write_iter-based implementation using kiocb and iov_iter. While kernel_write() was considered, it caused about 50% performance regression. vfs_write() is not exported for kernel use. Therefore, file->f_op->write_iter() is called directly with a synchronously initialized kiocb to preserve performance and remove write_begin usage. Performance results use gem_pwrite on Intel CPU i7-10700 (average of 10 runs): - ./gem_pwrite --run-subtest bench -s 16384 Before: 0.205s, After: 0.214s - ./gem_pwrite --run-subtest bench -s 524288 Before: 6.1021s, After: 4.8047s Part of a series refactoring address_space_operations write_begin and write_end callbacks to use struct kiocb for passing write context and flags. Signed-off-by: Taotao Chen Link: https://lore.kernel.org/20250716093559.217344-3-chentaotao@didiglobal.com Signed-off-by: Christian Brauner --- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 76 ++++++----------------- 1 file changed, 19 insertions(+), 57 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 1e8f66ac48ca..9cbb0f68a5bb 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -400,12 +401,12 @@ static int shmem_pwrite(struct drm_i915_gem_object *obj, const struct drm_i915_gem_pwrite *arg) { - struct address_space *mapping = obj->base.filp->f_mapping; - const struct address_space_operations *aops = mapping->a_ops; char __user *user_data = u64_to_user_ptr(arg->data_ptr); - u64 remain; - loff_t pos; - unsigned int pg; + struct file *file = obj->base.filp; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t written; + u64 size = arg->size; /* Caller already validated user args */ GEM_BUG_ON(!access_ok(user_data, arg->size)); @@ -428,63 +429,24 @@ shmem_pwrite(struct drm_i915_gem_object *obj, if (obj->mm.madv != I915_MADV_WILLNEED) return -EFAULT; - /* - * Before the pages are instantiated the object is treated as being - * in the CPU domain. The pages will be clflushed as required before - * use, and we can freely write into the pages directly. If userspace - * races pwrite with any other operation; corruption will ensue - - * that is userspace's prerogative! - */ + if (size > MAX_RW_COUNT) + return -EFBIG; - remain = arg->size; - pos = arg->offset; - pg = offset_in_page(pos); + if (!file->f_op->write_iter) + return -EINVAL; - do { - unsigned int len, unwritten; - struct folio *folio; - void *data, *vaddr; - int err; - char __maybe_unused c; + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = arg->offset; + iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)user_data, size); - len = PAGE_SIZE - pg; - if (len > remain) - len = remain; + written = file->f_op->write_iter(&kiocb, &iter); + BUG_ON(written == -EIOCBQUEUED); - /* Prefault the user page to reduce potential recursion */ - err = __get_user(c, user_data); - if (err) - return err; + if (written != size) + return -EIO; - err = __get_user(c, user_data + len - 1); - if (err) - return err; - - err = aops->write_begin(obj->base.filp, mapping, pos, len, - &folio, &data); - if (err < 0) - return err; - - vaddr = kmap_local_folio(folio, offset_in_folio(folio, pos)); - pagefault_disable(); - unwritten = __copy_from_user_inatomic(vaddr, user_data, len); - pagefault_enable(); - kunmap_local(vaddr); - - err = aops->write_end(obj->base.filp, mapping, pos, len, - len - unwritten, folio, data); - if (err < 0) - return err; - - /* We don't handle -EFAULT, leave it to the caller to check */ - if (unwritten) - return -ENODEV; - - remain -= len; - user_data += len; - pos += len; - pg = 0; - } while (remain); + if (written < 0) + return written; return 0; } From e9d8e2bf23206825ca9b4d3caf587945ba807939 Mon Sep 17 00:00:00 2001 From: Taotao Chen Date: Wed, 16 Jul 2025 09:36:06 +0000 Subject: [PATCH 20/24] fs: change write_begin/write_end interface to take struct kiocb * Change the address_space_operations callbacks write_begin() and write_end() to take struct kiocb * as the first argument instead of struct file *. Update all affected function prototypes, implementations, call sites, and related documentation across VFS, filesystems, and block layer. Part of a series refactoring address_space_operations write_begin and write_end callbacks to use struct kiocb for passing write context and flags. Signed-off-by: Taotao Chen Link: https://lore.kernel.org/20250716093559.217344-4-chentaotao@didiglobal.com Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 4 ++-- Documentation/filesystems/vfs.rst | 6 +++--- block/fops.c | 13 ++++++++----- fs/adfs/inode.c | 9 +++++---- fs/affs/file.c | 26 ++++++++++++++----------- fs/bcachefs/fs-io-buffered.c | 4 ++-- fs/bcachefs/fs-io-buffered.h | 4 ++-- fs/bfs/file.c | 7 ++++--- fs/buffer.c | 26 ++++++++++++------------- fs/ceph/addr.c | 10 +++++++--- fs/ecryptfs/mmap.c | 10 +++++----- fs/exfat/file.c | 11 +++++------ fs/exfat/inode.c | 16 ++++++++------- fs/ext2/inode.c | 11 ++++++----- fs/ext4/inode.c | 18 +++++++++-------- fs/f2fs/data.c | 8 +++++--- fs/fat/inode.c | 18 +++++++++-------- fs/fuse/file.c | 14 +++++++++----- fs/hfs/hfs_fs.h | 2 +- fs/hfs/inode.c | 4 ++-- fs/hfsplus/hfsplus_fs.h | 6 ++++-- fs/hfsplus/inode.c | 8 +++++--- fs/hostfs/hostfs_kern.c | 8 +++++--- fs/hpfs/file.c | 18 +++++++++-------- fs/hugetlbfs/inode.c | 9 +++++---- fs/jffs2/file.c | 28 +++++++++++++++------------ fs/jfs/inode.c | 16 ++++++++------- fs/libfs.c | 11 ++++++----- fs/minix/inode.c | 7 ++++--- fs/nfs/file.c | 8 ++++++-- fs/nilfs2/inode.c | 8 +++++--- fs/ntfs3/file.c | 4 ++-- fs/ntfs3/inode.c | 7 ++++--- fs/ntfs3/ntfs_fs.h | 10 ++++++---- fs/ocfs2/aops.c | 6 ++++-- fs/omfs/file.c | 7 ++++--- fs/orangefs/inode.c | 16 ++++++++------- fs/ubifs/file.c | 8 +++++--- fs/udf/inode.c | 11 +++++++---- fs/ufs/inode.c | 16 ++++++++------- fs/vboxsf/file.c | 5 +++-- include/linux/buffer_head.h | 4 ++-- include/linux/fs.h | 11 ++++++----- mm/filemap.c | 4 ++-- mm/shmem.c | 12 ++++++------ 45 files changed, 267 insertions(+), 202 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 2e567e341c3b..580581281ed7 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -253,10 +253,10 @@ prototypes:: int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *folio); void (*readahead)(struct readahead_control *); - int (*write_begin)(struct file *, struct address_space *mapping, + int (*write_begin)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); - int (*write_end)(struct file *, struct address_space *mapping, + int (*write_end)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index dd9da7e04a99..57604b07bdc9 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -823,10 +823,10 @@ cache in your filesystem. The following members are defined: int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); - int (*write_begin)(struct file *, struct address_space *mapping, + int (*write_begin)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); - int (*write_end)(struct file *, struct address_space *mapping, + struct page **pagep, void **fsdata); + int (*write_end)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); diff --git a/block/fops.c b/block/fops.c index 35cea0cb304d..f34e7315c83c 100644 --- a/block/fops.c +++ b/block/fops.c @@ -496,15 +496,18 @@ static void blkdev_readahead(struct readahead_control *rac) mpage_readahead(rac, blkdev_get_block); } -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int blkdev_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int blkdev_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; ret = block_write_end(pos, len, copied, folio); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 21527189e430..6830f8bc8d4e 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -53,13 +53,14 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) truncate_pagecache(inode, inode->i_size); } -static int adfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int adfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); if (unlikely(ret)) diff --git a/fs/affs/file.c b/fs/affs/file.c index 7a71018e3f67..219ea0353906 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -415,13 +415,14 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return ret; } -static int affs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int affs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); if (unlikely(ret)) @@ -430,14 +431,15 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int affs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, +static int affs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned int len, unsigned int copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); /* Clear Archived bit on file writes, as AmigaOS would do */ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { @@ -645,7 +647,8 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio) return err; } -static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, +static int affs_write_begin_ofs(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -684,9 +687,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return err; } -static int affs_write_end_ofs(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int affs_write_end_ofs(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 66bacdd49f78..1c54b9b5bd69 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -674,7 +674,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc /* buffered writes: */ -int bch2_write_begin(struct file *file, struct address_space *mapping, +int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -757,7 +757,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, return bch2_err_class(ret); } -int bch2_write_end(struct file *file, struct address_space *mapping, +int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h index 3207ebbb4ab4..14de91c27656 100644 --- a/fs/bcachefs/fs-io-buffered.h +++ b/fs/bcachefs/fs-io-buffered.h @@ -10,9 +10,9 @@ int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); -int bch2_write_begin(struct file *, struct address_space *, loff_t pos, +int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos, unsigned len, struct folio **, void **); -int bch2_write_end(struct file *, struct address_space *, loff_t, +int bch2_write_end(const struct kiocb *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); diff --git a/fs/bfs/file.c b/fs/bfs/file.c index fa66a09e496a..10dc0151ea55 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -170,9 +170,10 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) truncate_pagecache(inode, inode->i_size); } -static int bfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int bfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; diff --git a/fs/buffer.c b/fs/buffer.c index 565fe88773c2..ead4dc85debd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2297,9 +2297,9 @@ int block_write_end(loff_t pos, unsigned len, unsigned copied, } EXPORT_SYMBOL(block_write_end); -int generic_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +int generic_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -2494,7 +2494,8 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) } EXPORT_SYMBOL(generic_cont_expand_simple); -static int cont_expand_zero(struct file *file, struct address_space *mapping, +static int cont_expand_zero(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; @@ -2518,12 +2519,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = PAGE_SIZE - zerofrom; - err = aops->write_begin(file, mapping, curpos, len, + err = aops->write_begin(iocb, mapping, curpos, len, &folio, &fsdata); if (err) goto out; folio_zero_range(folio, offset_in_folio(folio, curpos), len); - err = aops->write_end(file, mapping, curpos, len, len, + err = aops->write_end(iocb, mapping, curpos, len, len, folio, fsdata); if (err < 0) goto out; @@ -2551,12 +2552,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = offset - zerofrom; - err = aops->write_begin(file, mapping, curpos, len, + err = aops->write_begin(iocb, mapping, curpos, len, &folio, &fsdata); if (err) goto out; folio_zero_range(folio, offset_in_folio(folio, curpos), len); - err = aops->write_end(file, mapping, curpos, len, len, + err = aops->write_end(iocb, mapping, curpos, len, len, folio, fsdata); if (err < 0) goto out; @@ -2571,17 +2572,16 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata, - get_block_t *get_block, loff_t *bytes) +int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; unsigned int blocksize = i_blocksize(inode); unsigned int zerofrom; int err; - err = cont_expand_zero(file, mapping, pos, bytes); + err = cont_expand_zero(iocb, mapping, pos, bytes); if (err) return err; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 60a621b00c65..02468c848cce 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1864,10 +1864,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. */ -static int ceph_write_begin(struct file *file, struct address_space *mapping, +static int ceph_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int r; @@ -1885,10 +1887,12 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, * we don't do anything in here that simple_write_end doesn't do * except adjust dirty page accounting */ -static int ceph_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, +static int ceph_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 60f0ac8744b5..2c2b12fedeae 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -228,7 +228,7 @@ static int fill_zeros_to_end_of_page(struct folio *folio, unsigned int to) /** * ecryptfs_write_begin - * @file: The eCryptfs file + * @iocb: I/O control block for the eCryptfs file * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write @@ -239,7 +239,7 @@ static int fill_zeros_to_end_of_page(struct folio *folio, unsigned int to) * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_write_begin(struct file *file, +static int ecryptfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file, * Note, this will increase i_size. */ if (index != 0) { if (prev_page_end_size > i_size_read(mapping->host)) { - rc = ecryptfs_truncate(file->f_path.dentry, + rc = ecryptfs_truncate(iocb->ki_filp->f_path.dentry, prev_page_end_size); if (rc) { printk(KERN_ERR "%s: Error on attempt to " @@ -429,7 +429,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) /** * ecryptfs_write_end - * @file: The eCryptfs file object + * @iocb: I/O control block for the eCryptfs file * @mapping: The eCryptfs object * @pos: The file position * @len: The length of the data (unused) @@ -437,7 +437,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) * @folio: The eCryptfs folio * @fsdata: The fsdata (unused) */ -static int ecryptfs_write_end(struct file *file, +static int ecryptfs_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 841a5b18e3df..70f53edd0a10 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -532,11 +532,10 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return blkdev_issue_flush(inode->i_sb->s_bdev); } -static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) +static int exfat_extend_valid_size(struct inode *inode, loff_t new_valid_size) { int err; loff_t pos; - struct inode *inode = file_inode(file); struct exfat_inode_info *ei = EXFAT_I(inode); struct address_space *mapping = inode->i_mapping; const struct address_space_operations *ops = mapping->a_ops; @@ -551,14 +550,14 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size) if (pos + len > new_valid_size) len = new_valid_size - pos; - err = ops->write_begin(file, mapping, pos, len, &folio, NULL); + err = ops->write_begin(NULL, mapping, pos, len, &folio, NULL); if (err) goto out; off = offset_in_folio(folio, pos); folio_zero_new_buffers(folio, off, off + len); - err = ops->write_end(file, mapping, pos, len, len, folio, NULL); + err = ops->write_end(NULL, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; @@ -604,7 +603,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) } if (pos > valid_size) { - ret = exfat_extend_valid_size(file, pos); + ret = exfat_extend_valid_size(inode, pos); if (ret < 0 && ret != -ENOSPC) { exfat_err(inode->i_sb, "write: fail to zero from %llu to %llu(%zd)", @@ -665,7 +664,7 @@ static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf) start + vma->vm_end - vma->vm_start); if (ei->valid_size < end) { - err = exfat_extend_valid_size(file, end); + err = exfat_extend_valid_size(inode, end); if (err < 0) { inode_unlock(inode); return vmf_fs_error(err); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index b22c02d6000f..c10844e1e16c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -446,9 +446,10 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) } } -static int exfat_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, - struct folio **foliop, void **fsdata) +static int exfat_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned int len, + struct folio **foliop, void **fsdata) { int ret; @@ -463,15 +464,16 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int exfat_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct folio *folio, void *fsdata) +static int exfat_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) exfat_write_failed(mapping, pos+len); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 30f8201c155f..d35ca26eee3c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -915,7 +915,7 @@ static void ext2_readahead(struct readahead_control *rac) } static int -ext2_write_begin(struct file *file, struct address_space *mapping, +ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; @@ -926,13 +926,14 @@ ext2_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int ext2_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int ext2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e6aa7ca6d842..9a16efd072bb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1252,7 +1252,8 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ -static int ext4_write_begin(struct file *file, struct address_space *mapping, +static int ext4_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -1400,12 +1401,12 @@ static int write_end_fn(handle_t *handle, struct inode *inode, /* * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). + * `iocb` can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ -static int ext4_write_end(struct file *file, +static int ext4_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -1510,7 +1511,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, } while (bh != head); } -static int ext4_journalled_write_end(struct file *file, +static int ext4_journalled_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -3036,7 +3037,8 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -static int ext4_da_write_begin(struct file *file, struct address_space *mapping, +static int ext4_da_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -3054,7 +3056,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; - return ext4_write_begin(file, mapping, pos, + return ext4_write_begin(iocb, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; @@ -3195,7 +3197,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, return copied; } -static int ext4_da_write_end(struct file *file, +static int ext4_da_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) @@ -3204,7 +3206,7 @@ static int ext4_da_write_end(struct file *file, int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) - return ext4_write_end(file, mapping, pos, + return ext4_write_end(iocb, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 31e892842625..711ad80b38d0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3519,8 +3519,10 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, return 0; } -static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int f2fs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3656,7 +3658,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, return err; } -static int f2fs_write_end(struct file *file, +static int f2fs_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 3852bb66358c..9648ed097816 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -219,13 +219,14 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) } } -static int fat_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int fat_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int err; - err = cont_write_begin(file, mapping, pos, len, + err = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); if (err < 0) @@ -233,13 +234,14 @@ static int fat_write_begin(struct file *file, struct address_space *mapping, return err; } -static int fat_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int fat_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f102afc03359..21c6f8654bfe 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2213,10 +2213,13 @@ static int fuse_writepages(struct address_space *mapping, * It's worthy to make sure that space is reserved on disk for the write, * but how to implement it without killing performance need more thinking. */ -static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +static int fuse_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; + struct file *file = iocb->ki_filp; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); struct folio *folio; loff_t fsize; @@ -2256,9 +2259,10 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, return err; } -static int fuse_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int fuse_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index a0c7cb0f79fc..c3fd3172fdd6 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -201,7 +201,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; -int hfs_write_begin(struct file *file, struct address_space *mapping, +int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index a81ce7a740b9..096f338134f9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -44,12 +44,12 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) } } -int hfs_write_begin(struct file *file, struct address_space *mapping, +int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hfs_get_block, &HFS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 2f089bff0095..3d5c65aef3b2 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -473,8 +473,10 @@ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata); +int hfsplus_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, struct folio **foliop, + void **fsdata); struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, umode_t mode); void hfsplus_delete_inode(struct inode *inode); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index f331e9574217..97d75bb2c388 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -38,12 +38,14 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) } } -int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) +int hfsplus_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hfsplus_get_block, &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 702c41317589..6c2cf0cdf3d6 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -445,7 +445,8 @@ static int hostfs_read_folio(struct file *file, struct folio *folio) return ret; } -static int hostfs_write_begin(struct file *file, struct address_space *mapping, +static int hostfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -458,7 +459,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping, return 0; } -static int hostfs_write_end(struct file *file, struct address_space *mapping, +static int hostfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { @@ -468,7 +470,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, int err; buffer = kmap_local_folio(folio, from); - err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied); + err = write_file(FILE_HOSTFS_I(iocb->ki_filp)->fd, &pos, buffer, copied); kunmap_local(buffer); if (!folio_test_uptodate(folio) && err == folio_size(folio)) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 449a3fc1b8d9..7b95a3d2e2a6 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -188,13 +188,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) hpfs_unlock(inode->i_sb); } -static int hpfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int hpfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; - ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, + ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata, hpfs_get_block, &hpfs_i(mapping->host)->mmu_private); if (unlikely(ret)) @@ -203,13 +204,14 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int hpfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int hpfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (err < len) hpfs_write_failed(mapping, pos + len); if (!(err < 0)) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e4de5425838d..541aae502d4d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -311,7 +311,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval; } -static int hugetlbfs_write_begin(struct file *file, +static int hugetlbfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -319,9 +319,10 @@ static int hugetlbfs_write_begin(struct file *file, return -EINVAL; } -static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int hugetlbfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { BUG(); return -EINVAL; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 13c18ccc13b0..adec3af9bf8d 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -21,12 +21,14 @@ #include #include "nodelist.h" -static int jffs2_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata); -static int jffs2_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata); +static int jffs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata); +static int jffs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata); static int jffs2_read_folio(struct file *filp, struct folio *folio); int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) @@ -121,9 +123,10 @@ static int jffs2_read_folio(struct file *file, struct folio *folio) return ret; } -static int jffs2_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int jffs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct folio *folio; struct inode *inode = mapping->host; @@ -235,9 +238,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, return ret; } -static int jffs2_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int jffs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 60fc92dee24d..083e7fa54709 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -290,9 +290,10 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) } } -static int jfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int jfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; @@ -303,13 +304,14 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int jfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int jfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) jfs_write_failed(mapping, pos + len); return ret; diff --git a/fs/libfs.c b/fs/libfs.c index f99ecc300647..52196566ccbc 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -910,7 +910,7 @@ static int simple_read_folio(struct file *file, struct folio *folio) return 0; } -int simple_write_begin(struct file *file, struct address_space *mapping, +int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -935,7 +935,7 @@ EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes - * @file: See .write_end of address_space_operations + * @iocb: kernel I/O control block * @mapping: " * @pos: " * @len: " @@ -956,9 +956,10 @@ EXPORT_SYMBOL(simple_write_begin); * * Use *ONLY* with simple_read_folio() */ -static int simple_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int simple_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f007e389d5d2..df9d11479caf 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -442,9 +442,10 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) } } -static int minix_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int minix_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 033feeab8c34..2bd557ca1af9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -342,12 +342,14 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, * If the writer ends up delaying the write, the writer needs to * increment the page use counts until he is done with the page. */ -static int nfs_write_begin(struct file *file, struct address_space *mapping, +static int nfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { fgf_t fgp = FGP_WRITEBEGIN; struct folio *folio; + struct file *file = iocb->ki_filp; int once_thru = 0; int ret; @@ -377,10 +379,12 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int nfs_write_end(struct file *file, struct address_space *mapping, +static int nfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { + struct file *file = iocb->ki_filp; struct nfs_open_context *ctx = nfs_file_open_context(file); unsigned offset = offset_in_folio(folio, pos); int status; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6613b8fcceb0..c2ccafdf4a19 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -218,7 +218,8 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) } } -static int nilfs_write_begin(struct file *file, struct address_space *mapping, +static int nilfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) @@ -237,7 +238,8 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, return err; } -static int nilfs_write_end(struct file *file, struct address_space *mapping, +static int nilfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { @@ -248,7 +250,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, nr_dirty = nilfs_page_count_clean_buffers(folio, start, start + copied); - copied = generic_write_end(file, mapping, pos, len, copied, folio, + copied = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); nilfs_set_file_dirty(inode, nr_dirty); err = nilfs_transaction_commit(inode->i_sb); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 1e99a35691cd..1590c2a3c48f 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -154,13 +154,13 @@ static int ntfs_extend_initialized_size(struct file *file, if (pos + len > new_valid) len = new_valid - pos; - err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL); + err = ntfs_write_begin(NULL, mapping, pos, len, &folio, NULL); if (err) goto out; folio_zero_range(folio, zerofrom, folio_size(folio) - zerofrom); - err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL); + err = ntfs_write_end(NULL, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 0f0d27d4644a..dad088e64b3c 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -912,7 +912,7 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, bh_result, create, GET_BLOCK_WRITE_BEGIN); } -int ntfs_write_begin(struct file *file, struct address_space *mapping, +int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, u32 len, struct folio **foliop, void **fsdata) { int err; @@ -957,7 +957,8 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, /* * ntfs_write_end - Address_space_operations::write_end. */ -int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, +int ntfs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; @@ -989,7 +990,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, folio_unlock(folio); folio_put(folio); } else { - err = generic_write_end(file, mapping, pos, len, copied, folio, + err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); } diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 36b8052660d5..921257773eec 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -702,10 +702,12 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, int ntfs_set_size(struct inode *inode, u64 new_size); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); -int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct folio **foliop, void **fsdata); -int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct folio *folio, void *fsdata); +int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, u32 len, struct folio **foliop, + void **fsdata); +int ntfs_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, u32 len, u32 copied, struct folio *folio, + void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int inode_read_data(struct inode *inode, void *data, size_t bytes); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 40b6bce12951..2203438738f6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1856,7 +1856,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, return ret; } -static int ocfs2_write_begin(struct file *file, struct address_space *mapping, +static int ocfs2_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -2047,7 +2048,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, return copied; } -static int ocfs2_write_end(struct file *file, struct address_space *mapping, +static int ocfs2_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 98358d405b6a..8d70f816b0c9 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -310,9 +310,10 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) } } -static int omfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int omfs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 08a6f372a352..a7ab63776735 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -285,9 +285,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) return ret; } -static int orangefs_write_begin(struct file *file, - struct address_space *mapping, loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int orangefs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, struct folio **foliop, + void **fsdata) { struct orangefs_write_range *wr; struct folio *folio; @@ -340,9 +341,10 @@ static int orangefs_write_begin(struct file *file, return 0; } -static int orangefs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct folio *folio, - void *fsdata) +static int orangefs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; @@ -372,7 +374,7 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping, folio_unlock(folio); folio_put(folio); - mark_inode_dirty_sync(file_inode(file)); + mark_inode_dirty_sync(file_inode(iocb->ki_filp)); return copied; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index bf311c38d9a8..cee8bec1ea26 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -404,7 +404,8 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio, * there is a plenty of flash space and the budget will be acquired quickly, * without forcing write-back. The slow path does not make this assumption. */ -static int ubifs_write_begin(struct file *file, struct address_space *mapping, +static int ubifs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { @@ -514,8 +515,9 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio, } } -static int ubifs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, +static int ubifs_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, + unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4386dd845e40..356b75676fa9 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -244,10 +244,12 @@ static void udf_readahead(struct readahead_control *rac) mpage_readahead(rac, udf_get_block); } -static int udf_write_begin(struct file *file, struct address_space *mapping, +static int udf_write_begin(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { + struct file *file = iocb->ki_filp; struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; @@ -271,15 +273,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, return 0; } -static int udf_write_end(struct file *file, struct address_space *mapping, +static int udf_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(iocb->ki_filp); loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) - return generic_write_end(file, mapping, pos, len, copied, folio, + return generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); last_pos = pos + copied; if (last_pos > inode->i_size) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7dc38fdef2ea..8361c00e8fa6 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -474,9 +474,10 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) } } -static int ufs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +static int ufs_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { int ret; @@ -487,13 +488,14 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, return ret; } -static int ufs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +static int ufs_write_end(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); + ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata); if (ret < len) ufs_write_failed(mapping, pos + len); return ret; diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index b492794f8e9a..af01e3beaa42 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -300,12 +300,13 @@ static int vboxsf_writepages(struct address_space *mapping, return error; } -static int vboxsf_write_end(struct file *file, struct address_space *mapping, +static int vboxsf_write_end(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; - struct vboxsf_handle *sf_handle = file->private_data; + struct vboxsf_handle *sf_handle = iocb->ki_filp->private_data; size_t from = offset_in_folio(folio, pos); u32 nwritten = len; u8 *buf; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 178eb90e9cf3..b16b88bfbc3e 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -263,11 +263,11 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *); -int generic_write_end(struct file *, struct address_space *, +int generic_write_end(const struct kiocb *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); -int cont_write_begin(struct file *, struct address_space *, loff_t, +int cont_write_begin(const struct kiocb *, struct address_space *, loff_t, unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); diff --git a/include/linux/fs.h b/include/linux/fs.h index 09e3e80b0528..df8c503100c4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -444,10 +444,10 @@ struct address_space_operations { void (*readahead)(struct readahead_control *); - int (*write_begin)(struct file *, struct address_space *mapping, + int (*write_begin)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); - int (*write_end)(struct file *, struct address_space *mapping, + int (*write_end)(const struct kiocb *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); @@ -3598,9 +3598,10 @@ extern void simple_recursive_removal(struct dentry *, extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); -extern int simple_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata); +extern int simple_write_begin(const struct kiocb *iocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); diff --git a/mm/filemap.c b/mm/filemap.c index bada249b9fb7..ba089d75fc86 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4109,7 +4109,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) break; } - status = a_ops->write_begin(file, mapping, pos, bytes, + status = a_ops->write_begin(iocb, mapping, pos, bytes, &folio, &fsdata); if (unlikely(status < 0)) break; @@ -4130,7 +4130,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); - status = a_ops->write_end(file, mapping, pos, bytes, copied, + status = a_ops->write_end(iocb, mapping, pos, bytes, copied, folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); diff --git a/mm/shmem.c b/mm/shmem.c index 0c5fb4ffa03a..2229425e1b29 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3266,9 +3266,9 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int -shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -3300,9 +3300,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, } static int -shmem_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; From b799474b9aeb46ec698874d4de1a799de8b5f64f Mon Sep 17 00:00:00 2001 From: Taotao Chen Date: Wed, 16 Jul 2025 09:36:08 +0000 Subject: [PATCH 21/24] mm/pagemap: add write_begin_get_folio() helper function Add write_begin_get_folio() to simplify the common folio lookup logic used by filesystem ->write_begin() implementations. This helper wraps __filemap_get_folio() with common flags such as FGP_WRITEBEGIN, conditional FGP_DONTCACHE, and set folio order based on the write length. Part of a series refactoring address_space_operations write_begin and write_end callbacks to use struct kiocb for passing write context and flags. Signed-off-by: Taotao Chen Link: https://lore.kernel.org/20250716093559.217344-5-chentaotao@didiglobal.com Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e63fbfbd5b0f..ce2bcdcadb73 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -750,6 +750,33 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); +/** + * write_begin_get_folio - Get folio for write_begin with flags. + * @iocb: The kiocb passed from write_begin (may be NULL). + * @mapping: The address space to search. + * @index: The page cache index. + * @len: Length of data being written. + * + * This is a helper for filesystem write_begin() implementations. + * It wraps __filemap_get_folio(), setting appropriate flags in + * the write begin context. + * + * Return: A folio or an ERR_PTR. + */ +static inline struct folio *write_begin_get_folio(const struct kiocb *iocb, + struct address_space *mapping, pgoff_t index, size_t len) +{ + fgf_t fgp_flags = FGP_WRITEBEGIN; + + fgp_flags |= fgf_set_order(len); + + if (iocb && iocb->ki_flags & IOCB_DONTCACHE) + fgp_flags |= FGP_DONTCACHE; + + return __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} + /** * filemap_get_folio - Find and get a folio. * @mapping: The address_space to search. From ae21c0c0ac56aa734327e9c8b7dfef4270ab54d4 Mon Sep 17 00:00:00 2001 From: Taotao Chen Date: Wed, 16 Jul 2025 09:36:09 +0000 Subject: [PATCH 22/24] ext4: support uncached buffered I/O Set FOP_DONTCACHE in ext4_file_operations to declare support for uncached buffered I/O. To handle this flag, update ext4_write_begin() and ext4_da_write_begin() to use write_begin_get_folio(), which encapsulates FGP_DONTCACHE logic based on iocb->ki_flags. Part of a series refactoring address_space_operations write_begin and write_end callbacks to use struct kiocb for passing write context and flags. Signed-off-by: Taotao Chen Link: https://lore.kernel.org/20250716093559.217344-6-chentaotao@didiglobal.com Signed-off-by: Christian Brauner --- fs/ext4/file.c | 3 ++- fs/ext4/inode.c | 12 +++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 21df81347147..274b41a476c8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -977,7 +977,8 @@ const struct file_operations ext4_file_operations = { .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_DIO_PARALLEL_WRITE, + FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9a16efd072bb..5c7024051f1e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1264,7 +1264,6 @@ static int ext4_write_begin(const struct kiocb *iocb, struct folio *folio; pgoff_t index; unsigned from, to; - fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -1288,16 +1287,14 @@ static int ext4_write_begin(const struct kiocb *iocb, } /* - * __filemap_get_folio() can take a long time if the + * write_begin_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: - fgp |= fgf_set_order(len); - folio = __filemap_get_folio(mapping, index, fgp, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -3046,7 +3043,6 @@ static int ext4_da_write_begin(const struct kiocb *iocb, struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; - fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -3072,9 +3068,7 @@ static int ext4_da_write_begin(const struct kiocb *iocb, } retry: - fgp |= fgf_set_order(len); - folio = __filemap_get_folio(mapping, index, fgp, - mapping_gfp_mask(mapping)); + folio = write_begin_get_folio(iocb, mapping, index, len); if (IS_ERR(folio)) return PTR_ERR(folio); From ecb6cc0fd8cd2d34b983e118aa61dd8c9b052d0d Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 21 Jul 2025 19:09:55 +0200 Subject: [PATCH 23/24] eventpoll: fix sphinx documentation build warning Sphinx complains that ep_get_upwards_depth_proc() has a kerneldoc-style comment without documenting its parameters. This is an internal function that was not meant to show up in kernel documentation, so fix the warning by changing the comment to a non-kerneldoc one. Fixes: 22bacca48a17 ("epoll: prevent creating circular epoll structures") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20250717173655.10ecdce6@canb.auug.org.au Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507171958.aMcW08Cn-lkp@intel.com/ Signed-off-by: Jann Horn Link: https://lore.kernel.org/20250721-epoll-sphinx-fix-v1-1-b695c92bf009@google.com Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Christian Brauner --- fs/eventpoll.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 44648cc09250..02ac05322b1b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2192,9 +2192,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) return result; } -/** - * ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards - */ +/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */ static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth) { int result = 0; From 4e8fc4f7208b032674ef8a4977b96484c328515c Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 23 Jul 2025 20:23:29 +0800 Subject: [PATCH 24/24] netfs: Remove unused declaration netfs_queue_write_request() Commit c245868524cc ("netfs: Remove the old writeback code") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Link: https://lore.kernel.org/20250723122329.923223-1-yuehaibing@huawei.com Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 065c17385e53..486d5766fe77 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -443,7 +443,6 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs); void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq); void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error); -void netfs_queue_write_request(struct netfs_io_subrequest *subreq); int netfs_start_io_read(struct inode *inode); void netfs_end_io_read(struct inode *inode);