From 1888635532fbbd6be4a4368621085c3a197279f8 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 30 Sep 2025 16:53:15 +0800 Subject: writeback: Wake up waiting tasks when finishing the writeback of a chunk. Writing back a large number of pages can take a lots of time. This issue is exacerbated when the underlying device is slow or subject to block layer rate limiting, which in turn triggers unexpected hung task warnings. We can trigger a wake-up once a chunk has been written back and the waiting time for writeback exceeds half of sysctl_hung_task_timeout_secs. This action allows the hung task detector to be aware of the writeback progress, thereby eliminating these unexpected hung task warnings. This patch has passed the xfstests 'check -g quick' test based on ext4, with no additional failures introduced. Signed-off-by: Julian Sun Reviewed-by: Jan Kara Suggested-by: Peter Zijlstra Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2b35e80037feed..61a980a06ceeb3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,6 +14,7 @@ * Additions for address_space-based writeback */ +#include #include #include #include @@ -213,7 +214,8 @@ static void wb_queue_work(struct bdi_writeback *wb, void wb_wait_for_completion(struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, + ({ done->progress_stamp = jiffies; !atomic_read(&done->cnt); })); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -2014,6 +2016,12 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + /* Report progress to inform the hung task detector of the progress. */ + if (work->done && work->done->progress_stamp && + (jiffies - work->done->progress_stamp) > HZ * + sysctl_hung_task_timeout_secs / 2) + wake_up_all(work->done->waitq); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; -- cgit 1.2.3-korg From d6e6215907640801b1f407dc9e871b19ca5a3805 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 30 Sep 2025 15:18:29 +0800 Subject: writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs) When a writeback work lasts for sysctl_hung_task_timeout_secs, we want to identify that there are tasks waiting for a long time-this helps us pinpoint potential issues. Additionally, recording the starting jiffies is useful when debugging a crashed vmcore. Signed-off-by: Julian Sun Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 17 +++++++++++++++-- include/linux/backing-dev-defs.h | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 61a980a06ceeb3..e76192d140e3d8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -201,6 +201,19 @@ static void wb_queue_work(struct bdi_writeback *wb, spin_unlock_irq(&wb->work_lock); } +static bool wb_wait_for_completion_cb(struct wb_completion *done) +{ + unsigned long waited_secs = (jiffies - done->wait_start) / HZ; + + done->progress_stamp = jiffies; + if (waited_secs > sysctl_hung_task_timeout_secs) + pr_info("INFO: The task %s:%d has been waiting for writeback " + "completion for more than %lu seconds.", + current->comm, current->pid, waited_secs); + + return !atomic_read(&done->cnt); +} + /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion @@ -213,9 +226,9 @@ static void wb_queue_work(struct bdi_writeback *wb, */ void wb_wait_for_completion(struct wb_completion *done) { + done->wait_start = jiffies; atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, - ({ done->progress_stamp = jiffies; !atomic_read(&done->cnt); })); + wait_event(*done->waitq, wb_wait_for_completion_cb(done)); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c8aa749790b14e..610ef62b6a32d4 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -64,6 +64,7 @@ struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; unsigned long progress_stamp; /* The jiffies when slow progress is detected */ + unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */ }; #define __WB_COMPLETION_INIT(_waitq) \ -- cgit 1.2.3-korg From 1bcb413d0cd80efb386751910036a93147fd8dbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:19 +0200 Subject: mm: remove filemap_fdatawrite_wbc Replace filemap_fdatawrite_wbc, which exposes a writeback_control to the callers with a filemap_writeback helper that takes all the possible arguments and declares the writeback_control itself. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-9-hch@lst.de Reviewed-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 6 +++--- include/linux/pagemap.h | 2 -- mm/filemap.c | 54 +++++++++++++++++-------------------------------- 3 files changed, 21 insertions(+), 41 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e76192d140e3d8..4448de35ec8bc2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -822,9 +822,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, * @wbc: writeback_control of interest * @inode: target inode * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. + * This function is to be used by filemap_writeback(), which is an alternative + * entry point into writeback code, and first ensures @inode is associated with + * a bdi_writeback and attaches it to @wbc. */ void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cebdf160d3ddff..678d8ae23d014c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -60,8 +60,6 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) diff --git a/mm/filemap.c b/mm/filemap.c index 3d4c4a96c586a5..7126d0587c949e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -366,31 +366,30 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) return 0; } -/** - * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @wbc: the writeback_control controlling the writeout - * - * Call writepages on the mapping using the provided wbc to control the - * writeout. - * - * Return: %0 on success, negative error code otherwise. - */ -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc) +static int filemap_writeback(struct address_space *mapping, loff_t start, + loff_t end, enum writeback_sync_modes sync_mode, + long *nr_to_write) { + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, + .range_start = start, + .range_end = end, + }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - wbc_attach_fdatawrite_inode(wbc, mapping->host); - ret = do_writepages(mapping, wbc); - wbc_detach_inode(wbc); + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); + + if (!ret && nr_to_write) + *nr_to_write = wbc.nr_to_write; return ret; } -EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range @@ -412,14 +411,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_wbc); int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); + return filemap_writeback(mapping, start, end, sync_mode, NULL); } int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, @@ -475,18 +467,8 @@ EXPORT_SYMBOL(filemap_flush); */ int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) { - struct writeback_control wbc = { - .nr_to_write = *nr_to_write, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; - int ret; - - ret = filemap_fdatawrite_wbc(mapping, &wbc); - if (!ret) - *nr_to_write = wbc.nr_to_write; - return ret; + return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, + nr_to_write); } EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); -- cgit 1.2.3-korg From 151d0922bf638a4e4235758d04b31f48bfcbb798 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Oct 2025 05:45:47 +0200 Subject: writeback: cleanup writeback_chunk_size Return the pages directly when calculated instead of first assigning them back to a variable, and directly return for the data integrity / tagged case instead of going through an else clause. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251017034611.651385-2-hch@lst.de Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Reviewed-by: Nirjhar Roy (IBM) Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4448de35ec8bc2..30de37865fa12a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1908,16 +1908,12 @@ static long writeback_chunk_size(struct bdi_writeback *wb, * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) - pages = LONG_MAX; - else { - pages = min(wb->avg_write_bandwidth / 2, - global_wb_domain.dirty_limit / DIRTY_SCOPE); - pages = min(pages, work->nr_pages); - pages = round_down(pages + MIN_WRITEBACK_PAGES, - MIN_WRITEBACK_PAGES); - } + return LONG_MAX; - return pages; + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + return round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); } /* -- cgit 1.2.3-korg From 90db4d4441f58d433ecf74f7e3bd17e0a553c20c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Oct 2025 05:45:48 +0200 Subject: writeback: allow the file system to override MIN_WRITEBACK_PAGES The relatively low minimal writeback size of 4MiB means that written back inodes on rotational media are switched a lot. Besides introducing additional seeks, this also can lead to extreme file fragmentation on zoned devices when a lot of files are cached relative to the available writeback bandwidth. Add a superblock field that allows the file system to override the default size. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251017034611.651385-3-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 14 +++++--------- fs/super.c | 1 + include/linux/fs.h | 1 + include/linux/writeback.h | 5 +++++ 4 files changed, 12 insertions(+), 9 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 30de37865fa12a..52763fa499d624 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -32,11 +32,6 @@ #include #include "internal.h" -/* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -1889,8 +1884,8 @@ static int writeback_single_inode(struct inode *inode, return ret; } -static long writeback_chunk_size(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long writeback_chunk_size(struct super_block *sb, + struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -1913,7 +1908,8 @@ static long writeback_chunk_size(struct bdi_writeback *wb, pages = min(wb->avg_write_bandwidth / 2, global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); - return round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); + return round_down(pages + sb->s_min_writeback_pages, + sb->s_min_writeback_pages); } /* @@ -2015,7 +2011,7 @@ static long writeback_sb_inodes(struct super_block *sb, inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb, work); + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e0358..599c1d2641feb0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; return s; fail: diff --git a/include/linux/fs.h b/include/linux/fs.h index a5dbfa20f8d74c..6bf369095d2e1b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1583,6 +1583,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ + long s_min_writeback_pages; } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667d9..49e1dd96f43e53 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) + #endif /* WRITEBACK_H */ -- cgit 1.2.3-korg