From 6d840a18773f36baaecc2d2f7fb18ec5862349e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:06:02 +0000 Subject: buffer: return bool from grow_dev_folio() Patch series "More buffer_head cleanups", v2. The first patch is a left-over from last cycle. The rest fix "obvious" block size > PAGE_SIZE problems. I haven't tested with a large block size setup (but I have done an ext4 xfstests run). This patch (of 7): Rename grow_dev_page() to grow_dev_folio() and make it return a bool. Document what that bool means; it's more subtle than it first appears. Also rename the 'failed' label to 'unlock' beacuse it's not exactly 'failed'. It just hasn't succeeded. Link: https://lkml.kernel.org/r/20231109210608.2252323-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hannes Reinecke Cc: Luis Chamberlain Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 967f34b70aa8f7..8dad6c691e143a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1024,40 +1024,43 @@ static sector_t folio_init_buffers(struct folio *folio, } /* - * Create the page-cache page that contains the requested block. + * Create the page-cache folio that contains the requested block. * * This is used purely for blockdev mappings. + * + * Returns false if we have a 'permanent' failure. Returns true if + * we succeeded, or the caller should retry. */ -static int -grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size, int sizebits, gfp_t gfp) +static bool grow_dev_folio(struct block_device *bdev, sector_t block, + pgoff_t index, unsigned size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct folio *folio; struct buffer_head *bh; - sector_t end_block; - int ret = 0; + sector_t end_block = 0; folio = __filemap_get_folio(inode->i_mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) - return PTR_ERR(folio); + return false; bh = folio_buffers(folio); if (bh) { if (bh->b_size == size) { end_block = folio_init_buffers(folio, bdev, (sector_t)index << sizebits, size); - goto done; + goto unlock; } + + /* Caller should retry if this call fails */ + end_block = ~0ULL; if (!try_to_free_buffers(folio)) - goto failed; + goto unlock; } - ret = -ENOMEM; bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); if (!bh) - goto failed; + goto unlock; /* * Link the folio to the buffers and initialise them. Take the @@ -1069,20 +1072,19 @@ grow_dev_page(struct block_device *bdev, sector_t block, end_block = folio_init_buffers(folio, bdev, (sector_t)index << sizebits, size); spin_unlock(&inode->i_mapping->private_lock); -done: - ret = (block < end_block) ? 1 : -ENXIO; -failed: +unlock: folio_unlock(folio); folio_put(folio); - return ret; + return block < end_block; } /* - * Create buffers for the specified block device block's page. If - * that page was dirty, the buffers are set dirty also. + * Create buffers for the specified block device block's folio. If + * that folio was dirty, the buffers are set dirty also. Returns false + * if we've hit a permanent error. */ -static int -grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) +static bool grow_buffers(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { pgoff_t index; int sizebits; @@ -1099,11 +1101,11 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) "device %pg\n", __func__, (unsigned long long)block, bdev); - return -EIO; + return false; } - /* Create a page with the proper size buffers.. */ - return grow_dev_page(bdev, block, index, size, sizebits, gfp); + /* Create a folio with the proper size buffers */ + return grow_dev_folio(bdev, block, index, size, sizebits, gfp); } static struct buffer_head * @@ -1124,14 +1126,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, for (;;) { struct buffer_head *bh; - int ret; bh = __find_get_block(bdev, block, size); if (bh) return bh; - ret = grow_buffers(bdev, block, size, gfp); - if (ret < 0) + if (!grow_buffers(bdev, block, size, gfp)) return NULL; } } -- cgit 1.2.3-korg From 382497ada051a6fc79612aba5e30cdfa26364374 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:06:03 +0000 Subject: buffer: calculate block number inside folio_init_buffers() The calculation of block from index doesn't work for devices with a block size larger than PAGE_SIZE as we end up shifting by a negative number. Instead, calculate the number of the first block from the folio's position in the block device. We no longer need to pass sizebits to grow_dev_folio(). Link: https://lkml.kernel.org/r/20231109210608.2252323-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Hannes Reinecke Cc: Luis Chamberlain Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 8dad6c691e143a..44e0c0b7f71f48 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -995,11 +995,12 @@ static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) * Initialise the state of a blockdev folio's buffers. */ static sector_t folio_init_buffers(struct folio *folio, - struct block_device *bdev, sector_t block, int size) + struct block_device *bdev, unsigned size) { struct buffer_head *head = folio_buffers(folio); struct buffer_head *bh = head; bool uptodate = folio_test_uptodate(folio); + sector_t block = div_u64(folio_pos(folio), size); sector_t end_block = blkdev_max_block(bdev, size); do { @@ -1032,7 +1033,7 @@ static sector_t folio_init_buffers(struct folio *folio, * we succeeded, or the caller should retry. */ static bool grow_dev_folio(struct block_device *bdev, sector_t block, - pgoff_t index, unsigned size, int sizebits, gfp_t gfp) + pgoff_t index, unsigned size, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct folio *folio; @@ -1047,8 +1048,7 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block, bh = folio_buffers(folio); if (bh) { if (bh->b_size == size) { - end_block = folio_init_buffers(folio, bdev, - (sector_t)index << sizebits, size); + end_block = folio_init_buffers(folio, bdev, size); goto unlock; } @@ -1069,8 +1069,7 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block, */ spin_lock(&inode->i_mapping->private_lock); link_dev_buffers(folio, bh); - end_block = folio_init_buffers(folio, bdev, - (sector_t)index << sizebits, size); + end_block = folio_init_buffers(folio, bdev, size); spin_unlock(&inode->i_mapping->private_lock); unlock: folio_unlock(folio); @@ -1105,7 +1104,7 @@ static bool grow_buffers(struct block_device *bdev, sector_t block, } /* Create a folio with the proper size buffers */ - return grow_dev_folio(bdev, block, index, size, sizebits, gfp); + return grow_dev_folio(bdev, block, index, size, gfp); } static struct buffer_head * -- cgit 1.2.3-korg From 5f3bd90d9b98855c2e811aa3b4823d583b0020df Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:06:04 +0000 Subject: buffer: fix grow_buffers() for block size > PAGE_SIZE We must not shift by a negative number so work in terms of a byte offset to avoid the awkward shift left-or-right-depending-on-sign option. This means we need to use check_mul_overflow() to ensure that a large block number does not result in a wrap. Link: https://lkml.kernel.org/r/20231109210608.2252323-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Nathan Chancellor Cc: Hannes Reinecke Cc: Luis Chamberlain Cc: Pankaj Raghav Cc: Ryusuke Konishi [nathan@kernel.org: add cast in grow_buffers() to avoid a multiplication libcall] Link: https://lkml.kernel.org/r/20231128-avoid-muloti4-grow_buffers-v1-1-bc3d0f0ec483@kernel.org Signed-off-by: Andrew Morton --- fs/buffer.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 44e0c0b7f71f48..f765a95034cf13 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1085,26 +1085,21 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block, static bool grow_buffers(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - pgoff_t index; - int sizebits; - - sizebits = PAGE_SHIFT - __ffs(size); - index = block >> sizebits; + loff_t pos; /* - * Check for a block which wants to lie outside our maximum possible - * pagecache index. (this comparison is done using sector_t types). + * Check for a block which lies outside our maximum possible + * pagecache index. */ - if (unlikely(index != block >> sizebits)) { - printk(KERN_ERR "%s: requested out-of-range block %llu for " - "device %pg\n", + if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) { + printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n", __func__, (unsigned long long)block, bdev); return false; } /* Create a folio with the proper size buffers */ - return grow_dev_folio(bdev, block, index, size, gfp); + return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp); } static struct buffer_head * -- cgit 1.2.3-korg From 808441943f6b817f4836752c6e0d1c07507f375e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:06:05 +0000 Subject: buffer: cast block to loff_t before shifting it While sector_t is always defined as a u64 today, that hasn't always been the case and it might not always be the same size as loff_t in the future. Link: https://lkml.kernel.org/r/20231109210608.2252323-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hannes Reinecke Cc: Luis Chamberlain Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index f765a95034cf13..1662ddddfa2766 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2008,7 +2008,7 @@ static int iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, const struct iomap *iomap) { - loff_t offset = block << inode->i_blkbits; + loff_t offset = (loff_t)block << inode->i_blkbits; bh->b_bdev = iomap->bdev; -- cgit 1.2.3-korg From 4b04646caed5449ca97b909bbadca0a7a2762159 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:06:06 +0000 Subject: buffer: fix various functions for block size > PAGE_SIZE If i_blkbits is larger than PAGE_SHIFT, we shift by a negative number, which is undefined. It is safe to shift the block left as a block device must be smaller than MAX_LFS_FILESIZE, which is guaranteed to fit in loff_t. Link: https://lkml.kernel.org/r/20231109210608.2252323-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Hannes Reinecke Cc: Luis Chamberlain Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 1662ddddfa2766..d7526432664353 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -199,7 +199,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) int all_mapped = 1; static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); - index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); + index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE; folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio)) goto out; @@ -1693,13 +1693,13 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct inode *bd_inode = bdev->bd_inode; struct address_space *bd_mapping = bd_inode->i_mapping; struct folio_batch fbatch; - pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); + pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE; pgoff_t end; int i, count; struct buffer_head *bh; struct buffer_head *head; - end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); + end = ((loff_t)(block + len - 1) << bd_inode->i_blkbits) / PAGE_SIZE; folio_batch_init(&fbatch); while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) { count = folio_batch_count(&fbatch); @@ -2660,8 +2660,8 @@ int block_truncate_page(struct address_space *mapping, return 0; length = blocksize - length; - iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); - + iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits; + folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) return PTR_ERR(folio); -- cgit 1.2.3-korg From b0619401b8cdafcf32ad352a8e9a225ab0b4b10d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:06:07 +0000 Subject: buffer: handle large folios in __block_write_begin_int() When __block_write_begin_int() was converted to support folios, we did not expect large folios to be passed to it. With the current work to support large block size storage devices, this will no longer be true so change the checks on 'from' and 'to' to be related to the size of the folio instead of PAGE_SIZE. Also remove an assumption that the block size is smaller than PAGE_SIZE. Link: https://lkml.kernel.org/r/20231109210608.2252323-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Ryusuke Konishi Cc: Hannes Reinecke Cc: Luis Chamberlain Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/buffer.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index d7526432664353..9f47841501949a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2075,27 +2075,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap) { - unsigned from = pos & (PAGE_SIZE - 1); - unsigned to = from + len; + size_t from = offset_in_folio(folio, pos); + size_t to = from + len; struct inode *inode = folio->mapping->host; - unsigned block_start, block_end; + size_t block_start, block_end; sector_t block; int err = 0; - unsigned blocksize, bbits; + size_t blocksize; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; - bbits = block_size_bits(blocksize); - - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); + block = div_u64(folio_pos(folio), blocksize); - for(bh = head, block_start = 0; bh != head || !block_start; + for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { -- cgit 1.2.3-korg From fa399c3112344fa420944e99cd529d679411ebe6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:06:08 +0000 Subject: buffer: fix more functions for block size > PAGE_SIZE Both __block_write_full_folio() and block_read_full_folio() assumed that block size <= PAGE_SIZE. Replace the shift with a divide, which is probably cheaper than first calculating the shift. That lets us remove block_size_bits() as these were the last callers. Link: https://lkml.kernel.org/r/20231109210608.2252323-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hannes Reinecke Cc: Luis Chamberlain Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 9f47841501949a..3a8c8322ed2862 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1742,19 +1742,6 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) } EXPORT_SYMBOL(clean_bdev_aliases); -/* - * Size is a power-of-two in the range 512..PAGE_SIZE, - * and the case we care about most is PAGE_SIZE. - * - * So this *could* possibly be written with those - * constraints in mind (relevant mostly if some - * architecture has a slow bit-scan instruction) - */ -static inline int block_size_bits(unsigned int blocksize) -{ - return ilog2(blocksize); -} - static struct buffer_head *folio_create_buffers(struct folio *folio, struct inode *inode, unsigned int b_state) @@ -1807,7 +1794,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, sector_t block; sector_t last_block; struct buffer_head *bh, *head; - unsigned int blocksize, bbits; + size_t blocksize; int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); @@ -1826,10 +1813,9 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, bh = head; blocksize = bh->b_size; - bbits = block_size_bits(blocksize); - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); - last_block = (i_size_read(inode) - 1) >> bbits; + block = div_u64(folio_pos(folio), blocksize); + last_block = div_u64(i_size_read(inode) - 1, blocksize); /* * Get all the dirty buffers mapped to disk addresses and @@ -2355,7 +2341,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) struct inode *inode = folio->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize, bbits; + size_t blocksize; int nr, i; int fully_mapped = 1; bool page_error = false; @@ -2369,10 +2355,9 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; - bbits = block_size_bits(blocksize); - iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits); - lblock = (limit+blocksize-1) >> bbits; + iblock = div_u64(folio_pos(folio), blocksize); + lblock = div_u64(limit + blocksize - 1, blocksize); bh = head; nr = 0; i = 0; -- cgit 1.2.3-korg From 17bf23a981be9c6629198a76940c777eb5c8c521 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:02:44 +0000 Subject: fs: convert block_write_full_page to block_write_full_folio Convert the function to be compatible with writepage_t so that it can be passed to write_cache_pages() by blkdev. This removes a call to compound_head(). We can also remove the function export as both callers are built-in. Link: https://lkml.kernel.org/r/20231215200245.748418-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Signed-off-by: Andrew Morton --- block/fops.c | 21 ++++++++++++++++++--- fs/buffer.c | 16 +++++++--------- fs/ext4/page-io.c | 2 +- fs/gfs2/aops.c | 4 ++-- fs/mpage.c | 2 +- fs/ntfs/aops.c | 4 ++-- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/file.c | 2 +- include/linux/buffer_head.h | 4 ++-- 9 files changed, 35 insertions(+), 22 deletions(-) (limited to 'fs/buffer.c') diff --git a/block/fops.c b/block/fops.c index 0bdad1e8d51457..0cf8cf72cdfa10 100644 --- a/block/fops.c +++ b/block/fops.c @@ -410,9 +410,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +/* + * We cannot call mpage_writepages() as it does not take the buffer lock. + * We must use block_write_full_folio() directly which holds the buffer + * lock. The buffer lock provides the synchronisation with writeback + * that filesystems rely on when they use the blockdev's mapping. + */ +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, blkdev_get_block, wbc); + struct blk_plug plug; + int err; + + blk_start_plug(&plug); + err = write_cache_pages(mapping, wbc, block_write_full_folio, + blkdev_get_block); + blk_finish_plug(&plug); + + return err; } static int blkdev_read_folio(struct file *file, struct folio *folio) @@ -449,7 +464,7 @@ const struct address_space_operations def_blk_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, - .writepage = blkdev_writepage, + .writepages = blkdev_writepages, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, diff --git a/fs/buffer.c b/fs/buffer.c index 3a8c8322ed2862..c838b4a3100921 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -372,7 +372,7 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) } /* - * Completion handler for block_write_full_page() - pages which are unlocked + * Completion handler for block_write_full_folio() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */ void end_buffer_async_write(struct buffer_head *bh, int uptodate) @@ -1771,18 +1771,18 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, */ /* - * While block_write_full_page is writing back the dirty buffers under + * While block_write_full_folio is writing back the dirty buffers under * the page lock, whoever dirtied the buffers may decide to clean them * again at any time. We handle that by only looking at the buffer * state inside lock_buffer(). * - * If block_write_full_page() is called for regular writeback + * If block_write_full_folio() is called for regular writeback * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. * - * If block_write_full_page() is called with wbc->sync_mode == + * If block_write_full_folio() is called with wbc->sync_mode == * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ @@ -1829,7 +1829,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, * truncate in progress. */ /* - * The buffer was zeroed by block_write_full_page() + * The buffer was zeroed by block_write_full_folio() */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); @@ -2696,10 +2696,9 @@ EXPORT_SYMBOL(block_truncate_page); /* * The generic ->writepage function for buffer-backed address_spaces */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, + void *get_block) { - struct folio *folio = page_folio(page); struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); @@ -2726,7 +2725,6 @@ int block_write_full_page(struct page *page, get_block_t *get_block, return __block_write_full_folio(inode, folio, get_block, wbc, end_buffer_async_write); } -EXPORT_SYMBOL(block_write_full_page); sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index dfdd7e5cf03890..312bc68133574d 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -444,7 +444,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, folio_clear_error(folio); /* - * Comments copied from block_write_full_page: + * Comments copied from block_write_full_folio: * * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 5cffb079b87cbf..f986cd032b7699 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -82,11 +82,11 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, } /** - * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page + * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_folio * @folio: The folio to write * @wbc: The writeback control * - * This is the same as calling block_write_full_page, but it also + * This is the same as calling block_write_full_folio, but it also * writes pages outside of i_size */ static int gfs2_write_jdata_folio(struct folio *folio, diff --git a/fs/mpage.c b/fs/mpage.c index d4963f3d8051c7..738882e0766d08 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -642,7 +642,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, /* * The caller has a ref on the inode, so *mapping is stable */ - ret = block_write_full_page(&folio->page, mpd->get_block, wbc); + ret = block_write_full_folio(folio, wbc, mpd->get_block); mapping_set_error(mapping, ret); out: mpd->bio = bio; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 70479ce915e8f1..6c414957e2c24a 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1304,7 +1304,7 @@ static int ntfs_write_mst_block(struct page *page, * page cleaned. The VM has already locked the page and marked it clean. * * For non-resident attributes, ntfs_writepage() writes the @page by calling - * the ntfs version of the generic block_write_full_page() function, + * the ntfs version of the generic block_write_full_folio() function, * ntfs_write_block(), which in turn if necessary creates and writes the * buffers associated with the page asynchronously. * @@ -1314,7 +1314,7 @@ static int ntfs_write_mst_block(struct page *page, * vfs inode dirty code path for the inode the mft record belongs to or via the * vm page dirty code path for the page the mft record is in. * - * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page(). + * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio(). * * Return 0 on success and -errno on error. */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 91b32b2377acc9..ea9127ba320844 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6934,7 +6934,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because - * otherwise block_write_full_page() will skip writeout of pages past + * otherwise block_write_full_folio() will skip writeout of pages past * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 94e2a12444420e..8b6d15010703b3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -818,7 +818,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* * fs-writeback will release the dirty pages without page lock * whose offset are over inode size, the release happens at - * block_write_full_page(). + * block_write_full_folio(). */ i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 94f6161eb45eba..396b2adf24bf1a 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -252,8 +252,8 @@ void __bh_read_batch(int nr, struct buffer_head *bhs[], * address_spaces. */ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc); +int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, + void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); -- cgit 1.2.3-korg From 14059f66a959c760467ea2041e165f412845bcb8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:02:45 +0000 Subject: fs: remove the bh_end_io argument from __block_write_full_folio All callers are passing end_buffer_async_write as this argument, so we can hardcode references to it within __block_write_full_folio(). That lets us make end_buffer_async_write() static. Link: https://lkml.kernel.org/r/20231215200245.748418-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- fs/buffer.c | 22 ++++++++++------------ fs/gfs2/aops.c | 2 +- include/linux/buffer_head.h | 4 +--- 3 files changed, 12 insertions(+), 16 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index c838b4a3100921..19548369bc6cab 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -372,10 +372,10 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) } /* - * Completion handler for block_write_full_folio() - pages which are unlocked - * during I/O, and which have PageWriteback cleared upon I/O completion. + * Completion handler for block_write_full_folio() - folios which are unlocked + * during I/O, and which have the writeback flag cleared upon I/O completion. */ -void end_buffer_async_write(struct buffer_head *bh, int uptodate) +static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { unsigned long flags; struct buffer_head *first; @@ -415,7 +415,6 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) spin_unlock_irqrestore(&first->b_uptodate_lock, flags); return; } -EXPORT_SYMBOL(end_buffer_async_write); /* * If a page's buffers are under async readin (end_buffer_async_read @@ -1787,8 +1786,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, * causes the writes to be flagged as synchronous writes. */ int __block_write_full_folio(struct inode *inode, struct folio *folio, - get_block_t *get_block, struct writeback_control *wbc, - bh_end_io_t *handler) + get_block_t *get_block, struct writeback_control *wbc) { int err; sector_t block; @@ -1867,7 +1865,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, continue; } if (test_clear_buffer_dirty(bh)) { - mark_buffer_async_write_endio(bh, handler); + mark_buffer_async_write_endio(bh, + end_buffer_async_write); } else { unlock_buffer(bh); } @@ -1920,7 +1919,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, if (buffer_mapped(bh) && buffer_dirty(bh) && !buffer_delay(bh)) { lock_buffer(bh); - mark_buffer_async_write_endio(bh, handler); + mark_buffer_async_write_endio(bh, + end_buffer_async_write); } else { /* * The buffer may have been set dirty during @@ -2704,8 +2704,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, /* Is the folio fully inside i_size? */ if (folio_pos(folio) + folio_size(folio) <= i_size) - return __block_write_full_folio(inode, folio, get_block, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, get_block, wbc); /* Is the folio fully outside i_size? (truncate in progress) */ if (folio_pos(folio) >= i_size) { @@ -2722,8 +2721,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, */ folio_zero_segment(folio, offset_in_folio(folio, i_size), folio_size(folio)); - return __block_write_full_folio(inode, folio, get_block, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, get_block, wbc); } sector_t generic_block_bmap(struct address_space *mapping, sector_t block, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index f986cd032b7699..9914d7f54f7d3e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -108,7 +108,7 @@ static int gfs2_write_jdata_folio(struct folio *folio, folio_size(folio)); return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc, - wbc, end_buffer_async_write); + wbc); } /** diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 396b2adf24bf1a..d78454a4dd1f0f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -205,7 +205,6 @@ struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); -void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); @@ -255,8 +254,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, - get_block_t *get_block, struct writeback_control *wbc, - bh_end_io_t *handler); + get_block_t *get_block, struct writeback_control *wbc); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, -- cgit 1.2.3-korg From bcd30d4cd937e8e15c3986358c5e601135475ce1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jan 2024 09:38:48 +0000 Subject: buffer: fix unintended successful return If try_to_free_buffers() succeeded and then folio_alloc_buffers() failed, grow_dev_folio() would return success. This would be incorrect; memory allocation failure is supposed to result in a failure. It's a harmless bug; the caller will simply go around the loop one more time and grow_dev_folio() will correctly return a failure that time. But it was an unintended change and looks like a more serious bug than it is. While I'm in here, improve the commentary about why we return success even though we failed. Link: https://lkml.kernel.org/r/20240101093848.2017115-1-willy@infradead.org Fixes: 6d840a18773f ("buffer: return bool from grow_dev_folio()") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs/buffer.c') diff --git a/fs/buffer.c b/fs/buffer.c index 19548369bc6cab..5c29850e4781a4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1028,8 +1028,8 @@ static sector_t folio_init_buffers(struct folio *folio, * * This is used purely for blockdev mappings. * - * Returns false if we have a 'permanent' failure. Returns true if - * we succeeded, or the caller should retry. + * Returns false if we have a failure which cannot be cured by retrying + * without sleeping. Returns true if we succeeded, or the caller should retry. */ static bool grow_dev_folio(struct block_device *bdev, sector_t block, pgoff_t index, unsigned size, gfp_t gfp) @@ -1051,10 +1051,17 @@ static bool grow_dev_folio(struct block_device *bdev, sector_t block, goto unlock; } - /* Caller should retry if this call fails */ - end_block = ~0ULL; - if (!try_to_free_buffers(folio)) + /* + * Retrying may succeed; for example the folio may finish + * writeback, or buffers may be cleaned. This should not + * happen very often; maybe we have old buffers attached to + * this blockdev's page cache and we're trying to change + * the block size? + */ + if (!try_to_free_buffers(folio)) { + end_block = ~0ULL; goto unlock; + } } bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); -- cgit 1.2.3-korg