aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAlistair Popple <apopple@nvidia.com>2025-02-28 14:31:14 +1100
committerAndrew Morton <akpm@linux-foundation.org>2025-03-17 22:06:41 -0700
commit38607c62b34b46317c46d5baf1df03ac6e48a1c6 (patch)
tree8c0d14aa6284b39fee26c611469272e399b5e54c /mm
parent653d7825c149932f254e0cd22153ccc945e7e545 (diff)
downloadtip-38607c62b34b.tar.gz
fs/dax: properly refcount fs dax pages
Currently fs dax pages are considered free when the refcount drops to one and their refcounts are not increased when mapped via PTEs or decreased when unmapped. This requires special logic in mm paths to detect that these pages should not be properly refcounted, and to detect when the refcount drops to one instead of zero. On the other hand get_user_pages(), etc. will properly refcount fs dax pages by taking a reference and dropping it when the page is unpinned. Tracking this special behaviour requires extra PTE bits (eg. pte_devmap) and introduces rules that are potentially confusing and specific to FS DAX pages. To fix this, and to possibly allow removal of the special PTE bits in future, convert the fs dax page refcounts to be zero based and instead take a reference on the page each time it is mapped as is currently the case for normal pages. This may also allow a future clean-up to remove the pgmap refcounting that is currently done in mm/gup.c. Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Reviewed-by: Dan Williams <dan.j.williams@intel.com> Tested-by: Alison Schofield <alison.schofield@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Asahi Lina <lina@asahilina.net> Cc: Balbir Singh <balbirs@nvidia.com> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chunyan Zhang <zhang.lyra@gmail.com> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Dave Jiang <dave.jiang@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: John Hubbard <jhubbard@nvidia.com> Cc: linmiaohe <linmiaohe@huawei.com> Cc: Logan Gunthorpe <logang@deltatee.com> Cc: Matthew Wilcow (Oracle) <willy@infradead.org> Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Xu <peterx@redhat.com> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Ted Ts'o <tytso@mit.edu> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Vishal Verma <vishal.l.verma@intel.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: WANG Xuerui <kernel@xen0n.name> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/gup.c9
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/internal.h2
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/memory.c6
-rw-r--r--mm/memremap.c47
-rw-r--r--mm/mm_init.c9
-rw-r--r--mm/swap.c2
8 files changed, 40 insertions, 47 deletions
diff --git a/mm/gup.c b/mm/gup.c
index e5d6454df41d74..e5040657870ea0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -96,8 +96,7 @@ retry:
* belongs to this folio.
*/
if (unlikely(page_folio(page) != folio)) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
goto retry;
}
@@ -116,8 +115,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
}
/**
@@ -565,8 +563,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs,
*/
if (unlikely((flags & FOLL_LONGTERM) &&
!folio_is_longterm_pinnable(folio))) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
return NULL;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e6f4189c1c947e..9a15fd3453ff6b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2225,7 +2225,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb->fullmm);
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (vma_is_special_huge(vma)) {
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
@@ -2882,13 +2882,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (vma_is_special_huge(vma))
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma))
return;
if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
folio = pfn_swap_entry_folio(entry);
+ } else if (is_huge_zero_pmd(old_pmd)) {
+ return;
} else {
page = pmd_page(old_pmd);
folio = page_folio(page);
diff --git a/mm/internal.h b/mm/internal.h
index 70fa96e61c7680..aa30282a774ae2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -737,8 +737,6 @@ static inline void prep_compound_tail(struct page *head, int tail_idx)
set_page_private(p, 0);
}
-extern void prep_compound_page(struct page *page, unsigned int order);
-
void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 327e02fdc029da..6257c7f5e941d8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -419,18 +419,18 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
- if (pud_devmap(*pud))
+ if (pud_trans_huge(*pud))
return PUD_SHIFT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
- if (pmd_devmap(*pmd))
+ if (pmd_trans_huge(*pmd))
return PMD_SHIFT;
pte = pte_offset_map(pmd, address);
if (!pte)
return 0;
ptent = ptep_get(pte);
- if (pte_present(ptent) && pte_devmap(ptent))
+ if (pte_present(ptent))
ret = PAGE_SHIFT;
pte_unmap(pte);
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index f09b4a1d09a8bf..8d1ea1dd6b526d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3848,13 +3848,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
+ * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if (!vmf->page)
+ if (!vmf->page || is_fsdax_page(vmf->page)) {
+ vmf->page = NULL;
return wp_pfn_shared(vmf);
+ }
return wp_page_shared(vmf, folio);
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 68099af9df4cdf..9a8879bf1ae423 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -458,8 +458,13 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_zone_device_folio(struct folio *folio)
{
- if (WARN_ON_ONCE(!folio->pgmap->ops ||
- !folio->pgmap->ops->page_free))
+ struct dev_pagemap *pgmap = folio->pgmap;
+
+ if (WARN_ON_ONCE(!pgmap->ops))
+ return;
+
+ if (WARN_ON_ONCE(pgmap->type != MEMORY_DEVICE_FS_DAX &&
+ !pgmap->ops->page_free))
return;
mem_cgroup_uncharge(folio);
@@ -484,26 +489,36 @@ void free_zone_device_folio(struct folio *folio)
* For other types of ZONE_DEVICE pages, migration is either
* handled differently or not done at all, so there is no need
* to clear folio->mapping.
+ *
+ * FS DAX pages clear the mapping when the folio->share count hits
+ * zero which indicating the page has been removed from the file
+ * system mapping.
*/
- folio->mapping = NULL;
- folio->pgmap->ops->page_free(folio_page(folio, 0));
+ if (pgmap->type != MEMORY_DEVICE_FS_DAX)
+ folio->mapping = NULL;
- switch (folio->pgmap->type) {
+ switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_COHERENT:
- put_dev_pagemap(folio->pgmap);
+ pgmap->ops->page_free(folio_page(folio, 0));
+ put_dev_pagemap(pgmap);
break;
- case MEMORY_DEVICE_FS_DAX:
case MEMORY_DEVICE_GENERIC:
/*
* Reset the refcount to 1 to prepare for handing out the page
* again.
*/
+ pgmap->ops->page_free(folio_page(folio, 0));
folio_set_count(folio, 1);
break;
+ case MEMORY_DEVICE_FS_DAX:
+ wake_up_var(&folio->page);
+ break;
+
case MEMORY_DEVICE_PCI_P2PDMA:
+ pgmap->ops->page_free(folio_page(folio, 0));
break;
}
}
@@ -519,21 +534,3 @@ void zone_device_page_init(struct page *page)
lock_page(page);
}
EXPORT_SYMBOL_GPL(zone_device_page_init);
-
-#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
-{
- if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
- return false;
-
- /*
- * fsdax page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (folio_ref_sub_return(folio, refs) == 1)
- wake_up_var(&folio->_refcount);
- return true;
-}
-EXPORT_SYMBOL(__put_devmap_managed_folio_refs);
-#endif /* CONFIG_FS_DAX */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 73e97ce95f58a6..133640a93d1da8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1026,23 +1026,22 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
}
/*
- * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC and
- * MEMORY_TYPE_FS_DAX pages are released directly to the driver page
- * allocator which will set the page count to 1 when allocating the
- * page.
+ * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
+ * directly to the driver page allocator which will set the page count
+ * to 1 when allocating the page.
*
* MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
* their refcount reset to one whenever they are freed (ie. after
* their refcount drops to 0).
*/
switch (pgmap->type) {
+ case MEMORY_DEVICE_FS_DAX:
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_COHERENT:
case MEMORY_DEVICE_PCI_P2PDMA:
set_page_count(page, 0);
break;
- case MEMORY_DEVICE_FS_DAX:
case MEMORY_DEVICE_GENERIC:
break;
}
diff --git a/mm/swap.c b/mm/swap.c
index fc8281ef42415d..7523b65d8caa64 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -956,8 +956,6 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_folio_refs(folio, nr_refs))
- continue;
if (folio_ref_sub_and_test(folio, nr_refs))
free_zone_device_folio(folio);
continue;