aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/internal.h3
-rw-r--r--mm/madvise.c110
-rw-r--r--mm/pagewalk.c17
3 files changed, 94 insertions, 36 deletions
diff --git a/mm/internal.h b/mm/internal.h
index ba471b8f36fca9..2bad3971813b75 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1652,6 +1652,9 @@ static inline void accept_page(struct page *page)
int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
+int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
pgd_t *pgd, void *private);
diff --git a/mm/madvise.c b/mm/madvise.c
index 7ed5bedb8f8ef9..2a165e9beb5bb0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1122,18 +1122,17 @@ static int guard_install_set_pte(unsigned long addr, unsigned long next,
return 0;
}
-static const struct mm_walk_ops guard_install_walk_ops = {
- .pud_entry = guard_install_pud_entry,
- .pmd_entry = guard_install_pmd_entry,
- .pte_entry = guard_install_pte_entry,
- .install_pte = guard_install_set_pte,
- .walk_lock = PGWALK_RDLOCK,
-};
-
static long madvise_guard_install(struct madvise_behavior *madv_behavior)
{
struct vm_area_struct *vma = madv_behavior->vma;
struct madvise_behavior_range *range = &madv_behavior->range;
+ struct mm_walk_ops walk_ops = {
+ .pud_entry = guard_install_pud_entry,
+ .pmd_entry = guard_install_pmd_entry,
+ .pte_entry = guard_install_pte_entry,
+ .install_pte = guard_install_set_pte,
+ .walk_lock = get_walk_lock(madv_behavior->lock_mode),
+ };
long err;
int i;
@@ -1150,8 +1149,14 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
/*
* If anonymous and we are establishing page tables the VMA ought to
* have an anon_vma associated with it.
+ *
+ * We will hold an mmap read lock if this is necessary, this is checked
+ * as part of the VMA lock logic.
*/
if (vma_is_anonymous(vma)) {
+ VM_WARN_ON_ONCE(!vma->anon_vma &&
+ madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK);
+
err = anon_vma_prepare(vma);
if (err)
return err;
@@ -1159,12 +1164,14 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
/*
* Optimistically try to install the guard marker pages first. If any
- * non-guard pages are encountered, give up and zap the range before
- * trying again.
+ * non-guard pages or THP huge pages are encountered, give up and zap
+ * the range before trying again.
*
* We try a few times before giving up and releasing back to userland to
- * loop around, releasing locks in the process to avoid contention. This
- * would only happen if there was a great many racing page faults.
+ * loop around, releasing locks in the process to avoid contention.
+ *
+ * This would only happen due to races with e.g. page faults or
+ * khugepaged.
*
* In most cases we should simply install the guard markers immediately
* with no zap or looping.
@@ -1173,8 +1180,13 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
unsigned long nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
- err = walk_page_range_mm_unsafe(vma->vm_mm, range->start,
- range->end, &guard_install_walk_ops, &nr_pages);
+ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK)
+ err = walk_page_range_vma_unsafe(madv_behavior->vma,
+ range->start, range->end, &walk_ops,
+ &nr_pages);
+ else
+ err = walk_page_range_mm_unsafe(vma->vm_mm, range->start,
+ range->end, &walk_ops, &nr_pages);
if (err < 0)
return err;
@@ -1195,8 +1207,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
}
/*
- * We were unable to install the guard pages due to being raced by page
- * faults. This should not happen ordinarily. We return to userspace and
+ * We were unable to install the guard pages, return to userspace and
* immediately retry, relieving lock contention.
*/
return restart_syscall();
@@ -1240,17 +1251,16 @@ static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops guard_remove_walk_ops = {
- .pud_entry = guard_remove_pud_entry,
- .pmd_entry = guard_remove_pmd_entry,
- .pte_entry = guard_remove_pte_entry,
- .walk_lock = PGWALK_RDLOCK,
-};
-
static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
{
struct vm_area_struct *vma = madv_behavior->vma;
struct madvise_behavior_range *range = &madv_behavior->range;
+ struct mm_walk_ops wallk_ops = {
+ .pud_entry = guard_remove_pud_entry,
+ .pmd_entry = guard_remove_pmd_entry,
+ .pte_entry = guard_remove_pte_entry,
+ .walk_lock = get_walk_lock(madv_behavior->lock_mode),
+ };
/*
* We're ok with removing guards in mlock()'d ranges, as this is a
@@ -1260,7 +1270,7 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
return -EINVAL;
return walk_page_range_vma(vma, range->start, range->end,
- &guard_remove_walk_ops, NULL);
+ &wallk_ops, NULL);
}
#ifdef CONFIG_64BIT
@@ -1573,6 +1583,47 @@ static bool process_madvise_remote_valid(int behavior)
}
}
+/* Does this operation invoke anon_vma_prepare()? */
+static bool prepares_anon_vma(int behavior)
+{
+ switch (behavior) {
+ case MADV_GUARD_INSTALL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA
+ * read lock only now we have a VMA to examine?
+ */
+static bool is_vma_lock_sufficient(struct vm_area_struct *vma,
+ struct madvise_behavior *madv_behavior)
+{
+ /* Must span only a single VMA.*/
+ if (madv_behavior->range.end > vma->vm_end)
+ return false;
+ /* Remote processes unsupported. */
+ if (current->mm != vma->vm_mm)
+ return false;
+ /* Userfaultfd unsupported. */
+ if (userfaultfd_armed(vma))
+ return false;
+ /*
+ * anon_vma_prepare() explicitly requires an mmap lock for
+ * serialisation, so we cannot use a VMA lock in this case.
+ *
+ * Note we might race with anon_vma being set, however this makes this
+ * check overly paranoid which is safe.
+ */
+ if (vma_is_anonymous(vma) &&
+ prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma)
+ return false;
+
+ return true;
+}
+
/*
* Try to acquire a VMA read lock if possible.
*
@@ -1594,15 +1645,12 @@ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
vma = lock_vma_under_rcu(mm, madv_behavior->range.start);
if (!vma)
goto take_mmap_read_lock;
- /*
- * Must span only a single VMA; uffd and remote processes are
- * unsupported.
- */
- if (madv_behavior->range.end > vma->vm_end || current->mm != mm ||
- userfaultfd_armed(vma)) {
+
+ if (!is_vma_lock_sufficient(vma, madv_behavior)) {
vma_end_read(vma);
goto take_mmap_read_lock;
}
+
madv_behavior->vma = vma;
return true;
@@ -1715,9 +1763,9 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
+ return MADVISE_MMAP_READ_LOCK;
case MADV_GUARD_INSTALL:
case MADV_GUARD_REMOVE:
- return MADVISE_MMAP_READ_LOCK;
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
case MADV_FREE:
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 30ea959bf38c4c..8a29b7237bc688 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -694,9 +694,8 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
return walk_pgd_range(start, end, &walk);
}
-int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, const struct mm_walk_ops *ops,
- void *private)
+int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops, void *private)
{
struct mm_walk walk = {
.ops = ops,
@@ -709,14 +708,22 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
return -EINVAL;
if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- if (!check_ops_safe(ops))
- return -EINVAL;
process_mm_walk_lock(walk.mm, ops->walk_lock);
process_vma_walk_lock(vma, ops->walk_lock);
return __walk_page_range(start, end, &walk);
}
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ if (!check_ops_safe(ops))
+ return -EINVAL;
+
+ return walk_page_range_vma_unsafe(vma, start, end, ops, private);
+}
+
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private)
{