diff --git a/mm/madvise.c b/mm/madvise.c index f543ef45f6a4..7d78d4b5fb18 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -48,38 +48,19 @@ struct madvise_walk_private { bool pageout; }; +enum madvise_lock_mode { + MADVISE_NO_LOCK, + MADVISE_MMAP_READ_LOCK, + MADVISE_MMAP_WRITE_LOCK, + MADVISE_VMA_READ_LOCK, +}; + struct madvise_behavior { int behavior; struct mmu_gather *tlb; + enum madvise_lock_mode lock_mode; }; -/* - * Any behaviour which results in changes to the vma->vm_flags needs to - * take mmap_lock for writing. Others, which simply traverse vmas, need - * to only take it for reading. - */ -static int madvise_need_mmap_write(int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - case MADV_WILLNEED: - case MADV_DONTNEED: - case MADV_DONTNEED_LOCKED: - case MADV_COLD: - case MADV_PAGEOUT: - case MADV_FREE: - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - case MADV_COLLAPSE: - case MADV_GUARD_INSTALL: - case MADV_GUARD_REMOVE: - return 0; - default: - /* be safe, default to 1. list exceptions explicitly */ - return 1; - } -} - #ifdef CONFIG_ANON_VMA_NAME struct anon_vma_name *anon_vma_name_alloc(const char *name) { @@ -1339,6 +1320,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, return madvise_guard_remove(vma, prev, start, end); } + /* We cannot provide prev in this lock mode. */ + VM_WARN_ON_ONCE(arg->lock_mode == MADVISE_VMA_READ_LOCK); anon_name = anon_vma_name(vma); anon_vma_name_get(anon_name); error = madvise_update_vma(vma, prev, start, end, new_flags, @@ -1488,6 +1471,44 @@ static bool process_madvise_remote_valid(int behavior) } } +/* + * Try to acquire a VMA read lock if possible. + * + * We only support this lock over a single VMA, which the input range must + * span either partially or fully. + * + * This function always returns with an appropriate lock held. If a VMA read + * lock could be acquired, we return the locked VMA. + * + * If a VMA read lock could not be acquired, we return NULL and expect caller to + * fallback to mmap lock behaviour. + */ +static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, start); + if (!vma) + goto take_mmap_read_lock; + /* + * Must span only a single VMA; uffd and remote processes are + * unsupported. + */ + if (end > vma->vm_end || current->mm != mm || + userfaultfd_armed(vma)) { + vma_end_read(vma); + goto take_mmap_read_lock; + } + return vma; + +take_mmap_read_lock: + mmap_read_lock(mm); + madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK; + return NULL; +} + /* * Walk the vmas in range [start,end), and call the visit function on each one. * The visit function will get start and end parameters that cover the overlap @@ -1498,7 +1519,8 @@ static bool process_madvise_remote_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, void *arg, + unsigned long end, struct madvise_behavior *madv_behavior, + void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, void *arg)) @@ -1507,6 +1529,21 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; unsigned long tmp; int unmapped_error = 0; + int error; + + /* + * If VMA read lock is supported, apply madvise to a single VMA + * tentatively, avoiding walking VMAs. + */ + if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) { + vma = try_vma_read_lock(mm, madv_behavior, start, end); + if (vma) { + prev = vma; + error = visit(vma, &prev, start, end, arg); + vma_end_read(vma); + return error; + } + } /* * If the interval [start,end) covers some unmapped address @@ -1518,8 +1555,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, prev = vma; for (;;) { - int error; - /* Still start < end. */ if (!vma) return -ENOMEM; @@ -1600,34 +1635,86 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, anon_name, + return madvise_walk_vmas(mm, start, end, NULL, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ -static int madvise_lock(struct mm_struct *mm, int behavior) -{ - if (is_memory_failure(behavior)) - return 0; - if (madvise_need_mmap_write(behavior)) { +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_lock for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior) +{ + int behavior = madv_behavior->behavior; + + if (is_memory_failure(behavior)) + return MADVISE_NO_LOCK; + + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_FREE: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: + return MADVISE_MMAP_READ_LOCK; + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + return MADVISE_VMA_READ_LOCK; + default: + return MADVISE_MMAP_WRITE_LOCK; + } +} + +static int madvise_lock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior) +{ + enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior); + + switch (lock_mode) { + case MADVISE_NO_LOCK: + break; + case MADVISE_MMAP_WRITE_LOCK: if (mmap_write_lock_killable(mm)) return -EINTR; - } else { + break; + case MADVISE_MMAP_READ_LOCK: mmap_read_lock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ + break; } + + madv_behavior->lock_mode = lock_mode; return 0; } -static void madvise_unlock(struct mm_struct *mm, int behavior) +static void madvise_unlock(struct mm_struct *mm, + struct madvise_behavior *madv_behavior) { - if (is_memory_failure(behavior)) + switch (madv_behavior->lock_mode) { + case MADVISE_NO_LOCK: return; - - if (madvise_need_mmap_write(behavior)) + case MADVISE_MMAP_WRITE_LOCK: mmap_write_unlock(mm); - else + break; + case MADVISE_MMAP_READ_LOCK: mmap_read_unlock(mm); + break; + case MADVISE_VMA_READ_LOCK: + /* We will drop the lock per-VMA in madvise_walk_vmas(). */ + break; + } + + madv_behavior->lock_mode = MADVISE_NO_LOCK; } static bool madvise_batch_tlb_flush(int behavior) @@ -1712,6 +1799,21 @@ static bool is_madvise_populate(int behavior) } } +/* + * untagged_addr_remote() assumes mmap_lock is already held. On + * architectures like x86 and RISC-V, tagging is tricky because each + * mm may have a different tagging mask. However, we might only hold + * the per-VMA lock (currently only local processes are supported), + * so untagged_addr is used to avoid the mmap_lock assertion for + * local processes. + */ +static inline unsigned long get_untagged_addr(struct mm_struct *mm, + unsigned long start) +{ + return current->mm == mm ? untagged_addr(start) : + untagged_addr_remote(mm, start); +} + static int madvise_do_behavior(struct mm_struct *mm, unsigned long start, size_t len_in, struct madvise_behavior *madv_behavior) @@ -1723,7 +1825,7 @@ static int madvise_do_behavior(struct mm_struct *mm, if (is_memory_failure(behavior)) return madvise_inject_error(behavior, start, start + len_in); - start = untagged_addr_remote(mm, start); + start = get_untagged_addr(mm, start); end = start + PAGE_ALIGN(len_in); blk_start_plug(&plug); @@ -1731,7 +1833,7 @@ static int madvise_do_behavior(struct mm_struct *mm, error = madvise_populate(mm, start, end, behavior); else error = madvise_walk_vmas(mm, start, end, madv_behavior, - madvise_vma_behavior); + madv_behavior, madvise_vma_behavior); blk_finish_plug(&plug); return error; } @@ -1819,13 +1921,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh if (madvise_should_skip(start, len_in, behavior, &error)) return error; - error = madvise_lock(mm, behavior); + error = madvise_lock(mm, &madv_behavior); if (error) return error; madvise_init_tlb(&madv_behavior, mm); error = madvise_do_behavior(mm, start, len_in, &madv_behavior); madvise_finish_tlb(&madv_behavior); - madvise_unlock(mm, behavior); + madvise_unlock(mm, &madv_behavior); return error; } @@ -1849,7 +1951,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, total_len = iov_iter_count(iter); - ret = madvise_lock(mm, behavior); + ret = madvise_lock(mm, &madv_behavior); if (ret) return ret; madvise_init_tlb(&madv_behavior, mm); @@ -1882,8 +1984,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, /* Drop and reacquire lock to unwind race. */ madvise_finish_tlb(&madv_behavior); - madvise_unlock(mm, behavior); - ret = madvise_lock(mm, behavior); + madvise_unlock(mm, &madv_behavior); + ret = madvise_lock(mm, &madv_behavior); if (ret) goto out; madvise_init_tlb(&madv_behavior, mm); @@ -1894,7 +1996,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, iov_iter_advance(iter, iter_iov_len(iter)); } madvise_finish_tlb(&madv_behavior); - madvise_unlock(mm, behavior); + madvise_unlock(mm, &madv_behavior); out: ret = (total_len - iov_iter_count(iter)) ? : ret;