mm, swap: hold a reference during scan and cleanup flag usage

The flag SWP_SCANNING was used as an indicator of whether a device is
being scanned for allocation, and prevents swapoff.  Combined with
SWP_WRITEOK, they work as a set of barriers for a clean swapoff:

1. Swapoff clears SWP_WRITEOK, allocation requests will see
   ~SWP_WRITEOK and abort as it's serialized by si->lock.
2. Swapoff unuses all allocated entries.
3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing
   allocations will stop, preventing UAF.
4. Now swapoff can free everything safely.

This will make the allocation path have a hard dependency on si->lock. 
Allocation always have to acquire si->lock first for setting SWP_SCANNING
and checking SWP_WRITEOK.

This commit removes this flag, and just uses the existing per-CPU refcount
instead to prevent UAF in step 3, which serves well for such usage without
dependency on si->lock, and scales very well too.  Just hold a reference
during the whole scan and allocation process.  Swapoff will kill and wait
for the counter.

And for preventing any allocation from happening after step 1 so the unuse
in step 2 can ensure all slots are free, swapoff will acquire the ci->lock
of each cluster one by one to ensure all allocations see ~SWP_WRITEOK and
abort.

This way these dependences on si->lock are gone.  And worth noting we
can't kill the refcount as the first step for swapoff as the unuse process
have to acquire the refcount.

Link: https://lkml.kernel.org/r/20250113175732.48099-8-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chis Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Kairui Song 2025-01-14 01:57:26 +08:00 committed by Andrew Morton
parent b228386cf2
commit 9a0ddeb798
2 changed files with 57 additions and 34 deletions

View File

@ -219,7 +219,6 @@ enum {
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
/* add others here before... */
SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
};
#define SWAP_CLUSTER_MAX 32UL

View File

@ -658,6 +658,8 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
{
unsigned int nr_pages = 1 << order;
lockdep_assert_held(&ci->lock);
if (!(si->flags & SWP_WRITEOK))
return false;
@ -1059,8 +1061,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
{
int n_ret = 0;
si->flags += SWP_SCANNING;
while (n_ret < nr) {
unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
@ -1069,8 +1069,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
slots[n_ret++] = swp_entry(si->type, offset);
}
si->flags -= SWP_SCANNING;
return n_ret;
}
@ -1112,6 +1110,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
return cluster_alloc_swap(si, usage, nr, slots, order);
}
static bool get_swap_device_info(struct swap_info_struct *si)
{
if (!percpu_ref_tryget_live(&si->users))
return false;
/*
* Guarantee the si->users are checked before accessing other
* fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
* up to dated.
*
* Paired with the spin_unlock() after setup_swap_info() in
* enable_swap_info(), and smp_wmb() in swapoff.
*/
smp_rmb();
return true;
}
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
{
int order = swap_entry_order(entry_order);
@ -1139,13 +1153,16 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
/* requeue si to after same-priority siblings */
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries, order);
spin_unlock(&si->lock);
if (n_ret || size > 1)
goto check_out;
cond_resched();
if (get_swap_device_info(si)) {
spin_lock(&si->lock);
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries, order);
spin_unlock(&si->lock);
put_swap_device(si);
if (n_ret || size > 1)
goto check_out;
cond_resched();
}
spin_lock(&swap_avail_lock);
/*
@ -1296,16 +1313,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
if (!percpu_ref_tryget_live(&si->users))
if (!get_swap_device_info(si))
goto out;
/*
* Guarantee the si->users are checked before accessing other
* fields of swap_info_struct.
*
* Paired with the spin_unlock() after setup_swap_info() in
* enable_swap_info().
*/
smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
goto put_out;
@ -1785,10 +1794,13 @@ swp_entry_t get_swap_page_of_type(int type)
goto fail;
/* This is called for allocating swap entry, not cache */
spin_lock(&si->lock);
if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
atomic_long_dec(&nr_swap_pages);
spin_unlock(&si->lock);
if (get_swap_device_info(si)) {
spin_lock(&si->lock);
if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
atomic_long_dec(&nr_swap_pages);
spin_unlock(&si->lock);
put_swap_device(si);
}
fail:
return entry;
}
@ -2562,6 +2574,25 @@ bool has_usable_swap(void)
return ret;
}
/*
* Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
* see the updated flags, so there will be no more allocations.
*/
static void wait_for_allocation(struct swap_info_struct *si)
{
unsigned long offset;
unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
struct swap_cluster_info *ci;
BUG_ON(si->flags & SWP_WRITEOK);
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
unlock_cluster(ci);
offset += SWAPFILE_CLUSTER;
}
}
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@ -2632,6 +2663,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
wait_for_allocation(p);
disable_swap_slots_cache_lock();
set_current_oom_origin();
@ -2674,15 +2707,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
drain_mmlist();
/* wait for anyone still in scan_swap_map_slots */
while (p->flags >= SWP_SCANNING) {
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
schedule_timeout_uninterruptible(1);
spin_lock(&swap_lock);
spin_lock(&p->lock);
}
swap_file = p->swap_file;
p->swap_file = NULL;
p->max = 0;