mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-08-16 17:48:47 +00:00
mm, swap: hold a reference during scan and cleanup flag usage
The flag SWP_SCANNING was used as an indicator of whether a device is being scanned for allocation, and prevents swapoff. Combined with SWP_WRITEOK, they work as a set of barriers for a clean swapoff: 1. Swapoff clears SWP_WRITEOK, allocation requests will see ~SWP_WRITEOK and abort as it's serialized by si->lock. 2. Swapoff unuses all allocated entries. 3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing allocations will stop, preventing UAF. 4. Now swapoff can free everything safely. This will make the allocation path have a hard dependency on si->lock. Allocation always have to acquire si->lock first for setting SWP_SCANNING and checking SWP_WRITEOK. This commit removes this flag, and just uses the existing per-CPU refcount instead to prevent UAF in step 3, which serves well for such usage without dependency on si->lock, and scales very well too. Just hold a reference during the whole scan and allocation process. Swapoff will kill and wait for the counter. And for preventing any allocation from happening after step 1 so the unuse in step 2 can ensure all slots are free, swapoff will acquire the ci->lock of each cluster one by one to ensure all allocations see ~SWP_WRITEOK and abort. This way these dependences on si->lock are gone. And worth noting we can't kill the refcount as the first step for swapoff as the unuse process have to acquire the refcount. Link: https://lkml.kernel.org/r/20250113175732.48099-8-ryncsn@gmail.com Signed-off-by: Kairui Song <kasong@tencent.com> Cc: Baoquan He <bhe@redhat.com> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Chis Li <chrisl@kernel.org> Cc: "Huang, Ying" <ying.huang@linux.alibaba.com> Cc: Hugh Dickens <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
b228386cf2
commit
9a0ddeb798
@ -219,7 +219,6 @@ enum {
|
||||
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
|
||||
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
|
||||
/* add others here before... */
|
||||
SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
|
||||
};
|
||||
|
||||
#define SWAP_CLUSTER_MAX 32UL
|
||||
|
@ -658,6 +658,8 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
|
||||
{
|
||||
unsigned int nr_pages = 1 << order;
|
||||
|
||||
lockdep_assert_held(&ci->lock);
|
||||
|
||||
if (!(si->flags & SWP_WRITEOK))
|
||||
return false;
|
||||
|
||||
@ -1059,8 +1061,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
|
||||
{
|
||||
int n_ret = 0;
|
||||
|
||||
si->flags += SWP_SCANNING;
|
||||
|
||||
while (n_ret < nr) {
|
||||
unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
|
||||
|
||||
@ -1069,8 +1069,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
|
||||
slots[n_ret++] = swp_entry(si->type, offset);
|
||||
}
|
||||
|
||||
si->flags -= SWP_SCANNING;
|
||||
|
||||
return n_ret;
|
||||
}
|
||||
|
||||
@ -1112,6 +1110,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
|
||||
return cluster_alloc_swap(si, usage, nr, slots, order);
|
||||
}
|
||||
|
||||
static bool get_swap_device_info(struct swap_info_struct *si)
|
||||
{
|
||||
if (!percpu_ref_tryget_live(&si->users))
|
||||
return false;
|
||||
/*
|
||||
* Guarantee the si->users are checked before accessing other
|
||||
* fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
|
||||
* up to dated.
|
||||
*
|
||||
* Paired with the spin_unlock() after setup_swap_info() in
|
||||
* enable_swap_info(), and smp_wmb() in swapoff.
|
||||
*/
|
||||
smp_rmb();
|
||||
return true;
|
||||
}
|
||||
|
||||
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
|
||||
{
|
||||
int order = swap_entry_order(entry_order);
|
||||
@ -1139,13 +1153,16 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
|
||||
/* requeue si to after same-priority siblings */
|
||||
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
|
||||
spin_unlock(&swap_avail_lock);
|
||||
spin_lock(&si->lock);
|
||||
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
|
||||
n_goal, swp_entries, order);
|
||||
spin_unlock(&si->lock);
|
||||
if (n_ret || size > 1)
|
||||
goto check_out;
|
||||
cond_resched();
|
||||
if (get_swap_device_info(si)) {
|
||||
spin_lock(&si->lock);
|
||||
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
|
||||
n_goal, swp_entries, order);
|
||||
spin_unlock(&si->lock);
|
||||
put_swap_device(si);
|
||||
if (n_ret || size > 1)
|
||||
goto check_out;
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
spin_lock(&swap_avail_lock);
|
||||
/*
|
||||
@ -1296,16 +1313,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
|
||||
si = swp_swap_info(entry);
|
||||
if (!si)
|
||||
goto bad_nofile;
|
||||
if (!percpu_ref_tryget_live(&si->users))
|
||||
if (!get_swap_device_info(si))
|
||||
goto out;
|
||||
/*
|
||||
* Guarantee the si->users are checked before accessing other
|
||||
* fields of swap_info_struct.
|
||||
*
|
||||
* Paired with the spin_unlock() after setup_swap_info() in
|
||||
* enable_swap_info().
|
||||
*/
|
||||
smp_rmb();
|
||||
offset = swp_offset(entry);
|
||||
if (offset >= si->max)
|
||||
goto put_out;
|
||||
@ -1785,10 +1794,13 @@ swp_entry_t get_swap_page_of_type(int type)
|
||||
goto fail;
|
||||
|
||||
/* This is called for allocating swap entry, not cache */
|
||||
spin_lock(&si->lock);
|
||||
if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
|
||||
atomic_long_dec(&nr_swap_pages);
|
||||
spin_unlock(&si->lock);
|
||||
if (get_swap_device_info(si)) {
|
||||
spin_lock(&si->lock);
|
||||
if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
|
||||
atomic_long_dec(&nr_swap_pages);
|
||||
spin_unlock(&si->lock);
|
||||
put_swap_device(si);
|
||||
}
|
||||
fail:
|
||||
return entry;
|
||||
}
|
||||
@ -2562,6 +2574,25 @@ bool has_usable_swap(void)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
|
||||
* see the updated flags, so there will be no more allocations.
|
||||
*/
|
||||
static void wait_for_allocation(struct swap_info_struct *si)
|
||||
{
|
||||
unsigned long offset;
|
||||
unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
|
||||
struct swap_cluster_info *ci;
|
||||
|
||||
BUG_ON(si->flags & SWP_WRITEOK);
|
||||
|
||||
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
|
||||
ci = lock_cluster(si, offset);
|
||||
unlock_cluster(ci);
|
||||
offset += SWAPFILE_CLUSTER;
|
||||
}
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
||||
{
|
||||
struct swap_info_struct *p = NULL;
|
||||
@ -2632,6 +2663,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
||||
spin_unlock(&p->lock);
|
||||
spin_unlock(&swap_lock);
|
||||
|
||||
wait_for_allocation(p);
|
||||
|
||||
disable_swap_slots_cache_lock();
|
||||
|
||||
set_current_oom_origin();
|
||||
@ -2674,15 +2707,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
||||
spin_lock(&p->lock);
|
||||
drain_mmlist();
|
||||
|
||||
/* wait for anyone still in scan_swap_map_slots */
|
||||
while (p->flags >= SWP_SCANNING) {
|
||||
spin_unlock(&p->lock);
|
||||
spin_unlock(&swap_lock);
|
||||
schedule_timeout_uninterruptible(1);
|
||||
spin_lock(&swap_lock);
|
||||
spin_lock(&p->lock);
|
||||
}
|
||||
|
||||
swap_file = p->swap_file;
|
||||
p->swap_file = NULL;
|
||||
p->max = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user