diff --git a/mm/swap.h b/mm/swap.h index fe5c20922082..8d8efdf1297a 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -36,7 +36,7 @@ struct swap_cluster_info { u16 count; u8 flags; u8 order; - atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ + atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ struct list_head list; }; diff --git a/mm/swap_state.c b/mm/swap_state.c index a1478cbff384..b13e9c4baa90 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -91,8 +91,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) struct folio *folio; for (;;) { - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), - swp_cluster_offset(entry)); + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); if (!swp_tb_is_folio(swp_tb)) return NULL; folio = swp_tb_to_folio(swp_tb); @@ -115,11 +115,10 @@ void *swap_cache_get_shadow(swp_entry_t entry) { unsigned long swp_tb; - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), - swp_cluster_offset(entry)); + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); if (swp_tb_is_shadow(swp_tb)) return swp_tb_to_shadow(swp_tb); - return NULL; } diff --git a/mm/swap_table.h b/mm/swap_table.h index e1f7cc009701..52254e455304 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -2,8 +2,15 @@ #ifndef _MM_SWAP_TABLE_H #define _MM_SWAP_TABLE_H +#include +#include #include "swap.h" +/* A typical flat array in each cluster as swap table */ +struct swap_table { + atomic_long_t entries[SWAPFILE_CLUSTER]; +}; + /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a @@ -76,22 +83,46 @@ static inline void *swp_tb_to_shadow(unsigned long swp_tb) static inline void __swap_table_set(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); - atomic_long_set(&ci->table[off], swp_tb); + atomic_long_set(&table[off], swp_tb); } static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); /* Ordering is guaranteed by cluster lock, relax */ - return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); + return atomic_long_xchg_relaxed(&table[off], swp_tb); } static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, unsigned int off) { + atomic_long_t *table; + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); - return atomic_long_read(&ci->table[off]); + table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); + + return atomic_long_read(&table[off]); +} + +static inline unsigned long swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + atomic_long_t *table; + unsigned long swp_tb; + + rcu_read_lock(); + table = rcu_dereference(ci->table); + swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); + rcu_read_unlock(); + + return swp_tb; } #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 314c5c10d3bd..094e3e75849f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -59,6 +59,9 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); +static void move_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -105,6 +108,8 @@ static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; +static struct kmem_cache *swap_table_cachep; + static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); @@ -401,10 +406,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info) return info->flags == CLUSTER_FLAG_DISCARD; } +static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) +{ + return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); +} + static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) return false; + if (!cluster_table_is_alloced(ci)) + return false; if (!order) return true; return cluster_is_empty(ci) || order == ci->order; @@ -422,32 +434,90 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static int swap_cluster_alloc_table(struct swap_cluster_info *ci) -{ - WARN_ON(ci->table); - ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); - if (!ci->table) - return -ENOMEM; - return 0; -} - static void swap_cluster_free_table(struct swap_cluster_info *ci) { unsigned int ci_off; - unsigned long swp_tb; + struct swap_table *table; - if (!ci->table) - return; + /* Only empty cluster's table is allow to be freed */ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(!cluster_is_empty(ci)); + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) + VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); + table = (void *)rcu_dereference_protected(ci->table, true); + rcu_assign_pointer(ci->table, NULL); - for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { - swp_tb = __swap_table_get(ci, ci_off); - if (!swp_tb_is_null(swp_tb)) - pr_err_once("swap: unclean swap space on swapoff: 0x%lx", - swp_tb); + kmem_cache_free(swap_table_cachep, table); +} + +/* + * Allocate swap table for one cluster. Attempt an atomic allocation first, + * then fallback to sleeping allocation. + */ +static struct swap_cluster_info * +swap_cluster_alloc_table(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + struct swap_table *table; + + /* + * Only cluster isolation from the allocator does table allocation. + * Swap allocator uses percpu clusters and holds the local lock. + */ + lockdep_assert_held(&ci->lock); + lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); + + /* The cluster must be free and was just isolated from the free list. */ + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); + + table = kmem_cache_zalloc(swap_table_cachep, + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (table) { + rcu_assign_pointer(ci->table, table); + return ci; } - kfree(ci->table); - ci->table = NULL; + /* + * Try a sleep allocation. Each isolated free cluster may cause + * a sleep allocation, but there is a limited number of them, so + * the potential recursive allocation is limited. + */ + spin_unlock(&ci->lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_unlock(&si->global_cluster_lock); + local_unlock(&percpu_swap_cluster.lock); + + table = kmem_cache_zalloc(swap_table_cachep, + __GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + + /* + * Back to atomic context. We might have migrated to a new CPU with a + * usable percpu cluster. But just keep using the isolated cluster to + * make things easier. Migration indicates a slight change of workload + * so using a new free cluster might not be a bad idea, and the worst + * could happen with ignoring the percpu cluster is fragmentation, + * which is acceptable since this fallback and race is rare. + */ + local_lock(&percpu_swap_cluster.lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_lock(&si->global_cluster_lock); + spin_lock(&ci->lock); + + /* Nothing except this helper should touch a dangling empty cluster. */ + if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { + if (table) + kmem_cache_free(swap_table_cachep, table); + return ci; + } + + if (!table) { + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); + spin_unlock(&ci->lock); + return NULL; + } + + rcu_assign_pointer(ci->table, table); + return ci; } static void move_cluster(struct swap_info_struct *si, @@ -479,7 +549,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&ci->lock); + swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } @@ -494,15 +564,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list) + struct swap_info_struct *si, struct list_head *list, int order) { - struct swap_cluster_info *ci, *ret = NULL; + struct swap_cluster_info *ci, *found = NULL; spin_lock(&si->lock); - - if (unlikely(!(si->flags & SWP_WRITEOK))) - goto out; - list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue; @@ -514,13 +580,19 @@ static struct swap_cluster_info *isolate_lock_cluster( list_del(&ci->list); ci->flags = CLUSTER_FLAG_NONE; - ret = ci; + found = ci; break; } -out: spin_unlock(&si->lock); - return ret; + if (found && !cluster_table_is_alloced(found)) { + /* Only an empty free cluster's swap table can be freed. */ + VM_WARN_ON_ONCE(list != &si->free_clusters); + VM_WARN_ON_ONCE(!cluster_is_empty(found)); + return swap_cluster_alloc_table(si, found); + } + + return found; } /* @@ -653,17 +725,27 @@ static void relocate_cluster(struct swap_info_struct *si, * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. */ -static void inc_cluster_info_page(struct swap_info_struct *si, +static int inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; + struct swap_table *table; struct swap_cluster_info *ci; ci = cluster_info + idx; + if (!ci->table) { + table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL); + if (!table) + return -ENOMEM; + rcu_assign_pointer(ci->table, table); + } + ci->count++; VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags); + + return 0; } static bool cluster_reclaim_range(struct swap_info_struct *si, @@ -845,7 +927,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, unsigned int found = SWAP_ENTRY_INVALID; do { - struct swap_cluster_info *ci = isolate_lock_cluster(si, list); + struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); unsigned long offset; if (!ci) @@ -870,7 +952,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; @@ -1018,6 +1100,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o done: if (!(si->flags & SWP_SOLIDSTATE)) spin_unlock(&si->global_cluster_lock); + return found; } @@ -1885,7 +1968,13 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { + /* + * Grab the local lock to be complaint + * with swap table allocation. + */ + local_lock(&percpu_swap_cluster.lock); offset = cluster_alloc_swap_entry(si, 0, 1); + local_unlock(&percpu_swap_cluster.lock); if (offset) { entry = swp_entry(si->type, offset); atomic_long_dec(&nr_swap_pages); @@ -2679,12 +2768,21 @@ static void wait_for_allocation(struct swap_info_struct *si) static void free_cluster_info(struct swap_cluster_info *cluster_info, unsigned long maxpages) { + struct swap_cluster_info *ci; int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); if (!cluster_info) return; - for (i = 0; i < nr_clusters; i++) - swap_cluster_free_table(&cluster_info[i]); + for (i = 0; i < nr_clusters; i++) { + ci = cluster_info + i; + /* Cluster with bad marks count will have a remaining table */ + spin_lock(&ci->lock); + if (rcu_dereference_protected(ci->table, true)) { + ci->count = 0; + swap_cluster_free_table(ci); + } + spin_unlock(&ci->lock); + } kvfree(cluster_info); } @@ -2720,6 +2818,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; + unsigned int maxpages; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) @@ -2826,8 +2925,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; + maxpages = p->max; cluster_info = p->cluster_info; - free_cluster_info(cluster_info, p->max); p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); @@ -2839,6 +2938,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); + free_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -3217,11 +3317,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (!cluster_info) goto err; - for (i = 0; i < nr_clusters; i++) { + for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - if (swap_cluster_alloc_table(&cluster_info[i])) - goto err_free; - } if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc(sizeof(*si->global_cluster), @@ -3240,16 +3337,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - inc_cluster_info_page(si, cluster_info, 0); + err = inc_cluster_info_page(si, cluster_info, 0); + if (err) + goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr >= maxpages) continue; - inc_cluster_info_page(si, cluster_info, page_nr); + err = inc_cluster_info_page(si, cluster_info, page_nr); + if (err) + goto err; + } + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { + err = inc_cluster_info_page(si, cluster_info, i); + if (err) + goto err; } - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) - inc_cluster_info_page(si, cluster_info, i); INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); @@ -3963,6 +4067,15 @@ static int __init swapfile_init(void) swapfile_maximum_size = arch_max_swapfile_size(); + /* + * Once a cluster is freed, it's swap table content is read + * only, and all swap cache readers (swap_cache_*) verifies + * the content before use. So it's safe to use RCU slab here. + */ + swap_table_cachep = kmem_cache_create("swap_table", + sizeof(struct swap_table), + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); + #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true;