xfs: cache open zone in inode->i_private

The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones.  One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.

So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private.  This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.

Fixes: 4e4d520755 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
This commit is contained in:
Christoph Hellwig
2025-10-17 05:55:41 +02:00
committed by Carlos Maiolino
parent a8c861f401
commit ca3d643a97
4 changed files with 53 additions and 85 deletions

View File

@@ -236,7 +236,6 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;

View File

@@ -786,6 +786,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (IS_ENABLED(CONFIG_XFS_RT) &&
S_ISREG(inode->i_mode) && inode->i_private) {
xfs_open_zone_put(inode->i_private);
inode->i_private = NULL;
}
}
static void

View File

@@ -26,14 +26,22 @@
#include "xfs_trace.h"
#include "xfs_mru_cache.h"
static void
xfs_open_zone_free_rcu(
struct callback_head *cb)
{
struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
xfs_rtgroup_rele(oz->oz_rtg);
kfree(oz);
}
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
if (atomic_dec_and_test(&oz->oz_ref)) {
xfs_rtgroup_rele(oz->oz_rtg);
kfree(oz);
}
if (atomic_dec_and_test(&oz->oz_ref))
call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -756,98 +764,55 @@ xfs_mark_rtg_boundary(
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
/*
* Cache the last zone written to for an inode so that it is considered first
* for subsequent writes.
*/
struct xfs_zone_cache_item {
struct xfs_mru_cache_elem mru;
struct xfs_open_zone *oz;
};
static inline struct xfs_zone_cache_item *
xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
{
return container_of(mru, struct xfs_zone_cache_item, mru);
}
static void
xfs_zone_cache_free_func(
void *data,
struct xfs_mru_cache_elem *mru)
{
struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
xfs_open_zone_put(item->oz);
kfree(item);
}
/*
* Check if we have a cached last open zone available for the inode and
* if yes return a reference to it.
*/
static struct xfs_open_zone *
xfs_cached_zone(
struct xfs_mount *mp,
struct xfs_inode *ip)
xfs_get_cached_zone(
struct xfs_inode *ip)
{
struct xfs_mru_cache_elem *mru;
struct xfs_open_zone *oz;
struct xfs_open_zone *oz;
mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
if (!mru)
return NULL;
oz = xfs_zone_cache_item(mru)->oz;
rcu_read_lock();
oz = VFS_I(ip)->i_private;
if (oz) {
/*
* GC only steals open zones at mount time, so no GC zones
* should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
if (!atomic_inc_not_zero(&oz->oz_ref))
oz = NULL;
}
xfs_mru_cache_done(mp->m_zone_cache);
rcu_read_unlock();
return oz;
}
/*
* Update the last used zone cache for a given inode.
* Stash our zone in the inode so that is is reused for future allocations.
*
* The caller must have a reference on the open zone.
* The open_zone structure will be pinned until either the inode is freed or
* until the cached open zone is replaced with a different one because the
* current one was full when we tried to use it. This means we keep any
* open zone around forever as long as any inode that used it for the last
* write is cached, which slightly increases the memory use of cached inodes
* that were every written to, but significantly simplifies the cached zone
* lookup. Because the open_zone is clearly marked as full when all data
* in the underlying RTG was written, the caching is always safe.
*/
static void
xfs_zone_cache_create_association(
struct xfs_inode *ip,
struct xfs_open_zone *oz)
xfs_set_cached_zone(
struct xfs_inode *ip,
struct xfs_open_zone *oz)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_cache_item *item = NULL;
struct xfs_mru_cache_elem *mru;
struct xfs_open_zone *old_oz;
ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
if (mru) {
/*
* If we have an association already, update it to point to the
* new zone.
*/
item = xfs_zone_cache_item(mru);
xfs_open_zone_put(item->oz);
item->oz = oz;
xfs_mru_cache_done(mp->m_zone_cache);
return;
}
item = kmalloc(sizeof(*item), GFP_KERNEL);
if (!item) {
xfs_open_zone_put(oz);
return;
}
item->oz = oz;
xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
old_oz = xchg(&VFS_I(ip)->i_private, oz);
if (old_oz)
xfs_open_zone_put(old_oz);
}
static void
@@ -891,15 +856,14 @@ xfs_zone_alloc_and_submit(
* the inode is still associated with a zone and use that if so.
*/
if (!*oz)
*oz = xfs_cached_zone(mp, ip);
*oz = xfs_get_cached_zone(ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
xfs_zone_cache_create_association(ip, *oz);
xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -977,6 +941,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
/*
* Wait for all open zones to be freed so that they drop the group
* references:
*/
rcu_barrier();
}
struct xfs_init_zones {
@@ -1290,14 +1260,6 @@ xfs_mount_zones(
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
/*
* Set up a mru cache to track inode to open zone for data placement
* purposes. The magic values for group count and life time is the
* same as the defaults for file streams, which seems sane enough.
*/
xfs_mru_cache_create(&mp->m_zone_cache, mp,
5000, 10, xfs_zone_cache_free_func);
return 0;
out_free_zone_info:
@@ -1311,5 +1273,4 @@ xfs_unmount_zones(
{
xfs_zone_gc_unmount(mp);
xfs_free_zone_info(mp->m_zone_info);
xfs_mru_cache_destroy(mp->m_zone_cache);
}

View File

@@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
struct rcu_head oz_rcu;
};
/*