Merge branch 'xfs-6.15-merge' into for-next

XFS code for 6.15 to be merged into linux-next

Signed-off-by: Carlos Maiolino <cem@kernel.org>
This commit is contained in:
Carlos Maiolino
2025-03-10 10:35:39 +01:00
92 changed files with 6404 additions and 1400 deletions

View File

@@ -246,6 +246,10 @@ The fields are as follows:
* **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
be set by the filesystem for its own purposes.
* **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
block assigned to it yet and the file system will do that in the bio
submission handler, splitting the I/O as needed.
These flags can be set by iomap itself during file operations.
The filesystem should supply an ``->iomap_end`` function if it needs
to observe these flags:
@@ -352,6 +356,11 @@ operations:
``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or
``RWF_NOWAIT``.
* ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a
buffered file I/O and would like the kernel to drop the pagecache
after the I/O completes, if it isn't already being used by another
thread.
If it is necessary to read existing file contents from a `different
<https://lore.kernel.org/all/20191008071527.29304-9-hch@lst.de/>`_
device or address range on a device, the filesystem should return that

View File

@@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
* ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``.
* ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.
Internal per-Folio State
------------------------
@@ -283,7 +285,7 @@ The ``ops`` structure must be specified and is as follows:
struct iomap_writeback_ops {
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
loff_t offset, unsigned len);
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
void (*discard_folio)(struct folio *folio, loff_t pos);
};
@@ -306,13 +308,12 @@ The fields are as follows:
purpose.
This function must be supplied by the filesystem.
- ``prepare_ioend``: Enables filesystems to transform the writeback
ioend or perform any other preparatory work before the writeback I/O
is submitted.
- ``submit_ioend``: Allows the file systems to hook into writeback bio
submission.
This might include pre-write space accounting updates, or installing
a custom ``->bi_end_io`` function for internal purposes, such as
deferring the ioend completion to a workqueue to run metadata update
transactions from process context.
transactions from process context before submitting the bio.
This function is optional.
- ``discard_folio``: iomap calls this function after ``->map_blocks``
@@ -341,7 +342,7 @@ This can happen in interrupt or process context, depending on the
storage device.
Filesystems that need to update internal bookkeeping (e.g. unwritten
extent conversions) should provide a ``->prepare_ioend`` function to
extent conversions) should provide a ``->submit_ioend`` function to
set ``struct iomap_end::bio::bi_end_io`` to its own function.
This function should call ``iomap_finish_ioends`` after finishing its
own work (e.g. unwritten extent conversion).
@@ -513,8 +514,8 @@ IOMAP_WRITE`` with any combination of the following enhancements:
if the mapping is unwritten and the filesystem cannot handle zeroing
the unaligned regions without exposing stale contents.
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
protection.
* ``IOMAP_ATOMIC_HW``: This write is being issued with torn-write
protection based on HW-offload support.
Only a single bio can be created for the write, and the write must
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
set.
@@ -525,8 +526,20 @@ IOMAP_WRITE`` with any combination of the following enhancements:
conversion or copy on write), all updates for the entire file range
must be committed atomically as well.
Only one space mapping is allowed per untorn write.
Untorn writes must be aligned to, and must not be longer than, a
single file block.
Untorn writes may be longer than a single file block. In all cases,
the mapping start disk block must have at least the same alignment as
the write offset.
* ``IOMAP_ATOMIC_SW``: This write is being issued with torn-write
protection via a software mechanism provided by the filesystem.
All the disk block alignment and single bio restrictions which apply
to IOMAP_ATOMIC_HW do not apply here.
SW-based untorn writes would typically be used as a fallback when
HW-based untorn writes may not be issued, e.g. the range of the write
covers multiple extents, meaning that it is not possible to issue
a single bio.
All filesystem metadata updates for the entire file range must be
committed atomically as well.
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
calling this function.

111
fs/dax.c
View File

@@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
static s64 dax_unshare_iter(struct iomap_iter *iter)
static int dax_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
@@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
u64 copy_len = iomap_length(iter);
u32 mod;
int id = 0;
s64 ret = 0;
s64 ret;
void *daddr = NULL, *saddr = NULL;
if (!iomap_want_unshare_iter(iter))
return iomap_length(iter);
return iomap_iter_advance_full(iter);
/*
* Extend the file range to be aligned to fsblock/pagesize, because
@@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
if (ret < 0)
goto out_unlock;
if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
ret = iomap_length(iter);
else
if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
ret = -EIO;
out_unlock:
dax_read_unlock(id);
return dax_mem2blk_err(ret);
if (ret < 0)
return dax_mem2blk_err(ret);
return iomap_iter_advance_full(iter);
}
int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
@@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = dax_unshare_iter(&iter);
iter.status = dax_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(dax_file_unshare);
@@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
return ret;
}
static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
const struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
u64 length = iomap_length(iter);
s64 written = 0;
int ret;
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return length;
return iomap_iter_advance(iter, &length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
*/
if (iomap->flags & IOMAP_F_SHARED)
invalidate_inode_pages2_range(iter->inode->i_mapping,
pos >> PAGE_SHIFT,
(pos + length - 1) >> PAGE_SHIFT);
iter->pos >> PAGE_SHIFT,
(iter->pos + length - 1) >> PAGE_SHIFT);
do {
loff_t pos = iter->pos;
unsigned offset = offset_in_page(pos);
unsigned size = min_t(u64, PAGE_SIZE - offset, length);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
long rc;
int id;
length = min_t(u64, PAGE_SIZE - offset, length);
id = dax_read_lock();
if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
rc = dax_memzero(iter, pos, size);
ret = dax_memzero(iter, pos, length);
dax_read_unlock(id);
if (rc < 0)
return rc;
pos += size;
length -= size;
written += size;
if (ret < 0)
return ret;
ret = iomap_iter_advance(iter, &length);
if (ret)
return ret;
} while (length > 0);
if (did_zero)
*did_zero = true;
return written;
return ret;
}
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
@@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
int ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = dax_zero_iter(&iter, did_zero);
iter.status = dax_zero_iter(&iter, did_zero);
return ret;
}
EXPORT_SYMBOL_GPL(dax_zero_range);
@@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iomi);
@@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (pos >= end)
return 0;
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
return iov_iter_zero(min(length, end - pos), iter);
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
done = iov_iter_zero(min(length, end - pos), iter);
return iomap_iter_advance(iomi, &done);
}
}
/*
@@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
}
id = dax_read_lock();
while (pos < end) {
while ((pos = iomi->pos) < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
@@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
pos += xfer;
length -= xfer;
done += xfer;
if (xfer == 0)
length = xfer;
ret = iomap_iter_advance(iomi, &length);
if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
}
dax_read_unlock(id);
return done ? done : ret;
return ret;
}
/**
@@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_NOWAIT;
while ((ret = iomap_iter(&iomi, ops)) > 0)
iomi.processed = dax_iomap_iter(&iomi, iter);
iomi.status = dax_iomap_iter(&iomi, iter);
done = iomi.pos - iocb->ki_pos;
iocb->ki_pos = iomi.pos;
@@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
while ((error = iomap_iter(&iter, ops)) > 0) {
if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
iter.processed = -EIO; /* fs corruption? */
iter.status = -EIO; /* fs corruption? */
continue;
}
@@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ret |= VM_FAULT_MAJOR;
}
if (!(ret & VM_FAULT_ERROR))
iter.processed = PAGE_SIZE;
if (!(ret & VM_FAULT_ERROR)) {
u64 length = PAGE_SIZE;
iter.status = iomap_iter_advance(&iter, &length);
}
}
if (iomap_errp)
@@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
if (ret != VM_FAULT_FALLBACK)
iter.processed = PMD_SIZE;
if (ret != VM_FAULT_FALLBACK) {
u64 length = PMD_SIZE;
iter.status = iomap_iter_advance(&iter, &length);
}
}
unlock_entry:
@@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
static int dax_range_compare_iter(struct iomap_iter *it_src,
struct iomap_iter *it_dest, u64 len, bool *same)
{
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
*same = true;
return len;
goto advance;
}
if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
@@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (!*same)
len = 0;
dax_read_unlock(id);
return len;
advance:
dest_len = len;
ret = iomap_iter_advance(it_src, &len);
if (!ret)
ret = iomap_iter_advance(it_dest, &dest_len);
return ret;
out_unlock:
dax_read_unlock(id);
@@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
.len = len,
.flags = IOMAP_DAX,
};
int ret, compared = 0;
int ret, status;
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
(ret = iomap_iter(&dst_iter, ops)) > 0) {
compared = dax_range_compare_iter(&src_iter, &dst_iter,
status = dax_range_compare_iter(&src_iter, &dst_iter,
min(src_iter.len, dst_iter.len), same);
if (compared < 0)
if (status < 0)
return ret;
src_iter.processed = dst_iter.processed = compared;
src_iter.status = dst_iter.status = status;
}
return ret;
}

View File

@@ -3467,7 +3467,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
return false;
/* atomic writes are all-or-nothing */
if (flags & IOMAP_ATOMIC)
if (flags & IOMAP_ATOMIC_HW)
return false;
/* can only try again if we wrote nothing */

View File

@@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
unsigned int length)
{
BUG_ON(current->journal_info);
return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
NULL);
}
#define GFS2_JTRUNC_REVOKES 8192

View File

@@ -12,6 +12,7 @@ iomap-y += trace.o \
iter.o
iomap-$(CONFIG_BLOCK) += buffered-io.o \
direct-io.o \
ioend.o \
fiemap.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o

View File

@@ -12,17 +12,15 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
#include "internal.h"
#include "trace.h"
#include "../internal.h"
#define IOEND_BATCH_SIZE 4096
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -40,8 +38,6 @@ struct iomap_folio_state {
unsigned long state[];
};
static struct bio_set iomap_ioend_bioset;
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@@ -366,15 +362,14 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
pos >= i_size_read(iter->inode);
}
static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx, loff_t offset)
static int iomap_readpage_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
const struct iomap *iomap = &iter->iomap;
loff_t pos = iter->pos + offset;
loff_t length = iomap_length(iter) - offset;
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
struct folio *folio = ctx->cur_folio;
struct iomap_folio_state *ifs;
loff_t orig_pos = pos;
size_t poff, plen;
sector_t sector;
@@ -438,25 +433,22 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
* we can skip trailing ones as they will be handled in the next
* iteration.
*/
return pos - orig_pos + plen;
length = pos - iter->pos + plen;
return iomap_iter_advance(iter, &length);
}
static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
static int iomap_read_folio_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
struct folio *folio = ctx->cur_folio;
size_t offset = offset_in_folio(folio, iter->pos);
loff_t length = min_t(loff_t, folio_size(folio) - offset,
iomap_length(iter));
loff_t done, ret;
int ret;
for (done = 0; done < length; done += ret) {
ret = iomap_readpage_iter(iter, ctx, done);
if (ret <= 0)
while (iomap_length(iter)) {
ret = iomap_readpage_iter(iter, ctx);
if (ret)
return ret;
}
return done;
return 0;
}
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
@@ -474,7 +466,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_read_folio_iter(&iter, &ctx);
iter.status = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
submit_bio(ctx.bio);
@@ -493,15 +485,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_read_folio);
static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
static int iomap_readahead_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
loff_t length = iomap_length(iter);
loff_t done, ret;
int ret;
for (done = 0; done < length; done += ret) {
while (iomap_length(iter)) {
if (ctx->cur_folio &&
offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
offset_in_folio(ctx->cur_folio, iter->pos) == 0) {
if (!ctx->cur_folio_in_bio)
folio_unlock(ctx->cur_folio);
ctx->cur_folio = NULL;
@@ -510,12 +501,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
ctx->cur_folio = readahead_folio(ctx->rac);
ctx->cur_folio_in_bio = false;
}
ret = iomap_readpage_iter(iter, ctx, done);
if (ret <= 0)
ret = iomap_readpage_iter(iter, ctx);
if (ret)
return ret;
}
return done;
return 0;
}
/**
@@ -547,7 +538,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
while (iomap_iter(&iter, ops) > 0)
iter.processed = iomap_readahead_iter(&iter, &ctx);
iter.status = iomap_readahead_iter(&iter, &ctx);
if (ctx.bio)
submit_bio(ctx.bio);
@@ -603,6 +594,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
if (iter->flags & IOMAP_DONTCACHE)
fgp |= FGP_DONTCACHE;
fgp |= fgf_set_order(len);
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
@@ -907,12 +900,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
return __iomap_write_end(iter->inode, pos, len, copied, folio);
}
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
ssize_t total_written = 0;
long status = 0;
int status = 0;
struct address_space *mapping = iter->inode->i_mapping;
size_t chunk = mapping_max_folio_size(mapping);
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
@@ -923,7 +914,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
size_t written; /* Bytes have been written */
u64 written; /* Bytes have been written */
loff_t pos = iter->pos;
bytes = iov_iter_count(i);
retry:
@@ -934,8 +926,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
if (unlikely(status))
break;
if (bytes > length)
bytes = length;
if (bytes > iomap_length(iter))
bytes = iomap_length(iter);
/*
* Bring in the user page that we'll copy from _first_.
@@ -1006,17 +998,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
goto retry;
}
} else {
pos += written;
total_written += written;
length -= written;
iomap_iter_advance(iter, &written);
}
} while (iov_iter_count(i) && length);
} while (iov_iter_count(i) && iomap_length(iter));
if (status == -EAGAIN) {
iov_iter_revert(i, total_written);
return -EAGAIN;
}
return total_written ? total_written : status;
return total_written ? 0 : status;
}
ssize_t
@@ -1034,9 +1021,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
if (iocb->ki_flags & IOCB_DONTCACHE)
iter.flags |= IOMAP_DONTCACHE;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i);
iter.status = iomap_write_iter(&iter, i);
if (unlikely(iter.pos == iocb->ki_pos))
return ret;
@@ -1270,23 +1259,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
}
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
static int iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
loff_t written = 0;
u64 bytes = iomap_length(iter);
int status;
if (!iomap_want_unshare_iter(iter))
return length;
return iomap_iter_advance(iter, &bytes);
do {
struct folio *folio;
int status;
size_t offset;
size_t bytes = min_t(u64, SIZE_MAX, length);
loff_t pos = iter->pos;
bool ret;
bytes = min_t(u64, SIZE_MAX, bytes);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
@@ -1304,14 +1292,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
cond_resched();
pos += bytes;
written += bytes;
length -= bytes;
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (length > 0);
return written;
status = iomap_iter_advance(iter, &bytes);
if (status)
break;
} while (bytes > 0);
return status;
}
int
@@ -1331,7 +1319,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_unshare_iter(&iter);
iter.status = iomap_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
@@ -1350,19 +1338,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
return filemap_write_and_wait_range(mapping, i->pos, end);
}
static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
loff_t written = 0;
u64 bytes = iomap_length(iter);
int status;
do {
struct folio *folio;
int status;
size_t offset;
size_t bytes = min_t(u64, SIZE_MAX, length);
loff_t pos = iter->pos;
bool ret;
bytes = min_t(u64, SIZE_MAX, bytes);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
return status;
@@ -1383,25 +1370,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (WARN_ON_ONCE(!ret))
return -EIO;
pos += bytes;
length -= bytes;
written += bytes;
} while (length > 0);
status = iomap_iter_advance(iter, &bytes);
if (status)
break;
} while (bytes > 0);
if (did_zero)
*did_zero = true;
return written;
return status;
}
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
const struct iomap_ops *ops)
const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = inode,
.pos = pos,
.len = len,
.flags = IOMAP_ZERO,
.private = private,
};
struct address_space *mapping = inode->i_mapping;
unsigned int blocksize = i_blocksize(inode);
@@ -1424,7 +1412,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
iter.len = plen;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_zero_iter(&iter, did_zero);
iter.status = iomap_zero_iter(&iter, did_zero);
iter.len = len - (iter.pos - pos);
if (ret || !iter.len)
@@ -1443,17 +1431,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
if (srcmap->type == IOMAP_HOLE ||
srcmap->type == IOMAP_UNWRITTEN) {
loff_t proc = iomap_length(&iter);
s64 status;
if (range_dirty) {
range_dirty = false;
proc = iomap_zero_iter_flush_and_stale(&iter);
status = iomap_zero_iter_flush_and_stale(&iter);
} else {
status = iomap_iter_advance_full(&iter);
}
iter.processed = proc;
iter.status = status;
continue;
}
iter.processed = iomap_zero_iter(&iter, did_zero);
iter.status = iomap_zero_iter(&iter, did_zero);
}
return ret;
}
@@ -1461,7 +1451,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops)
const struct iomap_ops *ops, void *private)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int off = pos & (blocksize - 1);
@@ -1469,11 +1459,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
/* Block boundary? Nothing to do */
if (!off)
return 0;
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
struct folio *folio)
{
loff_t length = iomap_length(iter);
@@ -1490,14 +1481,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
folio_mark_dirty(folio);
}
return length;
return iomap_iter_advance(iter, &length);
}
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
void *private)
{
struct iomap_iter iter = {
.inode = file_inode(vmf->vma->vm_file),
.flags = IOMAP_WRITE | IOMAP_FAULT,
.private = private,
};
struct folio *folio = page_folio(vmf->page);
ssize_t ret;
@@ -1509,7 +1502,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
iter.pos = folio_pos(folio);
iter.len = ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
iter.status = iomap_folio_mkwrite_iter(&iter, folio);
if (ret < 0)
goto out_unlock;
@@ -1538,16 +1531,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_bio;
struct folio_iter fi;
u32 folio_count = 0;
if (error) {
mapping_set_error(inode->i_mapping, error);
if (ioend->io_error) {
mapping_set_error(inode->i_mapping, ioend->io_error);
if (!bio_flagged(bio, BIO_QUIET)) {
pr_err_ratelimited(
"%s: writeback error on inode %lu, offset %lld, sector %llu",
@@ -1566,116 +1558,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
return folio_count;
}
/*
* Ioend completion routine for merged bios. This can only be called from task
* contexts as merged ioends can be of unbound length. Hence we have to break up
* the writeback completions into manageable chunks to avoid long scheduler
* holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
* good batch processing throughput without creating adverse scheduler latency
* conditions.
*/
void
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{
struct list_head tmp;
u32 completions;
might_sleep();
list_replace_init(&ioend->io_list, &tmp);
completions = iomap_finish_ioend(ioend, error);
while (!list_empty(&tmp)) {
if (completions > IOEND_BATCH_SIZE * 8) {
cond_resched();
completions = 0;
}
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
list_del_init(&ioend->io_list);
completions += iomap_finish_ioend(ioend, error);
}
}
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
/*
* We can merge two adjacent ioends if they have the same set of work to do.
*/
static bool
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
if (next->io_flags & IOMAP_F_BOUNDARY)
return false;
if ((ioend->io_flags & IOMAP_F_SHARED) ^
(next->io_flags & IOMAP_F_SHARED))
return false;
if ((ioend->io_type == IOMAP_UNWRITTEN) ^
(next->io_type == IOMAP_UNWRITTEN))
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
/*
* Do not merge physically discontiguous ioends. The filesystem
* completion functions will have to iterate the physical
* discontiguities even if we merge the ioends at a logical level, so
* we don't gain anything by merging physical discontiguities here.
*
* We cannot use bio->bi_iter.bi_sector here as it is modified during
* submission so does not point to the start sector of the bio at
* completion.
*/
if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
return false;
return true;
}
void
iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
{
struct iomap_ioend *next;
INIT_LIST_HEAD(&ioend->io_list);
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
io_list))) {
if (!iomap_ioend_can_merge(ioend, next))
break;
list_move_tail(&next->io_list, &ioend->io_list);
ioend->io_size += next->io_size;
}
}
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
static int
iomap_ioend_compare(void *priv, const struct list_head *a,
const struct list_head *b)
{
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
if (ia->io_offset < ib->io_offset)
return -1;
if (ia->io_offset > ib->io_offset)
return 1;
return 0;
}
void
iomap_sort_ioends(struct list_head *ioend_list)
{
list_sort(NULL, ioend_list, iomap_ioend_compare);
}
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
static void iomap_writepage_end_bio(struct bio *bio)
{
iomap_finish_ioend(iomap_ioend_from_bio(bio),
blk_status_to_errno(bio->bi_status));
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
ioend->io_error = blk_status_to_errno(bio->bi_status);
iomap_finish_ioend_buffered(ioend);
}
/*
* Submit the final bio for an ioend.
* Submit an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
* the submission process has failed after we've marked pages for writeback.
@@ -1694,14 +1586,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
* failure happened so that the file system end I/O handler gets called
* to clean up.
*/
if (wpc->ops->prepare_ioend)
error = wpc->ops->prepare_ioend(wpc->ioend, error);
if (wpc->ops->submit_ioend) {
error = wpc->ops->submit_ioend(wpc, error);
} else {
if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
error = -EIO;
if (!error)
submit_bio(&wpc->ioend->io_bio);
}
if (error) {
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
bio_endio(&wpc->ioend->io_bio);
} else {
submit_bio(&wpc->ioend->io_bio);
}
wpc->ioend = NULL;
@@ -1709,9 +1605,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
}
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode, loff_t pos)
struct writeback_control *wbc, struct inode *inode, loff_t pos,
u16 ioend_flags)
{
struct iomap_ioend *ioend;
struct bio *bio;
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
@@ -1719,36 +1615,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_end_io = iomap_writepage_end_bio;
wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
ioend = iomap_ioend_from_bio(bio);
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = wpc->iomap.type;
ioend->io_flags = wpc->iomap.flags;
if (pos > wpc->iomap.offset)
wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = pos;
ioend->io_sector = bio->bi_iter.bi_sector;
wbc_init_bio(wbc, bio);
wpc->nr_folios = 0;
return ioend;
return iomap_init_ioend(inode, bio, pos, ioend_flags);
}
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
u16 ioend_flags)
{
if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
if (ioend_flags & IOMAP_IOEND_BOUNDARY)
return false;
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
(wpc->ioend->io_flags & IOMAP_F_SHARED))
return false;
if (wpc->iomap.type != wpc->ioend->io_type)
if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
(wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
if (iomap_sector(&wpc->iomap, pos) !=
if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
iomap_sector(&wpc->iomap, pos) !=
bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
@@ -1779,14 +1663,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
unsigned int ioend_flags = 0;
int error;
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
if (wpc->iomap.type == IOMAP_UNWRITTEN)
ioend_flags |= IOMAP_IOEND_UNWRITTEN;
if (wpc->iomap.flags & IOMAP_F_SHARED)
ioend_flags |= IOMAP_IOEND_SHARED;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
error = iomap_submit_ioend(wpc, 0);
if (error)
return error;
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
ioend_flags);
}
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
@@ -2062,11 +1955,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
static int __init iomap_buffered_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
fs_initcall(iomap_buffered_init);

View File

@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (c) 2016-2021 Christoph Hellwig.
* Copyright (c) 2016-2025 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -12,6 +12,7 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -20,6 +21,7 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
#define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
@@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
WRITE_ONCE(iocb->private, bio);
}
if (dio->dops && dio->dops->submit_io)
if (dio->dops && dio->dops->submit_io) {
dio->dops->submit_io(iter, bio, pos);
else
} else {
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
submit_bio(bio);
}
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
!(dio->flags & IOMAP_DIO_NO_INVALIDATE))
kiocb_invalidate_post_direct_write(iocb, dio->size);
inode_dio_end(file_inode(iocb->ki_filp));
@@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
void iomap_dio_bio_end_io(struct bio *bio)
/*
* Called when dio->ref reaches zero from an I/O completion.
*/
static void iomap_dio_done(struct iomap_dio *dio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (!atomic_dec_and_test(&dio->ref))
goto release_bio;
/*
* Synchronous dio, task itself will handle any completion work
* that needs after IO. All we need to do is wake the task.
*/
if (dio->wait_for_completion) {
/*
* Synchronous I/O, task itself will handle any completion work
* that needs after IO. All we need to do is wake the task.
*/
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
goto release_bio;
}
/*
* Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
*/
if (dio->flags & IOMAP_DIO_INLINE_COMP) {
} else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
goto release_bio;
}
/*
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
* our completion that way to avoid an async punt to a workqueue.
*/
if (dio->flags & IOMAP_DIO_CALLER_COMP) {
} else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
/*
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
* schedule our completion that way to avoid an async punt to a
* workqueue.
*/
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
@@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
* issuer.
*/
iocb->ki_complete(iocb, 0);
goto release_bio;
}
} else {
struct inode *inode = file_inode(iocb->ki_filp);
/*
* Async DIO completion that requires filesystem level
* completion work gets punted to a work queue to complete as
* the operation may require more IO to be issued to finalise
* filesystem metadata changes or guarantee data integrity.
*/
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
}
}
void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (atomic_dec_and_test(&dio->ref))
iomap_dio_done(dio);
/*
* Async DIO completion that requires filesystem level completion work
* gets punted to a work queue to complete as the operation may require
* more IO to be issued to finalise filesystem metadata changes or
* guarantee data integrity.
*/
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
&dio->aio.work);
release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@@ -239,6 +244,47 @@ void iomap_dio_bio_end_io(struct bio *bio)
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
{
struct iomap_dio *dio = ioend->io_bio.bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
u32 vec_count = ioend->io_bio.bi_vcnt;
if (ioend->io_error)
iomap_dio_set_error(dio, ioend->io_error);
if (atomic_dec_and_test(&dio->ref)) {
/*
* Try to avoid another context switch for the completion given
* that we are already called from the ioend completion
* workqueue, but never invalidate pages from this thread to
* avoid deadlocks with buffered I/O completions. Tough luck if
* you hit the tiny race with someone dirtying the range now
* between this check and the actual completion.
*/
if (!dio->iocb->ki_filp->f_mapping->nrpages) {
dio->flags |= IOMAP_DIO_INLINE_COMP;
dio->flags |= IOMAP_DIO_NO_INVALIDATE;
}
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
iomap_dio_done(dio);
}
if (should_dirty) {
bio_check_pages_dirty(&ioend->io_bio);
} else {
bio_release_pages(&ioend->io_bio, false);
bio_put(&ioend->io_bio);
}
/*
* Return the number of bvecs completed as even direct I/O completions
* do significant per-folio work and we'll still want to give up the
* CPU after a lot of completions.
*/
return vec_count;
}
static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
@@ -271,7 +317,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua, bool atomic)
const struct iomap *iomap, bool use_fua, bool atomic_hw)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@@ -283,30 +329,29 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
if (atomic)
if (atomic_hw)
opflags |= REQ_ATOMIC;
return opflags;
}
static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
struct iomap_dio *dio)
static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW;
const loff_t length = iomap_length(iter);
bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
bool need_zeroout = false;
bool use_fua = false;
int nr_pages, ret = 0;
size_t copied = 0;
u64 copied = 0;
size_t orig_count;
if (atomic && length != fs_block_size)
if (atomic_hw && length != iter->len)
return -EINVAL;
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
@@ -383,7 +428,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@@ -416,7 +461,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
if (WARN_ON_ONCE(atomic && n != length)) {
if (WARN_ON_ONCE(atomic_hw && n != length)) {
/*
* This bio should have covered the complete length,
* which it doesn't, so error. We may need to zero out
@@ -467,30 +512,28 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
/* Undo iter limitation to current extent */
iov_iter_reexpand(dio->submit.iter, orig_count - copied);
if (copied)
return copied;
return iomap_iter_advance(iter, &copied);
return ret;
}
static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
struct iomap_dio *dio)
static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
if (!length)
return -EFAULT;
return length;
return iomap_iter_advance(iter, &length);
}
static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
struct iomap_dio *dio)
static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
{
const struct iomap *iomap = &iomi->iomap;
struct iov_iter *iter = dio->submit.iter;
void *inline_data = iomap_inline_data(iomap, iomi->pos);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
size_t copied;
u64 copied;
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
return -EIO;
@@ -512,11 +555,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
dio->size += copied;
if (!copied)
return -EFAULT;
return copied;
return iomap_iter_advance(iomi, &copied);
}
static loff_t iomap_dio_iter(const struct iomap_iter *iter,
struct iomap_dio *dio)
static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
switch (iter->iomap.type) {
case IOMAP_HOLE:
@@ -610,9 +652,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC;
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -647,6 +686,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
if (dio_flags & IOMAP_DIO_ATOMIC_SW)
iomi.flags |= IOMAP_ATOMIC_SW;
else if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC_HW;
/* for data sync or sync, we need sync completion processing */
if (iocb_is_dsync(iocb)) {
dio->flags |= IOMAP_DIO_NEED_SYNC;
@@ -700,7 +744,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
while ((ret = iomap_iter(&iomi, ops)) > 0) {
iomi.processed = iomap_dio_iter(&iomi, dio);
iomi.status = iomap_dio_iter(&iomi, dio);
/*
* We can only poll for single bio I/Os.

View File

@@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
iomap->length, flags);
}
static loff_t iomap_fiemap_iter(const struct iomap_iter *iter,
static int iomap_fiemap_iter(struct iomap_iter *iter,
struct fiemap_extent_info *fi, struct iomap *prev)
{
int ret;
if (iter->iomap.type == IOMAP_HOLE)
return iomap_length(iter);
goto advance;
ret = iomap_to_fiemap(fi, prev, 0);
*prev = iter->iomap;
switch (ret) {
case 0: /* success */
return iomap_length(iter);
case 1: /* extent array full */
return 0;
default: /* error */
if (ret < 0)
return ret;
}
if (ret == 1) /* extent array full */
return 0;
advance:
return iomap_iter_advance_full(iter);
}
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
@@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_fiemap_iter(&iter, fi, &prev);
iter.status = iomap_fiemap_iter(&iter, fi, &prev);
if (prev.type != IOMAP_HOLE) {
ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST);
@@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno,
while ((ret = iomap_iter(&iter, ops)) > 0) {
if (iter.iomap.type == IOMAP_MAPPED)
bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift;
/* leave iter.processed unset to abort loop */
/* leave iter.status unset to abort loop */
}
if (ret)
return 0;

10
fs/iomap/internal.h Normal file
View File

@@ -0,0 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _IOMAP_INTERNAL_H
#define _IOMAP_INTERNAL_H 1
#define IOEND_BATCH_SIZE 4096
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
#endif /* _IOMAP_INTERNAL_H */

216
fs/iomap/ioend.c Normal file
View File

@@ -0,0 +1,216 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2024-2025 Christoph Hellwig.
*/
#include <linux/iomap.h>
#include <linux/list_sort.h>
#include "internal.h"
struct bio_set iomap_ioend_bioset;
EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
struct iomap_ioend *iomap_init_ioend(struct inode *inode,
struct bio *bio, loff_t file_offset, u16 ioend_flags)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
atomic_set(&ioend->io_remaining, 1);
ioend->io_error = 0;
ioend->io_parent = NULL;
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_flags = ioend_flags;
ioend->io_inode = inode;
ioend->io_offset = file_offset;
ioend->io_size = bio->bi_iter.bi_size;
ioend->io_sector = bio->bi_iter.bi_sector;
ioend->io_private = NULL;
return ioend;
}
EXPORT_SYMBOL_GPL(iomap_init_ioend);
static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
if (ioend->io_parent) {
struct bio *bio = &ioend->io_bio;
ioend = ioend->io_parent;
bio_put(bio);
}
if (error)
cmpxchg(&ioend->io_error, 0, error);
if (!atomic_dec_and_test(&ioend->io_remaining))
return 0;
if (ioend->io_flags & IOMAP_IOEND_DIRECT)
return iomap_finish_ioend_direct(ioend);
return iomap_finish_ioend_buffered(ioend);
}
/*
* Ioend completion routine for merged bios. This can only be called from task
* contexts as merged ioends can be of unbound length. Hence we have to break up
* the writeback completions into manageable chunks to avoid long scheduler
* holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
* good batch processing throughput without creating adverse scheduler latency
* conditions.
*/
void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{
struct list_head tmp;
u32 completions;
might_sleep();
list_replace_init(&ioend->io_list, &tmp);
completions = iomap_finish_ioend(ioend, error);
while (!list_empty(&tmp)) {
if (completions > IOEND_BATCH_SIZE * 8) {
cond_resched();
completions = 0;
}
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
list_del_init(&ioend->io_list);
completions += iomap_finish_ioend(ioend, error);
}
}
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
/*
* We can merge two adjacent ioends if they have the same set of work to do.
*/
static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
struct iomap_ioend *next)
{
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
if (next->io_flags & IOMAP_IOEND_BOUNDARY)
return false;
if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
(next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
/*
* Do not merge physically discontiguous ioends. The filesystem
* completion functions will have to iterate the physical
* discontiguities even if we merge the ioends at a logical level, so
* we don't gain anything by merging physical discontiguities here.
*
* We cannot use bio->bi_iter.bi_sector here as it is modified during
* submission so does not point to the start sector of the bio at
* completion.
*/
if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
next->io_sector)
return false;
return true;
}
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends)
{
struct iomap_ioend *next;
INIT_LIST_HEAD(&ioend->io_list);
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
io_list))) {
if (!iomap_ioend_can_merge(ioend, next))
break;
list_move_tail(&next->io_list, &ioend->io_list);
ioend->io_size += next->io_size;
}
}
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
static int iomap_ioend_compare(void *priv, const struct list_head *a,
const struct list_head *b)
{
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
if (ia->io_offset < ib->io_offset)
return -1;
if (ia->io_offset > ib->io_offset)
return 1;
return 0;
}
void iomap_sort_ioends(struct list_head *ioend_list)
{
list_sort(NULL, ioend_list, iomap_ioend_compare);
}
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
/*
* Split up to the first @max_len bytes from @ioend if the ioend covers more
* than @max_len bytes.
*
* If @is_append is set, the split will be based on the hardware limits for
* REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
* limits don't allow the entire @max_len length.
*
* The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
* does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
* switch the operation after this call, but before submitting the bio.
*/
struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
unsigned int max_len, bool is_append)
{
struct bio *bio = &ioend->io_bio;
struct iomap_ioend *split_ioend;
unsigned int nr_segs;
int sector_offset;
struct bio *split;
if (is_append) {
struct queue_limits *lim = bdev_limits(bio->bi_bdev);
max_len = min(max_len,
lim->max_zone_append_sectors << SECTOR_SHIFT);
sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
if (unlikely(sector_offset < 0))
return ERR_PTR(sector_offset);
if (!sector_offset)
return NULL;
} else {
if (bio->bi_iter.bi_size <= max_len)
return NULL;
sector_offset = max_len >> SECTOR_SHIFT;
}
/* ensure the split ioend is still block size aligned */
sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
if (IS_ERR(split))
return ERR_CAST(split);
split->bi_private = bio->bi_private;
split->bi_end_io = bio->bi_end_io;
split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
ioend->io_flags);
split_ioend->io_parent = ioend;
atomic_inc(&ioend->io_remaining);
ioend->io_offset += split_ioend->io_size;
ioend->io_size -= split_ioend->io_size;
split_ioend->io_sector = ioend->io_sector;
if (!is_append)
ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
return split_ioend;
}
EXPORT_SYMBOL_GPL(iomap_split_ioend);
static int __init iomap_ioend_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
fs_initcall(iomap_ioend_init);

View File

@@ -7,40 +7,25 @@
#include <linux/iomap.h>
#include "trace.h"
/*
* Advance to the next range we need to map.
*
* If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
* processed - it was aborted because the extent the iomap spanned may have been
* changed during the operation. In this case, the iteration behaviour is to
* remap the unprocessed range of the iter, and that means we may need to remap
* even when we've made no progress (i.e. iter->processed = 0). Hence the
* "finished iterating" case needs to distinguish between
* (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
* need to remap the entire remaining range.
*/
static inline int iomap_iter_advance(struct iomap_iter *iter)
static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
bool stale = iter->iomap.flags & IOMAP_F_STALE;
int ret = 1;
/* handle the previous iteration (if any) */
if (iter->iomap.length) {
if (iter->processed < 0)
return iter->processed;
if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
return -EIO;
iter->pos += iter->processed;
iter->len -= iter->processed;
if (!iter->len || (!iter->processed && !stale))
ret = 0;
}
/* clear the per iteration state */
iter->processed = 0;
iter->status = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
return ret;
}
/*
* Advance the current iterator position and output the length remaining for the
* current mapping.
*/
int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
{
if (WARN_ON_ONCE(*count > iomap_length(iter)))
return -EIO;
iter->pos += *count;
iter->len -= *count;
*count = iomap_length(iter);
return 0;
}
static inline void iomap_iter_done(struct iomap_iter *iter)
@@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
iter->iter_start_pos = iter->pos;
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
if (iter->srcmap.type != IOMAP_HOLE)
trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
@@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
* function must be called in a loop that continues as long it returns a
* positive value. If 0 or a negative value is returned, the caller must not
* return to the loop body. Within a loop body, there are two ways to break out
* of the loop body: leave @iter.processed unchanged, or set it to a negative
* of the loop body: leave @iter.status unchanged, or set it to a negative
* errno.
*/
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
{
bool stale = iter->iomap.flags & IOMAP_F_STALE;
ssize_t advanced;
u64 olen;
int ret;
if (iter->iomap.length && ops->iomap_end) {
ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
iter->processed > 0 ? iter->processed : 0,
iter->flags, &iter->iomap);
if (ret < 0 && !iter->processed)
trace_iomap_iter(iter, ops, _RET_IP_);
if (!iter->iomap.length)
goto begin;
/*
* Calculate how far the iter was advanced and the original length bytes
* for ->iomap_end().
*/
advanced = iter->pos - iter->iter_start_pos;
olen = iter->len + advanced;
if (ops->iomap_end) {
ret = ops->iomap_end(iter->inode, iter->iter_start_pos,
iomap_length_trim(iter, iter->iter_start_pos,
olen),
advanced, iter->flags, &iter->iomap);
if (ret < 0 && !advanced)
return ret;
}
trace_iomap_iter(iter, ops, _RET_IP_);
ret = iomap_iter_advance(iter);
/* detect old return semantics where this would advance */
if (WARN_ON_ONCE(iter->status > 0))
iter->status = -EIO;
/*
* Use iter->len to determine whether to continue onto the next mapping.
* Explicitly terminate on error status or if the current iter has not
* advanced at all (i.e. no work was done for some reason) unless the
* mapping has been marked stale and needs to be reprocessed.
*/
if (iter->status < 0)
ret = iter->status;
else if (iter->len == 0 || (!advanced && !stale))
ret = 0;
else
ret = 1;
iomap_iter_reset_iomap(iter);
if (ret <= 0)
return ret;
begin:
ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
&iter->iomap, &iter->srcmap);
if (ret < 0)

View File

@@ -10,7 +10,7 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
static int iomap_seek_hole_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
@@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_HOLE);
if (*hole_pos == iter->pos + length)
return length;
return iomap_iter_advance(iter, &length);
return 0;
case IOMAP_HOLE:
*hole_pos = iter->pos;
return 0;
default:
return length;
return iomap_iter_advance(iter, &length);
}
}
@@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_seek_hole_iter(&iter, &pos);
iter.status = iomap_seek_hole_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found hole before EOF */
@@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_seek_hole);
static loff_t iomap_seek_data_iter(const struct iomap_iter *iter,
static int iomap_seek_data_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
switch (iter->iomap.type) {
case IOMAP_HOLE:
return length;
return iomap_iter_advance(iter, &length);
case IOMAP_UNWRITTEN:
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_DATA);
if (*hole_pos < 0)
return length;
return iomap_iter_advance(iter, &length);
return 0;
default:
*hole_pos = iter->pos;
@@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_seek_data_iter(&iter, &pos);
iter.status = iomap_seek_data_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found data before EOF */

View File

@@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
* swap only cares about contiguous page-aligned physical extents and makes no
* distinction between written and unwritten extents.
*/
static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
static int iomap_swapfile_iter(struct iomap_iter *iter,
struct iomap *iomap, struct iomap_swapfile_info *isi)
{
switch (iomap->type) {
@@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
return error;
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
}
return iomap_length(iter);
return iomap_iter_advance_full(iter);
}
/*
@@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
if (ret < 0)
return ret;

View File

@@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
{ IOMAP_NOWAIT, "NOWAIT" }, \
{ IOMAP_ATOMIC, "ATOMIC" }
{ IOMAP_ATOMIC_HW, "ATOMIC_HW" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
@@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter,
__field(u64, ino)
__field(loff_t, pos)
__field(u64, length)
__field(s64, processed)
__field(int, status)
__field(unsigned int, flags)
__field(const void *, ops)
__field(unsigned long, caller)
@@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter,
__entry->ino = iter->inode->i_ino;
__entry->pos = iter->pos;
__entry->length = iomap_length(iter);
__entry->processed = iter->processed;
__entry->status = iter->status;
__entry->flags = iter->flags;
__entry->ops = ops;
__entry->caller = caller;
),
TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
__entry->length,
__entry->processed,
__entry->status,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,

View File

@@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs_rtbitmap.o \
xfs_rtgroup.o \
xfs_zones.o \
)
# highlevel code
@@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_quotaops.o
# xfs_rtbitmap is shared with libxfs
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
xfs_zone_alloc.o \
xfs_zone_gc.o \
xfs_zone_info.o \
xfs_zone_space_resv.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o

View File

@@ -34,13 +34,13 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_health.h"
#include "xfs_bmap_item.h"
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -171,18 +171,16 @@ xfs_bmbt_update(
* Compute the worst-case number of indirect blocks that will be used
* for ip's delayed extent of length "len".
*/
STATIC xfs_filblks_t
xfs_filblks_t
xfs_bmap_worst_indlen(
xfs_inode_t *ip, /* incore inode pointer */
xfs_filblks_t len) /* delayed extent length */
struct xfs_inode *ip, /* incore inode pointer */
xfs_filblks_t len) /* delayed extent length */
{
int level; /* btree level number */
int maxrecs; /* maximum record count at this level */
xfs_mount_t *mp; /* mount structure */
xfs_filblks_t rval; /* return value */
struct xfs_mount *mp = ip->i_mount;
int maxrecs = mp->m_bmap_dmxr[0];
int level;
xfs_filblks_t rval;
mp = ip->i_mount;
maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
level++) {
@@ -2571,146 +2569,6 @@ xfs_bmap_add_extent_unwritten_real(
#undef PREV
}
/*
* Convert a hole to a delayed allocation.
*/
STATIC void
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
xfs_bmbt_irec_t *new) /* new data to add to file extents */
{
struct xfs_ifork *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(isnullstartblock(new->br_startblock));
/*
* Check and set flags if this segment has a left neighbor
*/
if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
state |= BMAP_LEFT_VALID;
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
/*
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
if (xfs_iext_get_extent(ifp, icur, &right)) {
state |= BMAP_RIGHT_VALID;
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
/*
* Set contiguity flags on the left and right neighbors.
* Don't let extents get too large, even if the pieces are contiguous.
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
* Switch out based on the contiguity flags.
*/
switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with delayed allocations
* on the left and on the right.
* Merge all three into a single extent record.
*/
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
left.br_startblock = nullstartblock(newlen);
left.br_blockcount = temp;
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_LEFT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the left.
* Merge the new allocation with the left neighbor.
*/
temp = left.br_blockcount + new->br_blockcount;
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
left.br_blockcount = temp;
left.br_startblock = nullstartblock(newlen);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the right.
* Merge the new allocation with the right neighbor.
*/
temp = new->br_blockcount + right.br_blockcount;
oldlen = startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
right.br_startoff = new->br_startoff;
right.br_startblock = nullstartblock(newlen);
right.br_blockcount = temp;
xfs_iext_update_extent(ip, state, icur, &right);
break;
case 0:
/*
* New allocation is not contiguous with another
* delayed allocation.
* Insert a new entry.
*/
oldlen = newlen = 0;
xfs_iext_insert(ip, icur, new, state);
break;
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
/*
* Nothing to do for disk quota accounting here.
*/
xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
}
}
/*
* Convert a hole to a real allocation.
*/
@@ -4039,144 +3897,6 @@ xfs_bmapi_read(
return 0;
}
/*
* Add a delayed allocation extent to an inode. Blocks are reserved from the
* global pool and the extent inserted into the inode in-core extent tree.
*
* On entry, got refers to the first extent beyond the offset of the extent to
* allocate or eof is specified if no such extent exists. On return, got refers
* to the extent record that was inserted to the inode fork.
*
* Note that the allocated extent may have been merged with contiguous extents
* during insertion into the inode fork. Thus, got does not reflect the current
* state of the inode fork on return. If necessary, the caller can use lastx to
* look up the updated record in the inode fork.
*/
int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_fileoff_t off,
xfs_filblks_t len,
xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
struct xfs_iext_cursor *icur,
int eof)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
uint64_t fdblocks;
int error;
xfs_fileoff_t aoff;
bool use_cowextszhint =
whichfork == XFS_COW_FORK && !prealloc;
retry:
/*
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
prealloc = alen - len;
/*
* If we're targetting the COW fork but aren't creating a speculative
* posteof preallocation, try to expand the reservation to align with
* the COW extent size hint if there's sufficient free space.
*
* Unlike the data fork, the CoW cancellation functions will free all
* the reservations at inactivation, so we don't require that every
* delalloc reservation have a dirty pagecache.
*/
if (use_cowextszhint) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
prev.br_startoff = NULLFILEOFF;
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
1, 0, &aoff, &alen);
ASSERT(!error);
}
/*
* Make a transaction-less quota reservation for delayed allocation
* blocks. This number gets adjusted later. We return if we haven't
* allocated blocks already inside this loop.
*/
error = xfs_quota_reserve_blkres(ip, alen);
if (error)
goto out;
/*
* Split changing sb for alen and indlen since they could be coming
* from different places.
*/
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
ASSERT(indlen > 0);
fdblocks = indlen;
if (XFS_IS_REALTIME_INODE(ip)) {
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
if (error)
goto out_unreserve_quota;
} else {
fdblocks += alen;
}
error = xfs_dec_fdblocks(mp, fdblocks, false);
if (error)
goto out_unreserve_frextents;
ip->i_delayed_blks += alen;
xfs_mod_delalloc(ip, alen, indlen);
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
/*
* Tag the inode if blocks were preallocated. Note that COW fork
* preallocation can occur at the start or end of the extent, even when
* prealloc == 0, so we must also check the aligned offset and length.
*/
if (whichfork == XFS_DATA_FORK && prealloc)
xfs_inode_set_eofblocks_tag(ip);
if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
xfs_inode_set_cowblocks_tag(ip);
return 0;
out_unreserve_frextents:
if (XFS_IS_REALTIME_INODE(ip))
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
out:
if (error == -ENOSPC || error == -EDQUOT) {
trace_xfs_delalloc_enospc(ip, off, len);
if (prealloc || use_cowextszhint) {
/* retry without any preallocation */
use_cowextszhint = false;
prealloc = 0;
goto retry;
}
}
return error;
}
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
@@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay(
int whichfork,
struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del)
struct xfs_bmbt_irec *del,
uint32_t bflags) /* bmapi flags */
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
fdblocks = da_diff;
if (isrt)
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
else
if (bflags & XFS_BMAPI_REMAP) {
;
} else if (isrt) {
xfs_rtbxlen_t rtxlen;
rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
if (xfs_is_zoned_inode(ip))
xfs_zoned_add_available(mp, rtxlen);
xfs_add_frextents(mp, rtxlen);
} else {
fdblocks += del->br_blockcount;
}
xfs_add_fdblocks(mp, fdblocks);
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -5670,7 +5399,8 @@ __xfs_bunmapi(
delete:
if (wasdel) {
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
&del, flags);
} else {
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
&del, &tmp_logflags, whichfork,

View File

@@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extnum_t nexts, int *done);
void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
struct xfs_bmbt_irec *del, uint32_t bflags);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
@@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
bool *done, xfs_fileoff_t stop_fsb);
int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
@@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
int fork);
int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args);
xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,

View File

@@ -178,9 +178,10 @@ typedef struct xfs_sb {
xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
uint8_t sb_rgblklog; /* rt group number shift */
uint8_t sb_pad[7]; /* zeroes */
xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */
xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -270,9 +271,10 @@ struct xfs_dsb {
__be64 sb_metadirino; /* metadata directory tree root */
__be32 sb_rgcount; /* # of realtime groups */
__be32 sb_rgextents; /* size of rtgroup in rtx */
__u8 sb_rgblklog; /* rt group number shift */
__u8 sb_pad[7]; /* zeroes */
__be64 sb_rtstart; /* start of internal RT section (FSB) */
__be64 sb_rtreserved; /* reserved (zoned) RT blocks */
/*
* The size of this structure must be padded to 64 bit alignment.
@@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */
#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_NREXT64 | \
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
XFS_SB_FEAT_INCOMPAT_PARENT | \
XFS_SB_FEAT_INCOMPAT_METADIR)
XFS_SB_FEAT_INCOMPAT_METADIR | \
XFS_SB_FEAT_INCOMPAT_ZONED | \
XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -952,7 +959,12 @@ struct xfs_dinode {
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
__be32 di_cowextsize; /* basic cow extent size for file */
union {
/* basic cow extent size for (regular) file */
__be32 di_cowextsize;
/* used blocks in RTG for (zoned) rtrmap inode */
__be32 di_used_blocks;
};
__u8 di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */

View File

@@ -189,7 +189,9 @@ struct xfs_fsop_geom {
uint32_t checked; /* o: checked fs & rt metadata */
__u32 rgextents; /* rt extents in a realtime group */
__u32 rgcount; /* number of realtime groups */
__u64 reserved[16]; /* reserved space */
__u64 rtstart; /* start of internal rt section */
__u64 rtreserved; /* RT (zoned) reserved blocks */
__u64 reserved[14]; /* reserved space */
};
#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
@@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */
/*
* Minimum and maximum sizes need for growth checks.
@@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry {
#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
/*
* Devices supported by a single XFS file system. Reported in fsmaps fmr_device
* when using internal RT devices.
*/
enum xfs_device {
XFS_DEV_DATA = 1,
XFS_DEV_LOG = 2,
XFS_DEV_RT = 3,
};
#ifndef HAVE_BBMACROS
/*

View File

@@ -19,10 +19,23 @@ struct xfs_group {
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
/*
* Track freed but not yet committed extents.
*/
struct xfs_extent_busy_tree *xg_busy_extents;
union {
/*
* For perags and non-zoned RT groups:
* Track freed but not yet committed extents.
*/
struct xfs_extent_busy_tree *xg_busy_extents;
/*
* For zoned RT groups:
* List of groups that need a zone reset.
*
* The zonegc code forces a log flush of the rtrmap inode before
* resetting the write pointer, so there is no need for
* individual busy extent tracking.
*/
struct xfs_group *xg_next_reset;
};
/*
* Bitsets of per-ag metadata that have been checked and/or are sick.
@@ -107,9 +120,15 @@ xfs_gbno_to_daddr(
xfs_agblock_t gbno)
{
struct xfs_mount *mp = xg->xg_mount;
uint32_t blocks = mp->m_groups[xg->xg_type].blocks;
struct xfs_groups *g = &mp->m_groups[xg->xg_type];
xfs_fsblock_t fsbno;
return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno);
if (g->has_daddr_gaps)
fsbno = xfs_gbno_to_fsb(xg, gbno);
else
fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
}
static inline uint32_t

View File

@@ -1927,7 +1927,7 @@ xfs_dialloc(
* that we can immediately allocate, but then we allow allocation on the
* second pass if we fail to find an AG with free inodes in it.
*/
if (percpu_counter_read_positive(&mp->m_fdblocks) <
if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
mp->m_low_space[XFS_LOWSP_1_PCNT]) {
ok_alloc = false;
low_space = true;

View File

@@ -252,7 +252,10 @@ xfs_inode_from_disk(
be64_to_cpu(from->di_changecount));
ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
ip->i_diflags2 = be64_to_cpu(from->di_flags2);
/* also covers the di_used_blocks union arm: */
ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
sizeof(from->di_used_blocks));
}
error = xfs_iformat_data_fork(ip, from);
@@ -349,6 +352,7 @@ xfs_inode_to_disk(
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
to->di_flags2 = cpu_to_be64(ip->i_diflags2);
/* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
@@ -752,11 +756,18 @@ xfs_dinode_verify(
!xfs_has_rtreflink(mp))
return __this_address;
/* COW extent size hint validation */
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
mode, flags, flags2);
if (fa)
return fa;
if (xfs_has_zoned(mp) &&
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) {
if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents)
return __this_address;
} else {
/* COW extent size hint validation */
fa = xfs_inode_validate_cowextsize(mp,
be32_to_cpu(dip->di_cowextsize),
mode, flags, flags2);
if (fa)
return fa;
}
/* bigtime iflag can only happen on bigtime filesystems */
if (xfs_dinode_has_bigtime(dip) &&

View File

@@ -322,6 +322,7 @@ xfs_inode_init(
if (xfs_has_v3inodes(mp)) {
inode_set_iversion(inode, 1);
/* also covers the di_used_blocks union arm: */
ip->i_cowextsize = 0;
times |= XFS_ICHGTIME_CREATE;
}

View File

@@ -475,7 +475,12 @@ struct xfs_log_dinode {
xfs_lsn_t di_lsn;
uint64_t di_flags2; /* more random flags */
uint32_t di_cowextsize; /* basic cow extent size for file */
union {
/* basic cow extent size for (regular) file */
uint32_t di_cowextsize;
/* used blocks in RTG for (zoned) rtrmap inode */
uint32_t di_used_blocks;
};
uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */

View File

@@ -21,6 +21,9 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
static const struct {
enum xfs_metafile_type mtype;
@@ -74,12 +77,11 @@ xfs_metafile_clear_iflag(
}
/*
* Is the amount of space that could be allocated towards a given metadata
* file at or beneath a certain threshold?
* Is the metafile reservations at or beneath a certain threshold?
*/
static inline bool
xfs_metafile_resv_can_cover(
struct xfs_inode *ip,
struct xfs_mount *mp,
int64_t rhs)
{
/*
@@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover(
* global free block count. Take care of the first case to avoid
* touching the per-cpu counter.
*/
if (ip->i_delayed_blks >= rhs)
if (mp->m_metafile_resv_avail >= rhs)
return true;
/*
* There aren't enough blocks left in the inode's reservation, but it
* isn't critical unless there also isn't enough free space.
*/
return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
rhs - ip->i_delayed_blks, 2048) >= 0;
return xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
rhs - mp->m_metafile_resv_avail, 2048) >= 0;
}
/*
* Is this metadata file critically low on blocks? For now we'll define that
* as the number of blocks we can get our hands on being less than 10% of what
* we reserved or less than some arbitrary number (maximum btree height).
* Is the metafile reservation critically low on blocks? For now we'll define
* that as the number of blocks we can get our hands on being less than 10% of
* what we reserved or less than some arbitrary number (maximum btree height).
*/
bool
xfs_metafile_resv_critical(
struct xfs_inode *ip)
struct xfs_mount *mp)
{
uint64_t asked_low_water;
ASSERT(xfs_has_metadir(mp));
if (!ip)
return false;
trace_xfs_metafile_resv_critical(mp, 0);
ASSERT(xfs_is_metadir_inode(ip));
trace_xfs_metafile_resv_critical(ip, 0);
if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels))
return true;
asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
if (!xfs_metafile_resv_can_cover(ip, asked_low_water))
if (!xfs_metafile_resv_can_cover(mp,
div_u64(mp->m_metafile_resv_target, 10)))
return true;
return XFS_TEST_ERROR(false, ip->i_mount,
XFS_ERRTAG_METAFILE_RESV_CRITICAL);
return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
}
/* Allocate a block from the metadata file's reservation. */
@@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space(
struct xfs_inode *ip,
struct xfs_alloc_arg *args)
{
struct xfs_mount *mp = ip->i_mount;
int64_t len = args->len;
ASSERT(xfs_is_metadir_inode(ip));
ASSERT(args->resv == XFS_AG_RESV_METAFILE);
trace_xfs_metafile_resv_alloc_space(ip, args->len);
trace_xfs_metafile_resv_alloc_space(mp, args->len);
/*
* Allocate the blocks from the metadata inode's block reservation
* and update the ondisk sb counter.
*/
if (ip->i_delayed_blks > 0) {
mutex_lock(&mp->m_metafile_resv_lock);
if (mp->m_metafile_resv_avail > 0) {
int64_t from_resv;
from_resv = min_t(int64_t, len, ip->i_delayed_blks);
ip->i_delayed_blks -= from_resv;
from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail);
mp->m_metafile_resv_avail -= from_resv;
xfs_mod_delalloc(ip, 0, -from_resv);
xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
-from_resv);
@@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space(
xfs_trans_mod_sb(args->tp, field, -len);
}
mp->m_metafile_resv_used += args->len;
mutex_unlock(&mp->m_metafile_resv_lock);
ip->i_nblocks += args->len;
xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
}
@@ -186,26 +188,33 @@ xfs_metafile_resv_free_space(
struct xfs_trans *tp,
xfs_filblks_t len)
{
struct xfs_mount *mp = ip->i_mount;
int64_t to_resv;
ASSERT(xfs_is_metadir_inode(ip));
trace_xfs_metafile_resv_free_space(ip, len);
trace_xfs_metafile_resv_free_space(mp, len);
ip->i_nblocks -= len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
mutex_lock(&mp->m_metafile_resv_lock);
mp->m_metafile_resv_used -= len;
/*
* Add the freed blocks back into the inode's delalloc reservation
* until it reaches the maximum size. Update the ondisk fdblocks only.
*/
to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
to_resv = mp->m_metafile_resv_target -
(mp->m_metafile_resv_used + mp->m_metafile_resv_avail);
if (to_resv > 0) {
to_resv = min_t(int64_t, to_resv, len);
ip->i_delayed_blks += to_resv;
mp->m_metafile_resv_avail += to_resv;
xfs_mod_delalloc(ip, 0, to_resv);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
len -= to_resv;
}
mutex_unlock(&mp->m_metafile_resv_lock);
/*
* Everything else goes back to the filesystem, so update the in-core
@@ -215,61 +224,99 @@ xfs_metafile_resv_free_space(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
}
/* Release a metadata file's space reservation. */
void
xfs_metafile_resv_free(
struct xfs_inode *ip)
static void
__xfs_metafile_resv_free(
struct xfs_mount *mp)
{
/* Non-btree metadata inodes don't need space reservations. */
if (!ip || !ip->i_meta_resv_asked)
return;
ASSERT(xfs_is_metadir_inode(ip));
trace_xfs_metafile_resv_free(ip, 0);
if (ip->i_delayed_blks) {
xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks);
xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks);
ip->i_delayed_blks = 0;
if (mp->m_metafile_resv_avail) {
xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail);
xfs_add_fdblocks(mp, mp->m_metafile_resv_avail);
}
ip->i_meta_resv_asked = 0;
mp->m_metafile_resv_avail = 0;
mp->m_metafile_resv_used = 0;
mp->m_metafile_resv_target = 0;
}
/* Set up a metadata file's space reservation. */
/* Release unused metafile space reservation. */
void
xfs_metafile_resv_free(
struct xfs_mount *mp)
{
if (!xfs_has_metadir(mp))
return;
trace_xfs_metafile_resv_free(mp, 0);
mutex_lock(&mp->m_metafile_resv_lock);
__xfs_metafile_resv_free(mp);
mutex_unlock(&mp->m_metafile_resv_lock);
}
/* Set up a metafile space reservation. */
int
xfs_metafile_resv_init(
struct xfs_inode *ip,
xfs_filblks_t ask)
struct xfs_mount *mp)
{
struct xfs_rtgroup *rtg = NULL;
xfs_filblks_t used = 0, target = 0;
xfs_filblks_t hidden_space;
xfs_filblks_t used;
int error;
xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4;
int error = 0;
if (!ip || ip->i_meta_resv_asked > 0)
if (!xfs_has_metadir(mp))
return 0;
ASSERT(xfs_is_metadir_inode(ip));
/*
* Free any previous reservation to have a clean slate.
*/
mutex_lock(&mp->m_metafile_resv_lock);
__xfs_metafile_resv_free(mp);
/*
* Space taken by all other metadata btrees are accounted on-disk as
* Currently the only btree metafiles that require reservations are the
* rtrmap and the rtrefcount. Anything new will have to be added here
* as well.
*/
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
if (xfs_has_rtrmapbt(mp)) {
used += rtg_rmap(rtg)->i_nblocks;
target += xfs_rtrmapbt_calc_reserves(mp);
}
if (xfs_has_rtreflink(mp)) {
used += rtg_refcount(rtg)->i_nblocks;
target += xfs_rtrefcountbt_calc_reserves(mp);
}
}
if (!target)
goto out_unlock;
/*
* Space taken by the per-AG metadata btrees are accounted on-disk as
* used space. We therefore only hide the space that is reserved but
* not used by the trees.
*/
used = ip->i_nblocks;
if (used > ask)
ask = used;
hidden_space = ask - used;
if (used > target)
target = used;
else if (target > dblocks_avail)
target = dblocks_avail;
hidden_space = target - used;
error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true);
error = xfs_dec_fdblocks(mp, hidden_space, true);
if (error) {
trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_);
return error;
trace_xfs_metafile_resv_init_error(mp, 0);
goto out_unlock;
}
xfs_mod_delalloc(ip, 0, hidden_space);
ip->i_delayed_blks = hidden_space;
ip->i_meta_resv_asked = ask;
xfs_mod_sb_delalloc(mp, hidden_space);
trace_xfs_metafile_resv_init(ip, ask);
return 0;
mp->m_metafile_resv_target = target;
mp->m_metafile_resv_used = used;
mp->m_metafile_resv_avail = hidden_space;
trace_xfs_metafile_resv_init(mp, target);
out_unlock:
mutex_unlock(&mp->m_metafile_resv_lock);
return error;
}

View File

@@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
/* Space reservations for metadata inodes. */
struct xfs_alloc_arg;
bool xfs_metafile_resv_critical(struct xfs_inode *ip);
bool xfs_metafile_resv_critical(struct xfs_mount *mp);
void xfs_metafile_resv_alloc_space(struct xfs_inode *ip,
struct xfs_alloc_arg *args);
void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp,
xfs_filblks_t len);
void xfs_metafile_resv_free(struct xfs_inode *ip);
int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask);
void xfs_metafile_resv_free(struct xfs_mount *mp);
int xfs_metafile_resv_init(struct xfs_mount *mp);
/* Code specific to kernel/userspace; must be provided externally. */

View File

@@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void)
16299260424LL);
/* superblock field checks we got from xfs/122 */
XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288);
XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288);
XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304);
XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304);
XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
@@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
XFS_CHECK_SB_OFFSET(sb_pad, 281);
XFS_CHECK_SB_OFFSET(sb_rtstart, 288);
XFS_CHECK_SB_OFFSET(sb_rtreserved, 296);
}
#endif /* __XFS_ONDISK_H */

View File

@@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
xfs_extlen_t mod;
int error;
ASSERT(!xfs_has_zoned(mp));
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
mod = xfs_blen_to_rtxoff(mp, rtlen);
@@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range(
end = min(end, rtg->rtg_extents - 1);
if (xfs_has_zoned(mp))
return -EINVAL;
/* Iterate the bitmap, looking for discrepancies. */
while (start <= end) {
struct xfs_rtalloc_rec rec;
@@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len(
struct xfs_mount *mp,
xfs_rtbxlen_t rtextents)
{
if (xfs_has_zoned(mp))
return 0;
return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
}
@@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount(
xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
unsigned long long rsumwords;
if (xfs_has_zoned(mp)) {
*rsumlevels = 0;
return 0;
}
*rsumlevels = xfs_compute_rextslog(rextents) + 1;
rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
return howmany_64(rsumwords, mp->m_blockwsize);

View File

@@ -194,15 +194,17 @@ xfs_rtgroup_lock(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
/*
* Lock both realtime free space metadata inodes for a freespace
* update.
*/
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
if (!xfs_has_zoned(rtg_mount(rtg))) {
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
/*
* Lock both realtime free space metadata inodes for a
* freespace update.
*/
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
}
}
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
@@ -228,11 +230,13 @@ xfs_rtgroup_unlock(
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
if (!xfs_has_zoned(rtg_mount(rtg))) {
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
}
}
}
@@ -249,7 +253,8 @@ xfs_rtgroup_trans_join(
ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
if (!xfs_has_zoned(rtg_mount(rtg)) &&
(rtglock_flags & XFS_RTGLOCK_BITMAP)) {
xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
}
@@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry(
/* Fill out form. */
memset(rgeo, 0, sizeof(*rgeo));
rgeo->rg_number = rtg_rgno(rtg);
rgeo->rg_length = rtg_group(rtg)->xg_block_count;
rgeo->rg_length = rtg_blocks(rtg);
xfs_rtgroup_geom_health(rtg, rgeo);
return 0;
}
@@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_BITMAP,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
.enabled = xfs_has_nonzoned,
.create = xfs_rtbitmap_create,
},
[XFS_RTGI_SUMMARY] = {
@@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_SUMMARY,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
.enabled = xfs_has_nonzoned,
.create = xfs_rtsummary_create,
},
[XFS_RTGI_RMAP] = {

View File

@@ -37,15 +37,33 @@ struct xfs_rtgroup {
xfs_rtxnum_t rtg_extents;
/*
* Cache of rt summary level per bitmap block with the invariant that
* rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
* or 0 if rsum[i][bbno] == 0 for all i.
*
* For bitmap based RT devices this points to a cache of rt summary
* level per bitmap block with the invariant that rtg_rsum_cache[bbno]
* > the maximum i for which rsum[i][bbno] != 0, or 0 if
* rsum[i][bbno] == 0 for all i.
* Reads and writes are serialized by the rsumip inode lock.
*
* For zoned RT devices this points to the open zone structure for
* a group that is open for writers, or is NULL.
*/
uint8_t *rtg_rsum_cache;
union {
uint8_t *rtg_rsum_cache;
struct xfs_open_zone *rtg_open_zone;
};
};
/*
* For zoned RT devices this is set on groups that have no written blocks
* and can be picked by the allocator for opening.
*/
#define XFS_RTG_FREE XA_MARK_0
/*
* For zoned RT devices this is set on groups that are fully written and that
* have unused blocks. Used by the garbage collection to pick targets.
*/
#define XFS_RTG_RECLAIMABLE XA_MARK_1
static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
{
return container_of(xg, struct xfs_rtgroup, rtg_group);
@@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
return rtg->rtg_group.xg_gno;
}
static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg)
{
return rtg->rtg_group.xg_block_count;
}
static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
{
return rtg->rtg_inodes[XFS_RTGI_BITMAP];
@@ -222,10 +245,14 @@ xfs_rtb_to_daddr(
xfs_rtblock_t rtbno)
{
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask));
if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask);
}
return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno);
}
static inline xfs_rtblock_t
@@ -233,10 +260,11 @@ xfs_daddr_to_rtb(
struct xfs_mount *mp,
xfs_daddr_t daddr)
{
xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr);
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
xfs_rfsblock_t bno;
if (xfs_has_rtgroups(mp)) {
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
xfs_rgnumber_t rgno;
uint32_t rgbno;

View File

@@ -1033,3 +1033,22 @@ xfs_rtrmapbt_init_rtsb(
xfs_btree_del_cursor(cur, error);
return error;
}
/*
* Return the highest rgbno currently tracked by the rmap for this rtg.
*/
xfs_rgblock_t
xfs_rtrmap_highest_rgbno(
struct xfs_rtgroup *rtg)
{
struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot;
union xfs_btree_key key = {};
struct xfs_btree_cur *cur;
if (block->bb_numrecs == 0)
return NULLRGBLOCK;
cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
xfs_btree_get_keys(cur, block, &key);
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock);
}

View File

@@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg);
#endif /* __XFS_RTRMAP_BTREE_H__ */

View File

@@ -30,6 +30,7 @@
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_rtbitmap.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -185,6 +186,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_PARENT;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
features |= XFS_FEAT_METADIR;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
features |= XFS_FEAT_ZONED;
return features;
}
@@ -266,6 +269,9 @@ static uint64_t
xfs_expected_rbmblocks(
struct xfs_sb *sbp)
{
if (xfs_sb_is_v5(sbp) &&
(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
return 0;
return howmany_64(xfs_extents_per_rbm(sbp),
NBBY * xfs_rtbmblock_size(sbp));
}
@@ -275,9 +281,15 @@ bool
xfs_validate_rt_geometry(
struct xfs_sb *sbp)
{
if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
return false;
if (xfs_sb_is_v5(sbp) &&
(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
if (sbp->sb_rextsize != 1)
return false;
} else {
if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
return false;
}
if (sbp->sb_rblocks == 0) {
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
@@ -435,6 +447,34 @@ xfs_validate_sb_rtgroups(
return 0;
}
static int
xfs_validate_sb_zoned(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
if (sbp->sb_frextents != 0) {
xfs_warn(mp,
"sb_frextents must be zero for zoned file systems.");
return -EINVAL;
}
if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) {
xfs_warn(mp,
"sb_rtstart (%lld) overlaps sb_dblocks (%lld).",
sbp->sb_rtstart, sbp->sb_dblocks);
return -EINVAL;
}
if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) {
xfs_warn(mp,
"sb_rtreserved (%lld) larger than sb_rblocks (%lld).",
sbp->sb_rtreserved, sbp->sb_rblocks);
return -EINVAL;
}
return 0;
}
/* Check the validity of the SB. */
STATIC int
xfs_validate_sb_common(
@@ -523,6 +563,11 @@ xfs_validate_sb_common(
if (error)
return error;
}
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
error = xfs_validate_sb_zoned(mp, sbp);
if (error)
return error;
}
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
xfs_notice(mp,
@@ -835,6 +880,14 @@ __xfs_sb_from_disk(
to->sb_rgcount = 1;
to->sb_rgextents = 0;
}
if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
} else {
to->sb_rtstart = 0;
to->sb_rtreserved = 0;
}
}
void
@@ -1001,6 +1054,11 @@ xfs_sb_to_disk(
to->sb_rbmino = cpu_to_be64(0);
to->sb_rsumino = cpu_to_be64(0);
}
if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
}
}
/*
@@ -1146,6 +1204,10 @@ xfs_sb_mount_rextsize(
rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
rgs->blklog = mp->m_sb.sb_rgblklog;
rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
rgs->start_fsb = mp->m_sb.sb_rtstart;
if (xfs_sb_has_incompat_feature(sbp,
XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
rgs->has_daddr_gaps = true;
} else {
rgs->blocks = 0;
rgs->blklog = 0;
@@ -1265,8 +1327,7 @@ xfs_log_sb(
mp->m_sb.sb_ifree = min_t(uint64_t,
percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
mp->m_sb.sb_fdblocks =
percpu_counter_sum_positive(&mp->m_fdblocks);
mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
}
/*
@@ -1275,9 +1336,10 @@ xfs_log_sb(
* we handle nearly-lockless reservations, so we must use the _positive
* variant here to avoid writing out nonsense frextents.
*/
if (xfs_has_rtgroups(mp))
if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
mp->m_sb.sb_frextents =
percpu_counter_sum_positive(&mp->m_frextents);
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
}
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
@@ -1510,6 +1572,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
if (xfs_has_metadir(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
if (xfs_has_zoned(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
@@ -1530,6 +1594,10 @@ xfs_fs_geometry(
geo->rgcount = sbp->sb_rgcount;
geo->rgextents = sbp->sb_rgextents;
}
if (xfs_has_zoned(mp)) {
geo->rtstart = sbp->sb_rtstart;
geo->rtreserved = sbp->sb_rtreserved;
}
}
/* Read a secondary superblock. */

View File

@@ -233,6 +233,34 @@ enum xfs_group_type {
{ XG_TYPE_AG, "ag" }, \
{ XG_TYPE_RTG, "rtg" }
enum xfs_free_counter {
/*
* Number of free blocks on the data device.
*/
XC_FREE_BLOCKS,
/*
* Number of free RT extents on the RT device.
*/
XC_FREE_RTEXTENTS,
/*
* Number of available for use RT extents.
*
* This counter only exists for zoned RT device and indicates the number
* of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS
* also includes blocks that have been written previously and freed, but
* sit in a rtgroup that still needs a zone reset.
*/
XC_FREE_RTAVAILABLE,
XC_FREE_NR,
};
#define XFS_FREECOUNTER_STR \
{ XC_FREE_BLOCKS, "blocks" }, \
{ XC_FREE_RTEXTENTS, "rtextents" }, \
{ XC_FREE_RTAVAILABLE, "rtavailable" }
/*
* Type verifier functions
*/

186
fs/xfs/libxfs/xfs_zones.c Normal file
View File

@@ -0,0 +1,186 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025 Christoph Hellwig.
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtgroup.h"
#include "xfs_zones.h"
static bool
xfs_zone_validate_empty(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
if (rtg_rmap(rtg)->i_used_blocks > 0) {
xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
return false;
}
*write_pointer = 0;
return true;
}
static bool
xfs_zone_validate_wp(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
xfs_warn(mp, "zone %u has too large used counter (0x%x).",
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
return false;
}
if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.",
rtg_rgno(rtg), wp_fsb);
return false;
}
*write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
if (*write_pointer >= rtg->rtg_extents) {
xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
rtg_rgno(rtg), *write_pointer);
return false;
}
return true;
}
static bool
xfs_zone_validate_full(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
xfs_warn(mp, "zone %u has too large used counter (0x%x).",
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
return false;
}
*write_pointer = rtg->rtg_extents;
return true;
}
static bool
xfs_zone_validate_seq(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
switch (zone->cond) {
case BLK_ZONE_COND_EMPTY:
return xfs_zone_validate_empty(zone, rtg, write_pointer);
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
return xfs_zone_validate_wp(zone, rtg, write_pointer);
case BLK_ZONE_COND_FULL:
return xfs_zone_validate_full(zone, rtg, write_pointer);
case BLK_ZONE_COND_NOT_WP:
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.",
rtg_rgno(rtg), zone->cond);
return false;
default:
xfs_warn(mp, "zone %u has unknown zone condition 0x%x.",
rtg_rgno(rtg), zone->cond);
return false;
}
}
static bool
xfs_zone_validate_conv(
struct blk_zone *zone,
struct xfs_rtgroup *rtg)
{
struct xfs_mount *mp = rtg_mount(rtg);
switch (zone->cond) {
case BLK_ZONE_COND_NOT_WP:
return true;
default:
xfs_warn(mp,
"conventional zone %u has unsupported zone condition 0x%x.",
rtg_rgno(rtg), zone->cond);
return false;
}
}
bool
xfs_zone_validate(
struct blk_zone *zone,
struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer)
{
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
uint32_t expected_size;
/*
* Check that the zone capacity matches the rtgroup size stored in the
* superblock. Note that all zones including the last one must have a
* uniform capacity.
*/
if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
xfs_warn(mp,
"zone %u capacity (0x%llx) does not match RT group size (0x%x).",
rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
g->blocks);
return false;
}
if (g->has_daddr_gaps) {
expected_size = 1 << g->blklog;
} else {
if (zone->len != zone->capacity) {
xfs_warn(mp,
"zone %u has capacity != size ((0x%llx vs 0x%llx)",
rtg_rgno(rtg),
XFS_BB_TO_FSB(mp, zone->len),
XFS_BB_TO_FSB(mp, zone->capacity));
return false;
}
expected_size = g->blocks;
}
if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) {
xfs_warn(mp,
"zone %u length (0x%llx) does match geometry (0x%x).",
rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
expected_size);
}
switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL:
return xfs_zone_validate_conv(zone, rtg);
case BLK_ZONE_TYPE_SEQWRITE_REQ:
return xfs_zone_validate_seq(zone, rtg, write_pointer);
default:
xfs_warn(mp, "zoned %u has unsupported type 0x%x.",
rtg_rgno(rtg), zone->type);
return false;
}
}

35
fs/xfs/libxfs/xfs_zones.h Normal file
View File

@@ -0,0 +1,35 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LIBXFS_ZONES_H
#define _LIBXFS_ZONES_H
struct xfs_rtgroup;
/*
* In order to guarantee forward progress for GC we need to reserve at least
* two zones: one that will be used for moving data into and one spare zone
* making sure that we have enough space to relocate a nearly-full zone.
* To allow for slightly sloppy accounting for when we need to reserve the
* second zone, we actually reserve three as that is easier than doing fully
* accurate bookkeeping.
*/
#define XFS_GC_ZONES 3U
/*
* In addition we need two zones for user writes, one open zone for writing
* and one to still have available blocks without resetting the open zone
* when data in the open zone has been freed.
*/
#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
/*
* Always keep one zone out of the general open zone pool to allow for GC to
* happen while other writers are waiting for free space.
*/
#define XFS_OPEN_GC_ZONES 1U
#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer);
#endif /* _LIBXFS_ZONES_H */

View File

@@ -69,6 +69,8 @@ STATIC size_t
xchk_superblock_ondisk_size(
struct xfs_mount *mp)
{
if (xfs_has_zoned(mp))
return offsetofend(struct xfs_dsb, sb_rtreserved);
if (xfs_has_metadir(mp))
return offsetofend(struct xfs_dsb, sb_pad);
if (xfs_has_metauuid(mp))

View File

@@ -1038,8 +1038,8 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
/* No CoW forks on non-reflink filesystems. */
if (!xfs_has_reflink(mp)) {
/* No CoW forks filesystem doesn't support out of place writes */
if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
return 0;
}

View File

@@ -350,7 +350,7 @@ xchk_fscount_aggregate_agcounts(
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
*/
fsc->fdblocks -= mp->m_resblks_avail;
fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
/*
* Delayed allocation reservations are taken out of the incore counters
@@ -413,7 +413,13 @@ xchk_fscount_count_frextents(
fsc->frextents = 0;
fsc->frextents_delayed = 0;
if (!xfs_has_realtime(mp))
/*
* Don't bother verifying and repairing the fs counters for zoned file
* systems as they don't track an on-disk frextents count, and the
* in-memory percpu counter also includes reservations.
*/
if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
return 0;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
@@ -513,8 +519,8 @@ xchk_fscounters(
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
fdblocks = percpu_counter_sum(&mp->m_fdblocks);
frextents = percpu_counter_sum(&mp->m_frextents);
fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
/* No negative values, please! */
if (icount < 0 || ifree < 0)
@@ -589,15 +595,17 @@ xchk_fscounters(
try_again = true;
}
if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
fsc->fdblocks)) {
if (!xchk_fscount_within_range(sc, fdblocks,
&mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
try_again = true;
}
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
if (!xfs_has_zoned(mp) &&
!xchk_fscount_within_range(sc, frextents,
&mp->m_free[XC_FREE_RTEXTENTS].count,
fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);

View File

@@ -64,7 +64,7 @@ xrep_fscounters(
percpu_counter_set(&mp->m_icount, fsc->icount);
percpu_counter_set(&mp->m_ifree, fsc->ifree);
percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
/*
* Online repair is only supported on v5 file systems, which require
@@ -74,10 +74,12 @@ xrep_fscounters(
* track of the delalloc reservations separately, as they are are
* subtracted from m_frextents, but not included in sb_frextents.
*/
percpu_counter_set(&mp->m_frextents,
fsc->frextents - fsc->frextents_delayed);
if (!xfs_has_rtgroups(mp))
mp->m_sb.sb_frextents = fsc->frextents;
if (!xfs_has_zoned(mp)) {
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
fsc->frextents - fsc->frextents_delayed);
if (!xfs_has_rtgroups(mp))
mp->m_sb.sb_frextents = fsc->frextents;
}
return 0;
}

View File

@@ -273,6 +273,13 @@ xchk_inode_cowextsize(
xfs_failaddr_t fa;
uint32_t value = be32_to_cpu(dip->di_cowextsize);
/*
* The used block counter for rtrmap is checked and repaired elsewhere.
*/
if (xfs_has_zoned(sc->mp) &&
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
return;
fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);

View File

@@ -710,7 +710,9 @@ xrep_dinode_extsize_hints(
XFS_DIFLAG_EXTSZINHERIT);
}
if (dip->di_version < 3)
if (dip->di_version < 3 ||
(xfs_has_zoned(sc->mp) &&
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
return;
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),

View File

@@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
free = sc->sa.pag->pagf_freeblks;
sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
} else {
free = percpu_counter_sum(&sc->mp->m_fdblocks);
free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
sz = sc->mp->m_sb.sb_dblocks;
}

View File

@@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks(
if (error)
return error;
if (xreap_dirty(&rs))
return xrep_defer_finish(sc);
if (xreap_dirty(&rs)) {
error = xrep_defer_finish(sc);
if (error)
return error;
}
return 0;
return xrep_reset_metafile_resv(sc);
}
/*

View File

@@ -43,6 +43,7 @@
#include "xfs_rtalloc.h"
#include "xfs_metafile.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse(
xfs_rtxnum_t startrtx;
xfs_rtxnum_t endrtx;
bool is_free = false;
int error;
int error = 0;
if (xfs_has_zoned(mp)) {
if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
return -EFSCORRUPTED;
return 0;
}
startrtx = xfs_rgbno_to_rtx(mp, rgbno);
endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
@@ -1386,11 +1393,12 @@ int
xrep_reset_metafile_resv(
struct xfs_scrub *sc)
{
struct xfs_inode *ip = sc->ip;
struct xfs_mount *mp = sc->mp;
int64_t delta;
int error;
delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail -
mp->m_metafile_resv_target;
if (delta == 0)
return 0;
@@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv(
if (delta > 0) {
int64_t give_back;
give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail);
if (give_back > 0) {
xfs_mod_delalloc(ip, 0, -give_back);
xfs_add_fdblocks(ip->i_mount, give_back);
ip->i_delayed_blks -= give_back;
xfs_mod_sb_delalloc(mp, -give_back);
xfs_add_fdblocks(mp, give_back);
mp->m_metafile_resv_avail -= give_back;
}
return 0;
@@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv(
/*
* Not enough reservation; try to take some blocks from the filesystem
* to the metadata inode. @delta is negative here, so invert the sign.
* to the metabtree reservation.
*/
delta = -delta;
error = xfs_dec_fdblocks(sc->mp, delta, true);
delta = -delta; /* delta is negative here, so invert the sign. */
error = xfs_dec_fdblocks(mp, delta, true);
while (error == -ENOSPC) {
delta--;
if (delta == 0) {
xfs_warn(sc->mp,
"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
ip->i_ino);
"Insufficient free space to reset metabtree reservation after repair.");
return 0;
}
error = xfs_dec_fdblocks(sc->mp, delta, true);
error = xfs_dec_fdblocks(mp, delta, true);
}
if (error)
return error;
xfs_mod_delalloc(ip, 0, delta);
ip->i_delayed_blks += delta;
xfs_mod_sb_delalloc(mp, delta);
mp->m_metafile_resv_avail += delta;
return 0;
}

View File

@@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_exchmaps.h"
#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
@@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
xfs_extlen_t len)
{
struct xfs_rtgroup *rtg = sc->sr.rtg;
struct xfs_inode *rbmip = rtg_bitmap(rtg);
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
@@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space(
if (xchk_skip_xref(sc->sm))
return;
if (xfs_has_zoned(sc->mp)) {
if (!xfs_zone_rgbno_is_valid(rtg,
xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
return;
}
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
@@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space(
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (is_free)
xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
}

View File

@@ -697,32 +697,6 @@ xrep_rtrefc_build_new_tree(
return error;
}
/*
* Now that we've logged the roots of the new btrees, invalidate all of the
* old blocks and free them.
*/
STATIC int
xrep_rtrefc_remove_old_tree(
struct xrep_rtrefc *rr)
{
int error;
/*
* Free all the extents that were allocated to the former rtrefcountbt
* and aren't cross-linked with something else.
*/
error = xrep_reap_metadir_fsblocks(rr->sc,
&rr->old_rtrefcountbt_blocks);
if (error)
return error;
/*
* Ensure the proper reservation for the rtrefcount inode so that we
* don't fail to expand the btree.
*/
return xrep_reset_metafile_resv(rr->sc);
}
/* Rebuild the rt refcount btree. */
int
xrep_rtrefcountbt(
@@ -769,8 +743,12 @@ xrep_rtrefcountbt(
if (error)
goto out_bitmap;
/* Kill the old tree. */
error = xrep_rtrefc_remove_old_tree(rr);
/*
* Free all the extents that were allocated to the former rtrefcountbt
* and aren't cross-linked with something else.
*/
error = xrep_reap_metadir_fsblocks(rr->sc,
&rr->old_rtrefcountbt_blocks);
if (error)
goto out_bitmap;

View File

@@ -810,28 +810,6 @@ xrep_rtrmap_build_new_tree(
/* Reaping the old btree. */
/* Reap the old rtrmapbt blocks. */
STATIC int
xrep_rtrmap_remove_old_tree(
struct xrep_rtrmap *rr)
{
int error;
/*
* Free all the extents that were allocated to the former rtrmapbt and
* aren't cross-linked with something else.
*/
error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
if (error)
return error;
/*
* Ensure the proper reservation for the rtrmap inode so that we don't
* fail to expand the new btree.
*/
return xrep_reset_metafile_resv(rr->sc);
}
static inline bool
xrep_rtrmapbt_want_live_update(
struct xchk_iscan *iscan,
@@ -995,8 +973,11 @@ xrep_rtrmapbt(
if (error)
goto out_records;
/* Kill the old tree. */
error = xrep_rtrmap_remove_old_tree(rr);
/*
* Free all the extents that were allocated to the former rtrmapbt and
* aren't cross-linked with something else.
*/
error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
if (error)
goto out_records;

View File

@@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_RTGROUP,
.has = xfs_has_nonzoned,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_RTGROUP,
.has = xfs_has_nonzoned,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.repair = xrep_rtsummary,

View File

@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* Copyright (c) 2016-2018 Christoph Hellwig.
* Copyright (c) 2016-2025 Christoph Hellwig.
* All Rights Reserved.
*/
#include "xfs.h"
@@ -20,6 +20,8 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_icache.h"
#include "xfs_zone_alloc.h"
#include "xfs_rtgroup.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -77,6 +79,26 @@ xfs_setfilesize(
return xfs_trans_commit(tp);
}
static void
xfs_ioend_put_open_zones(
struct iomap_ioend *ioend)
{
struct iomap_ioend *tmp;
/*
* Put the open zone for all ioends merged into this one (if any).
*/
list_for_each_entry(tmp, &ioend->io_list, io_list)
xfs_open_zone_put(tmp->io_private);
/*
* The main ioend might not have an open zone if the submission failed
* before xfs_zone_alloc_and_submit got called.
*/
if (ioend->io_private)
xfs_open_zone_put(ioend->io_private);
}
/*
* IO write completion.
*/
@@ -86,6 +108,7 @@ xfs_end_ioend(
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
bool is_zoned = xfs_is_zoned_inode(ip);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -115,10 +138,11 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
if (ioend->io_flags & IOMAP_IOEND_SHARED) {
ASSERT(!is_zoned);
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
offset + size, NULL);
}
goto done;
}
@@ -126,14 +150,21 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
if (ioend->io_flags & IOMAP_F_SHARED)
if (is_zoned)
error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
ioend->io_private, NULLFSBLOCK);
else if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
else if (ioend->io_type == IOMAP_UNWRITTEN)
else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
if (!error &&
!(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
xfs_ioend_is_append(ioend))
error = xfs_setfilesize(ip, offset, size);
done:
if (is_zoned)
xfs_ioend_put_open_zones(ioend);
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
@@ -176,17 +207,27 @@ xfs_end_io(
}
}
STATIC void
void
xfs_end_bio(
struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
/*
* For Appends record the actually written block number and set the
* boundary flag if needed.
*/
if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
ioend->io_sector = bio->bi_iter.bi_sector;
xfs_mark_rtg_boundary(ioend);
}
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
@@ -396,10 +437,11 @@ xfs_map_blocks(
}
static int
xfs_prepare_ioend(
struct iomap_ioend *ioend,
xfs_submit_ioend(
struct iomap_writepage_ctx *wpc,
int status)
{
struct iomap_ioend *ioend = wpc->ioend;
unsigned int nofs_flag;
/*
@@ -410,7 +452,7 @@ xfs_prepare_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
@@ -418,10 +460,14 @@ xfs_prepare_ioend(
memalloc_nofs_restore(nofs_flag);
/* send ioends that might require a transaction to the completion wq */
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED))
if (xfs_ioend_is_append(ioend) ||
(ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
ioend->io_bio.bi_end_io = xfs_end_bio;
return status;
if (status)
return status;
submit_bio(&ioend->io_bio);
return 0;
}
/*
@@ -458,12 +504,107 @@ xfs_discard_folio(
* folio itself and not the start offset that is passed in.
*/
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
folio_pos(folio) + folio_size(folio));
folio_pos(folio) + folio_size(folio), NULL);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
.prepare_ioend = xfs_prepare_ioend,
.submit_ioend = xfs_submit_ioend,
.discard_folio = xfs_discard_folio,
};
struct xfs_zoned_writepage_ctx {
struct iomap_writepage_ctx ctx;
struct xfs_open_zone *open_zone;
};
static inline struct xfs_zoned_writepage_ctx *
XFS_ZWPC(struct iomap_writepage_ctx *ctx)
{
return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
}
static int
xfs_zoned_map_blocks(
struct iomap_writepage_ctx *wpc,
struct inode *inode,
loff_t offset,
unsigned int len)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
xfs_filblks_t count_fsb;
struct xfs_bmbt_irec imap, del;
struct xfs_iext_cursor icur;
if (xfs_is_shutdown(mp))
return -EIO;
XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
/*
* All dirty data must be covered by delalloc extents. But truncate can
* remove delalloc extents underneath us or reduce their size.
* Returning a hole tells iomap to not write back any data from this
* range, which is the right thing to do in that case.
*
* Otherwise just tell iomap to treat ranges previously covered by a
* delalloc extent as mapped. The actual block allocation will be done
* just before submitting the bio.
*
* This implies we never map outside folios that are locked or marked
* as under writeback, and thus there is no need check the fork sequence
* count here.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
if (imap.br_startoff > offset_fsb) {
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
imap.br_state = XFS_EXT_NORM;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
return 0;
}
end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
count_fsb = end_fsb - offset_fsb;
del = imap;
xfs_trim_extent(&del, offset_fsb, count_fsb);
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
XFS_BMAPI_REMAP);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
wpc->iomap.type = IOMAP_MAPPED;
wpc->iomap.flags = IOMAP_F_DIRTY;
wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
wpc->iomap.offset = offset;
wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
wpc->iomap.flags = IOMAP_F_ANON_WRITE;
trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
return 0;
}
static int
xfs_zoned_submit_ioend(
struct iomap_writepage_ctx *wpc,
int status)
{
wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
if (status)
return status;
xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
return 0;
}
static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
.map_blocks = xfs_zoned_map_blocks,
.submit_ioend = xfs_zoned_submit_ioend,
.discard_folio = xfs_discard_folio,
};
@@ -472,10 +613,25 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
struct xfs_inode *ip = XFS_I(mapping->host);
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
xfs_iflags_clear(ip, XFS_ITRUNCATED);
if (xfs_is_zoned_inode(ip)) {
struct xfs_zoned_writepage_ctx xc = { };
int error;
error = iomap_writepages(mapping, wbc, &xc.ctx,
&xfs_zoned_writeback_ops);
if (xc.open_zone)
xfs_open_zone_put(xc.open_zone);
return error;
} else {
struct xfs_writepage_ctx wpc = { };
return iomap_writepages(mapping, wbc, &wpc.ctx,
&xfs_writeback_ops);
}
}
STATIC int

View File

@@ -9,6 +9,7 @@
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
void xfs_end_bio(struct bio *bio);
#endif /* __XFS_AOPS_H__ */

View File

@@ -30,6 +30,7 @@
#include "xfs_reflink.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
/* Kernel only BMAP related definitions and functions */
@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
int whichfork,
xfs_off_t start_byte,
xfs_off_t end_byte)
xfs_off_t end_byte,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range(
continue;
}
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
if (xfs_is_zoned_inode(ip) && ac) {
/*
* In a zoned buffered write context we need to return
* the punched delalloc allocations to the allocation
* context. This allows reusing them in the following
* iomap iterations.
*/
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
&del, XFS_BMAPI_REMAP);
ac->reserved_blocks += del.br_blockcount;
} else {
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
&del, 0);
}
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@@ -582,7 +598,7 @@ xfs_free_eofblocks(
if (ip->i_delayed_blks) {
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
LLONG_MAX);
LLONG_MAX, NULL);
}
xfs_inode_clear_eofblocks_tag(ip);
return 0;
@@ -825,7 +841,8 @@ int
xfs_free_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len)
xfs_off_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
@@ -880,7 +897,7 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
error = xfs_zero_range(ip, offset, len, NULL);
error = xfs_zero_range(ip, offset, len, ac, NULL);
if (error)
return error;
@@ -968,7 +985,8 @@ int
xfs_collapse_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len)
xfs_off_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -981,7 +999,7 @@ xfs_collapse_file_space(
trace_xfs_collapse_file_space(ip);
error = xfs_free_file_space(ip, offset, len);
error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;

View File

@@ -15,6 +15,7 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
struct xfs_zone_alloc_ctx;
#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */
void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
xfs_off_t start_byte, xfs_off_t end_byte);
xfs_off_t start_byte, xfs_off_t end_byte,
struct xfs_zone_alloc_ctx *ac);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
@@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
xfs_off_t len);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
xfs_off_t len);
/* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip);

View File

@@ -844,7 +844,8 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (mp->m_rtdev_targp &&
if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
rt_bdev = mp->m_rtdev_targp->bt_bdev;
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)

View File

@@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
while ((pag = xfs_perag_next(mp, pag)))
xfs_extent_busy_wait_group(pag_group(pag));
if (xfs_has_rtgroups(mp))
if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
while ((rtg = xfs_rtgroup_next(mp, rtg)))
xfs_extent_busy_wait_group(rtg_group(rtg));
}

View File

@@ -29,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_efi_cache;
struct kmem_cache *xfs_efd_cache;
@@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item(
trace_xfs_extent_free_deferred(mp, xefi);
if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) {
if (*rtgp != to_rtg(xefi->xefi_group)) {
*rtgp = to_rtg(xefi->xefi_group);
xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP);
xfs_rtgroup_trans_join(tp, *rtgp,
XFS_RTGLOCK_BITMAP);
}
error = xfs_rtfree_blocks(tp, *rtgp,
xefi->xefi_startblock, xefi->xefi_blockcount);
if (xefi->xefi_flags & XFS_EFI_CANCELLED)
goto done;
if (*rtgp != to_rtg(xefi->xefi_group)) {
unsigned int lock_flags;
if (xfs_has_zoned(mp))
lock_flags = XFS_RTGLOCK_RMAP;
else
lock_flags = XFS_RTGLOCK_BITMAP;
*rtgp = to_rtg(xefi->xefi_group);
xfs_rtgroup_lock(*rtgp, lock_flags);
xfs_rtgroup_trans_join(tp, *rtgp, lock_flags);
}
if (xfs_has_zoned(mp)) {
error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock,
xefi->xefi_blockcount);
} else {
error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock,
xefi->xefi_blockcount);
}
if (error == -EAGAIN) {
xfs_efd_from_efi(efdp);
return error;
}
done:
xfs_efd_add_extent(efdp, xefi);
xfs_extent_free_cancel_item(item);
return error;

View File

@@ -25,6 +25,8 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -150,7 +152,7 @@ xfs_file_fsync(
* ensure newly written file data make it to disk before logging the new
* inode size in case of an extending write.
*/
if (XFS_IS_REALTIME_INODE(ip))
if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
@@ -360,7 +362,8 @@ xfs_file_write_zero_eof(
struct iov_iter *from,
unsigned int *iolock,
size_t count,
bool *drained_dio)
bool *drained_dio,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
loff_t isize;
@@ -414,7 +417,7 @@ xfs_file_write_zero_eof(
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
@@ -431,7 +434,8 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
unsigned int *iolock)
unsigned int *iolock,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
@@ -481,7 +485,7 @@ xfs_file_write_checks(
*/
if (iocb->ki_pos > i_size_read(inode)) {
error = xfs_file_write_zero_eof(iocb, from, iolock, count,
&drained_dio);
&drained_dio, ac);
if (error == 1)
goto restart;
if (error)
@@ -491,6 +495,48 @@ xfs_file_write_checks(
return kiocb_modified(iocb);
}
static ssize_t
xfs_zoned_write_space_reserve(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from,
unsigned int flags,
struct xfs_zone_alloc_ctx *ac)
{
loff_t count = iov_iter_count(from);
int error;
if (iocb->ki_flags & IOCB_NOWAIT)
flags |= XFS_ZR_NOWAIT;
/*
* Check the rlimit and LFS boundary first so that we don't over-reserve
* by possibly a lot.
*
* The generic write path will redo this check later, and it might have
* changed by then. If it got expanded we'll stick to our earlier
* smaller limit, and if it is decreased the new smaller limit will be
* used and our extra space reservation will be returned after finishing
* the write.
*/
error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
if (error)
return error;
/*
* Sloppily round up count to file system blocks.
*
* This will often reserve an extra block, but that avoids having to look
* at the start offset, which isn't stable for O_APPEND until taking the
* iolock. Also we need to reserve a block each for zeroing the old
* EOF block and the new start block if they are unaligned.
*
* Any remaining block will be returned after the write.
*/
return xfs_zoned_space_reserve(ip,
XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
}
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
@@ -503,6 +549,9 @@ xfs_dio_write_end_io(
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
ASSERT(!xfs_is_zoned_inode(ip) ||
!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
trace_xfs_end_io_direct_write(ip, offset, size);
if (xfs_is_shutdown(ip->i_mount))
@@ -582,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
.end_io = xfs_dio_write_end_io,
};
static void
xfs_dio_zoned_submit_io(
const struct iomap_iter *iter,
struct bio *bio,
loff_t file_offset)
{
struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
struct xfs_zone_alloc_ctx *ac = iter->private;
xfs_filblks_t count_fsb;
struct iomap_ioend *ioend;
count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
if (count_fsb > ac->reserved_blocks) {
xfs_err(mp,
"allocation (%lld) larger than reservation (%lld).",
count_fsb, ac->reserved_blocks);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
bio_io_error(bio);
return;
}
ac->reserved_blocks -= count_fsb;
bio->bi_end_io = xfs_end_bio;
ioend = iomap_init_ioend(iter->inode, bio, file_offset,
IOMAP_IOEND_DIRECT);
xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
}
static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
.bio_set = &iomap_ioend_bioset,
.submit_io = xfs_dio_zoned_submit_io,
.end_io = xfs_dio_write_end_io,
};
/*
* Handle block aligned direct I/O writes
* Handle block aligned direct I/O writes.
*/
static noinline ssize_t
xfs_file_dio_write_aligned(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
struct iov_iter *from,
const struct iomap_ops *ops,
const struct iomap_dio_ops *dops,
struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
@@ -597,7 +683,7 @@ xfs_file_dio_write_aligned(
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
ret = xfs_file_write_checks(iocb, from, &iolock, ac);
if (ret)
goto out_unlock;
@@ -611,11 +697,31 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
&xfs_dio_write_ops, 0, NULL, 0);
ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
xfs_iunlock(ip, iolock);
return ret;
}
/*
* Handle block aligned direct I/O writes to zoned devices.
*/
static noinline ssize_t
xfs_file_dio_write_zoned(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
if (ret < 0)
return ret;
ret = xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_zoned_direct_write_iomap_ops,
&xfs_dio_zoned_write_ops, &ac);
xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
@@ -675,7 +781,7 @@ xfs_file_dio_write_unaligned(
goto out_unlock;
}
ret = xfs_file_write_checks(iocb, from, &iolock);
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
@@ -721,9 +827,21 @@ xfs_file_dio_write(
/* direct I/O must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
/*
* For always COW inodes we also must check the alignment of each
* individual iovec segment, as they could end up with different
* I/Os due to the way bio_iov_iter_get_pages works, and we'd
* then overwrite an already written block.
*/
if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
(xfs_is_always_cow_inode(ip) &&
(iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
return xfs_file_dio_write_unaligned(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
static noinline ssize_t
@@ -740,7 +858,7 @@ xfs_file_dax_write(
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -784,7 +902,7 @@ xfs_file_buffered_write(
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock);
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -831,6 +949,67 @@ xfs_file_buffered_write(
return ret;
}
STATIC ssize_t
xfs_file_buffered_write_zoned(
struct kiocb *iocb,
struct iov_iter *from)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
unsigned int iolock = XFS_IOLOCK_EXCL;
bool cleared_space = false;
struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
if (ret < 0)
return ret;
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
goto out_unreserve;
ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
if (ret)
goto out_unlock;
/*
* Truncate the iter to the length that we were actually able to
* allocate blocks for. This needs to happen after
* xfs_file_write_checks, because that assigns ki_pos for O_APPEND
* writes.
*/
iov_iter_truncate(from,
XFS_FSB_TO_B(mp, ac.reserved_blocks) -
(iocb->ki_pos & mp->m_blockmask));
if (!iov_iter_count(from))
goto out_unlock;
retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops, &ac);
if (ret == -ENOSPC && !cleared_space) {
/*
* Kick off writeback to convert delalloc space and release the
* usually too pessimistic indirect block reservations.
*/
xfs_flush_inodes(mp);
cleared_space = true;
goto retry;
}
out_unlock:
xfs_iunlock(ip, iolock);
out_unreserve:
xfs_zoned_space_unreserve(ip, &ac);
if (ret > 0) {
XFS_STATS_ADD(mp, xs_write_bytes, ret);
ret = generic_write_sync(iocb, ret);
}
return ret;
}
STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
@@ -878,6 +1057,8 @@ xfs_file_write_iter(
return ret;
}
if (xfs_is_zoned_inode(ip))
return xfs_file_buffered_write_zoned(iocb, from);
return xfs_file_buffered_write(iocb, from);
}
@@ -932,7 +1113,8 @@ static int
xfs_falloc_collapse_range(
struct file *file,
loff_t offset,
loff_t len)
loff_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
loff_t new_size = i_size_read(inode) - len;
@@ -948,7 +1130,7 @@ xfs_falloc_collapse_range(
if (offset + len >= i_size_read(inode))
return -EINVAL;
error = xfs_collapse_file_space(XFS_I(inode), offset, len);
error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
@@ -1004,7 +1186,8 @@ xfs_falloc_zero_range(
struct file *file,
int mode,
loff_t offset,
loff_t len)
loff_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
unsigned int blksize = i_blocksize(inode);
@@ -1017,7 +1200,7 @@ xfs_falloc_zero_range(
if (error)
return error;
error = xfs_free_file_space(XFS_I(inode), offset, len);
error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
@@ -1088,22 +1271,18 @@ xfs_falloc_allocate_range(
FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
STATIC long
xfs_file_fallocate(
__xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
loff_t len)
loff_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
@@ -1124,16 +1303,16 @@ xfs_file_fallocate(
switch (mode & FALLOC_FL_MODE_MASK) {
case FALLOC_FL_PUNCH_HOLE:
error = xfs_free_file_space(ip, offset, len);
error = xfs_free_file_space(ip, offset, len, ac);
break;
case FALLOC_FL_COLLAPSE_RANGE:
error = xfs_falloc_collapse_range(file, offset, len);
error = xfs_falloc_collapse_range(file, offset, len, ac);
break;
case FALLOC_FL_INSERT_RANGE:
error = xfs_falloc_insert_range(file, offset, len);
break;
case FALLOC_FL_ZERO_RANGE:
error = xfs_falloc_zero_range(file, mode, offset, len);
error = xfs_falloc_zero_range(file, mode, offset, len, ac);
break;
case FALLOC_FL_UNSHARE_RANGE:
error = xfs_falloc_unshare_range(file, mode, offset, len);
@@ -1154,6 +1333,54 @@ xfs_file_fallocate(
return error;
}
static long
xfs_file_zoned_fallocate(
struct file *file,
int mode,
loff_t offset,
loff_t len)
{
struct xfs_zone_alloc_ctx ac = { };
struct xfs_inode *ip = XFS_I(file_inode(file));
int error;
error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
if (error)
return error;
error = __xfs_file_fallocate(file, mode, offset, len, &ac);
xfs_zoned_space_unreserve(ip, &ac);
return error;
}
static long
xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
loff_t len)
{
struct inode *inode = file_inode(file);
if (!S_ISREG(inode->i_mode))
return -EINVAL;
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
/*
* For zoned file systems, zeroing the first and last block of a hole
* punch requires allocating a new block to rewrite the remaining data
* and new zeroes out of place. Get a reservations for those before
* taking the iolock. Dip into the reserved pool because we are
* expected to be able to punch a hole even on a completely full
* file system.
*/
if (xfs_is_zoned_inode(XFS_I(inode)) &&
(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_COLLAPSE_RANGE)))
return xfs_file_zoned_fallocate(file, mode, offset, len);
return __xfs_file_fallocate(file, mode, offset, len, NULL);
}
STATIC int
xfs_file_fadvise(
struct file *file,
@@ -1347,15 +1574,22 @@ xfs_file_release(
* blocks. This avoids open/read/close workloads from removing EOF
* blocks that other writers depend upon to reduce fragmentation.
*
* Inodes on the zoned RT device never have preallocations, so skip
* taking the locks below.
*/
if (!inode->i_nlink ||
!(file->f_mode & FMODE_WRITE) ||
(ip->i_diflags & XFS_DIFLAG_APPEND) ||
xfs_is_zoned_inode(ip))
return 0;
/*
* If we can't get the iolock just skip truncating the blocks past EOF
* because we could deadlock with the mmap_lock otherwise. We'll get
* another chance to drop them once the last reference to the inode is
* dropped, so we'll never leak blocks permanently.
*/
if (inode->i_nlink &&
(file->f_mode & FMODE_WRITE) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND) &&
!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
if (xfs_can_free_eofblocks(ip) &&
!xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
@@ -1472,9 +1706,10 @@ xfs_dax_read_fault(
* i_lock (XFS - extent map serialisation)
*/
static vm_fault_t
xfs_write_fault(
__xfs_write_fault(
struct vm_fault *vmf,
unsigned int order)
unsigned int order,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
@@ -1511,13 +1746,50 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
ac);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
return ret;
}
static vm_fault_t
xfs_write_fault_zoned(
struct vm_fault *vmf,
unsigned int order)
{
struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
unsigned int len = folio_size(page_folio(vmf->page));
struct xfs_zone_alloc_ctx ac = { };
int error;
vm_fault_t ret;
/*
* This could over-allocate as it doesn't check for truncation.
*
* But as the overallocation is limited to less than a folio and will be
* release instantly that's just fine.
*/
error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
&ac);
if (error < 0)
return vmf_fs_error(error);
ret = __xfs_write_fault(vmf, order, &ac);
xfs_zoned_space_unreserve(ip, &ac);
return ret;
}
static vm_fault_t
xfs_write_fault(
struct vm_fault *vmf,
unsigned int order)
{
if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
return xfs_write_fault_zoned(vmf, order);
return __xfs_write_fault(vmf, order, NULL);
}
static inline bool
xfs_is_write_fault(
struct vm_fault *vmf)
@@ -1626,7 +1898,8 @@ const struct file_operations xfs_file_operations = {
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
FOP_DONTCACHE,
};
const struct file_operations xfs_dir_file_operations = {

View File

@@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
xfs_daddr_t rtstart_daddr;
xfs_rtblock_t start_rtb;
xfs_rtblock_t end_rtb;
xfs_rgnumber_t start_rg, end_rg;
uint64_t eofs;
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
if (keys[0].fmr_physical < rtstart_daddr) {
struct xfs_fsmap_irec frec = {
.owner = XFS_RMAP_OWN_FS,
.len_daddr = rtstart_daddr,
};
/* Adjust the low key if we are continuing from where we left off. */
if (keys[0].fmr_length > 0) {
info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
return 0;
}
/* Fabricate an rmap entry for space occupied by the data dev */
error = xfs_getfsmap_helper(tp, info, &frec);
if (error)
return error;
}
start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
@@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt(
}
#endif /* CONFIG_XFS_RT */
static uint32_t
xfs_getfsmap_device(
struct xfs_mount *mp,
enum xfs_device dev)
{
if (mp->m_sb.sb_rtstart)
return dev;
switch (dev) {
case XFS_DEV_DATA:
return new_encode_dev(mp->m_ddev_targp->bt_dev);
case XFS_DEV_LOG:
return new_encode_dev(mp->m_logdev_targp->bt_dev);
case XFS_DEV_RT:
if (!mp->m_rtdev_targp)
break;
return new_encode_dev(mp->m_rtdev_targp->bt_dev);
}
return -1;
}
/* Do we recognize the device? */
STATIC bool
xfs_getfsmap_is_valid_device(
struct xfs_mount *mp,
struct xfs_fsmap *fm)
{
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
return true;
if (mp->m_logdev_targp &&
fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
return true;
if (mp->m_rtdev_targp &&
fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
return true;
return false;
return fm->fmr_device == 0 ||
fm->fmr_device == UINT_MAX ||
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) ||
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) ||
(mp->m_rtdev_targp &&
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT));
}
/* Ensure that the low key is less than the high key. */
@@ -1126,7 +1166,7 @@ xfs_getfsmap(
/* Set up our device handlers. */
memset(handlers, 0, sizeof(handlers));
handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA);
if (use_rmap)
handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
else
@@ -1134,13 +1174,17 @@ xfs_getfsmap(
if (mp->m_logdev_targp != mp->m_ddev_targp) {
handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
mp->m_sb.sb_logblocks);
handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG);
handlers[1].fn = xfs_getfsmap_logdev;
}
#ifdef CONFIG_XFS_RT
if (mp->m_rtdev_targp) {
/*
* For zoned file systems there is no rtbitmap, so only support fsmap
* if the callers is privileged enough to use the full rmap version.
*/
if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT);
if (use_rmap)
handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
else
@@ -1230,7 +1274,13 @@ xfs_getfsmap(
if (tp)
xfs_trans_cancel(tp);
head->fmh_oflags = FMH_OF_DEV_T;
/*
* For internal RT device we need to report different synthetic devices
* for a single physical device, and thus can't report the actual dev_t.
*/
if (!mp->m_sb.sb_rtstart)
head->fmh_oflags = FMH_OF_DEV_T;
return error;
}

View File

@@ -24,6 +24,7 @@
#include "xfs_rtalloc.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_metafile.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
@@ -307,6 +308,10 @@ xfs_growfs_data(
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
/* we can't grow the data section when an internal RT section exists */
if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart)
return -EINVAL;
/* update imaxpct separately to the physical grow of the filesystem */
if (in->imaxpct != mp->m_sb.sb_imax_pct) {
error = xfs_growfs_imaxpct(mp, in->imaxpct);
@@ -366,6 +371,7 @@ xfs_growfs_log(
int
xfs_reserve_blocks(
struct xfs_mount *mp,
enum xfs_free_counter ctr,
uint64_t request)
{
int64_t lcounter, delta;
@@ -373,6 +379,8 @@ xfs_reserve_blocks(
int64_t free;
int error = 0;
ASSERT(ctr < XC_FREE_NR);
/*
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
@@ -391,16 +399,16 @@ xfs_reserve_blocks(
* counters directly since we shouldn't have any problems unreserving
* space.
*/
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
if (mp->m_free[ctr].res_total > request) {
lcounter = mp->m_free[ctr].res_avail - request;
if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
mp->m_resblks_avail -= lcounter;
mp->m_free[ctr].res_avail -= lcounter;
}
mp->m_resblks = request;
mp->m_free[ctr].res_total = request;
if (fdblks_delta) {
spin_unlock(&mp->m_sb_lock);
xfs_add_fdblocks(mp, fdblks_delta);
xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
@@ -409,7 +417,7 @@ xfs_reserve_blocks(
/*
* If the request is larger than the current reservation, reserve the
* blocks before we update the reserve counters. Sample m_fdblocks and
* blocks before we update the reserve counters. Sample m_free and
* perform a partial reservation if the request exceeds free space.
*
* The code below estimates how many blocks it can request from
@@ -419,10 +427,10 @@ xfs_reserve_blocks(
* space to fill it because mod_fdblocks will refill an undersized
* reserve when it can.
*/
free = percpu_counter_sum(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp);
delta = request - mp->m_resblks;
mp->m_resblks = request;
free = xfs_sum_freecounter_raw(mp, ctr) -
xfs_freecounter_unavailable(mp, ctr);
delta = request - mp->m_free[ctr].res_total;
mp->m_free[ctr].res_total = request;
if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
@@ -436,9 +444,9 @@ xfs_reserve_blocks(
*/
fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0);
if (!error)
xfs_add_fdblocks(mp, fdblks_delta);
xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
out:
@@ -558,15 +566,13 @@ xfs_fs_reserve_ag_blocks(
return error;
}
if (xfs_has_realtime(mp)) {
err2 = xfs_rt_resv_init(mp);
if (err2 && err2 != -ENOSPC) {
xfs_warn(mp,
"Error %d reserving realtime metadata reserve pool.", err2);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
err2 = xfs_metafile_resv_init(mp);
if (err2 && err2 != -ENOSPC) {
xfs_warn(mp,
"Error %d reserving realtime metadata reserve pool.", err2);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
if (err2 && !error)
if (!error)
error = err2;
}
@@ -582,9 +588,7 @@ xfs_fs_unreserve_ag_blocks(
{
struct xfs_perag *pag = NULL;
if (xfs_has_realtime(mp))
xfs_rt_resv_free(mp);
xfs_metafile_resv_free(mp);
while ((pag = xfs_perag_next(mp, pag)))
xfs_ag_resv_free(pag);
}

View File

@@ -8,7 +8,8 @@
int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt,
uint64_t request);
int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);

View File

@@ -2073,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file(
{
struct xfs_mount *mp = ip->i_mount;
if (!XFS_IS_REALTIME_INODE(ip))
if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
return false;
if (__percpu_counter_compare(&mp->m_frextents,
if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
@@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work(
if (items > mp->m_ino_geo.inodes_per_cluster)
return true;
if (__percpu_counter_compare(&mp->m_fdblocks,
if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
mp->m_low_space[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;

View File

@@ -3074,5 +3074,6 @@ bool
xfs_is_always_cow_inode(
const struct xfs_inode *ip)
{
return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
return xfs_is_zoned_inode(ip) ||
(ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount));
}

View File

@@ -25,19 +25,9 @@ struct xfs_dquot;
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
union {
struct {
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
struct xfs_dquot *i_pdquot; /* project dquot */
};
/*
* Space that has been set aside to accomodate expansions of a
* metadata btree rooted in this file.
*/
uint64_t i_meta_resv_asked;
};
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
struct xfs_dquot *i_pdquot; /* project dquot */
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
@@ -69,8 +59,13 @@ typedef struct xfs_inode {
xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
prid_t i_projid; /* owner's project id */
xfs_extlen_t i_extsize; /* basic/minimum extent size */
/* cowextsize is only used for v3 inodes, flushiter for v1/2 */
/*
* i_used_blocks is used for zoned rtrmap inodes,
* i_cowextsize is used for other v3 inodes,
* i_flushiter for v1/2 inodes
*/
union {
uint32_t i_used_blocks; /* used blocks in RTG */
xfs_extlen_t i_cowextsize; /* basic cow extent size */
uint16_t i_flushiter; /* incremented on flush */
};
@@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip)
{
return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip);
}
bool xfs_is_always_cow_inode(const struct xfs_inode *ip);
static inline bool xfs_is_cow_inode(const struct xfs_inode *ip)

View File

@@ -596,6 +596,7 @@ xfs_inode_to_log_dinode(
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
to->di_flags2 = ip->i_diflags2;
/* also covers the di_used_blocks union arm: */
to->di_cowextsize = ip->i_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;

View File

@@ -203,6 +203,7 @@ xfs_log_dinode_to_disk(
to->di_crtime = xfs_log_dinode_to_disk_ts(from,
from->di_crtime);
to->di_flags2 = cpu_to_be64(from->di_flags2);
/* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);

View File

@@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks(
error = mnt_want_write_file(filp);
if (error)
return error;
error = xfs_reserve_blocks(mp, fsop.resblks);
error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks);
mnt_drop_write_file(filp);
if (error)
return error;
}
spin_lock(&mp->m_sb_lock);
fsop.resblks = mp->m_resblks;
fsop.resblks_avail = mp->m_resblks_avail;
fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total;
fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail;
spin_unlock(&mp->m_sb_lock);
if (copy_to_user(arg, &fsop, sizeof(fsop)))
@@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts(
struct xfs_fsop_counts out = {
.allocino = percpu_counter_read_positive(&mp->m_icount),
.freeino = percpu_counter_read_positive(&mp->m_ifree),
.freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp),
.freertx = percpu_counter_read_positive(&mp->m_frextents),
.freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) -
xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS),
.freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS),
};
if (copy_to_user(uarg, &out, sizeof(out)))

View File

@@ -30,6 +30,8 @@
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_rtbitmap.h"
#include "xfs_icache.h"
#include "xfs_zone_alloc.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -431,13 +433,14 @@ xfs_quota_calc_throttle(
static int64_t
xfs_iomap_freesp(
struct percpu_counter *counter,
struct xfs_mount *mp,
unsigned int idx,
uint64_t low_space[XFS_LOWSP_MAX],
int *shift)
{
int64_t freesp;
freesp = percpu_counter_read_positive(counter);
freesp = xfs_estimate_freecounter(mp, idx);
if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
*shift = 2;
if (freesp < low_space[XFS_LOWSP_4_PCNT])
@@ -536,10 +539,10 @@ xfs_iomap_prealloc_size(
if (unlikely(XFS_IS_REALTIME_INODE(ip)))
freesp = xfs_rtbxlen_to_blen(mp,
xfs_iomap_freesp(&mp->m_frextents,
xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts, &shift));
else
freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space,
&shift);
/*
@@ -962,6 +965,59 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};
#ifdef CONFIG_XFS_RT
/*
* This is really simple. The space has already been reserved before taking the
* IOLOCK, the actual block allocation is done just before submitting the bio
* and only recorded in the extent map on I/O completion.
*/
static int
xfs_zoned_direct_write_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
unsigned flags,
struct iomap *iomap,
struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
int error;
ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
/*
* Needs to be pushed down into the allocator so that only writes into
* a single zone can be supported.
*/
if (flags & IOMAP_NOWAIT)
return -EAGAIN;
/*
* Ensure the extent list is in memory in so that we don't have to do
* read it from the I/O completion handler.
*/
if (xfs_need_iread_extents(&ip->i_df)) {
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
}
iomap->type = IOMAP_MAPPED;
iomap->flags = IOMAP_F_DIRTY;
iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
iomap->offset = offset;
iomap->length = length;
iomap->flags = IOMAP_F_ANON_WRITE;
return 0;
}
const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
.iomap_begin = xfs_zoned_direct_write_iomap_begin,
};
#endif /* CONFIG_XFS_RT */
static int
xfs_dax_write_iomap_end(
struct inode *inode,
@@ -987,6 +1043,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
.iomap_end = xfs_dax_write_iomap_end,
};
/*
* Convert a hole to a delayed allocation.
*/
static void
xfs_bmap_add_extent_hole_delay(
struct xfs_inode *ip, /* incore inode pointer */
int whichfork,
struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *new) /* new data to add to file extents */
{
struct xfs_ifork *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
ifp = xfs_ifork_ptr(ip, whichfork);
ASSERT(isnullstartblock(new->br_startblock));
/*
* Check and set flags if this segment has a left neighbor
*/
if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
state |= BMAP_LEFT_VALID;
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
/*
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
if (xfs_iext_get_extent(ifp, icur, &right)) {
state |= BMAP_RIGHT_VALID;
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
/*
* Set contiguity flags on the left and right neighbors.
* Don't let extents get too large, even if the pieces are contiguous.
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
* Switch out based on the contiguity flags.
*/
switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with delayed allocations
* on the left and on the right.
* Merge all three into a single extent record.
*/
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
left.br_startblock = nullstartblock(newlen);
left.br_blockcount = temp;
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_LEFT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the left.
* Merge the new allocation with the left neighbor.
*/
temp = left.br_blockcount + new->br_blockcount;
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
left.br_blockcount = temp;
left.br_startblock = nullstartblock(newlen);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
break;
case BMAP_RIGHT_CONTIG:
/*
* New allocation is contiguous with a delayed allocation
* on the right.
* Merge the new allocation with the right neighbor.
*/
temp = new->br_blockcount + right.br_blockcount;
oldlen = startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
oldlen);
right.br_startoff = new->br_startoff;
right.br_startblock = nullstartblock(newlen);
right.br_blockcount = temp;
xfs_iext_update_extent(ip, state, icur, &right);
break;
case 0:
/*
* New allocation is not contiguous with another
* delayed allocation.
* Insert a new entry.
*/
oldlen = newlen = 0;
xfs_iext_insert(ip, icur, new, state);
break;
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
/*
* Nothing to do for disk quota accounting here.
*/
xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
}
}
/*
* Add a delayed allocation extent to an inode. Blocks are reserved from the
* global pool and the extent inserted into the inode in-core extent tree.
*
* On entry, got refers to the first extent beyond the offset of the extent to
* allocate or eof is specified if no such extent exists. On return, got refers
* to the extent record that was inserted to the inode fork.
*
* Note that the allocated extent may have been merged with contiguous extents
* during insertion into the inode fork. Thus, got does not reflect the current
* state of the inode fork on return. If necessary, the caller can use lastx to
* look up the updated record in the inode fork.
*/
static int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_fileoff_t off,
xfs_filblks_t len,
xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
struct xfs_iext_cursor *icur,
int eof)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_extlen_t alen;
xfs_extlen_t indlen;
uint64_t fdblocks;
int error;
xfs_fileoff_t aoff;
bool use_cowextszhint =
whichfork == XFS_COW_FORK && !prealloc;
retry:
/*
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
prealloc = alen - len;
/*
* If we're targetting the COW fork but aren't creating a speculative
* posteof preallocation, try to expand the reservation to align with
* the COW extent size hint if there's sufficient free space.
*
* Unlike the data fork, the CoW cancellation functions will free all
* the reservations at inactivation, so we don't require that every
* delalloc reservation have a dirty pagecache.
*/
if (use_cowextszhint) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
prev.br_startoff = NULLFILEOFF;
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
1, 0, &aoff, &alen);
ASSERT(!error);
}
/*
* Make a transaction-less quota reservation for delayed allocation
* blocks. This number gets adjusted later. We return if we haven't
* allocated blocks already inside this loop.
*/
error = xfs_quota_reserve_blkres(ip, alen);
if (error)
goto out;
/*
* Split changing sb for alen and indlen since they could be coming
* from different places.
*/
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
ASSERT(indlen > 0);
fdblocks = indlen;
if (XFS_IS_REALTIME_INODE(ip)) {
ASSERT(!xfs_is_zoned_inode(ip));
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
if (error)
goto out_unreserve_quota;
} else {
fdblocks += alen;
}
error = xfs_dec_fdblocks(mp, fdblocks, false);
if (error)
goto out_unreserve_frextents;
ip->i_delayed_blks += alen;
xfs_mod_delalloc(ip, alen, indlen);
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
got->br_blockcount = alen;
got->br_state = XFS_EXT_NORM;
xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
/*
* Tag the inode if blocks were preallocated. Note that COW fork
* preallocation can occur at the start or end of the extent, even when
* prealloc == 0, so we must also check the aligned offset and length.
*/
if (whichfork == XFS_DATA_FORK && prealloc)
xfs_inode_set_eofblocks_tag(ip);
if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
xfs_inode_set_cowblocks_tag(ip);
return 0;
out_unreserve_frextents:
if (XFS_IS_REALTIME_INODE(ip))
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
out:
if (error == -ENOSPC || error == -EDQUOT) {
trace_xfs_delalloc_enospc(ip, off, len);
if (prealloc || use_cowextszhint) {
/* retry without any preallocation */
use_cowextszhint = false;
prealloc = 0;
goto retry;
}
}
return error;
}
static int
xfs_zoned_buffered_write_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t count,
unsigned flags,
struct iomap *iomap,
struct iomap *srcmap)
{
struct iomap_iter *iter =
container_of(iomap, struct iomap_iter, iomap);
struct xfs_zone_alloc_ctx *ac = iter->private;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
u16 iomap_flags = IOMAP_F_SHARED;
unsigned int lockmode = XFS_ILOCK_EXCL;
xfs_filblks_t count_fsb;
xfs_extlen_t indlen;
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
int error = 0;
ASSERT(!xfs_get_extsz_hint(ip));
ASSERT(!(flags & IOMAP_UNSHARE));
ASSERT(ac);
if (xfs_is_shutdown(mp))
return -EIO;
error = xfs_qm_dqattach(ip);
if (error)
return error;
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
}
XFS_STATS_INC(mp, xs_blk_mapw);
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
goto out_unlock;
/*
* For zeroing operations check if there is any data to zero first.
*
* For regular writes we always need to allocate new blocks, but need to
* provide the source mapping when the range is unaligned to support
* read-modify-write of the whole block in the page cache.
*
* In either case we need to limit the reported range to the boundaries
* of the source map in the data fork.
*/
if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
!IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
(flags & IOMAP_ZERO)) {
struct xfs_bmbt_irec smap;
struct xfs_iext_cursor scur;
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
&smap))
smap.br_startoff = end_fsb; /* fake hole until EOF */
if (smap.br_startoff > offset_fsb) {
/*
* We never need to allocate blocks for zeroing a hole.
*/
if (flags & IOMAP_ZERO) {
xfs_hole_to_iomap(ip, iomap, offset_fsb,
smap.br_startoff);
goto out_unlock;
}
end_fsb = min(end_fsb, smap.br_startoff);
} else {
end_fsb = min(end_fsb,
smap.br_startoff + smap.br_blockcount);
xfs_trim_extent(&smap, offset_fsb,
end_fsb - offset_fsb);
error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
xfs_iomap_inode_sequence(ip, 0));
if (error)
goto out_unlock;
}
}
if (!ip->i_cowfp)
xfs_ifork_init_cow(ip);
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
got.br_startoff = end_fsb;
if (got.br_startoff <= offset_fsb) {
trace_xfs_reflink_cow_found(ip, &got);
goto done;
}
/*
* Cap the maximum length to keep the chunks of work done here somewhat
* symmetric with the work writeback does.
*/
end_fsb = min(end_fsb, got.br_startoff);
count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
/*
* The block reservation is supposed to cover all blocks that the
* operation could possible write, but there is a nasty corner case
* where blocks could be stolen from underneath us:
*
* 1) while this thread iterates over a larger buffered write,
* 2) another thread is causing a write fault that calls into
* ->page_mkwrite in range this thread writes to, using up the
* delalloc reservation created by a previous call to this function.
* 3) another thread does direct I/O on the range that the write fault
* happened on, which causes writeback of the dirty data.
* 4) this then set the stale flag, which cuts the current iomap
* iteration short, causing the new call to ->iomap_begin that gets
* us here again, but now without a sufficient reservation.
*
* This is a very unusual I/O pattern, and nothing but generic/095 is
* known to hit it. There's not really much we can do here, so turn this
* into a short write.
*/
if (count_fsb > ac->reserved_blocks) {
xfs_warn_ratelimited(mp,
"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
ip->i_ino, current->comm);
count_fsb = ac->reserved_blocks;
if (!count_fsb) {
error = -EIO;
goto out_unlock;
}
}
error = xfs_quota_reserve_blkres(ip, count_fsb);
if (error)
goto out_unlock;
indlen = xfs_bmap_worst_indlen(ip, count_fsb);
error = xfs_dec_fdblocks(mp, indlen, false);
if (error)
goto out_unlock;
ip->i_delayed_blks += count_fsb;
xfs_mod_delalloc(ip, count_fsb, indlen);
got.br_startoff = offset_fsb;
got.br_startblock = nullstartblock(indlen);
got.br_blockcount = count_fsb;
got.br_state = XFS_EXT_NORM;
xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
ac->reserved_blocks -= count_fsb;
iomap_flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
XFS_COW_FORK, &got);
done:
error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
}
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@@ -1013,6 +1518,10 @@ xfs_buffered_write_iomap_begin(
if (xfs_is_shutdown(mp))
return -EIO;
if (xfs_is_zoned_inode(ip))
return xfs_zoned_buffered_write_iomap_begin(inode, offset,
count, flags, iomap, srcmap);
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
@@ -1245,10 +1754,13 @@ xfs_buffered_write_delalloc_punch(
loff_t length,
struct iomap *iomap)
{
struct iomap_iter *iter =
container_of(iomap, struct iomap_iter, iomap);
xfs_bmap_punch_delalloc_range(XFS_I(inode),
(iomap->flags & IOMAP_F_SHARED) ?
XFS_COW_FORK : XFS_DATA_FORK,
offset, offset + length);
offset, offset + length, iter->private);
}
static int
@@ -1485,6 +1997,7 @@ xfs_zero_range(
struct xfs_inode *ip,
loff_t pos,
loff_t len,
struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@@ -1495,13 +2008,14 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
&xfs_buffered_write_iomap_ops);
&xfs_buffered_write_iomap_ops, ac);
}
int
xfs_truncate_page(
struct xfs_inode *ip,
loff_t pos,
struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@@ -1510,5 +2024,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops);
&xfs_buffered_write_iomap_ops, ac);
}

View File

@@ -10,6 +10,7 @@
struct xfs_inode;
struct xfs_bmbt_irec;
struct xfs_zone_alloc_ctx;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
@@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
bool *did_zero);
int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
struct xfs_zone_alloc_ctx *ac, bool *did_zero);
int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
struct xfs_zone_alloc_ctx *ac, bool *did_zero);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
@@ -49,6 +51,7 @@ xfs_aligned_fsb_count(
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;

View File

@@ -29,6 +29,7 @@
#include "xfs_xattr.h"
#include "xfs_file.h"
#include "xfs_bmap.h"
#include "xfs_zone_alloc.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -854,6 +855,7 @@ xfs_setattr_size(
uint lock_flags = 0;
uint resblks = 0;
bool did_zeroing = false;
struct xfs_zone_alloc_ctx ac = { };
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
@@ -889,6 +891,28 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
/*
* Normally xfs_zoned_space_reserve is supposed to be called outside the
* IOLOCK. For truncate we can't do that since ->setattr is called with
* it already held by the VFS. So for now chicken out and try to
* allocate space under it.
*
* To avoid deadlocks this means we can't block waiting for space, which
* can lead to spurious -ENOSPC if there are no directly available
* blocks. We mitigate this a bit by allowing zeroing to dip into the
* reserved pool, but eventually the VFS calling convention needs to
* change.
*/
if (xfs_is_zoned_inode(ip)) {
error = xfs_zoned_space_reserve(ip, 1,
XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
if (error) {
if (error == -EAGAIN)
return -ENOSPC;
return error;
}
}
/*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
@@ -902,11 +926,14 @@ xfs_setattr_size(
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
&did_zeroing);
&ac, &did_zeroing);
} else {
error = xfs_truncate_page(ip, newsize, &did_zeroing);
error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
}
if (xfs_is_zoned_inode(ip))
xfs_zoned_space_unreserve(ip, &ac);
if (error)
return error;

View File

@@ -20,6 +20,7 @@
#include "xfs_sysfs.h"
#include "xfs_sb.h"
#include "xfs_health.h"
#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_log_ticket_cache;
@@ -3540,6 +3541,9 @@ xlog_force_shutdown(
spin_unlock(&log->l_icloglock);
wake_up_var(&log->l_opstate);
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
xfs_zoned_wake_all(log->l_mp);
return log_error;
}

View File

@@ -173,6 +173,10 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_METADIR,
.name = "metadata directory tree",
},
[XFS_EXPERIMENTAL_ZONED] = {
.opstate = XFS_OPSTATE_WARNED_ZONED,
.name = "zoned RT device",
},
};
ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);

View File

@@ -99,6 +99,7 @@ enum xfs_experimental_feat {
XFS_EXPERIMENTAL_EXCHRANGE,
XFS_EXPERIMENTAL_PPTR,
XFS_EXPERIMENTAL_METADIR,
XFS_EXPERIMENTAL_ZONED,
XFS_EXPERIMENTAL_MAX,
};

View File

@@ -40,6 +40,7 @@
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
#include "xfs_zone_alloc.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -461,22 +462,38 @@ xfs_mount_reset_sbqflags(
return xfs_sync_sb(mp, false);
}
uint64_t
xfs_default_resblks(xfs_mount_t *mp)
{
uint64_t resblks;
static const char *const xfs_free_pool_name[] = {
[XC_FREE_BLOCKS] = "free blocks",
[XC_FREE_RTEXTENTS] = "free rt extents",
[XC_FREE_RTAVAILABLE] = "available rt extents",
};
/*
* We default to 5% or 8192 fsbs of space reserved, whichever is
* smaller. This is intended to cover concurrent allocation
* transactions when we initially hit enospc. These each require a 4
* block reservation. Hence by default we cover roughly 2000 concurrent
* allocation reservations.
*/
resblks = mp->m_sb.sb_dblocks;
do_div(resblks, 20);
resblks = min_t(uint64_t, resblks, 8192);
return resblks;
uint64_t
xfs_default_resblks(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
switch (ctr) {
case XC_FREE_BLOCKS:
/*
* Default to 5% or 8192 FSBs of space reserved, whichever is
* smaller.
*
* This is intended to cover concurrent allocation transactions
* when we initially hit ENOSPC. These each require a 4 block
* reservation. Hence by default we cover roughly 2000
* concurrent allocation reservations.
*/
return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
case XC_FREE_RTEXTENTS:
case XC_FREE_RTAVAILABLE:
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
return xfs_zoned_default_resblks(mp, ctr);
return 0;
default:
ASSERT(0);
return 0;
}
}
/* Ensure the summary counts are correct. */
@@ -543,7 +560,7 @@ xfs_check_summary_counts(
* If we're mounting the rt volume after recovering the log, recompute
* frextents from the rtbitmap file to fix the inconsistency.
*/
if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
error = xfs_rtalloc_reinit_frextents(mp);
if (error)
return error;
@@ -678,6 +695,7 @@ xfs_mountfs(
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
int i;
xfs_sb_mount_common(mp, sbp);
@@ -747,27 +765,15 @@ xfs_mountfs(
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
super_set_sysfs_name_id(mp->m_super);
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
NULL, mp->m_super->s_id);
if (error)
goto out;
error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
&mp->m_kobj, "stats");
if (error)
goto out_remove_sysfs;
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
error = xfs_error_sysfs_init(mp);
error = xfs_mount_sysfs_init(mp);
if (error)
goto out_remove_scrub_stats;
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
error = xfs_errortag_init(mp);
if (error)
goto out_remove_error_sysfs;
goto out_remove_sysfs;
error = xfs_uuid_mount(mp);
if (error)
@@ -1031,6 +1037,12 @@ xfs_mountfs(
if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
xfs_log_clean(mp);
if (xfs_has_zoned(mp)) {
error = xfs_mount_zones(mp);
if (error)
goto out_rtunmount;
}
/*
* Complete the quota initialisation, post-log-replay component.
*/
@@ -1046,22 +1058,28 @@ xfs_mountfs(
* privileged transactions. This is needed so that transaction
* space required for critical operations can dip into this pool
* when at ENOSPC. This is needed for operations like create with
* attr, unwritten extent conversion at ENOSPC, etc. Data allocations
* are not allowed to use this reserved space.
* attr, unwritten extent conversion at ENOSPC, garbage collection
* etc. Data allocations are not allowed to use this reserved space.
*
* This may drive us straight to ENOSPC on mount, but that implies
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
for (i = 0; i < XC_FREE_NR; i++) {
error = xfs_reserve_blocks(mp, i,
xfs_default_resblks(mp, i));
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
xfs_free_pool_name[i]);
}
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
goto out_agresv;
xfs_zone_gc_start(mp);
}
return 0;
@@ -1069,6 +1087,8 @@ xfs_mountfs(
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
if (xfs_has_zoned(mp))
xfs_unmount_zones(mp);
out_rtunmount:
xfs_rtunmount_inodes(mp);
out_rele_rip:
@@ -1116,13 +1136,10 @@ xfs_mountfs(
xfs_uuid_unmount(mp);
out_remove_errortag:
xfs_errortag_del(mp);
out_remove_error_sysfs:
xfs_error_sysfs_del(mp);
out_remove_sysfs:
xfs_mount_sysfs_del(mp);
out_remove_scrub_stats:
xchk_stats_unregister(mp->m_scrub_stats);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
out:
return error;
}
@@ -1148,8 +1165,12 @@ xfs_unmountfs(
xfs_inodegc_flush(mp);
xfs_blockgc_stop(mp);
if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
xfs_zone_gc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
if (xfs_has_zoned(mp))
xfs_unmount_zones(mp);
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
if (mp->m_metadirip)
@@ -1173,7 +1194,7 @@ xfs_unmountfs(
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
error = xfs_reserve_blocks(mp, 0);
error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
@@ -1195,10 +1216,8 @@ xfs_unmountfs(
xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
xfs_errortag_del(mp);
xfs_error_sysfs_del(mp);
xchk_stats_unregister(mp->m_scrub_stats);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
xfs_mount_sysfs_del(mp);
}
/*
@@ -1220,52 +1239,67 @@ xfs_fs_writable(
return true;
}
/*
* Estimate the amount of free space that is not available to userspace and is
* not explicitly reserved from the incore fdblocks. This includes:
*
* - The minimum number of blocks needed to support splitting a bmap btree
* - The blocks currently in use by the freespace btrees because they record
* the actual blocks that will fill per-AG metadata space reservations
*/
uint64_t
xfs_freecounter_unavailable(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
if (ctr != XC_FREE_BLOCKS)
return 0;
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
void
xfs_add_freecounter(
struct xfs_mount *mp,
struct percpu_counter *counter,
enum xfs_free_counter ctr,
uint64_t delta)
{
bool has_resv_pool = (counter == &mp->m_fdblocks);
struct xfs_freecounter *counter = &mp->m_free[ctr];
uint64_t res_used;
/*
* If the reserve pool is depleted, put blocks back into it first.
* Most of the time the pool is full.
*/
if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
percpu_counter_add(counter, delta);
if (likely(counter->res_avail == counter->res_total)) {
percpu_counter_add(&counter->count, delta);
return;
}
spin_lock(&mp->m_sb_lock);
res_used = mp->m_resblks - mp->m_resblks_avail;
res_used = counter->res_total - counter->res_avail;
if (res_used > delta) {
mp->m_resblks_avail += delta;
counter->res_avail += delta;
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
percpu_counter_add(counter, delta);
counter->res_avail = counter->res_total;
percpu_counter_add(&counter->count, delta);
}
spin_unlock(&mp->m_sb_lock);
}
/* Adjust in-core free blocks or RT extents. */
int
xfs_dec_freecounter(
struct xfs_mount *mp,
struct percpu_counter *counter,
enum xfs_free_counter ctr,
uint64_t delta,
bool rsvd)
{
int64_t lcounter;
uint64_t set_aside = 0;
struct xfs_freecounter *counter = &mp->m_free[ctr];
s32 batch;
bool has_resv_pool;
ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
has_resv_pool = (counter == &mp->m_fdblocks);
if (rsvd)
ASSERT(has_resv_pool);
ASSERT(ctr < XC_FREE_NR);
/*
* Taking blocks away, need to be more accurate the closer we
@@ -1275,7 +1309,7 @@ xfs_dec_freecounter(
* then make everything serialise as we are real close to
* ENOSPC.
*/
if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@@ -1292,34 +1326,34 @@ xfs_dec_freecounter(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
if (has_resv_pool)
set_aside = xfs_fdblocks_unavailable(mp);
percpu_counter_add_batch(counter, -((int64_t)delta), batch);
if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
}
/*
* lock up the sb for dipping into reserves before releasing the space
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
percpu_counter_add(counter, delta);
if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail - delta;
if (lcounter >= 0) {
mp->m_resblks_avail = lcounter;
spin_unlock(&mp->m_sb_lock);
return 0;
}
xfs_warn_once(mp,
percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch);
if (__percpu_counter_compare(&counter->count,
xfs_freecounter_unavailable(mp, ctr),
XFS_FDBLOCKS_BATCH) < 0) {
/*
* Lock up the sb for dipping into reserves before releasing the
* space that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
percpu_counter_add(&counter->count, delta);
if (!rsvd)
goto fdblocks_enospc;
if (delta > counter->res_avail) {
if (ctr == XC_FREE_BLOCKS)
xfs_warn_once(mp,
"Reserve blocks depleted! Consider increasing reserve pool size.");
goto fdblocks_enospc;
}
counter->res_avail -= delta;
trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_);
spin_unlock(&mp->m_sb_lock);
}
/* we had space! */
return 0;
fdblocks_enospc:
trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_);
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
}

View File

@@ -97,12 +97,42 @@ struct xfs_groups {
*/
uint8_t blklog;
/*
* Zoned devices can have gaps beyond the usable capacity of a zone and
* the end in the LBA/daddr address space. In other words, the hardware
* equivalent to the RT groups already takes care of the power of 2
* alignment for us. In this case the sparse FSB/RTB address space maps
* 1:1 to the device address space.
*/
bool has_daddr_gaps;
/*
* Mask to extract the group-relative block number from a FSB.
* For a pre-rtgroups filesystem we pretend to have one very large
* rtgroup, so this mask must be 64-bit.
*/
uint64_t blkmask;
/*
* Start of the first group in the device. This is used to support a
* RT device following the data device on the same block device for
* SMR hard drives.
*/
xfs_fsblock_t start_fsb;
};
struct xfs_freecounter {
/* free blocks for general use: */
struct percpu_counter count;
/* total reserved blocks: */
uint64_t res_total;
/* available reserved blocks: */
uint64_t res_avail;
/* reserved blks @ remount,ro: */
uint64_t res_saved;
};
/*
@@ -198,6 +228,7 @@ typedef struct xfs_mount {
bool m_fail_unmount;
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
@@ -222,8 +253,8 @@ typedef struct xfs_mount {
spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
struct percpu_counter m_frextents; /* free rt extent counter */
struct xfs_freecounter m_free[XC_FREE_NR];
/*
* Count of data device blocks reserved for delayed allocations,
@@ -245,10 +276,8 @@ typedef struct xfs_mount {
atomic64_t m_allocbt_blks;
struct xfs_groups m_groups[XG_TYPE_MAX];
uint64_t m_resblks; /* total reserved blocks */
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
struct delayed_work m_reclaim_work; /* background inode reclaim */
struct xfs_zone_info *m_zone_info; /* zone allocator information */
struct dentry *m_debugfs; /* debugfs parent */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
@@ -258,10 +287,16 @@ typedef struct xfs_mount {
#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
struct xchk_stats *m_scrub_stats;
#endif
struct xfs_kobj m_zoned_kobj;
xfs_agnumber_t m_agfrotor; /* last ag where space found */
atomic_t m_agirotor; /* last ag dir inode alloced */
atomic_t m_rtgrotor; /* last rtgroup rtpicked */
struct mutex m_metafile_resv_lock;
uint64_t m_metafile_resv_target;
uint64_t m_metafile_resv_used;
uint64_t m_metafile_resv_avail;
/* Memory shrinker to throttle and reprioritize inodegc */
struct shrinker *m_inodegc_shrinker;
/*
@@ -336,8 +371,10 @@ typedef struct xfs_mount {
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
/* Mount features */
#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
@@ -392,6 +429,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
__XFS_HAS_FEAT(zoned, ZONED)
__XFS_HAS_FEAT(nolifetime, NOLIFETIME)
static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
@@ -402,7 +441,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
{
/* all rtgroups filesystems with an rt section have an rtsb */
return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
return xfs_has_rtgroups(mp) &&
xfs_has_realtime(mp) &&
!xfs_has_zoned(mp);
}
static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
@@ -417,6 +458,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
xfs_has_reflink(mp);
}
static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
{
return !xfs_has_zoned(mp);
}
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
@@ -520,6 +566,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_METADIR 17
/* Filesystem should use qflags to determine quotaon status */
#define XFS_OPSTATE_RESUMING_QUOTAON 18
/* Kernel has logged a warning about zoned RT device being used on this fs. */
#define XFS_OPSTATE_WARNED_ZONED 19
/* (Zoned) GC is in progress */
#define XFS_OPSTATE_ZONEGC_RUNNING 20
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -564,6 +614,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
#endif /* CONFIG_XFS_QUOTA */
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING)
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -633,7 +684,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
}
extern void xfs_uuid_table_free(void);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
uint64_t xfs_default_resblks(struct xfs_mount *mp,
enum xfs_free_counter ctr);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
@@ -646,45 +698,74 @@ extern void xfs_unmountfs(xfs_mount_t *);
*/
#define XFS_FDBLOCKS_BATCH 1024
uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
enum xfs_free_counter ctr);
/*
* Estimate the amount of free space that is not available to userspace and is
* not explicitly reserved from the incore fdblocks. This includes:
*
* - The minimum number of blocks needed to support splitting a bmap btree
* - The blocks currently in use by the freespace btrees because they record
* the actual blocks that will fill per-AG metadata space reservations
* Sum up the freecount, but never return negative values.
*/
static inline uint64_t
xfs_fdblocks_unavailable(
struct xfs_mount *mp)
static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
return percpu_counter_sum_positive(&mp->m_free[ctr].count);
}
int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
/*
* Same as above, but does return negative values. Mostly useful for
* special cases like repair and tracing.
*/
static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
return percpu_counter_sum(&mp->m_free[ctr].count);
}
/*
* This just provides and estimate without the cpu-local updates, use
* xfs_sum_freecounter for the exact value.
*/
static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
return percpu_counter_read_positive(&mp->m_free[ctr].count);
}
static inline int xfs_compare_freecounter(struct xfs_mount *mp,
enum xfs_free_counter ctr, s64 rhs, s32 batch)
{
return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch);
}
static inline void xfs_set_freecounter(struct xfs_mount *mp,
enum xfs_free_counter ctr, uint64_t val)
{
percpu_counter_set(&mp->m_free[ctr].count, val);
}
int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta, bool rsvd);
void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta);
static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
bool reserved)
{
return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
}
static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
{
xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
}
static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
{
return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
}
static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
{
xfs_add_freecounter(mp, &mp->m_frextents, delta);
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
}
extern int xfs_readsb(xfs_mount_t *, int);
@@ -706,5 +787,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta,
int64_t ind_delta);
static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
{
percpu_counter_add(&mp->m_delalloc_blks, delta);
}
#endif /* __XFS_MOUNT_H__ */

View File

@@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas(
* immediately. We only support rtquota if rtgroups are enabled to
* avoid problems with older kernels.
*/
if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) {
if (mp->m_sb.sb_rextents &&
(!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) {
xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
mp->m_qflags = 0;
goto write_changes;

View File

@@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
@@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks(
if (isnullstartblock(del.br_startblock)) {
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
&del);
&del, 0);
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space(
if (!xfs_has_rmapbt(mp))
return 0;
if (XFS_IS_REALTIME_INODE(ip)) {
struct xfs_rtgroup *rtg;
xfs_rgnumber_t rgno;
rgno = xfs_rtb_to_rgno(mp, fsb);
rtg = xfs_rtgroup_get(mp, rgno);
if (xfs_metafile_resv_critical(rtg_rmap(rtg)))
error = -ENOSPC;
xfs_rtgroup_put(rtg);
return error;
if (xfs_metafile_resv_critical(mp))
return -ENOSPC;
return 0;
}
agno = XFS_FSB_TO_AGNO(mp, fsb);
@@ -1538,7 +1532,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
return xfs_zero_range(ip, isize, pos - isize, NULL);
return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
}
/*

View File

@@ -33,6 +33,7 @@
#include "xfs_trace.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_reflink.h"
#include "xfs_zone_alloc.h"
/*
* Return whether there are any free extents in the size range given
@@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
for (i = 0; i < XFS_RTGI_MAX; i++)
xfs_rtginode_irele(&rtg->rtg_inodes[i]);
kvfree(rtg->rtg_rsum_cache);
if (!xfs_has_zoned(rtg_mount(rtg)))
kvfree(rtg->rtg_rsum_cache);
}
static int
@@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb(
return error;
}
static void
xfs_growfs_rt_sb_fields(
struct xfs_trans *tp,
const struct xfs_mount *nmp)
{
struct xfs_mount *mp = tp->t_mountp;
if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT,
nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
}
static int
xfs_growfs_rt_zoned(
struct xfs_rtgroup *rtg,
xfs_rfsblock_t nrblocks)
{
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_mount *nmp;
struct xfs_trans *tp;
xfs_rtbxlen_t freed_rtx;
int error;
/*
* Calculate new sb and mount fields for this round. Also ensure the
* rtg_extents value is uptodate as the rtbitmap code relies on it.
*/
nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks,
mp->m_sb.sb_rextsize);
if (!nmp)
return -ENOMEM;
freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg),
nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents);
error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp);
if (error)
goto out_free;
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
xfs_growfs_rt_sb_fields(tp, nmp);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
error = xfs_trans_commit(tp);
if (error)
goto out_free;
/*
* Ensure the mount RT feature flag is now set, and compute new
* maxlevels for rt btrees.
*/
mp->m_features |= XFS_FEAT_REALTIME;
xfs_rtrmapbt_compute_maxlevels(mp);
xfs_rtrefcountbt_compute_maxlevels(mp);
xfs_zoned_add_available(mp, freed_rtx);
out_free:
kfree(nmp);
return error;
}
static int
xfs_growfs_rt_bmblock(
struct xfs_rtgroup *rtg,
@@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock(
/*
* Update superblock fields.
*/
if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE,
nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS,
nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS,
nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS,
nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT,
nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
xfs_growfs_rt_sb_fields(args.tp, nmp);
/*
* Free the new extent.
@@ -1127,6 +1190,11 @@ xfs_growfs_rtg(
goto out_rele;
}
if (xfs_has_zoned(mp)) {
error = xfs_growfs_rt_zoned(rtg, nrblocks);
goto out_rele;
}
error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);
if (error)
goto out_rele;
@@ -1146,8 +1214,7 @@ xfs_growfs_rtg(
if (old_rsum_cache)
kvfree(old_rsum_cache);
xfs_rtgroup_rele(rtg);
return 0;
goto out_rele;
out_error:
/*
@@ -1195,6 +1262,22 @@ xfs_growfs_check_rtgeom(
if (min_logfsbs > mp->m_sb.sb_logblocks)
return -EINVAL;
if (xfs_has_zoned(mp)) {
uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks;
uint32_t rem;
if (rextsize != 1)
return -EINVAL;
div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem);
if (rem) {
xfs_warn(mp,
"new RT volume size (%lld) not aligned to RT group size (%d)",
mp->m_sb.sb_rblocks, gblocks);
return -EINVAL;
}
}
return 0;
}
@@ -1248,6 +1331,35 @@ xfs_grow_last_rtg(
mp->m_sb.sb_rgextents;
}
/*
* Read in the last block of the RT device to make sure it is accessible.
*/
static int
xfs_rt_check_size(
struct xfs_mount *mp,
xfs_rfsblock_t last_block)
{
xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block);
struct xfs_buf *bp;
int error;
if (XFS_BB_TO_FSB(mp, daddr) != last_block) {
xfs_warn(mp, "RT device size overflow: %llu != %llu",
XFS_BB_TO_FSB(mp, daddr), last_block);
return -EFBIG;
}
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr,
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
if (error)
xfs_warn(mp, "cannot read last RT device sector (%lld)",
last_block);
else
xfs_buf_relse(bp);
return error;
}
/*
* Grow the realtime area of the filesystem.
*/
@@ -1259,7 +1371,6 @@ xfs_growfs_rt(
xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount;
xfs_rgnumber_t new_rgcount = 1;
xfs_rgnumber_t rgno;
struct xfs_buf *bp;
xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
int error;
@@ -1302,15 +1413,10 @@ xfs_growfs_rt(
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
if (error)
goto out_unlock;
/*
* Read in the last block of the device, make sure it exists.
*/
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, in->newblocks - 1),
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
error = xfs_rt_check_size(mp, in->newblocks - 1);
if (error)
goto out_unlock;
xfs_buf_relse(bp);
/*
* Calculate new parameters. These are the final values to be reached.
@@ -1376,8 +1482,7 @@ xfs_growfs_rt(
error = error2;
/* Reset the rt metadata btree space reservations. */
xfs_rt_resv_free(mp);
error2 = xfs_rt_resv_init(mp);
error2 = xfs_metafile_resv_init(mp);
if (error2 && error2 != -ENOSPC)
error = error2;
}
@@ -1444,10 +1549,6 @@ int /* error */
xfs_rtmount_init(
struct xfs_mount *mp) /* file system mount structure */
{
struct xfs_buf *bp; /* buffer for last block of subvolume */
xfs_daddr_t d; /* address of last block of subvolume */
int error;
if (mp->m_sb.sb_rblocks == 0)
return 0;
if (mp->m_rtdev_targp == NULL) {
@@ -1458,25 +1559,7 @@ xfs_rtmount_init(
mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels);
/*
* Check that the realtime section is an ok size.
*/
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
xfs_warn(mp, "realtime mount -- %llu != %llu",
(unsigned long long) XFS_BB_TO_FSB(mp, d),
(unsigned long long) mp->m_sb.sb_rblocks);
return -EFBIG;
}
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
if (error) {
xfs_warn(mp, "realtime device size check failed");
return error;
}
xfs_buf_relse(bp);
return 0;
return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1);
}
static int
@@ -1519,50 +1602,10 @@ xfs_rtalloc_reinit_frextents(
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_frextents = val;
spin_unlock(&mp->m_sb_lock);
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
return 0;
}
/* Free space reservations for rt metadata inodes. */
void
xfs_rt_resv_free(
struct xfs_mount *mp)
{
struct xfs_rtgroup *rtg = NULL;
unsigned int i;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
for (i = 0; i < XFS_RTGI_MAX; i++)
xfs_metafile_resv_free(rtg->rtg_inodes[i]);
}
}
/* Reserve space for rt metadata inodes' space expansion. */
int
xfs_rt_resv_init(
struct xfs_mount *mp)
{
struct xfs_rtgroup *rtg = NULL;
xfs_filblks_t ask;
int error = 0;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
int err2;
ask = xfs_rtrmapbt_calc_reserves(mp);
err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
if (err2 && !error)
error = err2;
ask = xfs_rtrefcountbt_calc_reserves(mp);
err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask);
if (err2 && !error)
error = err2;
}
return error;
}
/*
* Read in the bmbt of an rt metadata inode so that we never have to load them
* at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
@@ -1613,6 +1656,8 @@ xfs_rtmount_rtg(
}
}
if (xfs_has_zoned(mp))
return 0;
return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
}
@@ -2097,6 +2142,8 @@ xfs_bmap_rtalloc(
ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
int error;
ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
retry:
error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
if (error)

View File

@@ -34,9 +34,6 @@ int /* error */
xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
void xfs_rt_resv_free(struct xfs_mount *mp);
int xfs_rt_resv_init(struct xfs_mount *mp);
/*
* Grow the realtime area of the filesystem.
*/
@@ -65,8 +62,6 @@ xfs_rtmount_init(
}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
# define xfs_rtunmount_inodes(m)
# define xfs_rt_resv_free(mp) ((void)0)
# define xfs_rt_resv_init(mp) (0)
static inline int
xfs_growfs_check_rtgeom(const struct xfs_mount *mp,

View File

@@ -46,6 +46,7 @@
#include "xfs_exchmaps_item.h"
#include "xfs_parent.h"
#include "xfs_rtalloc.h"
#include "xfs_zone_alloc.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
@@ -109,7 +110,8 @@ enum {
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
Opt_lifetime, Opt_nolifetime,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -154,6 +156,9 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("nodiscard", Opt_nodiscard),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
fsparam_u32("max_open_zones", Opt_max_open_zones),
fsparam_flag("lifetime", Opt_lifetime),
fsparam_flag("nolifetime", Opt_nolifetime),
{}
};
@@ -182,6 +187,7 @@ xfs_fs_show_options(
{ XFS_FEAT_LARGE_IOSIZE, ",largeio" },
{ XFS_FEAT_DAX_ALWAYS, ",dax=always" },
{ XFS_FEAT_DAX_NEVER, ",dax=never" },
{ XFS_FEAT_NOLIFETIME, ",nolifetime" },
{ 0, NULL }
};
struct xfs_mount *mp = XFS_M(root->d_sb);
@@ -233,6 +239,9 @@ xfs_fs_show_options(
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
if (mp->m_max_open_zones)
seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
return 0;
}
@@ -533,7 +542,15 @@ xfs_setup_devices(
if (error)
return error;
}
if (mp->m_rtdev_targp) {
if (mp->m_sb.sb_rtstart) {
if (mp->m_rtdev_targp) {
xfs_warn(mp,
"can't use internal and external rtdev at the same time");
return -EINVAL;
}
mp->m_rtdev_targp = mp->m_ddev_targp;
} else if (mp->m_rtname) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
mp->m_sb.sb_sectsize);
if (error)
@@ -757,7 +774,7 @@ xfs_mount_free(
{
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_logdev_targp);
if (mp->m_rtdev_targp)
if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_rtdev_targp);
if (mp->m_ddev_targp)
xfs_free_buftarg(mp->m_ddev_targp);
@@ -814,6 +831,7 @@ xfs_fs_sync_fs(
if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
xfs_inodegc_stop(mp);
xfs_blockgc_stop(mp);
xfs_zone_gc_stop(mp);
}
return 0;
@@ -834,10 +852,12 @@ xfs_statfs_data(
struct kstatfs *st)
{
int64_t fdblocks =
percpu_counter_sum(&mp->m_fdblocks);
xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
/* make sure st->f_bfree does not underflow */
st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
st->f_bfree = max(0LL,
fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));
/*
* sb_dblocks can change during growfs, but nothing cares about reporting
* the old or new value during growfs.
@@ -856,8 +876,9 @@ xfs_statfs_rt(
struct kstatfs *st)
{
st->f_bfree = xfs_rtbxlen_to_blen(mp,
percpu_counter_sum_positive(&mp->m_frextents));
st->f_blocks = mp->m_sb.sb_rblocks;
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp,
mp->m_free[XC_FREE_RTEXTENTS].res_total);
}
static void
@@ -922,24 +943,32 @@ xfs_fs_statfs(
}
STATIC void
xfs_save_resvblks(struct xfs_mount *mp)
xfs_save_resvblks(
struct xfs_mount *mp)
{
mp->m_resblks_save = mp->m_resblks;
xfs_reserve_blocks(mp, 0);
enum xfs_free_counter i;
for (i = 0; i < XC_FREE_NR; i++) {
mp->m_free[i].res_saved = mp->m_free[i].res_total;
xfs_reserve_blocks(mp, i, 0);
}
}
STATIC void
xfs_restore_resvblks(struct xfs_mount *mp)
xfs_restore_resvblks(
struct xfs_mount *mp)
{
uint64_t resblks;
uint64_t resblks;
enum xfs_free_counter i;
if (mp->m_resblks_save) {
resblks = mp->m_resblks_save;
mp->m_resblks_save = 0;
} else
resblks = xfs_default_resblks(mp);
xfs_reserve_blocks(mp, resblks);
for (i = 0; i < XC_FREE_NR; i++) {
if (mp->m_free[i].res_saved) {
resblks = mp->m_free[i].res_saved;
mp->m_free[i].res_saved = 0;
} else
resblks = xfs_default_resblks(mp, i);
xfs_reserve_blocks(mp, i, resblks);
}
}
/*
@@ -976,6 +1005,7 @@ xfs_fs_freeze(
if (ret && !xfs_is_readonly(mp)) {
xfs_blockgc_start(mp);
xfs_inodegc_start(mp);
xfs_zone_gc_start(mp);
}
return ret;
@@ -997,6 +1027,7 @@ xfs_fs_unfreeze(
* filesystem.
*/
if (!xfs_is_readonly(mp)) {
xfs_zone_gc_start(mp);
xfs_blockgc_start(mp);
xfs_inodegc_start(mp);
}
@@ -1058,6 +1089,19 @@ xfs_finish_flags(
return -EINVAL;
}
if (!xfs_has_zoned(mp)) {
if (mp->m_max_open_zones) {
xfs_warn(mp,
"max_open_zones mount option only supported on zoned file systems.");
return -EINVAL;
}
if (mp->m_features & XFS_FEAT_NOLIFETIME) {
xfs_warn(mp,
"nolifetime mount option only supported on zoned file systems.");
return -EINVAL;
}
}
return 0;
}
@@ -1065,7 +1109,8 @@ static int
xfs_init_percpu_counters(
struct xfs_mount *mp)
{
int error;
int error;
int i;
error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
if (error)
@@ -1075,30 +1120,29 @@ xfs_init_percpu_counters(
if (error)
goto free_icount;
error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
if (error)
goto free_ifree;
error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
if (error)
goto free_fdblocks;
goto free_ifree;
error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc;
error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc_rt;
for (i = 0; i < XC_FREE_NR; i++) {
error = percpu_counter_init(&mp->m_free[i].count, 0,
GFP_KERNEL);
if (error)
goto free_freecounters;
}
return 0;
free_delalloc_rt:
free_freecounters:
while (--i > 0)
percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
percpu_counter_destroy(&mp->m_ifree);
free_icount:
@@ -1112,24 +1156,28 @@ xfs_reinit_percpu_counters(
{
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
if (!xfs_has_zoned(mp))
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
mp->m_sb.sb_frextents);
}
static void
xfs_destroy_percpu_counters(
struct xfs_mount *mp)
{
enum xfs_free_counter i;
for (i = 0; i < XC_FREE_NR; i++)
percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
percpu_counter_destroy(&mp->m_fdblocks);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
percpu_counter_destroy(&mp->m_frextents);
}
static int
@@ -1210,6 +1258,18 @@ xfs_fs_shutdown(
xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
}
static int
xfs_fs_show_stats(
struct seq_file *m,
struct dentry *root)
{
struct xfs_mount *mp = XFS_M(root->d_sb);
if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
xfs_zoned_show_stats(m, mp);
return 0;
}
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
@@ -1224,6 +1284,7 @@ static const struct super_operations xfs_super_operations = {
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
.shutdown = xfs_fs_shutdown,
.show_stats = xfs_fs_show_stats,
};
static int
@@ -1436,6 +1497,15 @@ xfs_fs_parse_param(
xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
parsing_mp->m_features |= XFS_FEAT_NOATTR2;
return 0;
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
case Opt_lifetime:
parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
return 0;
case Opt_nolifetime:
parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
@@ -1780,8 +1850,17 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}
if (xfs_has_metadir(mp))
if (xfs_has_zoned(mp)) {
if (!xfs_has_metadir(mp)) {
xfs_alert(mp,
"metadir feature required for zoned realtime devices.");
error = -EINVAL;
goto out_filestream_unmount;
}
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED);
} else if (xfs_has_metadir(mp)) {
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
}
if (xfs_has_reflink(mp)) {
if (xfs_has_realtime(mp) &&
@@ -1793,6 +1872,13 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
if (xfs_has_zoned(mp)) {
xfs_alert(mp,
"reflink not compatible with zoned RT device!");
error = -EINVAL;
goto out_filestream_unmount;
}
if (xfs_globals.always_cow) {
xfs_info(mp, "using DEBUG-only always_cow mode.");
mp->m_always_cow = true;
@@ -1917,6 +2003,9 @@ xfs_remount_rw(
/* Re-enable the background inode inactivation worker. */
xfs_inodegc_start(mp);
/* Restart zone reclaim */
xfs_zone_gc_start(mp);
return 0;
}
@@ -1961,6 +2050,9 @@ xfs_remount_ro(
*/
xfs_inodegc_stop(mp);
/* Stop zone reclaim */
xfs_zone_gc_stop(mp);
/* Free the per-AG metadata reservation pool. */
xfs_fs_unreserve_ag_blocks(mp);
@@ -2082,6 +2174,7 @@ xfs_init_fs_context(
for (i = 0; i < XG_TYPE_MAX; i++)
xa_init(&mp->m_groups[i].xa);
mutex_init(&mp->m_growlock);
mutex_init(&mp->m_metafile_resv_lock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
mp->m_kobj.kobject.kset = xfs_kset;

View File

@@ -13,6 +13,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
#include "xfs_zones.h"
struct xfs_sysfs_attr {
struct attribute attr;
@@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_mp);
const struct kobj_type xfs_mp_ktype = {
static const struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_mp_groups,
@@ -701,45 +702,103 @@ xfs_error_sysfs_init_class(
return error;
}
static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj)
{
return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj);
}
static ssize_t
max_open_zones_show(
struct kobject *kobj,
char *buf)
{
/* only report the open zones available for user data */
return sysfs_emit(buf, "%u\n",
zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES);
}
XFS_SYSFS_ATTR_RO(max_open_zones);
static struct attribute *xfs_zoned_attrs[] = {
ATTR_LIST(max_open_zones),
NULL,
};
ATTRIBUTE_GROUPS(xfs_zoned);
static const struct kobj_type xfs_zoned_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_zoned_groups,
};
int
xfs_error_sysfs_init(
xfs_mount_sysfs_init(
struct xfs_mount *mp)
{
int error;
super_set_sysfs_name_id(mp->m_super);
/* .../xfs/<dev>/ */
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
NULL, mp->m_super->s_id);
if (error)
return error;
/* .../xfs/<dev>/stats/ */
error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
&mp->m_kobj, "stats");
if (error)
goto out_remove_fsdir;
/* .../xfs/<dev>/error/ */
error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
&mp->m_kobj, "error");
if (error)
return error;
goto out_remove_stats_dir;
/* .../xfs/<dev>/error/fail_at_unmount */
error = sysfs_create_file(&mp->m_error_kobj.kobject,
ATTR_LIST(fail_at_unmount));
if (error)
goto out_error;
goto out_remove_error_dir;
/* .../xfs/<dev>/error/metadata/ */
error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
"metadata", &mp->m_error_meta_kobj,
xfs_error_meta_init);
if (error)
goto out_error;
goto out_remove_error_dir;
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) {
/* .../xfs/<dev>/zoned/ */
error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype,
&mp->m_kobj, "zoned");
if (error)
goto out_remove_error_dir;
}
return 0;
out_error:
out_remove_error_dir:
xfs_sysfs_del(&mp->m_error_kobj);
out_remove_stats_dir:
xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_fsdir:
xfs_sysfs_del(&mp->m_kobj);
return error;
}
void
xfs_error_sysfs_del(
xfs_mount_sysfs_del(
struct xfs_mount *mp)
{
struct xfs_error_cfg *cfg;
int i, j;
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
xfs_sysfs_del(&mp->m_zoned_kobj);
for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
cfg = &mp->m_error_cfg[i][j];
@@ -749,6 +808,8 @@ xfs_error_sysfs_del(
}
xfs_sysfs_del(&mp->m_error_meta_kobj);
xfs_sysfs_del(&mp->m_error_kobj);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
struct xfs_error_cfg *

View File

@@ -7,7 +7,6 @@
#ifndef __XFS_SYSFS_H__
#define __XFS_SYSFS_H__
extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern const struct kobj_type xfs_dbg_ktype; /* debug */
extern const struct kobj_type xfs_log_ktype; /* xlog */
extern const struct kobj_type xfs_stats_ktype; /* stats */
@@ -53,7 +52,7 @@ xfs_sysfs_del(
wait_for_completion(&kobj->complete);
}
int xfs_error_sysfs_init(struct xfs_mount *mp);
void xfs_error_sysfs_del(struct xfs_mount *mp);
int xfs_mount_sysfs_init(struct xfs_mount *mp);
void xfs_mount_sysfs_del(struct xfs_mount *mp);
#endif /* __XFS_SYSFS_H__ */

View File

@@ -49,6 +49,8 @@
#include "xfs_metafile.h"
#include "xfs_metadir.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
/*
* We include this last to have the helpers above available for the trace

View File

@@ -102,6 +102,7 @@ struct xfs_rmap_intent;
struct xfs_refcount_intent;
struct xfs_metadir_update;
struct xfs_rtgroup;
struct xfs_open_zone;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -265,6 +266,152 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
DEFINE_GROUP_REF_EVENT(xfs_group_rele);
#ifdef CONFIG_XFS_RT
DECLARE_EVENT_CLASS(xfs_zone_class,
TP_PROTO(struct xfs_rtgroup *rtg),
TP_ARGS(rtg),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
__field(unsigned int, nr_open)
),
TP_fast_assign(
struct xfs_mount *mp = rtg_mount(rtg);
__entry->dev = mp->m_super->s_dev;
__entry->rgno = rtg_rgno(rtg);
__entry->used = rtg_rmap(rtg)->i_used_blocks;
__entry->nr_open = mp->m_zone_info->zi_nr_open_zones;
),
TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
__entry->nr_open)
);
#define DEFINE_ZONE_EVENT(name) \
DEFINE_EVENT(xfs_zone_class, name, \
TP_PROTO(struct xfs_rtgroup *rtg), \
TP_ARGS(rtg))
DEFINE_ZONE_EVENT(xfs_zone_emptied);
DEFINE_ZONE_EVENT(xfs_zone_full);
DEFINE_ZONE_EVENT(xfs_zone_opened);
DEFINE_ZONE_EVENT(xfs_zone_reset);
DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
TRACE_EVENT(xfs_zone_free_blocks,
TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
xfs_extlen_t len),
TP_ARGS(rtg, rgbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
__field(xfs_rgblock_t, rgbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = rtg_mount(rtg)->m_super->s_dev;
__entry->rgno = rtg_rgno(rtg);
__entry->used = rtg_rmap(rtg)->i_used_blocks;
__entry->rgbno = rgbno;
__entry->len = len;
),
TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
__entry->rgbno,
__entry->len)
);
DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
xfs_extlen_t len),
TP_ARGS(oz, rgbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
__field(xfs_rgblock_t, written)
__field(xfs_rgblock_t, write_pointer)
__field(xfs_rgblock_t, rgbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
__entry->rgno = rtg_rgno(oz->oz_rtg);
__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
__entry->written = oz->oz_written;
__entry->write_pointer = oz->oz_write_pointer;
__entry->rgbno = rgbno;
__entry->len = len;
),
TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
__entry->written,
__entry->write_pointer,
__entry->rgbno,
__entry->len)
);
#define DEFINE_ZONE_ALLOC_EVENT(name) \
DEFINE_EVENT(xfs_zone_alloc_class, name, \
TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \
xfs_extlen_t len), \
TP_ARGS(oz, rgbno, len))
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
TRACE_EVENT(xfs_zone_gc_select_victim,
TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
TP_ARGS(rtg, bucket),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
__field(unsigned int, bucket)
),
TP_fast_assign(
__entry->dev = rtg_mount(rtg)->m_super->s_dev;
__entry->rgno = rtg_rgno(rtg);
__entry->used = rtg_rmap(rtg)->i_used_blocks;
__entry->bucket = bucket;
),
TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
__entry->bucket)
);
TRACE_EVENT(xfs_zones_mount,
TP_PROTO(struct xfs_mount *mp),
TP_ARGS(mp),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgcount)
__field(uint32_t, blocks)
__field(unsigned int, max_open_zones)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->rgcount = mp->m_sb.sb_rgcount;
__entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
__entry->max_open_zones = mp->m_max_open_zones;
),
TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgcount,
__entry->blocks,
__entry->max_open_zones)
);
#endif /* CONFIG_XFS_RT */
TRACE_EVENT(xfs_inodegc_worker,
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
TP_ARGS(mp, shrinker_hits),
@@ -1596,6 +1743,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
@@ -3983,6 +4131,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -5606,11 +5755,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup);
/* metadata inode space reservations */
DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
TP_ARGS(ip, len),
TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len),
TP_ARGS(mp, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(unsigned long long, freeblks)
__field(unsigned long long, reserved)
__field(unsigned long long, asked)
@@ -5618,19 +5766,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
__field(unsigned long long, len)
),
TP_fast_assign(
struct xfs_mount *mp = ip->i_mount;
__entry->dev = mp->m_super->s_dev;
__entry->ino = ip->i_ino;
__entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
__entry->reserved = ip->i_delayed_blks;
__entry->asked = ip->i_meta_resv_asked;
__entry->used = ip->i_nblocks;
__entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
__entry->reserved = mp->m_metafile_resv_avail;
__entry->asked = mp->m_metafile_resv_target;
__entry->used = mp->m_metafile_resv_used;
__entry->len = len;
),
TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->freeblks,
__entry->reserved,
__entry->asked,
@@ -5639,14 +5783,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
)
#define DEFINE_METAFILE_RESV_EVENT(name) \
DEFINE_EVENT(xfs_metafile_resv_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
TP_ARGS(ip, len))
TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \
TP_ARGS(mp, len))
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error);
#ifdef CONFIG_XFS_RT
TRACE_EVENT(xfs_growfs_check_rtgeom,
@@ -5669,6 +5813,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
);
#endif /* CONFIG_XFS_RT */
TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta, unsigned long caller_ip),
TP_ARGS(mp, ctr, delta, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(enum xfs_free_counter, ctr)
__field(uint64_t, delta)
__field(uint64_t, avail)
__field(uint64_t, total)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->ctr = ctr;
__entry->delta = delta;
__entry->avail = mp->m_free[ctr].res_avail;
__entry->total = mp->m_free[ctr].res_total;
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR),
__entry->delta,
__entry->avail,
__entry->total,
(char *)__entry->caller_ip)
)
#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \
DEFINE_EVENT(xfs_freeblocks_resv_class, name, \
TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \
uint64_t delta, unsigned long caller_ip), \
TP_ARGS(mp, ctr, delta, caller_ip))
DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved);
DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc);
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH

1211
fs/xfs/xfs_zone_alloc.c Normal file

File diff suppressed because it is too large Load Diff

70
fs/xfs/xfs_zone_alloc.h Normal file
View File

@@ -0,0 +1,70 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFS_ZONE_ALLOC_H
#define _XFS_ZONE_ALLOC_H
struct iomap_ioend;
struct xfs_open_zone;
struct xfs_zone_alloc_ctx {
struct xfs_open_zone *open_zone;
xfs_filblks_t reserved_blocks;
};
/*
* Grab any available space, even if it is less than what the caller asked for.
*/
#define XFS_ZR_GREEDY (1U << 0)
/*
* Only grab instantly available space, don't wait or GC.
*/
#define XFS_ZR_NOWAIT (1U << 1)
/*
* Dip into the reserved pool.
*/
#define XFS_ZR_RESERVED (1U << 2)
int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
unsigned int flags, struct xfs_zone_alloc_ctx *ac);
void xfs_zoned_space_unreserve(struct xfs_inode *ip,
struct xfs_zone_alloc_ctx *ac);
void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
struct xfs_open_zone **oz);
int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
xfs_fsblock_t fsbno, xfs_filblks_t len);
int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
xfs_daddr_t daddr, struct xfs_open_zone *oz,
xfs_fsblock_t old_startblock);
void xfs_open_zone_put(struct xfs_open_zone *oz);
void xfs_zoned_wake_all(struct xfs_mount *mp);
bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
enum xfs_free_counter ctr);
void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
#ifdef CONFIG_XFS_RT
int xfs_mount_zones(struct xfs_mount *mp);
void xfs_unmount_zones(struct xfs_mount *mp);
void xfs_zone_gc_start(struct xfs_mount *mp);
void xfs_zone_gc_stop(struct xfs_mount *mp);
#else
static inline int xfs_mount_zones(struct xfs_mount *mp)
{
return -EIO;
}
static inline void xfs_unmount_zones(struct xfs_mount *mp)
{
}
static inline void xfs_zone_gc_start(struct xfs_mount *mp)
{
}
static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
{
}
#endif /* CONFIG_XFS_RT */
#endif /* _XFS_ZONE_ALLOC_H */

1165
fs/xfs/xfs_zone_gc.c Normal file

File diff suppressed because it is too large Load Diff

105
fs/xfs/xfs_zone_info.c Normal file
View File

@@ -0,0 +1,105 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025 Christoph Hellwig.
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
static const char xfs_write_hint_shorthand[6][16] = {
"NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"};
static inline const char *
xfs_write_hint_to_str(
uint8_t write_hint)
{
if (write_hint > WRITE_LIFE_EXTREME)
return "UNKNOWN";
return xfs_write_hint_shorthand[write_hint];
}
static void
xfs_show_open_zone(
struct seq_file *m,
struct xfs_open_zone *oz)
{
seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
rtg_rgno(oz->oz_rtg),
oz->oz_write_pointer, oz->oz_written,
rtg_rmap(oz->oz_rtg)->i_used_blocks,
xfs_write_hint_to_str(oz->oz_write_hint));
}
static void
xfs_show_full_zone_used_distribution(
struct seq_file *m,
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
unsigned int reclaimable = 0, full, i;
spin_lock(&zi->zi_used_buckets_lock);
for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
unsigned int entries = zi->zi_used_bucket_entries[i];
seq_printf(m, "\t %2u..%2u%%: %u\n",
i * (100 / XFS_ZONE_USED_BUCKETS),
(i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1,
entries);
reclaimable += entries;
}
spin_unlock(&zi->zi_used_buckets_lock);
full = mp->m_sb.sb_rgcount;
if (zi->zi_open_gc_zone)
full--;
full -= zi->zi_nr_open_zones;
full -= atomic_read(&zi->zi_nr_free_zones);
full -= reclaimable;
seq_printf(m, "\t 100%%: %u\n", full);
}
void
xfs_zoned_show_stats(
struct seq_file *m,
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_open_zone *oz;
seq_puts(m, "\n");
seq_printf(m, "\tuser free RT blocks: %lld\n",
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
seq_printf(m, "\treserved free RT blocks: %lld\n",
mp->m_free[XC_FREE_RTEXTENTS].res_avail);
seq_printf(m, "\tuser available RT blocks: %lld\n",
xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE));
seq_printf(m, "\treserved available RT blocks: %lld\n",
mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
seq_printf(m, "\tRT reservations required: %d\n",
!list_empty_careful(&zi->zi_reclaim_reservations));
seq_printf(m, "\tRT GC required: %d\n",
xfs_zoned_need_gc(mp));
seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
seq_puts(m, "\topen zones:\n");
spin_lock(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
xfs_show_open_zone(m, oz);
if (zi->zi_open_gc_zone) {
seq_puts(m, "\topen gc zone:\n");
xfs_show_open_zone(m, zi->zi_open_gc_zone);
}
spin_unlock(&zi->zi_open_zones_lock);
seq_puts(m, "\tused blocks distribution (fully written zones):\n");
xfs_show_full_zone_used_distribution(m, mp);
}

119
fs/xfs/xfs_zone_priv.h Normal file
View File

@@ -0,0 +1,119 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _XFS_ZONE_PRIV_H
#define _XFS_ZONE_PRIV_H
struct xfs_open_zone {
/*
* Entry in the open zone list and refcount. Protected by
* zi_open_zones_lock in struct xfs_zone_info.
*/
struct list_head oz_entry;
atomic_t oz_ref;
/*
* oz_write_pointer is the write pointer at which space is handed out
* for conventional zones, or simple the count of blocks handed out
* so far for sequential write required zones and is protected by
* oz_alloc_lock/
*/
spinlock_t oz_alloc_lock;
xfs_rgblock_t oz_write_pointer;
/*
* oz_written is the number of blocks for which we've received a
* write completion. oz_written must always be <= oz_write_pointer
* and is protected by the ILOCK of the rmap inode.
*/
xfs_rgblock_t oz_written;
/*
* Write hint (data temperature) assigned to this zone, or
* WRITE_LIFE_NOT_SET if none was set.
*/
enum rw_hint oz_write_hint;
/*
* Is this open zone used for garbage collection? There can only be a
* single open GC zone, which is pointed to by zi_open_gc_zone in
* struct xfs_zone_info. Constant over the life time of an open zone.
*/
bool oz_is_gc;
/*
* Pointer to the RT groups structure for this open zone. Constant over
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
};
/*
* Number of bitmap buckets to track reclaimable zones. There are 10 buckets
* so that each 10% of the usable capacity get their own bucket and GC can
* only has to walk the bitmaps of the lesser used zones if there are any.
*/
#define XFS_ZONE_USED_BUCKETS 10u
struct xfs_zone_info {
/*
* List of pending space reservations:
*/
spinlock_t zi_reservation_lock;
struct list_head zi_reclaim_reservations;
/*
* List and number of open zones:
*/
spinlock_t zi_open_zones_lock;
struct list_head zi_open_zones;
unsigned int zi_nr_open_zones;
/*
* Free zone search cursor and number of free zones:
*/
unsigned long zi_free_zone_cursor;
atomic_t zi_nr_free_zones;
/*
* Wait queue to wait for free zones or open zone resources to become
* available:
*/
wait_queue_head_t zi_zone_wait;
/*
* Pointer to the GC thread, and the current open zone used by GC
* (if any).
*
* zi_open_gc_zone is mostly private to the GC thread, but can be read
* for debugging from other threads, in which case zi_open_zones_lock
* must be taken to access it.
*/
struct task_struct *zi_gc_thread;
struct xfs_open_zone *zi_open_gc_zone;
/*
* List of zones that need a reset:
*/
spinlock_t zi_reset_list_lock;
struct xfs_group *zi_reset_list;
/*
* A set of bitmaps to bucket-sort reclaimable zones by used blocks to help
* garbage collection to quickly find the best candidate for reclaim.
*/
spinlock_t zi_used_buckets_lock;
unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS];
unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS];
};
struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
enum rw_hint write_hint, bool is_gc);
int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
bool xfs_zoned_need_gc(struct xfs_mount *mp);
int xfs_zone_gc_mount(struct xfs_mount *mp);
void xfs_zone_gc_unmount(struct xfs_mount *mp);
void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
#endif /* _XFS_ZONE_PRIV_H */

View File

@@ -0,0 +1,253 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023-2025 Christoph Hellwig.
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
*/
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
/*
* Note: the zoned allocator does not support a rtextsize > 1, so this code and
* the allocator itself uses file system blocks interchangeable with realtime
* extents without doing the otherwise required conversions.
*/
/*
* Per-task space reservation.
*
* Tasks that need to wait for GC to free up space allocate one of these
* on-stack and adds it to the per-mount zi_reclaim_reservations lists.
* The GC thread will then wake the tasks in order when space becomes available.
*/
struct xfs_zone_reservation {
struct list_head entry;
struct task_struct *task;
xfs_filblks_t count_fsb;
};
/*
* Calculate the number of reserved blocks.
*
* XC_FREE_RTEXTENTS counts the user available capacity, to which the file
* system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
* available for writes without waiting for GC.
*
* For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
* block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
* is further restricted by at least one zone as well as the optional
* persistently reserved blocks. This allows the allocator to run more
* smoothly by not always triggering GC.
*/
uint64_t
xfs_zoned_default_resblks(
struct xfs_mount *mp,
enum xfs_free_counter ctr)
{
switch (ctr) {
case XC_FREE_RTEXTENTS:
return (uint64_t)XFS_RESERVED_ZONES *
mp->m_groups[XG_TYPE_RTG].blocks +
mp->m_sb.sb_rtreserved;
case XC_FREE_RTAVAILABLE:
return (uint64_t)XFS_GC_ZONES *
mp->m_groups[XG_TYPE_RTG].blocks;
default:
ASSERT(0);
return 0;
}
}
void
xfs_zoned_resv_wake_all(
struct xfs_mount *mp)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_reservation *reservation;
spin_lock(&zi->zi_reservation_lock);
list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
wake_up_process(reservation->task);
spin_unlock(&zi->zi_reservation_lock);
}
void
xfs_zoned_add_available(
struct xfs_mount *mp,
xfs_filblks_t count_fsb)
{
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_reservation *reservation;
if (list_empty_careful(&zi->zi_reclaim_reservations)) {
xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
return;
}
spin_lock(&zi->zi_reservation_lock);
xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
if (reservation->count_fsb > count_fsb)
break;
wake_up_process(reservation->task);
count_fsb -= reservation->count_fsb;
}
spin_unlock(&zi->zi_reservation_lock);
}
static int
xfs_zoned_space_wait_error(
struct xfs_mount *mp)
{
if (xfs_is_shutdown(mp))
return -EIO;
if (fatal_signal_pending(current))
return -EINTR;
return 0;
}
static int
xfs_zoned_reserve_available(
struct xfs_inode *ip,
xfs_filblks_t count_fsb,
unsigned int flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_reservation reservation = {
.task = current,
.count_fsb = count_fsb,
};
int error;
/*
* If there are no waiters, try to directly grab the available blocks
* from the percpu counter.
*
* If the caller wants to dip into the reserved pool also bypass the
* wait list. This relies on the fact that we have a very graciously
* sized reserved pool that always has enough space. If the reserved
* allocations fail we're in trouble.
*/
if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
(flags & XFS_ZR_RESERVED))) {
error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
flags & XFS_ZR_RESERVED);
if (error != -ENOSPC)
return error;
}
if (flags & XFS_ZR_NOWAIT)
return -EAGAIN;
spin_lock(&zi->zi_reservation_lock);
list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
set_current_state(TASK_KILLABLE);
error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
flags & XFS_ZR_RESERVED);
if (error != -ENOSPC)
break;
/*
* If there is no reclaimable group left and we aren't still
* processing a pending GC request give up as we're fully out
* of space.
*/
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
!xfs_is_zonegc_running(mp))
break;
spin_unlock(&zi->zi_reservation_lock);
schedule();
spin_lock(&zi->zi_reservation_lock);
}
list_del(&reservation.entry);
spin_unlock(&zi->zi_reservation_lock);
__set_current_state(TASK_RUNNING);
return error;
}
/*
* Implement greedy space allocation for short writes by trying to grab all
* that is left after locking out other threads from trying to do the same.
*
* This isn't exactly optimal and can hopefully be replaced by a proper
* percpu_counter primitive one day.
*/
static int
xfs_zoned_reserve_extents_greedy(
struct xfs_inode *ip,
xfs_filblks_t *count_fsb,
unsigned int flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
s64 len = *count_fsb;
int error = -ENOSPC;
spin_lock(&zi->zi_reservation_lock);
len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
if (len > 0) {
*count_fsb = len;
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
flags & XFS_ZR_RESERVED);
}
spin_unlock(&zi->zi_reservation_lock);
return error;
}
int
xfs_zoned_space_reserve(
struct xfs_inode *ip,
xfs_filblks_t count_fsb,
unsigned int flags,
struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
int error;
ASSERT(ac->reserved_blocks == 0);
ASSERT(ac->open_zone == NULL);
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
flags & XFS_ZR_RESERVED);
if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
if (error)
return error;
error = xfs_zoned_reserve_available(ip, count_fsb, flags);
if (error) {
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
return error;
}
ac->reserved_blocks = count_fsb;
return 0;
}
void
xfs_zoned_space_unreserve(
struct xfs_inode *ip,
struct xfs_zone_alloc_ctx *ac)
{
if (ac->reserved_blocks > 0) {
struct xfs_mount *mp = ip->i_mount;
xfs_zoned_add_available(mp, ac->reserved_blocks);
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
}
if (ac->open_zone)
xfs_open_zone_put(ac->open_zone);
}

View File

@@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);

View File

@@ -56,6 +56,10 @@ struct vm_fault;
*
* IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
* never be merged with the mapping before it.
*
* IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
* assigned to it yet and the file system will do that in the bio submission
* handler, splitting the I/O as needed.
*/
#define IOMAP_F_NEW (1U << 0)
#define IOMAP_F_DIRTY (1U << 1)
@@ -68,6 +72,7 @@ struct vm_fault;
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)
#define IOMAP_F_BOUNDARY (1U << 6)
#define IOMAP_F_ANON_WRITE (1U << 7)
/*
* Flags set by the core iomap code during operations:
@@ -111,6 +116,8 @@ struct iomap {
static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
{
if (iomap->flags & IOMAP_F_ANON_WRITE)
return U64_MAX; /* invalid */
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}
@@ -182,7 +189,9 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
#define IOMAP_ATOMIC (1 << 9)
#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */
#define IOMAP_DONTCACHE (1 << 10)
#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */
struct iomap_ops {
/*
@@ -211,8 +220,10 @@ struct iomap_ops {
* calls to iomap_iter(). Treat as read-only in the body.
* @len: The remaining length of the file segment we're operating on.
* It is updated at the same time as @pos.
* @processed: The number of bytes processed by the body in the most recent
* iteration, or a negative errno. 0 causes the iteration to stop.
* @iter_start_pos: The original start pos for the current iomap. Used for
* incremental iter advance.
* @status: Status of the most recent iteration. Zero on success or a negative
* errno on error.
* @flags: Zero or more of the iomap_begin flags above.
* @iomap: Map describing the I/O iteration
* @srcmap: Source map for COW operations
@@ -221,7 +232,8 @@ struct iomap_iter {
struct inode *inode;
loff_t pos;
u64 len;
s64 processed;
loff_t iter_start_pos;
int status;
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
@@ -229,6 +241,26 @@ struct iomap_iter {
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
int iomap_iter_advance(struct iomap_iter *iter, u64 *count);
/**
* iomap_length_trim - trimmed length of the current iomap iteration
* @iter: iteration structure
* @pos: File position to trim from.
* @len: Length of the mapping to trim to.
*
* Returns a trimmed length that the operation applies to for the current
* iteration.
*/
static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos,
u64 len)
{
u64 end = iter->iomap.offset + iter->iomap.length;
if (iter->srcmap.type != IOMAP_HOLE)
end = min(end, iter->srcmap.offset + iter->srcmap.length);
return min(len, end - pos);
}
/**
* iomap_length - length of the current iomap iteration
@@ -238,11 +270,17 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
*/
static inline u64 iomap_length(const struct iomap_iter *iter)
{
u64 end = iter->iomap.offset + iter->iomap.length;
return iomap_length_trim(iter, iter->pos, iter->len);
}
if (iter->srcmap.type != IOMAP_HOLE)
end = min(end, iter->srcmap.offset + iter->srcmap.length);
return min(iter->len, end - iter->pos);
/**
* iomap_iter_advance_full - advance by the full length of current map
*/
static inline int iomap_iter_advance_full(struct iomap_iter *iter)
{
u64 length = iomap_length(iter);
return iomap_iter_advance(iter, &length);
}
/**
@@ -306,12 +344,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops);
bool *did_zero, const struct iomap_ops *ops, void *private);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
const struct iomap_ops *ops);
const struct iomap_ops *ops, void *private);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
void *private);
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
struct iomap *iomap);
void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
@@ -327,17 +364,43 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset,
sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
const struct iomap_ops *ops);
/*
* Flags for iomap_ioend->io_flags.
*/
/* shared COW extent */
#define IOMAP_IOEND_SHARED (1U << 0)
/* unwritten extent */
#define IOMAP_IOEND_UNWRITTEN (1U << 1)
/* don't merge into previous ioend */
#define IOMAP_IOEND_BOUNDARY (1U << 2)
/* is direct I/O */
#define IOMAP_IOEND_DIRECT (1U << 3)
/*
* Flags that if set on either ioend prevent the merge of two ioends.
* (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
*/
#define IOMAP_IOEND_NOMERGE_FLAGS \
(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
/*
* Structure for writeback I/O completions.
*
* File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io
* for direct I/O) can split a bio generated by iomap. In that case the parent
* ioend it was split from is recorded in ioend->io_parent.
*/
struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
u16 io_type;
u16 io_flags; /* IOMAP_F_* */
u16 io_flags; /* IOMAP_IOEND_* */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of data within eof */
size_t io_size; /* size of the extent */
atomic_t io_remaining; /* completetion defer count */
int io_error; /* stashed away status */
struct iomap_ioend *io_parent; /* parent for completions */
loff_t io_offset; /* offset in the file */
sector_t io_sector; /* start sector of ioend */
void *io_private; /* file system private data */
struct bio io_bio; /* MUST BE LAST! */
};
@@ -362,12 +425,14 @@ struct iomap_writeback_ops {
loff_t offset, unsigned len);
/*
* Optional, allows the file systems to perform actions just before
* submitting the bio and/or override the bio end_io handler for complex
* operations like copy on write extent manipulation or unwritten extent
* conversions.
* Optional, allows the file systems to hook into bio submission,
* including overriding the bi_end_io handler.
*
* Returns 0 if the bio was successfully submitted, or a negative
* error code if status was non-zero or another error happened and
* the bio could not be submitted.
*/
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
/*
* Optional, allows the file system to discard state on a page where
@@ -383,6 +448,10 @@ struct iomap_writepage_ctx {
u32 nr_folios; /* folios added to the ioend */
};
struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
loff_t file_offset, u16 ioend_flags);
struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
unsigned int max_len, bool is_append);
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends);
@@ -434,6 +503,11 @@ struct iomap_dio_ops {
*/
#define IOMAP_DIO_PARTIAL (1 << 2)
/*
* Use software-based torn-write protection.
*/
#define IOMAP_DIO_ATOMIC_SW (1 << 3)
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before);
@@ -454,4 +528,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO)
#endif /* CONFIG_SWAP */
extern struct bio_set iomap_ioend_bioset;
#endif /* LINUX_IOMAP_H */