mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-02-19 15:57:56 -05:00
Merge branch 'xfs-6.15-merge' into for-next
XFS code for 6.15 to be merged into linux-next Signed-off-by: Carlos Maiolino <cem@kernel.org>
This commit is contained in:
@@ -246,6 +246,10 @@ The fields are as follows:
|
||||
* **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
|
||||
be set by the filesystem for its own purposes.
|
||||
|
||||
* **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
|
||||
block assigned to it yet and the file system will do that in the bio
|
||||
submission handler, splitting the I/O as needed.
|
||||
|
||||
These flags can be set by iomap itself during file operations.
|
||||
The filesystem should supply an ``->iomap_end`` function if it needs
|
||||
to observe these flags:
|
||||
@@ -352,6 +356,11 @@ operations:
|
||||
``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or
|
||||
``RWF_NOWAIT``.
|
||||
|
||||
* ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a
|
||||
buffered file I/O and would like the kernel to drop the pagecache
|
||||
after the I/O completes, if it isn't already being used by another
|
||||
thread.
|
||||
|
||||
If it is necessary to read existing file contents from a `different
|
||||
<https://lore.kernel.org/all/20191008071527.29304-9-hch@lst.de/>`_
|
||||
device or address range on a device, the filesystem should return that
|
||||
|
||||
@@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
|
||||
|
||||
* ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``.
|
||||
|
||||
* ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.
|
||||
|
||||
Internal per-Folio State
|
||||
------------------------
|
||||
|
||||
@@ -283,7 +285,7 @@ The ``ops`` structure must be specified and is as follows:
|
||||
struct iomap_writeback_ops {
|
||||
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
|
||||
loff_t offset, unsigned len);
|
||||
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
|
||||
int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
|
||||
void (*discard_folio)(struct folio *folio, loff_t pos);
|
||||
};
|
||||
|
||||
@@ -306,13 +308,12 @@ The fields are as follows:
|
||||
purpose.
|
||||
This function must be supplied by the filesystem.
|
||||
|
||||
- ``prepare_ioend``: Enables filesystems to transform the writeback
|
||||
ioend or perform any other preparatory work before the writeback I/O
|
||||
is submitted.
|
||||
- ``submit_ioend``: Allows the file systems to hook into writeback bio
|
||||
submission.
|
||||
This might include pre-write space accounting updates, or installing
|
||||
a custom ``->bi_end_io`` function for internal purposes, such as
|
||||
deferring the ioend completion to a workqueue to run metadata update
|
||||
transactions from process context.
|
||||
transactions from process context before submitting the bio.
|
||||
This function is optional.
|
||||
|
||||
- ``discard_folio``: iomap calls this function after ``->map_blocks``
|
||||
@@ -341,7 +342,7 @@ This can happen in interrupt or process context, depending on the
|
||||
storage device.
|
||||
|
||||
Filesystems that need to update internal bookkeeping (e.g. unwritten
|
||||
extent conversions) should provide a ``->prepare_ioend`` function to
|
||||
extent conversions) should provide a ``->submit_ioend`` function to
|
||||
set ``struct iomap_end::bio::bi_end_io`` to its own function.
|
||||
This function should call ``iomap_finish_ioends`` after finishing its
|
||||
own work (e.g. unwritten extent conversion).
|
||||
@@ -513,8 +514,8 @@ IOMAP_WRITE`` with any combination of the following enhancements:
|
||||
if the mapping is unwritten and the filesystem cannot handle zeroing
|
||||
the unaligned regions without exposing stale contents.
|
||||
|
||||
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
|
||||
protection.
|
||||
* ``IOMAP_ATOMIC_HW``: This write is being issued with torn-write
|
||||
protection based on HW-offload support.
|
||||
Only a single bio can be created for the write, and the write must
|
||||
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
|
||||
set.
|
||||
@@ -525,8 +526,20 @@ IOMAP_WRITE`` with any combination of the following enhancements:
|
||||
conversion or copy on write), all updates for the entire file range
|
||||
must be committed atomically as well.
|
||||
Only one space mapping is allowed per untorn write.
|
||||
Untorn writes must be aligned to, and must not be longer than, a
|
||||
single file block.
|
||||
Untorn writes may be longer than a single file block. In all cases,
|
||||
the mapping start disk block must have at least the same alignment as
|
||||
the write offset.
|
||||
|
||||
* ``IOMAP_ATOMIC_SW``: This write is being issued with torn-write
|
||||
protection via a software mechanism provided by the filesystem.
|
||||
All the disk block alignment and single bio restrictions which apply
|
||||
to IOMAP_ATOMIC_HW do not apply here.
|
||||
SW-based untorn writes would typically be used as a fallback when
|
||||
HW-based untorn writes may not be issued, e.g. the range of the write
|
||||
covers multiple extents, meaning that it is not possible to issue
|
||||
a single bio.
|
||||
All filesystem metadata updates for the entire file range must be
|
||||
committed atomically as well.
|
||||
|
||||
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
|
||||
calling this function.
|
||||
|
||||
111
fs/dax.c
111
fs/dax.c
@@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
|
||||
}
|
||||
#endif /* CONFIG_FS_DAX_PMD */
|
||||
|
||||
static s64 dax_unshare_iter(struct iomap_iter *iter)
|
||||
static int dax_unshare_iter(struct iomap_iter *iter)
|
||||
{
|
||||
struct iomap *iomap = &iter->iomap;
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
||||
@@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
|
||||
u64 copy_len = iomap_length(iter);
|
||||
u32 mod;
|
||||
int id = 0;
|
||||
s64 ret = 0;
|
||||
s64 ret;
|
||||
void *daddr = NULL, *saddr = NULL;
|
||||
|
||||
if (!iomap_want_unshare_iter(iter))
|
||||
return iomap_length(iter);
|
||||
return iomap_iter_advance_full(iter);
|
||||
|
||||
/*
|
||||
* Extend the file range to be aligned to fsblock/pagesize, because
|
||||
@@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
|
||||
ret = iomap_length(iter);
|
||||
else
|
||||
if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
|
||||
ret = -EIO;
|
||||
|
||||
out_unlock:
|
||||
dax_read_unlock(id);
|
||||
return dax_mem2blk_err(ret);
|
||||
if (ret < 0)
|
||||
return dax_mem2blk_err(ret);
|
||||
return iomap_iter_advance_full(iter);
|
||||
}
|
||||
|
||||
int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
|
||||
@@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
|
||||
|
||||
iter.len = min(len, size - pos);
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = dax_unshare_iter(&iter);
|
||||
iter.status = dax_unshare_iter(&iter);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_file_unshare);
|
||||
@@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
||||
static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
||||
{
|
||||
const struct iomap *iomap = &iter->iomap;
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iter);
|
||||
loff_t pos = iter->pos;
|
||||
u64 length = iomap_length(iter);
|
||||
s64 written = 0;
|
||||
int ret;
|
||||
|
||||
/* already zeroed? we're done. */
|
||||
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
|
||||
return length;
|
||||
return iomap_iter_advance(iter, &length);
|
||||
|
||||
/*
|
||||
* invalidate the pages whose sharing state is to be changed
|
||||
@@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
||||
*/
|
||||
if (iomap->flags & IOMAP_F_SHARED)
|
||||
invalidate_inode_pages2_range(iter->inode->i_mapping,
|
||||
pos >> PAGE_SHIFT,
|
||||
(pos + length - 1) >> PAGE_SHIFT);
|
||||
iter->pos >> PAGE_SHIFT,
|
||||
(iter->pos + length - 1) >> PAGE_SHIFT);
|
||||
|
||||
do {
|
||||
loff_t pos = iter->pos;
|
||||
unsigned offset = offset_in_page(pos);
|
||||
unsigned size = min_t(u64, PAGE_SIZE - offset, length);
|
||||
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
|
||||
long rc;
|
||||
int id;
|
||||
|
||||
length = min_t(u64, PAGE_SIZE - offset, length);
|
||||
|
||||
id = dax_read_lock();
|
||||
if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
|
||||
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
|
||||
if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
|
||||
ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
|
||||
else
|
||||
rc = dax_memzero(iter, pos, size);
|
||||
ret = dax_memzero(iter, pos, length);
|
||||
dax_read_unlock(id);
|
||||
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
pos += size;
|
||||
length -= size;
|
||||
written += size;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = iomap_iter_advance(iter, &length);
|
||||
if (ret)
|
||||
return ret;
|
||||
} while (length > 0);
|
||||
|
||||
if (did_zero)
|
||||
*did_zero = true;
|
||||
return written;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
|
||||
@@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
|
||||
int ret;
|
||||
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = dax_zero_iter(&iter, did_zero);
|
||||
iter.status = dax_zero_iter(&iter, did_zero);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_zero_range);
|
||||
@@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_truncate_page);
|
||||
|
||||
static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
struct iov_iter *iter)
|
||||
static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
|
||||
{
|
||||
const struct iomap *iomap = &iomi->iomap;
|
||||
const struct iomap *srcmap = iomap_iter_srcmap(iomi);
|
||||
@@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
if (pos >= end)
|
||||
return 0;
|
||||
|
||||
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
|
||||
return iov_iter_zero(min(length, end - pos), iter);
|
||||
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
|
||||
done = iov_iter_zero(min(length, end - pos), iter);
|
||||
return iomap_iter_advance(iomi, &done);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
}
|
||||
|
||||
id = dax_read_lock();
|
||||
while (pos < end) {
|
||||
while ((pos = iomi->pos) < end) {
|
||||
unsigned offset = pos & (PAGE_SIZE - 1);
|
||||
const size_t size = ALIGN(length + offset, PAGE_SIZE);
|
||||
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
|
||||
@@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
||||
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
|
||||
map_len, iter);
|
||||
|
||||
pos += xfer;
|
||||
length -= xfer;
|
||||
done += xfer;
|
||||
|
||||
if (xfer == 0)
|
||||
length = xfer;
|
||||
ret = iomap_iter_advance(iomi, &length);
|
||||
if (!ret && xfer == 0)
|
||||
ret = -EFAULT;
|
||||
if (xfer < map_len)
|
||||
break;
|
||||
}
|
||||
dax_read_unlock(id);
|
||||
|
||||
return done ? done : ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
iomi.flags |= IOMAP_NOWAIT;
|
||||
|
||||
while ((ret = iomap_iter(&iomi, ops)) > 0)
|
||||
iomi.processed = dax_iomap_iter(&iomi, iter);
|
||||
iomi.status = dax_iomap_iter(&iomi, iter);
|
||||
|
||||
done = iomi.pos - iocb->ki_pos;
|
||||
iocb->ki_pos = iomi.pos;
|
||||
@@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
||||
|
||||
while ((error = iomap_iter(&iter, ops)) > 0) {
|
||||
if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
|
||||
iter.processed = -EIO; /* fs corruption? */
|
||||
iter.status = -EIO; /* fs corruption? */
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
||||
ret |= VM_FAULT_MAJOR;
|
||||
}
|
||||
|
||||
if (!(ret & VM_FAULT_ERROR))
|
||||
iter.processed = PAGE_SIZE;
|
||||
if (!(ret & VM_FAULT_ERROR)) {
|
||||
u64 length = PAGE_SIZE;
|
||||
iter.status = iomap_iter_advance(&iter, &length);
|
||||
}
|
||||
}
|
||||
|
||||
if (iomap_errp)
|
||||
@@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
|
||||
continue; /* actually breaks out of the loop */
|
||||
|
||||
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
|
||||
if (ret != VM_FAULT_FALLBACK)
|
||||
iter.processed = PMD_SIZE;
|
||||
if (ret != VM_FAULT_FALLBACK) {
|
||||
u64 length = PMD_SIZE;
|
||||
iter.status = iomap_iter_advance(&iter, &length);
|
||||
}
|
||||
}
|
||||
|
||||
unlock_entry:
|
||||
@@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
|
||||
|
||||
static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
|
||||
static int dax_range_compare_iter(struct iomap_iter *it_src,
|
||||
struct iomap_iter *it_dest, u64 len, bool *same)
|
||||
{
|
||||
const struct iomap *smap = &it_src->iomap;
|
||||
const struct iomap *dmap = &it_dest->iomap;
|
||||
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
|
||||
u64 dest_len;
|
||||
void *saddr, *daddr;
|
||||
int id, ret;
|
||||
|
||||
@@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
|
||||
|
||||
if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
|
||||
*same = true;
|
||||
return len;
|
||||
goto advance;
|
||||
}
|
||||
|
||||
if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
|
||||
@@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
|
||||
if (!*same)
|
||||
len = 0;
|
||||
dax_read_unlock(id);
|
||||
return len;
|
||||
|
||||
advance:
|
||||
dest_len = len;
|
||||
ret = iomap_iter_advance(it_src, &len);
|
||||
if (!ret)
|
||||
ret = iomap_iter_advance(it_dest, &dest_len);
|
||||
return ret;
|
||||
|
||||
out_unlock:
|
||||
dax_read_unlock(id);
|
||||
@@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
|
||||
.len = len,
|
||||
.flags = IOMAP_DAX,
|
||||
};
|
||||
int ret, compared = 0;
|
||||
int ret, status;
|
||||
|
||||
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
|
||||
(ret = iomap_iter(&dst_iter, ops)) > 0) {
|
||||
compared = dax_range_compare_iter(&src_iter, &dst_iter,
|
||||
status = dax_range_compare_iter(&src_iter, &dst_iter,
|
||||
min(src_iter.len, dst_iter.len), same);
|
||||
if (compared < 0)
|
||||
if (status < 0)
|
||||
return ret;
|
||||
src_iter.processed = dst_iter.processed = compared;
|
||||
src_iter.status = dst_iter.status = status;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -3467,7 +3467,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
|
||||
return false;
|
||||
|
||||
/* atomic writes are all-or-nothing */
|
||||
if (flags & IOMAP_ATOMIC)
|
||||
if (flags & IOMAP_ATOMIC_HW)
|
||||
return false;
|
||||
|
||||
/* can only try again if we wrote nothing */
|
||||
|
||||
@@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
|
||||
unsigned int length)
|
||||
{
|
||||
BUG_ON(current->journal_info);
|
||||
return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
|
||||
return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
|
||||
NULL);
|
||||
}
|
||||
|
||||
#define GFS2_JTRUNC_REVOKES 8192
|
||||
|
||||
@@ -12,6 +12,7 @@ iomap-y += trace.o \
|
||||
iter.o
|
||||
iomap-$(CONFIG_BLOCK) += buffered-io.o \
|
||||
direct-io.o \
|
||||
ioend.o \
|
||||
fiemap.o \
|
||||
seek.o
|
||||
iomap-$(CONFIG_SWAP) += swapfile.o
|
||||
|
||||
@@ -12,17 +12,15 @@
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/list_sort.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/migrate.h>
|
||||
#include "internal.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include "../internal.h"
|
||||
|
||||
#define IOEND_BATCH_SIZE 4096
|
||||
|
||||
/*
|
||||
* Structure allocated for each folio to track per-block uptodate, dirty state
|
||||
* and I/O completions.
|
||||
@@ -40,8 +38,6 @@ struct iomap_folio_state {
|
||||
unsigned long state[];
|
||||
};
|
||||
|
||||
static struct bio_set iomap_ioend_bioset;
|
||||
|
||||
static inline bool ifs_is_fully_uptodate(struct folio *folio,
|
||||
struct iomap_folio_state *ifs)
|
||||
{
|
||||
@@ -366,15 +362,14 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
|
||||
pos >= i_size_read(iter->inode);
|
||||
}
|
||||
|
||||
static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
|
||||
struct iomap_readpage_ctx *ctx, loff_t offset)
|
||||
static int iomap_readpage_iter(struct iomap_iter *iter,
|
||||
struct iomap_readpage_ctx *ctx)
|
||||
{
|
||||
const struct iomap *iomap = &iter->iomap;
|
||||
loff_t pos = iter->pos + offset;
|
||||
loff_t length = iomap_length(iter) - offset;
|
||||
loff_t pos = iter->pos;
|
||||
loff_t length = iomap_length(iter);
|
||||
struct folio *folio = ctx->cur_folio;
|
||||
struct iomap_folio_state *ifs;
|
||||
loff_t orig_pos = pos;
|
||||
size_t poff, plen;
|
||||
sector_t sector;
|
||||
|
||||
@@ -438,25 +433,22 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
|
||||
* we can skip trailing ones as they will be handled in the next
|
||||
* iteration.
|
||||
*/
|
||||
return pos - orig_pos + plen;
|
||||
length = pos - iter->pos + plen;
|
||||
return iomap_iter_advance(iter, &length);
|
||||
}
|
||||
|
||||
static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
|
||||
static int iomap_read_folio_iter(struct iomap_iter *iter,
|
||||
struct iomap_readpage_ctx *ctx)
|
||||
{
|
||||
struct folio *folio = ctx->cur_folio;
|
||||
size_t offset = offset_in_folio(folio, iter->pos);
|
||||
loff_t length = min_t(loff_t, folio_size(folio) - offset,
|
||||
iomap_length(iter));
|
||||
loff_t done, ret;
|
||||
int ret;
|
||||
|
||||
for (done = 0; done < length; done += ret) {
|
||||
ret = iomap_readpage_iter(iter, ctx, done);
|
||||
if (ret <= 0)
|
||||
while (iomap_length(iter)) {
|
||||
ret = iomap_readpage_iter(iter, ctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
|
||||
@@ -474,7 +466,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
|
||||
trace_iomap_readpage(iter.inode, 1);
|
||||
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_read_folio_iter(&iter, &ctx);
|
||||
iter.status = iomap_read_folio_iter(&iter, &ctx);
|
||||
|
||||
if (ctx.bio) {
|
||||
submit_bio(ctx.bio);
|
||||
@@ -493,15 +485,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_read_folio);
|
||||
|
||||
static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
|
||||
static int iomap_readahead_iter(struct iomap_iter *iter,
|
||||
struct iomap_readpage_ctx *ctx)
|
||||
{
|
||||
loff_t length = iomap_length(iter);
|
||||
loff_t done, ret;
|
||||
int ret;
|
||||
|
||||
for (done = 0; done < length; done += ret) {
|
||||
while (iomap_length(iter)) {
|
||||
if (ctx->cur_folio &&
|
||||
offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
|
||||
offset_in_folio(ctx->cur_folio, iter->pos) == 0) {
|
||||
if (!ctx->cur_folio_in_bio)
|
||||
folio_unlock(ctx->cur_folio);
|
||||
ctx->cur_folio = NULL;
|
||||
@@ -510,12 +501,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
|
||||
ctx->cur_folio = readahead_folio(ctx->rac);
|
||||
ctx->cur_folio_in_bio = false;
|
||||
}
|
||||
ret = iomap_readpage_iter(iter, ctx, done);
|
||||
if (ret <= 0)
|
||||
ret = iomap_readpage_iter(iter, ctx);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return done;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -547,7 +538,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
|
||||
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
|
||||
|
||||
while (iomap_iter(&iter, ops) > 0)
|
||||
iter.processed = iomap_readahead_iter(&iter, &ctx);
|
||||
iter.status = iomap_readahead_iter(&iter, &ctx);
|
||||
|
||||
if (ctx.bio)
|
||||
submit_bio(ctx.bio);
|
||||
@@ -603,6 +594,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
|
||||
|
||||
if (iter->flags & IOMAP_NOWAIT)
|
||||
fgp |= FGP_NOWAIT;
|
||||
if (iter->flags & IOMAP_DONTCACHE)
|
||||
fgp |= FGP_DONTCACHE;
|
||||
fgp |= fgf_set_order(len);
|
||||
|
||||
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
|
||||
@@ -907,12 +900,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
|
||||
return __iomap_write_end(iter->inode, pos, len, copied, folio);
|
||||
}
|
||||
|
||||
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
||||
static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
||||
{
|
||||
loff_t length = iomap_length(iter);
|
||||
loff_t pos = iter->pos;
|
||||
ssize_t total_written = 0;
|
||||
long status = 0;
|
||||
int status = 0;
|
||||
struct address_space *mapping = iter->inode->i_mapping;
|
||||
size_t chunk = mapping_max_folio_size(mapping);
|
||||
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
|
||||
@@ -923,7 +914,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
||||
size_t offset; /* Offset into folio */
|
||||
size_t bytes; /* Bytes to write to folio */
|
||||
size_t copied; /* Bytes copied from user */
|
||||
size_t written; /* Bytes have been written */
|
||||
u64 written; /* Bytes have been written */
|
||||
loff_t pos = iter->pos;
|
||||
|
||||
bytes = iov_iter_count(i);
|
||||
retry:
|
||||
@@ -934,8 +926,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
||||
if (unlikely(status))
|
||||
break;
|
||||
|
||||
if (bytes > length)
|
||||
bytes = length;
|
||||
if (bytes > iomap_length(iter))
|
||||
bytes = iomap_length(iter);
|
||||
|
||||
/*
|
||||
* Bring in the user page that we'll copy from _first_.
|
||||
@@ -1006,17 +998,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
|
||||
goto retry;
|
||||
}
|
||||
} else {
|
||||
pos += written;
|
||||
total_written += written;
|
||||
length -= written;
|
||||
iomap_iter_advance(iter, &written);
|
||||
}
|
||||
} while (iov_iter_count(i) && length);
|
||||
} while (iov_iter_count(i) && iomap_length(iter));
|
||||
|
||||
if (status == -EAGAIN) {
|
||||
iov_iter_revert(i, total_written);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return total_written ? total_written : status;
|
||||
return total_written ? 0 : status;
|
||||
}
|
||||
|
||||
ssize_t
|
||||
@@ -1034,9 +1021,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
iter.flags |= IOMAP_NOWAIT;
|
||||
if (iocb->ki_flags & IOCB_DONTCACHE)
|
||||
iter.flags |= IOMAP_DONTCACHE;
|
||||
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_write_iter(&iter, i);
|
||||
iter.status = iomap_write_iter(&iter, i);
|
||||
|
||||
if (unlikely(iter.pos == iocb->ki_pos))
|
||||
return ret;
|
||||
@@ -1270,23 +1259,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
|
||||
|
||||
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
|
||||
static int iomap_unshare_iter(struct iomap_iter *iter)
|
||||
{
|
||||
struct iomap *iomap = &iter->iomap;
|
||||
loff_t pos = iter->pos;
|
||||
loff_t length = iomap_length(iter);
|
||||
loff_t written = 0;
|
||||
u64 bytes = iomap_length(iter);
|
||||
int status;
|
||||
|
||||
if (!iomap_want_unshare_iter(iter))
|
||||
return length;
|
||||
return iomap_iter_advance(iter, &bytes);
|
||||
|
||||
do {
|
||||
struct folio *folio;
|
||||
int status;
|
||||
size_t offset;
|
||||
size_t bytes = min_t(u64, SIZE_MAX, length);
|
||||
loff_t pos = iter->pos;
|
||||
bool ret;
|
||||
|
||||
bytes = min_t(u64, SIZE_MAX, bytes);
|
||||
status = iomap_write_begin(iter, pos, bytes, &folio);
|
||||
if (unlikely(status))
|
||||
return status;
|
||||
@@ -1304,14 +1292,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
|
||||
|
||||
cond_resched();
|
||||
|
||||
pos += bytes;
|
||||
written += bytes;
|
||||
length -= bytes;
|
||||
|
||||
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
|
||||
} while (length > 0);
|
||||
|
||||
return written;
|
||||
status = iomap_iter_advance(iter, &bytes);
|
||||
if (status)
|
||||
break;
|
||||
} while (bytes > 0);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
int
|
||||
@@ -1331,7 +1319,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
|
||||
|
||||
iter.len = min(len, size - pos);
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_unshare_iter(&iter);
|
||||
iter.status = iomap_unshare_iter(&iter);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_file_unshare);
|
||||
@@ -1350,19 +1338,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
|
||||
return filemap_write_and_wait_range(mapping, i->pos, end);
|
||||
}
|
||||
|
||||
static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
||||
static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
||||
{
|
||||
loff_t pos = iter->pos;
|
||||
loff_t length = iomap_length(iter);
|
||||
loff_t written = 0;
|
||||
u64 bytes = iomap_length(iter);
|
||||
int status;
|
||||
|
||||
do {
|
||||
struct folio *folio;
|
||||
int status;
|
||||
size_t offset;
|
||||
size_t bytes = min_t(u64, SIZE_MAX, length);
|
||||
loff_t pos = iter->pos;
|
||||
bool ret;
|
||||
|
||||
bytes = min_t(u64, SIZE_MAX, bytes);
|
||||
status = iomap_write_begin(iter, pos, bytes, &folio);
|
||||
if (status)
|
||||
return status;
|
||||
@@ -1383,25 +1370,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
|
||||
if (WARN_ON_ONCE(!ret))
|
||||
return -EIO;
|
||||
|
||||
pos += bytes;
|
||||
length -= bytes;
|
||||
written += bytes;
|
||||
} while (length > 0);
|
||||
status = iomap_iter_advance(iter, &bytes);
|
||||
if (status)
|
||||
break;
|
||||
} while (bytes > 0);
|
||||
|
||||
if (did_zero)
|
||||
*did_zero = true;
|
||||
return written;
|
||||
return status;
|
||||
}
|
||||
|
||||
int
|
||||
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
|
||||
const struct iomap_ops *ops)
|
||||
const struct iomap_ops *ops, void *private)
|
||||
{
|
||||
struct iomap_iter iter = {
|
||||
.inode = inode,
|
||||
.pos = pos,
|
||||
.len = len,
|
||||
.flags = IOMAP_ZERO,
|
||||
.private = private,
|
||||
};
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
unsigned int blocksize = i_blocksize(inode);
|
||||
@@ -1424,7 +1412,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
|
||||
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
|
||||
iter.len = plen;
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_zero_iter(&iter, did_zero);
|
||||
iter.status = iomap_zero_iter(&iter, did_zero);
|
||||
|
||||
iter.len = len - (iter.pos - pos);
|
||||
if (ret || !iter.len)
|
||||
@@ -1443,17 +1431,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
|
||||
|
||||
if (srcmap->type == IOMAP_HOLE ||
|
||||
srcmap->type == IOMAP_UNWRITTEN) {
|
||||
loff_t proc = iomap_length(&iter);
|
||||
s64 status;
|
||||
|
||||
if (range_dirty) {
|
||||
range_dirty = false;
|
||||
proc = iomap_zero_iter_flush_and_stale(&iter);
|
||||
status = iomap_zero_iter_flush_and_stale(&iter);
|
||||
} else {
|
||||
status = iomap_iter_advance_full(&iter);
|
||||
}
|
||||
iter.processed = proc;
|
||||
iter.status = status;
|
||||
continue;
|
||||
}
|
||||
|
||||
iter.processed = iomap_zero_iter(&iter, did_zero);
|
||||
iter.status = iomap_zero_iter(&iter, did_zero);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -1461,7 +1451,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
|
||||
|
||||
int
|
||||
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
|
||||
const struct iomap_ops *ops)
|
||||
const struct iomap_ops *ops, void *private)
|
||||
{
|
||||
unsigned int blocksize = i_blocksize(inode);
|
||||
unsigned int off = pos & (blocksize - 1);
|
||||
@@ -1469,11 +1459,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
|
||||
/* Block boundary? Nothing to do */
|
||||
if (!off)
|
||||
return 0;
|
||||
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
|
||||
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
|
||||
private);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_truncate_page);
|
||||
|
||||
static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
|
||||
static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
|
||||
struct folio *folio)
|
||||
{
|
||||
loff_t length = iomap_length(iter);
|
||||
@@ -1490,14 +1481,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
|
||||
folio_mark_dirty(folio);
|
||||
}
|
||||
|
||||
return length;
|
||||
return iomap_iter_advance(iter, &length);
|
||||
}
|
||||
|
||||
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
|
||||
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
|
||||
void *private)
|
||||
{
|
||||
struct iomap_iter iter = {
|
||||
.inode = file_inode(vmf->vma->vm_file),
|
||||
.flags = IOMAP_WRITE | IOMAP_FAULT,
|
||||
.private = private,
|
||||
};
|
||||
struct folio *folio = page_folio(vmf->page);
|
||||
ssize_t ret;
|
||||
@@ -1509,7 +1502,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
|
||||
iter.pos = folio_pos(folio);
|
||||
iter.len = ret;
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
|
||||
iter.status = iomap_folio_mkwrite_iter(&iter, folio);
|
||||
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
@@ -1538,16 +1531,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
|
||||
* state, release holds on bios, and finally free up memory. Do not use the
|
||||
* ioend after this.
|
||||
*/
|
||||
static u32
|
||||
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
|
||||
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
|
||||
{
|
||||
struct inode *inode = ioend->io_inode;
|
||||
struct bio *bio = &ioend->io_bio;
|
||||
struct folio_iter fi;
|
||||
u32 folio_count = 0;
|
||||
|
||||
if (error) {
|
||||
mapping_set_error(inode->i_mapping, error);
|
||||
if (ioend->io_error) {
|
||||
mapping_set_error(inode->i_mapping, ioend->io_error);
|
||||
if (!bio_flagged(bio, BIO_QUIET)) {
|
||||
pr_err_ratelimited(
|
||||
"%s: writeback error on inode %lu, offset %lld, sector %llu",
|
||||
@@ -1566,116 +1558,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
|
||||
return folio_count;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ioend completion routine for merged bios. This can only be called from task
|
||||
* contexts as merged ioends can be of unbound length. Hence we have to break up
|
||||
* the writeback completions into manageable chunks to avoid long scheduler
|
||||
* holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
|
||||
* good batch processing throughput without creating adverse scheduler latency
|
||||
* conditions.
|
||||
*/
|
||||
void
|
||||
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
|
||||
{
|
||||
struct list_head tmp;
|
||||
u32 completions;
|
||||
|
||||
might_sleep();
|
||||
|
||||
list_replace_init(&ioend->io_list, &tmp);
|
||||
completions = iomap_finish_ioend(ioend, error);
|
||||
|
||||
while (!list_empty(&tmp)) {
|
||||
if (completions > IOEND_BATCH_SIZE * 8) {
|
||||
cond_resched();
|
||||
completions = 0;
|
||||
}
|
||||
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
|
||||
list_del_init(&ioend->io_list);
|
||||
completions += iomap_finish_ioend(ioend, error);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
|
||||
|
||||
/*
|
||||
* We can merge two adjacent ioends if they have the same set of work to do.
|
||||
*/
|
||||
static bool
|
||||
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
|
||||
{
|
||||
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
|
||||
return false;
|
||||
if (next->io_flags & IOMAP_F_BOUNDARY)
|
||||
return false;
|
||||
if ((ioend->io_flags & IOMAP_F_SHARED) ^
|
||||
(next->io_flags & IOMAP_F_SHARED))
|
||||
return false;
|
||||
if ((ioend->io_type == IOMAP_UNWRITTEN) ^
|
||||
(next->io_type == IOMAP_UNWRITTEN))
|
||||
return false;
|
||||
if (ioend->io_offset + ioend->io_size != next->io_offset)
|
||||
return false;
|
||||
/*
|
||||
* Do not merge physically discontiguous ioends. The filesystem
|
||||
* completion functions will have to iterate the physical
|
||||
* discontiguities even if we merge the ioends at a logical level, so
|
||||
* we don't gain anything by merging physical discontiguities here.
|
||||
*
|
||||
* We cannot use bio->bi_iter.bi_sector here as it is modified during
|
||||
* submission so does not point to the start sector of the bio at
|
||||
* completion.
|
||||
*/
|
||||
if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
|
||||
{
|
||||
struct iomap_ioend *next;
|
||||
|
||||
INIT_LIST_HEAD(&ioend->io_list);
|
||||
|
||||
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
|
||||
io_list))) {
|
||||
if (!iomap_ioend_can_merge(ioend, next))
|
||||
break;
|
||||
list_move_tail(&next->io_list, &ioend->io_list);
|
||||
ioend->io_size += next->io_size;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
|
||||
|
||||
static int
|
||||
iomap_ioend_compare(void *priv, const struct list_head *a,
|
||||
const struct list_head *b)
|
||||
{
|
||||
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
|
||||
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
|
||||
|
||||
if (ia->io_offset < ib->io_offset)
|
||||
return -1;
|
||||
if (ia->io_offset > ib->io_offset)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
iomap_sort_ioends(struct list_head *ioend_list)
|
||||
{
|
||||
list_sort(NULL, ioend_list, iomap_ioend_compare);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
|
||||
|
||||
static void iomap_writepage_end_bio(struct bio *bio)
|
||||
{
|
||||
iomap_finish_ioend(iomap_ioend_from_bio(bio),
|
||||
blk_status_to_errno(bio->bi_status));
|
||||
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
|
||||
|
||||
ioend->io_error = blk_status_to_errno(bio->bi_status);
|
||||
iomap_finish_ioend_buffered(ioend);
|
||||
}
|
||||
|
||||
/*
|
||||
* Submit the final bio for an ioend.
|
||||
* Submit an ioend.
|
||||
*
|
||||
* If @error is non-zero, it means that we have a situation where some part of
|
||||
* the submission process has failed after we've marked pages for writeback.
|
||||
@@ -1694,14 +1586,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
|
||||
* failure happened so that the file system end I/O handler gets called
|
||||
* to clean up.
|
||||
*/
|
||||
if (wpc->ops->prepare_ioend)
|
||||
error = wpc->ops->prepare_ioend(wpc->ioend, error);
|
||||
if (wpc->ops->submit_ioend) {
|
||||
error = wpc->ops->submit_ioend(wpc, error);
|
||||
} else {
|
||||
if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
|
||||
error = -EIO;
|
||||
if (!error)
|
||||
submit_bio(&wpc->ioend->io_bio);
|
||||
}
|
||||
|
||||
if (error) {
|
||||
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
|
||||
bio_endio(&wpc->ioend->io_bio);
|
||||
} else {
|
||||
submit_bio(&wpc->ioend->io_bio);
|
||||
}
|
||||
|
||||
wpc->ioend = NULL;
|
||||
@@ -1709,9 +1605,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
|
||||
}
|
||||
|
||||
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
|
||||
struct writeback_control *wbc, struct inode *inode, loff_t pos)
|
||||
struct writeback_control *wbc, struct inode *inode, loff_t pos,
|
||||
u16 ioend_flags)
|
||||
{
|
||||
struct iomap_ioend *ioend;
|
||||
struct bio *bio;
|
||||
|
||||
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
|
||||
@@ -1719,36 +1615,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
|
||||
GFP_NOFS, &iomap_ioend_bioset);
|
||||
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
|
||||
bio->bi_end_io = iomap_writepage_end_bio;
|
||||
wbc_init_bio(wbc, bio);
|
||||
bio->bi_write_hint = inode->i_write_hint;
|
||||
|
||||
ioend = iomap_ioend_from_bio(bio);
|
||||
INIT_LIST_HEAD(&ioend->io_list);
|
||||
ioend->io_type = wpc->iomap.type;
|
||||
ioend->io_flags = wpc->iomap.flags;
|
||||
if (pos > wpc->iomap.offset)
|
||||
wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
|
||||
ioend->io_inode = inode;
|
||||
ioend->io_size = 0;
|
||||
ioend->io_offset = pos;
|
||||
ioend->io_sector = bio->bi_iter.bi_sector;
|
||||
|
||||
wbc_init_bio(wbc, bio);
|
||||
wpc->nr_folios = 0;
|
||||
return ioend;
|
||||
return iomap_init_ioend(inode, bio, pos, ioend_flags);
|
||||
}
|
||||
|
||||
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
|
||||
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
|
||||
u16 ioend_flags)
|
||||
{
|
||||
if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
|
||||
if (ioend_flags & IOMAP_IOEND_BOUNDARY)
|
||||
return false;
|
||||
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
|
||||
(wpc->ioend->io_flags & IOMAP_F_SHARED))
|
||||
return false;
|
||||
if (wpc->iomap.type != wpc->ioend->io_type)
|
||||
if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
|
||||
(wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
|
||||
return false;
|
||||
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
|
||||
return false;
|
||||
if (iomap_sector(&wpc->iomap, pos) !=
|
||||
if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
|
||||
iomap_sector(&wpc->iomap, pos) !=
|
||||
bio_end_sector(&wpc->ioend->io_bio))
|
||||
return false;
|
||||
/*
|
||||
@@ -1779,14 +1663,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
|
||||
{
|
||||
struct iomap_folio_state *ifs = folio->private;
|
||||
size_t poff = offset_in_folio(folio, pos);
|
||||
unsigned int ioend_flags = 0;
|
||||
int error;
|
||||
|
||||
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
|
||||
if (wpc->iomap.type == IOMAP_UNWRITTEN)
|
||||
ioend_flags |= IOMAP_IOEND_UNWRITTEN;
|
||||
if (wpc->iomap.flags & IOMAP_F_SHARED)
|
||||
ioend_flags |= IOMAP_IOEND_SHARED;
|
||||
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
|
||||
ioend_flags |= IOMAP_IOEND_BOUNDARY;
|
||||
|
||||
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
|
||||
new_ioend:
|
||||
error = iomap_submit_ioend(wpc, 0);
|
||||
if (error)
|
||||
return error;
|
||||
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
|
||||
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
|
||||
ioend_flags);
|
||||
}
|
||||
|
||||
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
|
||||
@@ -2062,11 +1955,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
|
||||
return iomap_submit_ioend(wpc, error);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_writepages);
|
||||
|
||||
static int __init iomap_buffered_init(void)
|
||||
{
|
||||
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
|
||||
offsetof(struct iomap_ioend, io_bio),
|
||||
BIOSET_NEED_BVECS);
|
||||
}
|
||||
fs_initcall(iomap_buffered_init);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2010 Red Hat, Inc.
|
||||
* Copyright (c) 2016-2021 Christoph Hellwig.
|
||||
* Copyright (c) 2016-2025 Christoph Hellwig.
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/compiler.h>
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include "internal.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include "../internal.h"
|
||||
@@ -20,6 +21,7 @@
|
||||
* Private flags for iomap_dio, must not overlap with the public ones in
|
||||
* iomap.h:
|
||||
*/
|
||||
#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
|
||||
#define IOMAP_DIO_CALLER_COMP (1U << 26)
|
||||
#define IOMAP_DIO_INLINE_COMP (1U << 27)
|
||||
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
|
||||
@@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
|
||||
WRITE_ONCE(iocb->private, bio);
|
||||
}
|
||||
|
||||
if (dio->dops && dio->dops->submit_io)
|
||||
if (dio->dops && dio->dops->submit_io) {
|
||||
dio->dops->submit_io(iter, bio, pos);
|
||||
else
|
||||
} else {
|
||||
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
|
||||
submit_bio(bio);
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t iomap_dio_complete(struct iomap_dio *dio)
|
||||
@@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
|
||||
* ->end_io() when necessary, otherwise a racing buffer read would cache
|
||||
* zeros from unwritten extents.
|
||||
*/
|
||||
if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
|
||||
if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
|
||||
!(dio->flags & IOMAP_DIO_NO_INVALIDATE))
|
||||
kiocb_invalidate_post_direct_write(iocb, dio->size);
|
||||
|
||||
inode_dio_end(file_inode(iocb->ki_filp));
|
||||
@@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
|
||||
cmpxchg(&dio->error, 0, ret);
|
||||
}
|
||||
|
||||
void iomap_dio_bio_end_io(struct bio *bio)
|
||||
/*
|
||||
* Called when dio->ref reaches zero from an I/O completion.
|
||||
*/
|
||||
static void iomap_dio_done(struct iomap_dio *dio)
|
||||
{
|
||||
struct iomap_dio *dio = bio->bi_private;
|
||||
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
|
||||
struct kiocb *iocb = dio->iocb;
|
||||
|
||||
if (bio->bi_status)
|
||||
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
|
||||
if (!atomic_dec_and_test(&dio->ref))
|
||||
goto release_bio;
|
||||
|
||||
/*
|
||||
* Synchronous dio, task itself will handle any completion work
|
||||
* that needs after IO. All we need to do is wake the task.
|
||||
*/
|
||||
if (dio->wait_for_completion) {
|
||||
/*
|
||||
* Synchronous I/O, task itself will handle any completion work
|
||||
* that needs after IO. All we need to do is wake the task.
|
||||
*/
|
||||
struct task_struct *waiter = dio->submit.waiter;
|
||||
|
||||
WRITE_ONCE(dio->submit.waiter, NULL);
|
||||
blk_wake_io_task(waiter);
|
||||
goto release_bio;
|
||||
}
|
||||
|
||||
/*
|
||||
* Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
|
||||
*/
|
||||
if (dio->flags & IOMAP_DIO_INLINE_COMP) {
|
||||
} else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
|
||||
WRITE_ONCE(iocb->private, NULL);
|
||||
iomap_dio_complete_work(&dio->aio.work);
|
||||
goto release_bio;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
|
||||
* our completion that way to avoid an async punt to a workqueue.
|
||||
*/
|
||||
if (dio->flags & IOMAP_DIO_CALLER_COMP) {
|
||||
} else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
|
||||
/*
|
||||
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
|
||||
* schedule our completion that way to avoid an async punt to a
|
||||
* workqueue.
|
||||
*/
|
||||
/* only polled IO cares about private cleared */
|
||||
iocb->private = dio;
|
||||
iocb->dio_complete = iomap_dio_deferred_complete;
|
||||
@@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
|
||||
* issuer.
|
||||
*/
|
||||
iocb->ki_complete(iocb, 0);
|
||||
goto release_bio;
|
||||
}
|
||||
} else {
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
|
||||
/*
|
||||
* Async DIO completion that requires filesystem level
|
||||
* completion work gets punted to a work queue to complete as
|
||||
* the operation may require more IO to be issued to finalise
|
||||
* filesystem metadata changes or guarantee data integrity.
|
||||
*/
|
||||
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
|
||||
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
|
||||
}
|
||||
}
|
||||
|
||||
void iomap_dio_bio_end_io(struct bio *bio)
|
||||
{
|
||||
struct iomap_dio *dio = bio->bi_private;
|
||||
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
|
||||
|
||||
if (bio->bi_status)
|
||||
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
|
||||
|
||||
if (atomic_dec_and_test(&dio->ref))
|
||||
iomap_dio_done(dio);
|
||||
|
||||
/*
|
||||
* Async DIO completion that requires filesystem level completion work
|
||||
* gets punted to a work queue to complete as the operation may require
|
||||
* more IO to be issued to finalise filesystem metadata changes or
|
||||
* guarantee data integrity.
|
||||
*/
|
||||
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
|
||||
queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
|
||||
&dio->aio.work);
|
||||
release_bio:
|
||||
if (should_dirty) {
|
||||
bio_check_pages_dirty(bio);
|
||||
} else {
|
||||
@@ -239,6 +244,47 @@ void iomap_dio_bio_end_io(struct bio *bio)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
|
||||
|
||||
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
|
||||
{
|
||||
struct iomap_dio *dio = ioend->io_bio.bi_private;
|
||||
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
|
||||
u32 vec_count = ioend->io_bio.bi_vcnt;
|
||||
|
||||
if (ioend->io_error)
|
||||
iomap_dio_set_error(dio, ioend->io_error);
|
||||
|
||||
if (atomic_dec_and_test(&dio->ref)) {
|
||||
/*
|
||||
* Try to avoid another context switch for the completion given
|
||||
* that we are already called from the ioend completion
|
||||
* workqueue, but never invalidate pages from this thread to
|
||||
* avoid deadlocks with buffered I/O completions. Tough luck if
|
||||
* you hit the tiny race with someone dirtying the range now
|
||||
* between this check and the actual completion.
|
||||
*/
|
||||
if (!dio->iocb->ki_filp->f_mapping->nrpages) {
|
||||
dio->flags |= IOMAP_DIO_INLINE_COMP;
|
||||
dio->flags |= IOMAP_DIO_NO_INVALIDATE;
|
||||
}
|
||||
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
|
||||
iomap_dio_done(dio);
|
||||
}
|
||||
|
||||
if (should_dirty) {
|
||||
bio_check_pages_dirty(&ioend->io_bio);
|
||||
} else {
|
||||
bio_release_pages(&ioend->io_bio, false);
|
||||
bio_put(&ioend->io_bio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the number of bvecs completed as even direct I/O completions
|
||||
* do significant per-folio work and we'll still want to give up the
|
||||
* CPU after a lot of completions.
|
||||
*/
|
||||
return vec_count;
|
||||
}
|
||||
|
||||
static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
|
||||
loff_t pos, unsigned len)
|
||||
{
|
||||
@@ -271,7 +317,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
|
||||
* clearing the WRITE_THROUGH flag in the dio request.
|
||||
*/
|
||||
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
|
||||
const struct iomap *iomap, bool use_fua, bool atomic)
|
||||
const struct iomap *iomap, bool use_fua, bool atomic_hw)
|
||||
{
|
||||
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
|
||||
|
||||
@@ -283,30 +329,29 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
|
||||
opflags |= REQ_FUA;
|
||||
else
|
||||
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
|
||||
if (atomic)
|
||||
if (atomic_hw)
|
||||
opflags |= REQ_ATOMIC;
|
||||
|
||||
return opflags;
|
||||
}
|
||||
|
||||
static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
|
||||
struct iomap_dio *dio)
|
||||
static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
|
||||
{
|
||||
const struct iomap *iomap = &iter->iomap;
|
||||
struct inode *inode = iter->inode;
|
||||
unsigned int fs_block_size = i_blocksize(inode), pad;
|
||||
bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW;
|
||||
const loff_t length = iomap_length(iter);
|
||||
bool atomic = iter->flags & IOMAP_ATOMIC;
|
||||
loff_t pos = iter->pos;
|
||||
blk_opf_t bio_opf;
|
||||
struct bio *bio;
|
||||
bool need_zeroout = false;
|
||||
bool use_fua = false;
|
||||
int nr_pages, ret = 0;
|
||||
size_t copied = 0;
|
||||
u64 copied = 0;
|
||||
size_t orig_count;
|
||||
|
||||
if (atomic && length != fs_block_size)
|
||||
if (atomic_hw && length != iter->len)
|
||||
return -EINVAL;
|
||||
|
||||
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
|
||||
@@ -383,7 +428,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
|
||||
goto out;
|
||||
}
|
||||
|
||||
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
|
||||
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw);
|
||||
|
||||
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
|
||||
do {
|
||||
@@ -416,7 +461,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
|
||||
}
|
||||
|
||||
n = bio->bi_iter.bi_size;
|
||||
if (WARN_ON_ONCE(atomic && n != length)) {
|
||||
if (WARN_ON_ONCE(atomic_hw && n != length)) {
|
||||
/*
|
||||
* This bio should have covered the complete length,
|
||||
* which it doesn't, so error. We may need to zero out
|
||||
@@ -467,30 +512,28 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
|
||||
/* Undo iter limitation to current extent */
|
||||
iov_iter_reexpand(dio->submit.iter, orig_count - copied);
|
||||
if (copied)
|
||||
return copied;
|
||||
return iomap_iter_advance(iter, &copied);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
|
||||
struct iomap_dio *dio)
|
||||
static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
|
||||
{
|
||||
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
|
||||
|
||||
dio->size += length;
|
||||
if (!length)
|
||||
return -EFAULT;
|
||||
return length;
|
||||
return iomap_iter_advance(iter, &length);
|
||||
}
|
||||
|
||||
static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
|
||||
struct iomap_dio *dio)
|
||||
static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
|
||||
{
|
||||
const struct iomap *iomap = &iomi->iomap;
|
||||
struct iov_iter *iter = dio->submit.iter;
|
||||
void *inline_data = iomap_inline_data(iomap, iomi->pos);
|
||||
loff_t length = iomap_length(iomi);
|
||||
loff_t pos = iomi->pos;
|
||||
size_t copied;
|
||||
u64 copied;
|
||||
|
||||
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
|
||||
return -EIO;
|
||||
@@ -512,11 +555,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
|
||||
dio->size += copied;
|
||||
if (!copied)
|
||||
return -EFAULT;
|
||||
return copied;
|
||||
return iomap_iter_advance(iomi, &copied);
|
||||
}
|
||||
|
||||
static loff_t iomap_dio_iter(const struct iomap_iter *iter,
|
||||
struct iomap_dio *dio)
|
||||
static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
|
||||
{
|
||||
switch (iter->iomap.type) {
|
||||
case IOMAP_HOLE:
|
||||
@@ -610,9 +652,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
iomi.flags |= IOMAP_NOWAIT;
|
||||
|
||||
if (iocb->ki_flags & IOCB_ATOMIC)
|
||||
iomi.flags |= IOMAP_ATOMIC;
|
||||
|
||||
if (iov_iter_rw(iter) == READ) {
|
||||
/* reads can always complete inline */
|
||||
dio->flags |= IOMAP_DIO_INLINE_COMP;
|
||||
@@ -647,6 +686,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
iomi.flags |= IOMAP_OVERWRITE_ONLY;
|
||||
}
|
||||
|
||||
if (dio_flags & IOMAP_DIO_ATOMIC_SW)
|
||||
iomi.flags |= IOMAP_ATOMIC_SW;
|
||||
else if (iocb->ki_flags & IOCB_ATOMIC)
|
||||
iomi.flags |= IOMAP_ATOMIC_HW;
|
||||
|
||||
/* for data sync or sync, we need sync completion processing */
|
||||
if (iocb_is_dsync(iocb)) {
|
||||
dio->flags |= IOMAP_DIO_NEED_SYNC;
|
||||
@@ -700,7 +744,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
|
||||
blk_start_plug(&plug);
|
||||
while ((ret = iomap_iter(&iomi, ops)) > 0) {
|
||||
iomi.processed = iomap_dio_iter(&iomi, dio);
|
||||
iomi.status = iomap_dio_iter(&iomi, dio);
|
||||
|
||||
/*
|
||||
* We can only poll for single bio I/Os.
|
||||
|
||||
@@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
|
||||
iomap->length, flags);
|
||||
}
|
||||
|
||||
static loff_t iomap_fiemap_iter(const struct iomap_iter *iter,
|
||||
static int iomap_fiemap_iter(struct iomap_iter *iter,
|
||||
struct fiemap_extent_info *fi, struct iomap *prev)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (iter->iomap.type == IOMAP_HOLE)
|
||||
return iomap_length(iter);
|
||||
goto advance;
|
||||
|
||||
ret = iomap_to_fiemap(fi, prev, 0);
|
||||
*prev = iter->iomap;
|
||||
switch (ret) {
|
||||
case 0: /* success */
|
||||
return iomap_length(iter);
|
||||
case 1: /* extent array full */
|
||||
return 0;
|
||||
default: /* error */
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
if (ret == 1) /* extent array full */
|
||||
return 0;
|
||||
|
||||
advance:
|
||||
return iomap_iter_advance_full(iter);
|
||||
}
|
||||
|
||||
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
|
||||
@@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
|
||||
return ret;
|
||||
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_fiemap_iter(&iter, fi, &prev);
|
||||
iter.status = iomap_fiemap_iter(&iter, fi, &prev);
|
||||
|
||||
if (prev.type != IOMAP_HOLE) {
|
||||
ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST);
|
||||
@@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno,
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0) {
|
||||
if (iter.iomap.type == IOMAP_MAPPED)
|
||||
bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift;
|
||||
/* leave iter.processed unset to abort loop */
|
||||
/* leave iter.status unset to abort loop */
|
||||
}
|
||||
if (ret)
|
||||
return 0;
|
||||
|
||||
10
fs/iomap/internal.h
Normal file
10
fs/iomap/internal.h
Normal file
@@ -0,0 +1,10 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _IOMAP_INTERNAL_H
|
||||
#define _IOMAP_INTERNAL_H 1
|
||||
|
||||
#define IOEND_BATCH_SIZE 4096
|
||||
|
||||
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
|
||||
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
|
||||
|
||||
#endif /* _IOMAP_INTERNAL_H */
|
||||
216
fs/iomap/ioend.c
Normal file
216
fs/iomap/ioend.c
Normal file
@@ -0,0 +1,216 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2024-2025 Christoph Hellwig.
|
||||
*/
|
||||
#include <linux/iomap.h>
|
||||
#include <linux/list_sort.h>
|
||||
#include "internal.h"
|
||||
|
||||
struct bio_set iomap_ioend_bioset;
|
||||
EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
|
||||
|
||||
struct iomap_ioend *iomap_init_ioend(struct inode *inode,
|
||||
struct bio *bio, loff_t file_offset, u16 ioend_flags)
|
||||
{
|
||||
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
|
||||
|
||||
atomic_set(&ioend->io_remaining, 1);
|
||||
ioend->io_error = 0;
|
||||
ioend->io_parent = NULL;
|
||||
INIT_LIST_HEAD(&ioend->io_list);
|
||||
ioend->io_flags = ioend_flags;
|
||||
ioend->io_inode = inode;
|
||||
ioend->io_offset = file_offset;
|
||||
ioend->io_size = bio->bi_iter.bi_size;
|
||||
ioend->io_sector = bio->bi_iter.bi_sector;
|
||||
ioend->io_private = NULL;
|
||||
return ioend;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_init_ioend);
|
||||
|
||||
static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
|
||||
{
|
||||
if (ioend->io_parent) {
|
||||
struct bio *bio = &ioend->io_bio;
|
||||
|
||||
ioend = ioend->io_parent;
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
if (error)
|
||||
cmpxchg(&ioend->io_error, 0, error);
|
||||
|
||||
if (!atomic_dec_and_test(&ioend->io_remaining))
|
||||
return 0;
|
||||
if (ioend->io_flags & IOMAP_IOEND_DIRECT)
|
||||
return iomap_finish_ioend_direct(ioend);
|
||||
return iomap_finish_ioend_buffered(ioend);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ioend completion routine for merged bios. This can only be called from task
|
||||
* contexts as merged ioends can be of unbound length. Hence we have to break up
|
||||
* the writeback completions into manageable chunks to avoid long scheduler
|
||||
* holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
|
||||
* good batch processing throughput without creating adverse scheduler latency
|
||||
* conditions.
|
||||
*/
|
||||
void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
|
||||
{
|
||||
struct list_head tmp;
|
||||
u32 completions;
|
||||
|
||||
might_sleep();
|
||||
|
||||
list_replace_init(&ioend->io_list, &tmp);
|
||||
completions = iomap_finish_ioend(ioend, error);
|
||||
|
||||
while (!list_empty(&tmp)) {
|
||||
if (completions > IOEND_BATCH_SIZE * 8) {
|
||||
cond_resched();
|
||||
completions = 0;
|
||||
}
|
||||
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
|
||||
list_del_init(&ioend->io_list);
|
||||
completions += iomap_finish_ioend(ioend, error);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
|
||||
|
||||
/*
|
||||
* We can merge two adjacent ioends if they have the same set of work to do.
|
||||
*/
|
||||
static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
|
||||
struct iomap_ioend *next)
|
||||
{
|
||||
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
|
||||
return false;
|
||||
if (next->io_flags & IOMAP_IOEND_BOUNDARY)
|
||||
return false;
|
||||
if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
|
||||
(next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
|
||||
return false;
|
||||
if (ioend->io_offset + ioend->io_size != next->io_offset)
|
||||
return false;
|
||||
/*
|
||||
* Do not merge physically discontiguous ioends. The filesystem
|
||||
* completion functions will have to iterate the physical
|
||||
* discontiguities even if we merge the ioends at a logical level, so
|
||||
* we don't gain anything by merging physical discontiguities here.
|
||||
*
|
||||
* We cannot use bio->bi_iter.bi_sector here as it is modified during
|
||||
* submission so does not point to the start sector of the bio at
|
||||
* completion.
|
||||
*/
|
||||
if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
|
||||
next->io_sector)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
|
||||
struct list_head *more_ioends)
|
||||
{
|
||||
struct iomap_ioend *next;
|
||||
|
||||
INIT_LIST_HEAD(&ioend->io_list);
|
||||
|
||||
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
|
||||
io_list))) {
|
||||
if (!iomap_ioend_can_merge(ioend, next))
|
||||
break;
|
||||
list_move_tail(&next->io_list, &ioend->io_list);
|
||||
ioend->io_size += next->io_size;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
|
||||
|
||||
static int iomap_ioend_compare(void *priv, const struct list_head *a,
|
||||
const struct list_head *b)
|
||||
{
|
||||
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
|
||||
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
|
||||
|
||||
if (ia->io_offset < ib->io_offset)
|
||||
return -1;
|
||||
if (ia->io_offset > ib->io_offset)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void iomap_sort_ioends(struct list_head *ioend_list)
|
||||
{
|
||||
list_sort(NULL, ioend_list, iomap_ioend_compare);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
|
||||
|
||||
/*
|
||||
* Split up to the first @max_len bytes from @ioend if the ioend covers more
|
||||
* than @max_len bytes.
|
||||
*
|
||||
* If @is_append is set, the split will be based on the hardware limits for
|
||||
* REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
|
||||
* limits don't allow the entire @max_len length.
|
||||
*
|
||||
* The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
|
||||
* does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
|
||||
* switch the operation after this call, but before submitting the bio.
|
||||
*/
|
||||
struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
|
||||
unsigned int max_len, bool is_append)
|
||||
{
|
||||
struct bio *bio = &ioend->io_bio;
|
||||
struct iomap_ioend *split_ioend;
|
||||
unsigned int nr_segs;
|
||||
int sector_offset;
|
||||
struct bio *split;
|
||||
|
||||
if (is_append) {
|
||||
struct queue_limits *lim = bdev_limits(bio->bi_bdev);
|
||||
|
||||
max_len = min(max_len,
|
||||
lim->max_zone_append_sectors << SECTOR_SHIFT);
|
||||
|
||||
sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
|
||||
if (unlikely(sector_offset < 0))
|
||||
return ERR_PTR(sector_offset);
|
||||
if (!sector_offset)
|
||||
return NULL;
|
||||
} else {
|
||||
if (bio->bi_iter.bi_size <= max_len)
|
||||
return NULL;
|
||||
sector_offset = max_len >> SECTOR_SHIFT;
|
||||
}
|
||||
|
||||
/* ensure the split ioend is still block size aligned */
|
||||
sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
|
||||
i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
|
||||
|
||||
split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
|
||||
if (IS_ERR(split))
|
||||
return ERR_CAST(split);
|
||||
split->bi_private = bio->bi_private;
|
||||
split->bi_end_io = bio->bi_end_io;
|
||||
|
||||
split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
|
||||
ioend->io_flags);
|
||||
split_ioend->io_parent = ioend;
|
||||
|
||||
atomic_inc(&ioend->io_remaining);
|
||||
ioend->io_offset += split_ioend->io_size;
|
||||
ioend->io_size -= split_ioend->io_size;
|
||||
|
||||
split_ioend->io_sector = ioend->io_sector;
|
||||
if (!is_append)
|
||||
ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
|
||||
return split_ioend;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_split_ioend);
|
||||
|
||||
static int __init iomap_ioend_init(void)
|
||||
{
|
||||
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
|
||||
offsetof(struct iomap_ioend, io_bio),
|
||||
BIOSET_NEED_BVECS);
|
||||
}
|
||||
fs_initcall(iomap_ioend_init);
|
||||
@@ -7,40 +7,25 @@
|
||||
#include <linux/iomap.h>
|
||||
#include "trace.h"
|
||||
|
||||
/*
|
||||
* Advance to the next range we need to map.
|
||||
*
|
||||
* If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
|
||||
* processed - it was aborted because the extent the iomap spanned may have been
|
||||
* changed during the operation. In this case, the iteration behaviour is to
|
||||
* remap the unprocessed range of the iter, and that means we may need to remap
|
||||
* even when we've made no progress (i.e. iter->processed = 0). Hence the
|
||||
* "finished iterating" case needs to distinguish between
|
||||
* (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
|
||||
* need to remap the entire remaining range.
|
||||
*/
|
||||
static inline int iomap_iter_advance(struct iomap_iter *iter)
|
||||
static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
|
||||
{
|
||||
bool stale = iter->iomap.flags & IOMAP_F_STALE;
|
||||
int ret = 1;
|
||||
|
||||
/* handle the previous iteration (if any) */
|
||||
if (iter->iomap.length) {
|
||||
if (iter->processed < 0)
|
||||
return iter->processed;
|
||||
if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
|
||||
return -EIO;
|
||||
iter->pos += iter->processed;
|
||||
iter->len -= iter->processed;
|
||||
if (!iter->len || (!iter->processed && !stale))
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
/* clear the per iteration state */
|
||||
iter->processed = 0;
|
||||
iter->status = 0;
|
||||
memset(&iter->iomap, 0, sizeof(iter->iomap));
|
||||
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance the current iterator position and output the length remaining for the
|
||||
* current mapping.
|
||||
*/
|
||||
int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
|
||||
{
|
||||
if (WARN_ON_ONCE(*count > iomap_length(iter)))
|
||||
return -EIO;
|
||||
iter->pos += *count;
|
||||
iter->len -= *count;
|
||||
*count = iomap_length(iter);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void iomap_iter_done(struct iomap_iter *iter)
|
||||
@@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
|
||||
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
|
||||
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
|
||||
|
||||
iter->iter_start_pos = iter->pos;
|
||||
|
||||
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
|
||||
if (iter->srcmap.type != IOMAP_HOLE)
|
||||
trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
|
||||
@@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
|
||||
* function must be called in a loop that continues as long it returns a
|
||||
* positive value. If 0 or a negative value is returned, the caller must not
|
||||
* return to the loop body. Within a loop body, there are two ways to break out
|
||||
* of the loop body: leave @iter.processed unchanged, or set it to a negative
|
||||
* of the loop body: leave @iter.status unchanged, or set it to a negative
|
||||
* errno.
|
||||
*/
|
||||
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
|
||||
{
|
||||
bool stale = iter->iomap.flags & IOMAP_F_STALE;
|
||||
ssize_t advanced;
|
||||
u64 olen;
|
||||
int ret;
|
||||
|
||||
if (iter->iomap.length && ops->iomap_end) {
|
||||
ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
|
||||
iter->processed > 0 ? iter->processed : 0,
|
||||
iter->flags, &iter->iomap);
|
||||
if (ret < 0 && !iter->processed)
|
||||
trace_iomap_iter(iter, ops, _RET_IP_);
|
||||
|
||||
if (!iter->iomap.length)
|
||||
goto begin;
|
||||
|
||||
/*
|
||||
* Calculate how far the iter was advanced and the original length bytes
|
||||
* for ->iomap_end().
|
||||
*/
|
||||
advanced = iter->pos - iter->iter_start_pos;
|
||||
olen = iter->len + advanced;
|
||||
|
||||
if (ops->iomap_end) {
|
||||
ret = ops->iomap_end(iter->inode, iter->iter_start_pos,
|
||||
iomap_length_trim(iter, iter->iter_start_pos,
|
||||
olen),
|
||||
advanced, iter->flags, &iter->iomap);
|
||||
if (ret < 0 && !advanced)
|
||||
return ret;
|
||||
}
|
||||
|
||||
trace_iomap_iter(iter, ops, _RET_IP_);
|
||||
ret = iomap_iter_advance(iter);
|
||||
/* detect old return semantics where this would advance */
|
||||
if (WARN_ON_ONCE(iter->status > 0))
|
||||
iter->status = -EIO;
|
||||
|
||||
/*
|
||||
* Use iter->len to determine whether to continue onto the next mapping.
|
||||
* Explicitly terminate on error status or if the current iter has not
|
||||
* advanced at all (i.e. no work was done for some reason) unless the
|
||||
* mapping has been marked stale and needs to be reprocessed.
|
||||
*/
|
||||
if (iter->status < 0)
|
||||
ret = iter->status;
|
||||
else if (iter->len == 0 || (!advanced && !stale))
|
||||
ret = 0;
|
||||
else
|
||||
ret = 1;
|
||||
iomap_iter_reset_iomap(iter);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
begin:
|
||||
ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
|
||||
&iter->iomap, &iter->srcmap);
|
||||
if (ret < 0)
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/pagevec.h>
|
||||
|
||||
static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
|
||||
static int iomap_seek_hole_iter(struct iomap_iter *iter,
|
||||
loff_t *hole_pos)
|
||||
{
|
||||
loff_t length = iomap_length(iter);
|
||||
@@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
|
||||
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
|
||||
iter->pos, iter->pos + length, SEEK_HOLE);
|
||||
if (*hole_pos == iter->pos + length)
|
||||
return length;
|
||||
return iomap_iter_advance(iter, &length);
|
||||
return 0;
|
||||
case IOMAP_HOLE:
|
||||
*hole_pos = iter->pos;
|
||||
return 0;
|
||||
default:
|
||||
return length;
|
||||
return iomap_iter_advance(iter, &length);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
|
||||
|
||||
iter.len = size - pos;
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_seek_hole_iter(&iter, &pos);
|
||||
iter.status = iomap_seek_hole_iter(&iter, &pos);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (iter.len) /* found hole before EOF */
|
||||
@@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iomap_seek_hole);
|
||||
|
||||
static loff_t iomap_seek_data_iter(const struct iomap_iter *iter,
|
||||
static int iomap_seek_data_iter(struct iomap_iter *iter,
|
||||
loff_t *hole_pos)
|
||||
{
|
||||
loff_t length = iomap_length(iter);
|
||||
|
||||
switch (iter->iomap.type) {
|
||||
case IOMAP_HOLE:
|
||||
return length;
|
||||
return iomap_iter_advance(iter, &length);
|
||||
case IOMAP_UNWRITTEN:
|
||||
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
|
||||
iter->pos, iter->pos + length, SEEK_DATA);
|
||||
if (*hole_pos < 0)
|
||||
return length;
|
||||
return iomap_iter_advance(iter, &length);
|
||||
return 0;
|
||||
default:
|
||||
*hole_pos = iter->pos;
|
||||
@@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
|
||||
|
||||
iter.len = size - pos;
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_seek_data_iter(&iter, &pos);
|
||||
iter.status = iomap_seek_data_iter(&iter, &pos);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (iter.len) /* found data before EOF */
|
||||
|
||||
@@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
|
||||
* swap only cares about contiguous page-aligned physical extents and makes no
|
||||
* distinction between written and unwritten extents.
|
||||
*/
|
||||
static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
|
||||
static int iomap_swapfile_iter(struct iomap_iter *iter,
|
||||
struct iomap *iomap, struct iomap_swapfile_info *isi)
|
||||
{
|
||||
switch (iomap->type) {
|
||||
@@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
|
||||
return error;
|
||||
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
|
||||
}
|
||||
return iomap_length(iter);
|
||||
|
||||
return iomap_iter_advance_full(iter);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
|
||||
return ret;
|
||||
|
||||
while ((ret = iomap_iter(&iter, ops)) > 0)
|
||||
iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
|
||||
iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
|
||||
@@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
|
||||
{ IOMAP_FAULT, "FAULT" }, \
|
||||
{ IOMAP_DIRECT, "DIRECT" }, \
|
||||
{ IOMAP_NOWAIT, "NOWAIT" }, \
|
||||
{ IOMAP_ATOMIC, "ATOMIC" }
|
||||
{ IOMAP_ATOMIC_HW, "ATOMIC_HW" }
|
||||
|
||||
#define IOMAP_F_FLAGS_STRINGS \
|
||||
{ IOMAP_F_NEW, "NEW" }, \
|
||||
@@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter,
|
||||
__field(u64, ino)
|
||||
__field(loff_t, pos)
|
||||
__field(u64, length)
|
||||
__field(s64, processed)
|
||||
__field(int, status)
|
||||
__field(unsigned int, flags)
|
||||
__field(const void *, ops)
|
||||
__field(unsigned long, caller)
|
||||
@@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter,
|
||||
__entry->ino = iter->inode->i_ino;
|
||||
__entry->pos = iter->pos;
|
||||
__entry->length = iomap_length(iter);
|
||||
__entry->processed = iter->processed;
|
||||
__entry->status = iter->status;
|
||||
__entry->flags = iter->flags;
|
||||
__entry->ops = ops;
|
||||
__entry->caller = caller;
|
||||
),
|
||||
TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
|
||||
TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->ino,
|
||||
__entry->pos,
|
||||
__entry->length,
|
||||
__entry->processed,
|
||||
__entry->status,
|
||||
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
|
||||
__entry->flags,
|
||||
__entry->ops,
|
||||
|
||||
@@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
|
||||
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
|
||||
xfs_rtbitmap.o \
|
||||
xfs_rtgroup.o \
|
||||
xfs_zones.o \
|
||||
)
|
||||
|
||||
# highlevel code
|
||||
@@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
|
||||
xfs_quotaops.o
|
||||
|
||||
# xfs_rtbitmap is shared with libxfs
|
||||
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
|
||||
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
|
||||
xfs_zone_alloc.o \
|
||||
xfs_zone_gc.o \
|
||||
xfs_zone_info.o \
|
||||
xfs_zone_space_resv.o
|
||||
|
||||
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
|
||||
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
|
||||
|
||||
@@ -34,13 +34,13 @@
|
||||
#include "xfs_ag.h"
|
||||
#include "xfs_ag_resv.h"
|
||||
#include "xfs_refcount.h"
|
||||
#include "xfs_icache.h"
|
||||
#include "xfs_iomap.h"
|
||||
#include "xfs_health.h"
|
||||
#include "xfs_bmap_item.h"
|
||||
#include "xfs_symlink_remote.h"
|
||||
#include "xfs_inode_util.h"
|
||||
#include "xfs_rtgroup.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
struct kmem_cache *xfs_bmap_intent_cache;
|
||||
|
||||
@@ -171,18 +171,16 @@ xfs_bmbt_update(
|
||||
* Compute the worst-case number of indirect blocks that will be used
|
||||
* for ip's delayed extent of length "len".
|
||||
*/
|
||||
STATIC xfs_filblks_t
|
||||
xfs_filblks_t
|
||||
xfs_bmap_worst_indlen(
|
||||
xfs_inode_t *ip, /* incore inode pointer */
|
||||
xfs_filblks_t len) /* delayed extent length */
|
||||
struct xfs_inode *ip, /* incore inode pointer */
|
||||
xfs_filblks_t len) /* delayed extent length */
|
||||
{
|
||||
int level; /* btree level number */
|
||||
int maxrecs; /* maximum record count at this level */
|
||||
xfs_mount_t *mp; /* mount structure */
|
||||
xfs_filblks_t rval; /* return value */
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
int maxrecs = mp->m_bmap_dmxr[0];
|
||||
int level;
|
||||
xfs_filblks_t rval;
|
||||
|
||||
mp = ip->i_mount;
|
||||
maxrecs = mp->m_bmap_dmxr[0];
|
||||
for (level = 0, rval = 0;
|
||||
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
|
||||
level++) {
|
||||
@@ -2571,146 +2569,6 @@ xfs_bmap_add_extent_unwritten_real(
|
||||
#undef PREV
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a hole to a delayed allocation.
|
||||
*/
|
||||
STATIC void
|
||||
xfs_bmap_add_extent_hole_delay(
|
||||
xfs_inode_t *ip, /* incore inode pointer */
|
||||
int whichfork,
|
||||
struct xfs_iext_cursor *icur,
|
||||
xfs_bmbt_irec_t *new) /* new data to add to file extents */
|
||||
{
|
||||
struct xfs_ifork *ifp; /* inode fork pointer */
|
||||
xfs_bmbt_irec_t left; /* left neighbor extent entry */
|
||||
xfs_filblks_t newlen=0; /* new indirect size */
|
||||
xfs_filblks_t oldlen=0; /* old indirect size */
|
||||
xfs_bmbt_irec_t right; /* right neighbor extent entry */
|
||||
uint32_t state = xfs_bmap_fork_to_state(whichfork);
|
||||
xfs_filblks_t temp; /* temp for indirect calculations */
|
||||
|
||||
ifp = xfs_ifork_ptr(ip, whichfork);
|
||||
ASSERT(isnullstartblock(new->br_startblock));
|
||||
|
||||
/*
|
||||
* Check and set flags if this segment has a left neighbor
|
||||
*/
|
||||
if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
|
||||
state |= BMAP_LEFT_VALID;
|
||||
if (isnullstartblock(left.br_startblock))
|
||||
state |= BMAP_LEFT_DELAY;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check and set flags if the current (right) segment exists.
|
||||
* If it doesn't exist, we're converting the hole at end-of-file.
|
||||
*/
|
||||
if (xfs_iext_get_extent(ifp, icur, &right)) {
|
||||
state |= BMAP_RIGHT_VALID;
|
||||
if (isnullstartblock(right.br_startblock))
|
||||
state |= BMAP_RIGHT_DELAY;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set contiguity flags on the left and right neighbors.
|
||||
* Don't let extents get too large, even if the pieces are contiguous.
|
||||
*/
|
||||
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
|
||||
left.br_startoff + left.br_blockcount == new->br_startoff &&
|
||||
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
|
||||
state |= BMAP_LEFT_CONTIG;
|
||||
|
||||
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
|
||||
new->br_startoff + new->br_blockcount == right.br_startoff &&
|
||||
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
|
||||
(!(state & BMAP_LEFT_CONTIG) ||
|
||||
(left.br_blockcount + new->br_blockcount +
|
||||
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
|
||||
state |= BMAP_RIGHT_CONTIG;
|
||||
|
||||
/*
|
||||
* Switch out based on the contiguity flags.
|
||||
*/
|
||||
switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
|
||||
case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
|
||||
/*
|
||||
* New allocation is contiguous with delayed allocations
|
||||
* on the left and on the right.
|
||||
* Merge all three into a single extent record.
|
||||
*/
|
||||
temp = left.br_blockcount + new->br_blockcount +
|
||||
right.br_blockcount;
|
||||
|
||||
oldlen = startblockval(left.br_startblock) +
|
||||
startblockval(new->br_startblock) +
|
||||
startblockval(right.br_startblock);
|
||||
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
|
||||
oldlen);
|
||||
left.br_startblock = nullstartblock(newlen);
|
||||
left.br_blockcount = temp;
|
||||
|
||||
xfs_iext_remove(ip, icur, state);
|
||||
xfs_iext_prev(ifp, icur);
|
||||
xfs_iext_update_extent(ip, state, icur, &left);
|
||||
break;
|
||||
|
||||
case BMAP_LEFT_CONTIG:
|
||||
/*
|
||||
* New allocation is contiguous with a delayed allocation
|
||||
* on the left.
|
||||
* Merge the new allocation with the left neighbor.
|
||||
*/
|
||||
temp = left.br_blockcount + new->br_blockcount;
|
||||
|
||||
oldlen = startblockval(left.br_startblock) +
|
||||
startblockval(new->br_startblock);
|
||||
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
|
||||
oldlen);
|
||||
left.br_blockcount = temp;
|
||||
left.br_startblock = nullstartblock(newlen);
|
||||
|
||||
xfs_iext_prev(ifp, icur);
|
||||
xfs_iext_update_extent(ip, state, icur, &left);
|
||||
break;
|
||||
|
||||
case BMAP_RIGHT_CONTIG:
|
||||
/*
|
||||
* New allocation is contiguous with a delayed allocation
|
||||
* on the right.
|
||||
* Merge the new allocation with the right neighbor.
|
||||
*/
|
||||
temp = new->br_blockcount + right.br_blockcount;
|
||||
oldlen = startblockval(new->br_startblock) +
|
||||
startblockval(right.br_startblock);
|
||||
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
|
||||
oldlen);
|
||||
right.br_startoff = new->br_startoff;
|
||||
right.br_startblock = nullstartblock(newlen);
|
||||
right.br_blockcount = temp;
|
||||
xfs_iext_update_extent(ip, state, icur, &right);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
/*
|
||||
* New allocation is not contiguous with another
|
||||
* delayed allocation.
|
||||
* Insert a new entry.
|
||||
*/
|
||||
oldlen = newlen = 0;
|
||||
xfs_iext_insert(ip, icur, new, state);
|
||||
break;
|
||||
}
|
||||
if (oldlen != newlen) {
|
||||
ASSERT(oldlen > newlen);
|
||||
xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
|
||||
|
||||
/*
|
||||
* Nothing to do for disk quota accounting here.
|
||||
*/
|
||||
xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a hole to a real allocation.
|
||||
*/
|
||||
@@ -4039,144 +3897,6 @@ xfs_bmapi_read(
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a delayed allocation extent to an inode. Blocks are reserved from the
|
||||
* global pool and the extent inserted into the inode in-core extent tree.
|
||||
*
|
||||
* On entry, got refers to the first extent beyond the offset of the extent to
|
||||
* allocate or eof is specified if no such extent exists. On return, got refers
|
||||
* to the extent record that was inserted to the inode fork.
|
||||
*
|
||||
* Note that the allocated extent may have been merged with contiguous extents
|
||||
* during insertion into the inode fork. Thus, got does not reflect the current
|
||||
* state of the inode fork on return. If necessary, the caller can use lastx to
|
||||
* look up the updated record in the inode fork.
|
||||
*/
|
||||
int
|
||||
xfs_bmapi_reserve_delalloc(
|
||||
struct xfs_inode *ip,
|
||||
int whichfork,
|
||||
xfs_fileoff_t off,
|
||||
xfs_filblks_t len,
|
||||
xfs_filblks_t prealloc,
|
||||
struct xfs_bmbt_irec *got,
|
||||
struct xfs_iext_cursor *icur,
|
||||
int eof)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
|
||||
xfs_extlen_t alen;
|
||||
xfs_extlen_t indlen;
|
||||
uint64_t fdblocks;
|
||||
int error;
|
||||
xfs_fileoff_t aoff;
|
||||
bool use_cowextszhint =
|
||||
whichfork == XFS_COW_FORK && !prealloc;
|
||||
|
||||
retry:
|
||||
/*
|
||||
* Cap the alloc length. Keep track of prealloc so we know whether to
|
||||
* tag the inode before we return.
|
||||
*/
|
||||
aoff = off;
|
||||
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
|
||||
if (!eof)
|
||||
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
|
||||
if (prealloc && alen >= len)
|
||||
prealloc = alen - len;
|
||||
|
||||
/*
|
||||
* If we're targetting the COW fork but aren't creating a speculative
|
||||
* posteof preallocation, try to expand the reservation to align with
|
||||
* the COW extent size hint if there's sufficient free space.
|
||||
*
|
||||
* Unlike the data fork, the CoW cancellation functions will free all
|
||||
* the reservations at inactivation, so we don't require that every
|
||||
* delalloc reservation have a dirty pagecache.
|
||||
*/
|
||||
if (use_cowextszhint) {
|
||||
struct xfs_bmbt_irec prev;
|
||||
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
|
||||
|
||||
if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
|
||||
prev.br_startoff = NULLFILEOFF;
|
||||
|
||||
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
|
||||
1, 0, &aoff, &alen);
|
||||
ASSERT(!error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make a transaction-less quota reservation for delayed allocation
|
||||
* blocks. This number gets adjusted later. We return if we haven't
|
||||
* allocated blocks already inside this loop.
|
||||
*/
|
||||
error = xfs_quota_reserve_blkres(ip, alen);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Split changing sb for alen and indlen since they could be coming
|
||||
* from different places.
|
||||
*/
|
||||
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
|
||||
ASSERT(indlen > 0);
|
||||
|
||||
fdblocks = indlen;
|
||||
if (XFS_IS_REALTIME_INODE(ip)) {
|
||||
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
|
||||
if (error)
|
||||
goto out_unreserve_quota;
|
||||
} else {
|
||||
fdblocks += alen;
|
||||
}
|
||||
|
||||
error = xfs_dec_fdblocks(mp, fdblocks, false);
|
||||
if (error)
|
||||
goto out_unreserve_frextents;
|
||||
|
||||
ip->i_delayed_blks += alen;
|
||||
xfs_mod_delalloc(ip, alen, indlen);
|
||||
|
||||
got->br_startoff = aoff;
|
||||
got->br_startblock = nullstartblock(indlen);
|
||||
got->br_blockcount = alen;
|
||||
got->br_state = XFS_EXT_NORM;
|
||||
|
||||
xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
|
||||
|
||||
/*
|
||||
* Tag the inode if blocks were preallocated. Note that COW fork
|
||||
* preallocation can occur at the start or end of the extent, even when
|
||||
* prealloc == 0, so we must also check the aligned offset and length.
|
||||
*/
|
||||
if (whichfork == XFS_DATA_FORK && prealloc)
|
||||
xfs_inode_set_eofblocks_tag(ip);
|
||||
if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
|
||||
xfs_inode_set_cowblocks_tag(ip);
|
||||
|
||||
return 0;
|
||||
|
||||
out_unreserve_frextents:
|
||||
if (XFS_IS_REALTIME_INODE(ip))
|
||||
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
|
||||
out_unreserve_quota:
|
||||
if (XFS_IS_QUOTA_ON(mp))
|
||||
xfs_quota_unreserve_blkres(ip, alen);
|
||||
out:
|
||||
if (error == -ENOSPC || error == -EDQUOT) {
|
||||
trace_xfs_delalloc_enospc(ip, off, len);
|
||||
|
||||
if (prealloc || use_cowextszhint) {
|
||||
/* retry without any preallocation */
|
||||
use_cowextszhint = false;
|
||||
prealloc = 0;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_bmapi_allocate(
|
||||
struct xfs_bmalloca *bma)
|
||||
@@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay(
|
||||
int whichfork,
|
||||
struct xfs_iext_cursor *icur,
|
||||
struct xfs_bmbt_irec *got,
|
||||
struct xfs_bmbt_irec *del)
|
||||
struct xfs_bmbt_irec *del,
|
||||
uint32_t bflags) /* bmapi flags */
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
|
||||
@@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay(
|
||||
da_diff = da_old - da_new;
|
||||
fdblocks = da_diff;
|
||||
|
||||
if (isrt)
|
||||
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
|
||||
else
|
||||
if (bflags & XFS_BMAPI_REMAP) {
|
||||
;
|
||||
} else if (isrt) {
|
||||
xfs_rtbxlen_t rtxlen;
|
||||
|
||||
rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
|
||||
if (xfs_is_zoned_inode(ip))
|
||||
xfs_zoned_add_available(mp, rtxlen);
|
||||
xfs_add_frextents(mp, rtxlen);
|
||||
} else {
|
||||
fdblocks += del->br_blockcount;
|
||||
}
|
||||
|
||||
xfs_add_fdblocks(mp, fdblocks);
|
||||
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
|
||||
@@ -5670,7 +5399,8 @@ __xfs_bunmapi(
|
||||
|
||||
delete:
|
||||
if (wasdel) {
|
||||
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
|
||||
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
|
||||
&del, flags);
|
||||
} else {
|
||||
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
|
||||
&del, &tmp_logflags, whichfork,
|
||||
|
||||
@@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
|
||||
xfs_extnum_t nexts, int *done);
|
||||
void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
|
||||
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
|
||||
struct xfs_bmbt_irec *del);
|
||||
struct xfs_bmbt_irec *del, uint32_t bflags);
|
||||
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
|
||||
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
|
||||
struct xfs_bmbt_irec *del);
|
||||
@@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
|
||||
bool *done, xfs_fileoff_t stop_fsb);
|
||||
int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
|
||||
xfs_fileoff_t split_offset);
|
||||
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
|
||||
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
|
||||
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
|
||||
int eof);
|
||||
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
|
||||
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
|
||||
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
|
||||
@@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
|
||||
int fork);
|
||||
int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
|
||||
struct xfs_alloc_arg *args);
|
||||
xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
|
||||
|
||||
enum xfs_bmap_intent_type {
|
||||
XFS_BMAP_MAP = 1,
|
||||
|
||||
@@ -178,9 +178,10 @@ typedef struct xfs_sb {
|
||||
|
||||
xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
|
||||
xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
|
||||
|
||||
uint8_t sb_rgblklog; /* rt group number shift */
|
||||
uint8_t sb_pad[7]; /* zeroes */
|
||||
xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */
|
||||
xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */
|
||||
|
||||
/* must be padded to 64 bit alignment */
|
||||
} xfs_sb_t;
|
||||
@@ -270,9 +271,10 @@ struct xfs_dsb {
|
||||
__be64 sb_metadirino; /* metadata directory tree root */
|
||||
__be32 sb_rgcount; /* # of realtime groups */
|
||||
__be32 sb_rgextents; /* size of rtgroup in rtx */
|
||||
|
||||
__u8 sb_rgblklog; /* rt group number shift */
|
||||
__u8 sb_pad[7]; /* zeroes */
|
||||
__be64 sb_rtstart; /* start of internal RT section (FSB) */
|
||||
__be64 sb_rtreserved; /* reserved (zoned) RT blocks */
|
||||
|
||||
/*
|
||||
* The size of this structure must be padded to 64 bit alignment.
|
||||
@@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature(
|
||||
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
|
||||
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
|
||||
#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
|
||||
#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */
|
||||
#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */
|
||||
|
||||
#define XFS_SB_FEAT_INCOMPAT_ALL \
|
||||
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
|
||||
XFS_SB_FEAT_INCOMPAT_SPINODES | \
|
||||
@@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature(
|
||||
XFS_SB_FEAT_INCOMPAT_NREXT64 | \
|
||||
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
|
||||
XFS_SB_FEAT_INCOMPAT_PARENT | \
|
||||
XFS_SB_FEAT_INCOMPAT_METADIR)
|
||||
XFS_SB_FEAT_INCOMPAT_METADIR | \
|
||||
XFS_SB_FEAT_INCOMPAT_ZONED | \
|
||||
XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)
|
||||
|
||||
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
|
||||
static inline bool
|
||||
@@ -952,7 +959,12 @@ struct xfs_dinode {
|
||||
__be64 di_changecount; /* number of attribute changes */
|
||||
__be64 di_lsn; /* flush sequence */
|
||||
__be64 di_flags2; /* more random flags */
|
||||
__be32 di_cowextsize; /* basic cow extent size for file */
|
||||
union {
|
||||
/* basic cow extent size for (regular) file */
|
||||
__be32 di_cowextsize;
|
||||
/* used blocks in RTG for (zoned) rtrmap inode */
|
||||
__be32 di_used_blocks;
|
||||
};
|
||||
__u8 di_pad2[12]; /* more padding for future expansion */
|
||||
|
||||
/* fields only written to during inode creation */
|
||||
|
||||
@@ -189,7 +189,9 @@ struct xfs_fsop_geom {
|
||||
uint32_t checked; /* o: checked fs & rt metadata */
|
||||
__u32 rgextents; /* rt extents in a realtime group */
|
||||
__u32 rgcount; /* number of realtime groups */
|
||||
__u64 reserved[16]; /* reserved space */
|
||||
__u64 rtstart; /* start of internal rt section */
|
||||
__u64 rtreserved; /* RT (zoned) reserved blocks */
|
||||
__u64 reserved[14]; /* reserved space */
|
||||
};
|
||||
|
||||
#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
|
||||
@@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks {
|
||||
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
|
||||
#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
|
||||
#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
|
||||
#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */
|
||||
|
||||
/*
|
||||
* Minimum and maximum sizes need for growth checks.
|
||||
@@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry {
|
||||
#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
|
||||
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
|
||||
|
||||
/*
|
||||
* Devices supported by a single XFS file system. Reported in fsmaps fmr_device
|
||||
* when using internal RT devices.
|
||||
*/
|
||||
enum xfs_device {
|
||||
XFS_DEV_DATA = 1,
|
||||
XFS_DEV_LOG = 2,
|
||||
XFS_DEV_RT = 3,
|
||||
};
|
||||
|
||||
#ifndef HAVE_BBMACROS
|
||||
/*
|
||||
|
||||
@@ -19,10 +19,23 @@ struct xfs_group {
|
||||
#ifdef __KERNEL__
|
||||
/* -- kernel only structures below this line -- */
|
||||
|
||||
/*
|
||||
* Track freed but not yet committed extents.
|
||||
*/
|
||||
struct xfs_extent_busy_tree *xg_busy_extents;
|
||||
union {
|
||||
/*
|
||||
* For perags and non-zoned RT groups:
|
||||
* Track freed but not yet committed extents.
|
||||
*/
|
||||
struct xfs_extent_busy_tree *xg_busy_extents;
|
||||
|
||||
/*
|
||||
* For zoned RT groups:
|
||||
* List of groups that need a zone reset.
|
||||
*
|
||||
* The zonegc code forces a log flush of the rtrmap inode before
|
||||
* resetting the write pointer, so there is no need for
|
||||
* individual busy extent tracking.
|
||||
*/
|
||||
struct xfs_group *xg_next_reset;
|
||||
};
|
||||
|
||||
/*
|
||||
* Bitsets of per-ag metadata that have been checked and/or are sick.
|
||||
@@ -107,9 +120,15 @@ xfs_gbno_to_daddr(
|
||||
xfs_agblock_t gbno)
|
||||
{
|
||||
struct xfs_mount *mp = xg->xg_mount;
|
||||
uint32_t blocks = mp->m_groups[xg->xg_type].blocks;
|
||||
struct xfs_groups *g = &mp->m_groups[xg->xg_type];
|
||||
xfs_fsblock_t fsbno;
|
||||
|
||||
return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno);
|
||||
if (g->has_daddr_gaps)
|
||||
fsbno = xfs_gbno_to_fsb(xg, gbno);
|
||||
else
|
||||
fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
|
||||
|
||||
return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
|
||||
@@ -1927,7 +1927,7 @@ xfs_dialloc(
|
||||
* that we can immediately allocate, but then we allow allocation on the
|
||||
* second pass if we fail to find an AG with free inodes in it.
|
||||
*/
|
||||
if (percpu_counter_read_positive(&mp->m_fdblocks) <
|
||||
if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
|
||||
mp->m_low_space[XFS_LOWSP_1_PCNT]) {
|
||||
ok_alloc = false;
|
||||
low_space = true;
|
||||
|
||||
@@ -252,7 +252,10 @@ xfs_inode_from_disk(
|
||||
be64_to_cpu(from->di_changecount));
|
||||
ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
|
||||
ip->i_diflags2 = be64_to_cpu(from->di_flags2);
|
||||
/* also covers the di_used_blocks union arm: */
|
||||
ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
|
||||
BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
|
||||
sizeof(from->di_used_blocks));
|
||||
}
|
||||
|
||||
error = xfs_iformat_data_fork(ip, from);
|
||||
@@ -349,6 +352,7 @@ xfs_inode_to_disk(
|
||||
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
|
||||
to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
|
||||
to->di_flags2 = cpu_to_be64(ip->i_diflags2);
|
||||
/* also covers the di_used_blocks union arm: */
|
||||
to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
|
||||
to->di_ino = cpu_to_be64(ip->i_ino);
|
||||
to->di_lsn = cpu_to_be64(lsn);
|
||||
@@ -752,11 +756,18 @@ xfs_dinode_verify(
|
||||
!xfs_has_rtreflink(mp))
|
||||
return __this_address;
|
||||
|
||||
/* COW extent size hint validation */
|
||||
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
|
||||
mode, flags, flags2);
|
||||
if (fa)
|
||||
return fa;
|
||||
if (xfs_has_zoned(mp) &&
|
||||
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) {
|
||||
if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents)
|
||||
return __this_address;
|
||||
} else {
|
||||
/* COW extent size hint validation */
|
||||
fa = xfs_inode_validate_cowextsize(mp,
|
||||
be32_to_cpu(dip->di_cowextsize),
|
||||
mode, flags, flags2);
|
||||
if (fa)
|
||||
return fa;
|
||||
}
|
||||
|
||||
/* bigtime iflag can only happen on bigtime filesystems */
|
||||
if (xfs_dinode_has_bigtime(dip) &&
|
||||
|
||||
@@ -322,6 +322,7 @@ xfs_inode_init(
|
||||
|
||||
if (xfs_has_v3inodes(mp)) {
|
||||
inode_set_iversion(inode, 1);
|
||||
/* also covers the di_used_blocks union arm: */
|
||||
ip->i_cowextsize = 0;
|
||||
times |= XFS_ICHGTIME_CREATE;
|
||||
}
|
||||
|
||||
@@ -475,7 +475,12 @@ struct xfs_log_dinode {
|
||||
xfs_lsn_t di_lsn;
|
||||
|
||||
uint64_t di_flags2; /* more random flags */
|
||||
uint32_t di_cowextsize; /* basic cow extent size for file */
|
||||
union {
|
||||
/* basic cow extent size for (regular) file */
|
||||
uint32_t di_cowextsize;
|
||||
/* used blocks in RTG for (zoned) rtrmap inode */
|
||||
uint32_t di_used_blocks;
|
||||
};
|
||||
uint8_t di_pad2[12]; /* more padding for future expansion */
|
||||
|
||||
/* fields only written to during inode creation */
|
||||
|
||||
@@ -21,6 +21,9 @@
|
||||
#include "xfs_errortag.h"
|
||||
#include "xfs_error.h"
|
||||
#include "xfs_alloc.h"
|
||||
#include "xfs_rtgroup.h"
|
||||
#include "xfs_rtrmap_btree.h"
|
||||
#include "xfs_rtrefcount_btree.h"
|
||||
|
||||
static const struct {
|
||||
enum xfs_metafile_type mtype;
|
||||
@@ -74,12 +77,11 @@ xfs_metafile_clear_iflag(
|
||||
}
|
||||
|
||||
/*
|
||||
* Is the amount of space that could be allocated towards a given metadata
|
||||
* file at or beneath a certain threshold?
|
||||
* Is the metafile reservations at or beneath a certain threshold?
|
||||
*/
|
||||
static inline bool
|
||||
xfs_metafile_resv_can_cover(
|
||||
struct xfs_inode *ip,
|
||||
struct xfs_mount *mp,
|
||||
int64_t rhs)
|
||||
{
|
||||
/*
|
||||
@@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover(
|
||||
* global free block count. Take care of the first case to avoid
|
||||
* touching the per-cpu counter.
|
||||
*/
|
||||
if (ip->i_delayed_blks >= rhs)
|
||||
if (mp->m_metafile_resv_avail >= rhs)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* There aren't enough blocks left in the inode's reservation, but it
|
||||
* isn't critical unless there also isn't enough free space.
|
||||
*/
|
||||
return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
|
||||
rhs - ip->i_delayed_blks, 2048) >= 0;
|
||||
return xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
|
||||
rhs - mp->m_metafile_resv_avail, 2048) >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is this metadata file critically low on blocks? For now we'll define that
|
||||
* as the number of blocks we can get our hands on being less than 10% of what
|
||||
* we reserved or less than some arbitrary number (maximum btree height).
|
||||
* Is the metafile reservation critically low on blocks? For now we'll define
|
||||
* that as the number of blocks we can get our hands on being less than 10% of
|
||||
* what we reserved or less than some arbitrary number (maximum btree height).
|
||||
*/
|
||||
bool
|
||||
xfs_metafile_resv_critical(
|
||||
struct xfs_inode *ip)
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
uint64_t asked_low_water;
|
||||
ASSERT(xfs_has_metadir(mp));
|
||||
|
||||
if (!ip)
|
||||
return false;
|
||||
trace_xfs_metafile_resv_critical(mp, 0);
|
||||
|
||||
ASSERT(xfs_is_metadir_inode(ip));
|
||||
trace_xfs_metafile_resv_critical(ip, 0);
|
||||
|
||||
if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
|
||||
if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels))
|
||||
return true;
|
||||
|
||||
asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
|
||||
if (!xfs_metafile_resv_can_cover(ip, asked_low_water))
|
||||
if (!xfs_metafile_resv_can_cover(mp,
|
||||
div_u64(mp->m_metafile_resv_target, 10)))
|
||||
return true;
|
||||
|
||||
return XFS_TEST_ERROR(false, ip->i_mount,
|
||||
XFS_ERRTAG_METAFILE_RESV_CRITICAL);
|
||||
return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
|
||||
}
|
||||
|
||||
/* Allocate a block from the metadata file's reservation. */
|
||||
@@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space(
|
||||
struct xfs_inode *ip,
|
||||
struct xfs_alloc_arg *args)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
int64_t len = args->len;
|
||||
|
||||
ASSERT(xfs_is_metadir_inode(ip));
|
||||
ASSERT(args->resv == XFS_AG_RESV_METAFILE);
|
||||
|
||||
trace_xfs_metafile_resv_alloc_space(ip, args->len);
|
||||
trace_xfs_metafile_resv_alloc_space(mp, args->len);
|
||||
|
||||
/*
|
||||
* Allocate the blocks from the metadata inode's block reservation
|
||||
* and update the ondisk sb counter.
|
||||
*/
|
||||
if (ip->i_delayed_blks > 0) {
|
||||
mutex_lock(&mp->m_metafile_resv_lock);
|
||||
if (mp->m_metafile_resv_avail > 0) {
|
||||
int64_t from_resv;
|
||||
|
||||
from_resv = min_t(int64_t, len, ip->i_delayed_blks);
|
||||
ip->i_delayed_blks -= from_resv;
|
||||
from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail);
|
||||
mp->m_metafile_resv_avail -= from_resv;
|
||||
xfs_mod_delalloc(ip, 0, -from_resv);
|
||||
xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
|
||||
-from_resv);
|
||||
@@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space(
|
||||
xfs_trans_mod_sb(args->tp, field, -len);
|
||||
}
|
||||
|
||||
mp->m_metafile_resv_used += args->len;
|
||||
mutex_unlock(&mp->m_metafile_resv_lock);
|
||||
|
||||
ip->i_nblocks += args->len;
|
||||
xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
|
||||
}
|
||||
@@ -186,26 +188,33 @@ xfs_metafile_resv_free_space(
|
||||
struct xfs_trans *tp,
|
||||
xfs_filblks_t len)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
int64_t to_resv;
|
||||
|
||||
ASSERT(xfs_is_metadir_inode(ip));
|
||||
trace_xfs_metafile_resv_free_space(ip, len);
|
||||
|
||||
trace_xfs_metafile_resv_free_space(mp, len);
|
||||
|
||||
ip->i_nblocks -= len;
|
||||
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
|
||||
|
||||
mutex_lock(&mp->m_metafile_resv_lock);
|
||||
mp->m_metafile_resv_used -= len;
|
||||
|
||||
/*
|
||||
* Add the freed blocks back into the inode's delalloc reservation
|
||||
* until it reaches the maximum size. Update the ondisk fdblocks only.
|
||||
*/
|
||||
to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
|
||||
to_resv = mp->m_metafile_resv_target -
|
||||
(mp->m_metafile_resv_used + mp->m_metafile_resv_avail);
|
||||
if (to_resv > 0) {
|
||||
to_resv = min_t(int64_t, to_resv, len);
|
||||
ip->i_delayed_blks += to_resv;
|
||||
mp->m_metafile_resv_avail += to_resv;
|
||||
xfs_mod_delalloc(ip, 0, to_resv);
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
|
||||
len -= to_resv;
|
||||
}
|
||||
mutex_unlock(&mp->m_metafile_resv_lock);
|
||||
|
||||
/*
|
||||
* Everything else goes back to the filesystem, so update the in-core
|
||||
@@ -215,61 +224,99 @@ xfs_metafile_resv_free_space(
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
|
||||
}
|
||||
|
||||
/* Release a metadata file's space reservation. */
|
||||
void
|
||||
xfs_metafile_resv_free(
|
||||
struct xfs_inode *ip)
|
||||
static void
|
||||
__xfs_metafile_resv_free(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
/* Non-btree metadata inodes don't need space reservations. */
|
||||
if (!ip || !ip->i_meta_resv_asked)
|
||||
return;
|
||||
|
||||
ASSERT(xfs_is_metadir_inode(ip));
|
||||
trace_xfs_metafile_resv_free(ip, 0);
|
||||
|
||||
if (ip->i_delayed_blks) {
|
||||
xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks);
|
||||
xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks);
|
||||
ip->i_delayed_blks = 0;
|
||||
if (mp->m_metafile_resv_avail) {
|
||||
xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail);
|
||||
xfs_add_fdblocks(mp, mp->m_metafile_resv_avail);
|
||||
}
|
||||
ip->i_meta_resv_asked = 0;
|
||||
mp->m_metafile_resv_avail = 0;
|
||||
mp->m_metafile_resv_used = 0;
|
||||
mp->m_metafile_resv_target = 0;
|
||||
}
|
||||
|
||||
/* Set up a metadata file's space reservation. */
|
||||
/* Release unused metafile space reservation. */
|
||||
void
|
||||
xfs_metafile_resv_free(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
if (!xfs_has_metadir(mp))
|
||||
return;
|
||||
|
||||
trace_xfs_metafile_resv_free(mp, 0);
|
||||
|
||||
mutex_lock(&mp->m_metafile_resv_lock);
|
||||
__xfs_metafile_resv_free(mp);
|
||||
mutex_unlock(&mp->m_metafile_resv_lock);
|
||||
}
|
||||
|
||||
/* Set up a metafile space reservation. */
|
||||
int
|
||||
xfs_metafile_resv_init(
|
||||
struct xfs_inode *ip,
|
||||
xfs_filblks_t ask)
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
struct xfs_rtgroup *rtg = NULL;
|
||||
xfs_filblks_t used = 0, target = 0;
|
||||
xfs_filblks_t hidden_space;
|
||||
xfs_filblks_t used;
|
||||
int error;
|
||||
xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4;
|
||||
int error = 0;
|
||||
|
||||
if (!ip || ip->i_meta_resv_asked > 0)
|
||||
if (!xfs_has_metadir(mp))
|
||||
return 0;
|
||||
|
||||
ASSERT(xfs_is_metadir_inode(ip));
|
||||
/*
|
||||
* Free any previous reservation to have a clean slate.
|
||||
*/
|
||||
mutex_lock(&mp->m_metafile_resv_lock);
|
||||
__xfs_metafile_resv_free(mp);
|
||||
|
||||
/*
|
||||
* Space taken by all other metadata btrees are accounted on-disk as
|
||||
* Currently the only btree metafiles that require reservations are the
|
||||
* rtrmap and the rtrefcount. Anything new will have to be added here
|
||||
* as well.
|
||||
*/
|
||||
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
|
||||
if (xfs_has_rtrmapbt(mp)) {
|
||||
used += rtg_rmap(rtg)->i_nblocks;
|
||||
target += xfs_rtrmapbt_calc_reserves(mp);
|
||||
}
|
||||
if (xfs_has_rtreflink(mp)) {
|
||||
used += rtg_refcount(rtg)->i_nblocks;
|
||||
target += xfs_rtrefcountbt_calc_reserves(mp);
|
||||
}
|
||||
}
|
||||
|
||||
if (!target)
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* Space taken by the per-AG metadata btrees are accounted on-disk as
|
||||
* used space. We therefore only hide the space that is reserved but
|
||||
* not used by the trees.
|
||||
*/
|
||||
used = ip->i_nblocks;
|
||||
if (used > ask)
|
||||
ask = used;
|
||||
hidden_space = ask - used;
|
||||
if (used > target)
|
||||
target = used;
|
||||
else if (target > dblocks_avail)
|
||||
target = dblocks_avail;
|
||||
hidden_space = target - used;
|
||||
|
||||
error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true);
|
||||
error = xfs_dec_fdblocks(mp, hidden_space, true);
|
||||
if (error) {
|
||||
trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_);
|
||||
return error;
|
||||
trace_xfs_metafile_resv_init_error(mp, 0);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
xfs_mod_delalloc(ip, 0, hidden_space);
|
||||
ip->i_delayed_blks = hidden_space;
|
||||
ip->i_meta_resv_asked = ask;
|
||||
xfs_mod_sb_delalloc(mp, hidden_space);
|
||||
|
||||
trace_xfs_metafile_resv_init(ip, ask);
|
||||
return 0;
|
||||
mp->m_metafile_resv_target = target;
|
||||
mp->m_metafile_resv_used = used;
|
||||
mp->m_metafile_resv_avail = hidden_space;
|
||||
|
||||
trace_xfs_metafile_resv_init(mp, target);
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&mp->m_metafile_resv_lock);
|
||||
return error;
|
||||
}
|
||||
|
||||
@@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
|
||||
/* Space reservations for metadata inodes. */
|
||||
struct xfs_alloc_arg;
|
||||
|
||||
bool xfs_metafile_resv_critical(struct xfs_inode *ip);
|
||||
bool xfs_metafile_resv_critical(struct xfs_mount *mp);
|
||||
void xfs_metafile_resv_alloc_space(struct xfs_inode *ip,
|
||||
struct xfs_alloc_arg *args);
|
||||
void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp,
|
||||
xfs_filblks_t len);
|
||||
void xfs_metafile_resv_free(struct xfs_inode *ip);
|
||||
int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask);
|
||||
void xfs_metafile_resv_free(struct xfs_mount *mp);
|
||||
int xfs_metafile_resv_init(struct xfs_mount *mp);
|
||||
|
||||
/* Code specific to kernel/userspace; must be provided externally. */
|
||||
|
||||
|
||||
@@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void)
|
||||
16299260424LL);
|
||||
|
||||
/* superblock field checks we got from xfs/122 */
|
||||
XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288);
|
||||
XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288);
|
||||
XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304);
|
||||
XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304);
|
||||
XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
|
||||
XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
|
||||
XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
|
||||
@@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void)
|
||||
XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
|
||||
XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
|
||||
XFS_CHECK_SB_OFFSET(sb_pad, 281);
|
||||
XFS_CHECK_SB_OFFSET(sb_rtstart, 288);
|
||||
XFS_CHECK_SB_OFFSET(sb_rtreserved, 296);
|
||||
}
|
||||
|
||||
#endif /* __XFS_ONDISK_H */
|
||||
|
||||
@@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
|
||||
xfs_extlen_t mod;
|
||||
int error;
|
||||
|
||||
ASSERT(!xfs_has_zoned(mp));
|
||||
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
|
||||
|
||||
mod = xfs_blen_to_rtxoff(mp, rtlen);
|
||||
@@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range(
|
||||
|
||||
end = min(end, rtg->rtg_extents - 1);
|
||||
|
||||
if (xfs_has_zoned(mp))
|
||||
return -EINVAL;
|
||||
|
||||
/* Iterate the bitmap, looking for discrepancies. */
|
||||
while (start <= end) {
|
||||
struct xfs_rtalloc_rec rec;
|
||||
@@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len(
|
||||
struct xfs_mount *mp,
|
||||
xfs_rtbxlen_t rtextents)
|
||||
{
|
||||
if (xfs_has_zoned(mp))
|
||||
return 0;
|
||||
return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
|
||||
}
|
||||
|
||||
@@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount(
|
||||
xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
|
||||
unsigned long long rsumwords;
|
||||
|
||||
if (xfs_has_zoned(mp)) {
|
||||
*rsumlevels = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
*rsumlevels = xfs_compute_rextslog(rextents) + 1;
|
||||
rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
|
||||
return howmany_64(rsumwords, mp->m_blockwsize);
|
||||
|
||||
@@ -194,15 +194,17 @@ xfs_rtgroup_lock(
|
||||
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
|
||||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
|
||||
|
||||
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
|
||||
/*
|
||||
* Lock both realtime free space metadata inodes for a freespace
|
||||
* update.
|
||||
*/
|
||||
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
|
||||
xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
|
||||
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
|
||||
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
|
||||
if (!xfs_has_zoned(rtg_mount(rtg))) {
|
||||
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
|
||||
/*
|
||||
* Lock both realtime free space metadata inodes for a
|
||||
* freespace update.
|
||||
*/
|
||||
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
|
||||
xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
|
||||
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
|
||||
xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
|
||||
}
|
||||
}
|
||||
|
||||
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
|
||||
@@ -228,11 +230,13 @@ xfs_rtgroup_unlock(
|
||||
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
|
||||
xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
|
||||
|
||||
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
|
||||
xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
|
||||
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
|
||||
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
|
||||
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
|
||||
if (!xfs_has_zoned(rtg_mount(rtg))) {
|
||||
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
|
||||
xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
|
||||
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
|
||||
} else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
|
||||
xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -249,7 +253,8 @@ xfs_rtgroup_trans_join(
|
||||
ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
|
||||
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
|
||||
|
||||
if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
|
||||
if (!xfs_has_zoned(rtg_mount(rtg)) &&
|
||||
(rtglock_flags & XFS_RTGLOCK_BITMAP)) {
|
||||
xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
|
||||
xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
|
||||
}
|
||||
@@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry(
|
||||
/* Fill out form. */
|
||||
memset(rgeo, 0, sizeof(*rgeo));
|
||||
rgeo->rg_number = rtg_rgno(rtg);
|
||||
rgeo->rg_length = rtg_group(rtg)->xg_block_count;
|
||||
rgeo->rg_length = rtg_blocks(rtg);
|
||||
xfs_rtgroup_geom_health(rtg, rgeo);
|
||||
return 0;
|
||||
}
|
||||
@@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
|
||||
.sick = XFS_SICK_RG_BITMAP,
|
||||
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
|
||||
(1U << XFS_DINODE_FMT_BTREE),
|
||||
.enabled = xfs_has_nonzoned,
|
||||
.create = xfs_rtbitmap_create,
|
||||
},
|
||||
[XFS_RTGI_SUMMARY] = {
|
||||
@@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
|
||||
.sick = XFS_SICK_RG_SUMMARY,
|
||||
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
|
||||
(1U << XFS_DINODE_FMT_BTREE),
|
||||
.enabled = xfs_has_nonzoned,
|
||||
.create = xfs_rtsummary_create,
|
||||
},
|
||||
[XFS_RTGI_RMAP] = {
|
||||
|
||||
@@ -37,15 +37,33 @@ struct xfs_rtgroup {
|
||||
xfs_rtxnum_t rtg_extents;
|
||||
|
||||
/*
|
||||
* Cache of rt summary level per bitmap block with the invariant that
|
||||
* rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
|
||||
* or 0 if rsum[i][bbno] == 0 for all i.
|
||||
*
|
||||
* For bitmap based RT devices this points to a cache of rt summary
|
||||
* level per bitmap block with the invariant that rtg_rsum_cache[bbno]
|
||||
* > the maximum i for which rsum[i][bbno] != 0, or 0 if
|
||||
* rsum[i][bbno] == 0 for all i.
|
||||
* Reads and writes are serialized by the rsumip inode lock.
|
||||
*
|
||||
* For zoned RT devices this points to the open zone structure for
|
||||
* a group that is open for writers, or is NULL.
|
||||
*/
|
||||
uint8_t *rtg_rsum_cache;
|
||||
union {
|
||||
uint8_t *rtg_rsum_cache;
|
||||
struct xfs_open_zone *rtg_open_zone;
|
||||
};
|
||||
};
|
||||
|
||||
/*
|
||||
* For zoned RT devices this is set on groups that have no written blocks
|
||||
* and can be picked by the allocator for opening.
|
||||
*/
|
||||
#define XFS_RTG_FREE XA_MARK_0
|
||||
|
||||
/*
|
||||
* For zoned RT devices this is set on groups that are fully written and that
|
||||
* have unused blocks. Used by the garbage collection to pick targets.
|
||||
*/
|
||||
#define XFS_RTG_RECLAIMABLE XA_MARK_1
|
||||
|
||||
static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
|
||||
{
|
||||
return container_of(xg, struct xfs_rtgroup, rtg_group);
|
||||
@@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
|
||||
return rtg->rtg_group.xg_gno;
|
||||
}
|
||||
|
||||
static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg)
|
||||
{
|
||||
return rtg->rtg_group.xg_block_count;
|
||||
}
|
||||
|
||||
static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
|
||||
{
|
||||
return rtg->rtg_inodes[XFS_RTGI_BITMAP];
|
||||
@@ -222,10 +245,14 @@ xfs_rtb_to_daddr(
|
||||
xfs_rtblock_t rtbno)
|
||||
{
|
||||
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
|
||||
xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
|
||||
uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
|
||||
|
||||
return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask));
|
||||
if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
|
||||
xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
|
||||
|
||||
rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask);
|
||||
}
|
||||
|
||||
return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno);
|
||||
}
|
||||
|
||||
static inline xfs_rtblock_t
|
||||
@@ -233,10 +260,11 @@ xfs_daddr_to_rtb(
|
||||
struct xfs_mount *mp,
|
||||
xfs_daddr_t daddr)
|
||||
{
|
||||
xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr);
|
||||
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
|
||||
xfs_rfsblock_t bno;
|
||||
|
||||
if (xfs_has_rtgroups(mp)) {
|
||||
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
|
||||
bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
|
||||
if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
|
||||
xfs_rgnumber_t rgno;
|
||||
uint32_t rgbno;
|
||||
|
||||
|
||||
@@ -1033,3 +1033,22 @@ xfs_rtrmapbt_init_rtsb(
|
||||
xfs_btree_del_cursor(cur, error);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the highest rgbno currently tracked by the rmap for this rtg.
|
||||
*/
|
||||
xfs_rgblock_t
|
||||
xfs_rtrmap_highest_rgbno(
|
||||
struct xfs_rtgroup *rtg)
|
||||
{
|
||||
struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot;
|
||||
union xfs_btree_key key = {};
|
||||
struct xfs_btree_cur *cur;
|
||||
|
||||
if (block->bb_numrecs == 0)
|
||||
return NULLRGBLOCK;
|
||||
cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
|
||||
xfs_btree_get_keys(cur, block, &key);
|
||||
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
|
||||
return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock);
|
||||
}
|
||||
|
||||
@@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
|
||||
int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
|
||||
struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
|
||||
|
||||
xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg);
|
||||
|
||||
#endif /* __XFS_RTRMAP_BTREE_H__ */
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "xfs_rtgroup.h"
|
||||
#include "xfs_rtrmap_btree.h"
|
||||
#include "xfs_rtrefcount_btree.h"
|
||||
#include "xfs_rtbitmap.h"
|
||||
|
||||
/*
|
||||
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
|
||||
@@ -185,6 +186,8 @@ xfs_sb_version_to_features(
|
||||
features |= XFS_FEAT_PARENT;
|
||||
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
|
||||
features |= XFS_FEAT_METADIR;
|
||||
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
|
||||
features |= XFS_FEAT_ZONED;
|
||||
|
||||
return features;
|
||||
}
|
||||
@@ -266,6 +269,9 @@ static uint64_t
|
||||
xfs_expected_rbmblocks(
|
||||
struct xfs_sb *sbp)
|
||||
{
|
||||
if (xfs_sb_is_v5(sbp) &&
|
||||
(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
|
||||
return 0;
|
||||
return howmany_64(xfs_extents_per_rbm(sbp),
|
||||
NBBY * xfs_rtbmblock_size(sbp));
|
||||
}
|
||||
@@ -275,9 +281,15 @@ bool
|
||||
xfs_validate_rt_geometry(
|
||||
struct xfs_sb *sbp)
|
||||
{
|
||||
if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
|
||||
sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
|
||||
return false;
|
||||
if (xfs_sb_is_v5(sbp) &&
|
||||
(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
|
||||
if (sbp->sb_rextsize != 1)
|
||||
return false;
|
||||
} else {
|
||||
if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
|
||||
sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (sbp->sb_rblocks == 0) {
|
||||
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
|
||||
@@ -435,6 +447,34 @@ xfs_validate_sb_rtgroups(
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_validate_sb_zoned(
|
||||
struct xfs_mount *mp,
|
||||
struct xfs_sb *sbp)
|
||||
{
|
||||
if (sbp->sb_frextents != 0) {
|
||||
xfs_warn(mp,
|
||||
"sb_frextents must be zero for zoned file systems.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) {
|
||||
xfs_warn(mp,
|
||||
"sb_rtstart (%lld) overlaps sb_dblocks (%lld).",
|
||||
sbp->sb_rtstart, sbp->sb_dblocks);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) {
|
||||
xfs_warn(mp,
|
||||
"sb_rtreserved (%lld) larger than sb_rblocks (%lld).",
|
||||
sbp->sb_rtreserved, sbp->sb_rblocks);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Check the validity of the SB. */
|
||||
STATIC int
|
||||
xfs_validate_sb_common(
|
||||
@@ -523,6 +563,11 @@ xfs_validate_sb_common(
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
|
||||
error = xfs_validate_sb_zoned(mp, sbp);
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
|
||||
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
|
||||
xfs_notice(mp,
|
||||
@@ -835,6 +880,14 @@ __xfs_sb_from_disk(
|
||||
to->sb_rgcount = 1;
|
||||
to->sb_rgextents = 0;
|
||||
}
|
||||
|
||||
if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
|
||||
to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
|
||||
to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
|
||||
} else {
|
||||
to->sb_rtstart = 0;
|
||||
to->sb_rtreserved = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
@@ -1001,6 +1054,11 @@ xfs_sb_to_disk(
|
||||
to->sb_rbmino = cpu_to_be64(0);
|
||||
to->sb_rsumino = cpu_to_be64(0);
|
||||
}
|
||||
|
||||
if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
|
||||
to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
|
||||
to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1146,6 +1204,10 @@ xfs_sb_mount_rextsize(
|
||||
rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
|
||||
rgs->blklog = mp->m_sb.sb_rgblklog;
|
||||
rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
|
||||
rgs->start_fsb = mp->m_sb.sb_rtstart;
|
||||
if (xfs_sb_has_incompat_feature(sbp,
|
||||
XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
|
||||
rgs->has_daddr_gaps = true;
|
||||
} else {
|
||||
rgs->blocks = 0;
|
||||
rgs->blklog = 0;
|
||||
@@ -1265,8 +1327,7 @@ xfs_log_sb(
|
||||
mp->m_sb.sb_ifree = min_t(uint64_t,
|
||||
percpu_counter_sum_positive(&mp->m_ifree),
|
||||
mp->m_sb.sb_icount);
|
||||
mp->m_sb.sb_fdblocks =
|
||||
percpu_counter_sum_positive(&mp->m_fdblocks);
|
||||
mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1275,9 +1336,10 @@ xfs_log_sb(
|
||||
* we handle nearly-lockless reservations, so we must use the _positive
|
||||
* variant here to avoid writing out nonsense frextents.
|
||||
*/
|
||||
if (xfs_has_rtgroups(mp))
|
||||
if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
|
||||
mp->m_sb.sb_frextents =
|
||||
percpu_counter_sum_positive(&mp->m_frextents);
|
||||
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
|
||||
}
|
||||
|
||||
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
|
||||
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
|
||||
@@ -1510,6 +1572,8 @@ xfs_fs_geometry(
|
||||
geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
|
||||
if (xfs_has_metadir(mp))
|
||||
geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
|
||||
if (xfs_has_zoned(mp))
|
||||
geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
|
||||
geo->rtsectsize = sbp->sb_blocksize;
|
||||
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
|
||||
|
||||
@@ -1530,6 +1594,10 @@ xfs_fs_geometry(
|
||||
geo->rgcount = sbp->sb_rgcount;
|
||||
geo->rgextents = sbp->sb_rgextents;
|
||||
}
|
||||
if (xfs_has_zoned(mp)) {
|
||||
geo->rtstart = sbp->sb_rtstart;
|
||||
geo->rtreserved = sbp->sb_rtreserved;
|
||||
}
|
||||
}
|
||||
|
||||
/* Read a secondary superblock. */
|
||||
|
||||
@@ -233,6 +233,34 @@ enum xfs_group_type {
|
||||
{ XG_TYPE_AG, "ag" }, \
|
||||
{ XG_TYPE_RTG, "rtg" }
|
||||
|
||||
enum xfs_free_counter {
|
||||
/*
|
||||
* Number of free blocks on the data device.
|
||||
*/
|
||||
XC_FREE_BLOCKS,
|
||||
|
||||
/*
|
||||
* Number of free RT extents on the RT device.
|
||||
*/
|
||||
XC_FREE_RTEXTENTS,
|
||||
|
||||
/*
|
||||
* Number of available for use RT extents.
|
||||
*
|
||||
* This counter only exists for zoned RT device and indicates the number
|
||||
* of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS
|
||||
* also includes blocks that have been written previously and freed, but
|
||||
* sit in a rtgroup that still needs a zone reset.
|
||||
*/
|
||||
XC_FREE_RTAVAILABLE,
|
||||
XC_FREE_NR,
|
||||
};
|
||||
|
||||
#define XFS_FREECOUNTER_STR \
|
||||
{ XC_FREE_BLOCKS, "blocks" }, \
|
||||
{ XC_FREE_RTEXTENTS, "rtextents" }, \
|
||||
{ XC_FREE_RTAVAILABLE, "rtavailable" }
|
||||
|
||||
/*
|
||||
* Type verifier functions
|
||||
*/
|
||||
|
||||
186
fs/xfs/libxfs/xfs_zones.c
Normal file
186
fs/xfs/libxfs/xfs_zones.c
Normal file
@@ -0,0 +1,186 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2023-2025 Christoph Hellwig.
|
||||
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
|
||||
*/
|
||||
#include "xfs.h"
|
||||
#include "xfs_fs.h"
|
||||
#include "xfs_shared.h"
|
||||
#include "xfs_format.h"
|
||||
#include "xfs_log_format.h"
|
||||
#include "xfs_trans_resv.h"
|
||||
#include "xfs_mount.h"
|
||||
#include "xfs_inode.h"
|
||||
#include "xfs_rtgroup.h"
|
||||
#include "xfs_zones.h"
|
||||
|
||||
static bool
|
||||
xfs_zone_validate_empty(
|
||||
struct blk_zone *zone,
|
||||
struct xfs_rtgroup *rtg,
|
||||
xfs_rgblock_t *write_pointer)
|
||||
{
|
||||
struct xfs_mount *mp = rtg_mount(rtg);
|
||||
|
||||
if (rtg_rmap(rtg)->i_used_blocks > 0) {
|
||||
xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
|
||||
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
|
||||
return false;
|
||||
}
|
||||
|
||||
*write_pointer = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
xfs_zone_validate_wp(
|
||||
struct blk_zone *zone,
|
||||
struct xfs_rtgroup *rtg,
|
||||
xfs_rgblock_t *write_pointer)
|
||||
{
|
||||
struct xfs_mount *mp = rtg_mount(rtg);
|
||||
xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
|
||||
|
||||
if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
|
||||
xfs_warn(mp, "zone %u has too large used counter (0x%x).",
|
||||
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
|
||||
xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.",
|
||||
rtg_rgno(rtg), wp_fsb);
|
||||
return false;
|
||||
}
|
||||
|
||||
*write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
|
||||
if (*write_pointer >= rtg->rtg_extents) {
|
||||
xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
|
||||
rtg_rgno(rtg), *write_pointer);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
xfs_zone_validate_full(
|
||||
struct blk_zone *zone,
|
||||
struct xfs_rtgroup *rtg,
|
||||
xfs_rgblock_t *write_pointer)
|
||||
{
|
||||
struct xfs_mount *mp = rtg_mount(rtg);
|
||||
|
||||
if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
|
||||
xfs_warn(mp, "zone %u has too large used counter (0x%x).",
|
||||
rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
|
||||
return false;
|
||||
}
|
||||
|
||||
*write_pointer = rtg->rtg_extents;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
xfs_zone_validate_seq(
|
||||
struct blk_zone *zone,
|
||||
struct xfs_rtgroup *rtg,
|
||||
xfs_rgblock_t *write_pointer)
|
||||
{
|
||||
struct xfs_mount *mp = rtg_mount(rtg);
|
||||
|
||||
switch (zone->cond) {
|
||||
case BLK_ZONE_COND_EMPTY:
|
||||
return xfs_zone_validate_empty(zone, rtg, write_pointer);
|
||||
case BLK_ZONE_COND_IMP_OPEN:
|
||||
case BLK_ZONE_COND_EXP_OPEN:
|
||||
case BLK_ZONE_COND_CLOSED:
|
||||
return xfs_zone_validate_wp(zone, rtg, write_pointer);
|
||||
case BLK_ZONE_COND_FULL:
|
||||
return xfs_zone_validate_full(zone, rtg, write_pointer);
|
||||
case BLK_ZONE_COND_NOT_WP:
|
||||
case BLK_ZONE_COND_OFFLINE:
|
||||
case BLK_ZONE_COND_READONLY:
|
||||
xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.",
|
||||
rtg_rgno(rtg), zone->cond);
|
||||
return false;
|
||||
default:
|
||||
xfs_warn(mp, "zone %u has unknown zone condition 0x%x.",
|
||||
rtg_rgno(rtg), zone->cond);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
xfs_zone_validate_conv(
|
||||
struct blk_zone *zone,
|
||||
struct xfs_rtgroup *rtg)
|
||||
{
|
||||
struct xfs_mount *mp = rtg_mount(rtg);
|
||||
|
||||
switch (zone->cond) {
|
||||
case BLK_ZONE_COND_NOT_WP:
|
||||
return true;
|
||||
default:
|
||||
xfs_warn(mp,
|
||||
"conventional zone %u has unsupported zone condition 0x%x.",
|
||||
rtg_rgno(rtg), zone->cond);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
xfs_zone_validate(
|
||||
struct blk_zone *zone,
|
||||
struct xfs_rtgroup *rtg,
|
||||
xfs_rgblock_t *write_pointer)
|
||||
{
|
||||
struct xfs_mount *mp = rtg_mount(rtg);
|
||||
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
|
||||
uint32_t expected_size;
|
||||
|
||||
/*
|
||||
* Check that the zone capacity matches the rtgroup size stored in the
|
||||
* superblock. Note that all zones including the last one must have a
|
||||
* uniform capacity.
|
||||
*/
|
||||
if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
|
||||
xfs_warn(mp,
|
||||
"zone %u capacity (0x%llx) does not match RT group size (0x%x).",
|
||||
rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
|
||||
g->blocks);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (g->has_daddr_gaps) {
|
||||
expected_size = 1 << g->blklog;
|
||||
} else {
|
||||
if (zone->len != zone->capacity) {
|
||||
xfs_warn(mp,
|
||||
"zone %u has capacity != size ((0x%llx vs 0x%llx)",
|
||||
rtg_rgno(rtg),
|
||||
XFS_BB_TO_FSB(mp, zone->len),
|
||||
XFS_BB_TO_FSB(mp, zone->capacity));
|
||||
return false;
|
||||
}
|
||||
expected_size = g->blocks;
|
||||
}
|
||||
|
||||
if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) {
|
||||
xfs_warn(mp,
|
||||
"zone %u length (0x%llx) does match geometry (0x%x).",
|
||||
rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
|
||||
expected_size);
|
||||
}
|
||||
|
||||
switch (zone->type) {
|
||||
case BLK_ZONE_TYPE_CONVENTIONAL:
|
||||
return xfs_zone_validate_conv(zone, rtg);
|
||||
case BLK_ZONE_TYPE_SEQWRITE_REQ:
|
||||
return xfs_zone_validate_seq(zone, rtg, write_pointer);
|
||||
default:
|
||||
xfs_warn(mp, "zoned %u has unsupported type 0x%x.",
|
||||
rtg_rgno(rtg), zone->type);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
35
fs/xfs/libxfs/xfs_zones.h
Normal file
35
fs/xfs/libxfs/xfs_zones.h
Normal file
@@ -0,0 +1,35 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LIBXFS_ZONES_H
|
||||
#define _LIBXFS_ZONES_H
|
||||
|
||||
struct xfs_rtgroup;
|
||||
|
||||
/*
|
||||
* In order to guarantee forward progress for GC we need to reserve at least
|
||||
* two zones: one that will be used for moving data into and one spare zone
|
||||
* making sure that we have enough space to relocate a nearly-full zone.
|
||||
* To allow for slightly sloppy accounting for when we need to reserve the
|
||||
* second zone, we actually reserve three as that is easier than doing fully
|
||||
* accurate bookkeeping.
|
||||
*/
|
||||
#define XFS_GC_ZONES 3U
|
||||
|
||||
/*
|
||||
* In addition we need two zones for user writes, one open zone for writing
|
||||
* and one to still have available blocks without resetting the open zone
|
||||
* when data in the open zone has been freed.
|
||||
*/
|
||||
#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
|
||||
#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
|
||||
|
||||
/*
|
||||
* Always keep one zone out of the general open zone pool to allow for GC to
|
||||
* happen while other writers are waiting for free space.
|
||||
*/
|
||||
#define XFS_OPEN_GC_ZONES 1U
|
||||
#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
|
||||
|
||||
bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
|
||||
xfs_rgblock_t *write_pointer);
|
||||
|
||||
#endif /* _LIBXFS_ZONES_H */
|
||||
@@ -69,6 +69,8 @@ STATIC size_t
|
||||
xchk_superblock_ondisk_size(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
if (xfs_has_zoned(mp))
|
||||
return offsetofend(struct xfs_dsb, sb_rtreserved);
|
||||
if (xfs_has_metadir(mp))
|
||||
return offsetofend(struct xfs_dsb, sb_pad);
|
||||
if (xfs_has_metauuid(mp))
|
||||
|
||||
@@ -1038,8 +1038,8 @@ xchk_bmap(
|
||||
|
||||
switch (whichfork) {
|
||||
case XFS_COW_FORK:
|
||||
/* No CoW forks on non-reflink filesystems. */
|
||||
if (!xfs_has_reflink(mp)) {
|
||||
/* No CoW forks filesystem doesn't support out of place writes */
|
||||
if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
|
||||
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -350,7 +350,7 @@ xchk_fscount_aggregate_agcounts(
|
||||
* The global incore space reservation is taken from the incore
|
||||
* counters, so leave that out of the computation.
|
||||
*/
|
||||
fsc->fdblocks -= mp->m_resblks_avail;
|
||||
fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
|
||||
|
||||
/*
|
||||
* Delayed allocation reservations are taken out of the incore counters
|
||||
@@ -413,7 +413,13 @@ xchk_fscount_count_frextents(
|
||||
|
||||
fsc->frextents = 0;
|
||||
fsc->frextents_delayed = 0;
|
||||
if (!xfs_has_realtime(mp))
|
||||
|
||||
/*
|
||||
* Don't bother verifying and repairing the fs counters for zoned file
|
||||
* systems as they don't track an on-disk frextents count, and the
|
||||
* in-memory percpu counter also includes reservations.
|
||||
*/
|
||||
if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
|
||||
return 0;
|
||||
|
||||
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
|
||||
@@ -513,8 +519,8 @@ xchk_fscounters(
|
||||
/* Snapshot the percpu counters. */
|
||||
icount = percpu_counter_sum(&mp->m_icount);
|
||||
ifree = percpu_counter_sum(&mp->m_ifree);
|
||||
fdblocks = percpu_counter_sum(&mp->m_fdblocks);
|
||||
frextents = percpu_counter_sum(&mp->m_frextents);
|
||||
fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
|
||||
frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
|
||||
|
||||
/* No negative values, please! */
|
||||
if (icount < 0 || ifree < 0)
|
||||
@@ -589,15 +595,17 @@ xchk_fscounters(
|
||||
try_again = true;
|
||||
}
|
||||
|
||||
if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
|
||||
fsc->fdblocks)) {
|
||||
if (!xchk_fscount_within_range(sc, fdblocks,
|
||||
&mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
|
||||
if (fsc->frozen)
|
||||
xchk_set_corrupt(sc);
|
||||
else
|
||||
try_again = true;
|
||||
}
|
||||
|
||||
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
|
||||
if (!xfs_has_zoned(mp) &&
|
||||
!xchk_fscount_within_range(sc, frextents,
|
||||
&mp->m_free[XC_FREE_RTEXTENTS].count,
|
||||
fsc->frextents - fsc->frextents_delayed)) {
|
||||
if (fsc->frozen)
|
||||
xchk_set_corrupt(sc);
|
||||
|
||||
@@ -64,7 +64,7 @@ xrep_fscounters(
|
||||
|
||||
percpu_counter_set(&mp->m_icount, fsc->icount);
|
||||
percpu_counter_set(&mp->m_ifree, fsc->ifree);
|
||||
percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
|
||||
xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
|
||||
|
||||
/*
|
||||
* Online repair is only supported on v5 file systems, which require
|
||||
@@ -74,10 +74,12 @@ xrep_fscounters(
|
||||
* track of the delalloc reservations separately, as they are are
|
||||
* subtracted from m_frextents, but not included in sb_frextents.
|
||||
*/
|
||||
percpu_counter_set(&mp->m_frextents,
|
||||
fsc->frextents - fsc->frextents_delayed);
|
||||
if (!xfs_has_rtgroups(mp))
|
||||
mp->m_sb.sb_frextents = fsc->frextents;
|
||||
if (!xfs_has_zoned(mp)) {
|
||||
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
|
||||
fsc->frextents - fsc->frextents_delayed);
|
||||
if (!xfs_has_rtgroups(mp))
|
||||
mp->m_sb.sb_frextents = fsc->frextents;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -273,6 +273,13 @@ xchk_inode_cowextsize(
|
||||
xfs_failaddr_t fa;
|
||||
uint32_t value = be32_to_cpu(dip->di_cowextsize);
|
||||
|
||||
/*
|
||||
* The used block counter for rtrmap is checked and repaired elsewhere.
|
||||
*/
|
||||
if (xfs_has_zoned(sc->mp) &&
|
||||
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
|
||||
return;
|
||||
|
||||
fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
|
||||
if (fa)
|
||||
xchk_ino_set_corrupt(sc, ino);
|
||||
|
||||
@@ -710,7 +710,9 @@ xrep_dinode_extsize_hints(
|
||||
XFS_DIFLAG_EXTSZINHERIT);
|
||||
}
|
||||
|
||||
if (dip->di_version < 3)
|
||||
if (dip->di_version < 3 ||
|
||||
(xfs_has_zoned(sc->mp) &&
|
||||
dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
|
||||
return;
|
||||
|
||||
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
|
||||
|
||||
@@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
|
||||
free = sc->sa.pag->pagf_freeblks;
|
||||
sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
|
||||
} else {
|
||||
free = percpu_counter_sum(&sc->mp->m_fdblocks);
|
||||
free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
|
||||
sz = sc->mp->m_sb.sb_dblocks;
|
||||
}
|
||||
|
||||
|
||||
@@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks(
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
if (xreap_dirty(&rs))
|
||||
return xrep_defer_finish(sc);
|
||||
if (xreap_dirty(&rs)) {
|
||||
error = xrep_defer_finish(sc);
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return xrep_reset_metafile_resv(sc);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#include "xfs_rtalloc.h"
|
||||
#include "xfs_metafile.h"
|
||||
#include "xfs_rtrefcount_btree.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
#include "scrub/scrub.h"
|
||||
#include "scrub/common.h"
|
||||
#include "scrub/trace.h"
|
||||
@@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse(
|
||||
xfs_rtxnum_t startrtx;
|
||||
xfs_rtxnum_t endrtx;
|
||||
bool is_free = false;
|
||||
int error;
|
||||
int error = 0;
|
||||
|
||||
if (xfs_has_zoned(mp)) {
|
||||
if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
|
||||
return -EFSCORRUPTED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
startrtx = xfs_rgbno_to_rtx(mp, rgbno);
|
||||
endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
|
||||
@@ -1386,11 +1393,12 @@ int
|
||||
xrep_reset_metafile_resv(
|
||||
struct xfs_scrub *sc)
|
||||
{
|
||||
struct xfs_inode *ip = sc->ip;
|
||||
struct xfs_mount *mp = sc->mp;
|
||||
int64_t delta;
|
||||
int error;
|
||||
|
||||
delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
|
||||
delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail -
|
||||
mp->m_metafile_resv_target;
|
||||
if (delta == 0)
|
||||
return 0;
|
||||
|
||||
@@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv(
|
||||
if (delta > 0) {
|
||||
int64_t give_back;
|
||||
|
||||
give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
|
||||
give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail);
|
||||
if (give_back > 0) {
|
||||
xfs_mod_delalloc(ip, 0, -give_back);
|
||||
xfs_add_fdblocks(ip->i_mount, give_back);
|
||||
ip->i_delayed_blks -= give_back;
|
||||
xfs_mod_sb_delalloc(mp, -give_back);
|
||||
xfs_add_fdblocks(mp, give_back);
|
||||
mp->m_metafile_resv_avail -= give_back;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv(
|
||||
|
||||
/*
|
||||
* Not enough reservation; try to take some blocks from the filesystem
|
||||
* to the metadata inode. @delta is negative here, so invert the sign.
|
||||
* to the metabtree reservation.
|
||||
*/
|
||||
delta = -delta;
|
||||
error = xfs_dec_fdblocks(sc->mp, delta, true);
|
||||
delta = -delta; /* delta is negative here, so invert the sign. */
|
||||
error = xfs_dec_fdblocks(mp, delta, true);
|
||||
while (error == -ENOSPC) {
|
||||
delta--;
|
||||
if (delta == 0) {
|
||||
xfs_warn(sc->mp,
|
||||
"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
|
||||
ip->i_ino);
|
||||
"Insufficient free space to reset metabtree reservation after repair.");
|
||||
return 0;
|
||||
}
|
||||
error = xfs_dec_fdblocks(sc->mp, delta, true);
|
||||
error = xfs_dec_fdblocks(mp, delta, true);
|
||||
}
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
xfs_mod_delalloc(ip, 0, delta);
|
||||
ip->i_delayed_blks += delta;
|
||||
xfs_mod_sb_delalloc(mp, delta);
|
||||
mp->m_metafile_resv_avail += delta;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "xfs_rmap.h"
|
||||
#include "xfs_rtrmap_btree.h"
|
||||
#include "xfs_exchmaps.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
#include "scrub/scrub.h"
|
||||
#include "scrub/common.h"
|
||||
#include "scrub/repair.h"
|
||||
@@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
|
||||
xfs_extlen_t len)
|
||||
{
|
||||
struct xfs_rtgroup *rtg = sc->sr.rtg;
|
||||
struct xfs_inode *rbmip = rtg_bitmap(rtg);
|
||||
xfs_rtxnum_t startext;
|
||||
xfs_rtxnum_t endext;
|
||||
bool is_free;
|
||||
@@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space(
|
||||
if (xchk_skip_xref(sc->sm))
|
||||
return;
|
||||
|
||||
if (xfs_has_zoned(sc->mp)) {
|
||||
if (!xfs_zone_rgbno_is_valid(rtg,
|
||||
xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
|
||||
xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
|
||||
return;
|
||||
}
|
||||
|
||||
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
|
||||
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
|
||||
error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
|
||||
@@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space(
|
||||
if (!xchk_should_check_xref(sc, &error, NULL))
|
||||
return;
|
||||
if (is_free)
|
||||
xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
|
||||
xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
|
||||
}
|
||||
|
||||
@@ -697,32 +697,6 @@ xrep_rtrefc_build_new_tree(
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now that we've logged the roots of the new btrees, invalidate all of the
|
||||
* old blocks and free them.
|
||||
*/
|
||||
STATIC int
|
||||
xrep_rtrefc_remove_old_tree(
|
||||
struct xrep_rtrefc *rr)
|
||||
{
|
||||
int error;
|
||||
|
||||
/*
|
||||
* Free all the extents that were allocated to the former rtrefcountbt
|
||||
* and aren't cross-linked with something else.
|
||||
*/
|
||||
error = xrep_reap_metadir_fsblocks(rr->sc,
|
||||
&rr->old_rtrefcountbt_blocks);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/*
|
||||
* Ensure the proper reservation for the rtrefcount inode so that we
|
||||
* don't fail to expand the btree.
|
||||
*/
|
||||
return xrep_reset_metafile_resv(rr->sc);
|
||||
}
|
||||
|
||||
/* Rebuild the rt refcount btree. */
|
||||
int
|
||||
xrep_rtrefcountbt(
|
||||
@@ -769,8 +743,12 @@ xrep_rtrefcountbt(
|
||||
if (error)
|
||||
goto out_bitmap;
|
||||
|
||||
/* Kill the old tree. */
|
||||
error = xrep_rtrefc_remove_old_tree(rr);
|
||||
/*
|
||||
* Free all the extents that were allocated to the former rtrefcountbt
|
||||
* and aren't cross-linked with something else.
|
||||
*/
|
||||
error = xrep_reap_metadir_fsblocks(rr->sc,
|
||||
&rr->old_rtrefcountbt_blocks);
|
||||
if (error)
|
||||
goto out_bitmap;
|
||||
|
||||
|
||||
@@ -810,28 +810,6 @@ xrep_rtrmap_build_new_tree(
|
||||
|
||||
/* Reaping the old btree. */
|
||||
|
||||
/* Reap the old rtrmapbt blocks. */
|
||||
STATIC int
|
||||
xrep_rtrmap_remove_old_tree(
|
||||
struct xrep_rtrmap *rr)
|
||||
{
|
||||
int error;
|
||||
|
||||
/*
|
||||
* Free all the extents that were allocated to the former rtrmapbt and
|
||||
* aren't cross-linked with something else.
|
||||
*/
|
||||
error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/*
|
||||
* Ensure the proper reservation for the rtrmap inode so that we don't
|
||||
* fail to expand the new btree.
|
||||
*/
|
||||
return xrep_reset_metafile_resv(rr->sc);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
xrep_rtrmapbt_want_live_update(
|
||||
struct xchk_iscan *iscan,
|
||||
@@ -995,8 +973,11 @@ xrep_rtrmapbt(
|
||||
if (error)
|
||||
goto out_records;
|
||||
|
||||
/* Kill the old tree. */
|
||||
error = xrep_rtrmap_remove_old_tree(rr);
|
||||
/*
|
||||
* Free all the extents that were allocated to the former rtrmapbt and
|
||||
* aren't cross-linked with something else.
|
||||
*/
|
||||
error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
|
||||
if (error)
|
||||
goto out_records;
|
||||
|
||||
|
||||
@@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
|
||||
},
|
||||
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
|
||||
.type = ST_RTGROUP,
|
||||
.has = xfs_has_nonzoned,
|
||||
.setup = xchk_setup_rtbitmap,
|
||||
.scrub = xchk_rtbitmap,
|
||||
.repair = xrep_rtbitmap,
|
||||
},
|
||||
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
|
||||
.type = ST_RTGROUP,
|
||||
.has = xfs_has_nonzoned,
|
||||
.setup = xchk_setup_rtsummary,
|
||||
.scrub = xchk_rtsummary,
|
||||
.repair = xrep_rtsummary,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
||||
* Copyright (c) 2016-2018 Christoph Hellwig.
|
||||
* Copyright (c) 2016-2025 Christoph Hellwig.
|
||||
* All Rights Reserved.
|
||||
*/
|
||||
#include "xfs.h"
|
||||
@@ -20,6 +20,8 @@
|
||||
#include "xfs_errortag.h"
|
||||
#include "xfs_error.h"
|
||||
#include "xfs_icache.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
#include "xfs_rtgroup.h"
|
||||
|
||||
struct xfs_writepage_ctx {
|
||||
struct iomap_writepage_ctx ctx;
|
||||
@@ -77,6 +79,26 @@ xfs_setfilesize(
|
||||
return xfs_trans_commit(tp);
|
||||
}
|
||||
|
||||
static void
|
||||
xfs_ioend_put_open_zones(
|
||||
struct iomap_ioend *ioend)
|
||||
{
|
||||
struct iomap_ioend *tmp;
|
||||
|
||||
/*
|
||||
* Put the open zone for all ioends merged into this one (if any).
|
||||
*/
|
||||
list_for_each_entry(tmp, &ioend->io_list, io_list)
|
||||
xfs_open_zone_put(tmp->io_private);
|
||||
|
||||
/*
|
||||
* The main ioend might not have an open zone if the submission failed
|
||||
* before xfs_zone_alloc_and_submit got called.
|
||||
*/
|
||||
if (ioend->io_private)
|
||||
xfs_open_zone_put(ioend->io_private);
|
||||
}
|
||||
|
||||
/*
|
||||
* IO write completion.
|
||||
*/
|
||||
@@ -86,6 +108,7 @@ xfs_end_ioend(
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(ioend->io_inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
bool is_zoned = xfs_is_zoned_inode(ip);
|
||||
xfs_off_t offset = ioend->io_offset;
|
||||
size_t size = ioend->io_size;
|
||||
unsigned int nofs_flag;
|
||||
@@ -115,10 +138,11 @@ xfs_end_ioend(
|
||||
*/
|
||||
error = blk_status_to_errno(ioend->io_bio.bi_status);
|
||||
if (unlikely(error)) {
|
||||
if (ioend->io_flags & IOMAP_F_SHARED) {
|
||||
if (ioend->io_flags & IOMAP_IOEND_SHARED) {
|
||||
ASSERT(!is_zoned);
|
||||
xfs_reflink_cancel_cow_range(ip, offset, size, true);
|
||||
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
|
||||
offset + size);
|
||||
offset + size, NULL);
|
||||
}
|
||||
goto done;
|
||||
}
|
||||
@@ -126,14 +150,21 @@ xfs_end_ioend(
|
||||
/*
|
||||
* Success: commit the COW or unwritten blocks if needed.
|
||||
*/
|
||||
if (ioend->io_flags & IOMAP_F_SHARED)
|
||||
if (is_zoned)
|
||||
error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
|
||||
ioend->io_private, NULLFSBLOCK);
|
||||
else if (ioend->io_flags & IOMAP_IOEND_SHARED)
|
||||
error = xfs_reflink_end_cow(ip, offset, size);
|
||||
else if (ioend->io_type == IOMAP_UNWRITTEN)
|
||||
else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
|
||||
error = xfs_iomap_write_unwritten(ip, offset, size, false);
|
||||
|
||||
if (!error && xfs_ioend_is_append(ioend))
|
||||
if (!error &&
|
||||
!(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
|
||||
xfs_ioend_is_append(ioend))
|
||||
error = xfs_setfilesize(ip, offset, size);
|
||||
done:
|
||||
if (is_zoned)
|
||||
xfs_ioend_put_open_zones(ioend);
|
||||
iomap_finish_ioends(ioend, error);
|
||||
memalloc_nofs_restore(nofs_flag);
|
||||
}
|
||||
@@ -176,17 +207,27 @@ xfs_end_io(
|
||||
}
|
||||
}
|
||||
|
||||
STATIC void
|
||||
void
|
||||
xfs_end_bio(
|
||||
struct bio *bio)
|
||||
{
|
||||
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
|
||||
struct xfs_inode *ip = XFS_I(ioend->io_inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* For Appends record the actually written block number and set the
|
||||
* boundary flag if needed.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
|
||||
ioend->io_sector = bio->bi_iter.bi_sector;
|
||||
xfs_mark_rtg_boundary(ioend);
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&ip->i_ioend_lock, flags);
|
||||
if (list_empty(&ip->i_ioend_list))
|
||||
WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
|
||||
WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
|
||||
&ip->i_ioend_work));
|
||||
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
|
||||
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
|
||||
@@ -396,10 +437,11 @@ xfs_map_blocks(
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_prepare_ioend(
|
||||
struct iomap_ioend *ioend,
|
||||
xfs_submit_ioend(
|
||||
struct iomap_writepage_ctx *wpc,
|
||||
int status)
|
||||
{
|
||||
struct iomap_ioend *ioend = wpc->ioend;
|
||||
unsigned int nofs_flag;
|
||||
|
||||
/*
|
||||
@@ -410,7 +452,7 @@ xfs_prepare_ioend(
|
||||
nofs_flag = memalloc_nofs_save();
|
||||
|
||||
/* Convert CoW extents to regular */
|
||||
if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
|
||||
if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
|
||||
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
|
||||
ioend->io_offset, ioend->io_size);
|
||||
}
|
||||
@@ -418,10 +460,14 @@ xfs_prepare_ioend(
|
||||
memalloc_nofs_restore(nofs_flag);
|
||||
|
||||
/* send ioends that might require a transaction to the completion wq */
|
||||
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
|
||||
(ioend->io_flags & IOMAP_F_SHARED))
|
||||
if (xfs_ioend_is_append(ioend) ||
|
||||
(ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
|
||||
ioend->io_bio.bi_end_io = xfs_end_bio;
|
||||
return status;
|
||||
|
||||
if (status)
|
||||
return status;
|
||||
submit_bio(&ioend->io_bio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -458,12 +504,107 @@ xfs_discard_folio(
|
||||
* folio itself and not the start offset that is passed in.
|
||||
*/
|
||||
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
|
||||
folio_pos(folio) + folio_size(folio));
|
||||
folio_pos(folio) + folio_size(folio), NULL);
|
||||
}
|
||||
|
||||
static const struct iomap_writeback_ops xfs_writeback_ops = {
|
||||
.map_blocks = xfs_map_blocks,
|
||||
.prepare_ioend = xfs_prepare_ioend,
|
||||
.submit_ioend = xfs_submit_ioend,
|
||||
.discard_folio = xfs_discard_folio,
|
||||
};
|
||||
|
||||
struct xfs_zoned_writepage_ctx {
|
||||
struct iomap_writepage_ctx ctx;
|
||||
struct xfs_open_zone *open_zone;
|
||||
};
|
||||
|
||||
static inline struct xfs_zoned_writepage_ctx *
|
||||
XFS_ZWPC(struct iomap_writepage_ctx *ctx)
|
||||
{
|
||||
return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_zoned_map_blocks(
|
||||
struct iomap_writepage_ctx *wpc,
|
||||
struct inode *inode,
|
||||
loff_t offset,
|
||||
unsigned int len)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
|
||||
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
|
||||
xfs_filblks_t count_fsb;
|
||||
struct xfs_bmbt_irec imap, del;
|
||||
struct xfs_iext_cursor icur;
|
||||
|
||||
if (xfs_is_shutdown(mp))
|
||||
return -EIO;
|
||||
|
||||
XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
|
||||
|
||||
/*
|
||||
* All dirty data must be covered by delalloc extents. But truncate can
|
||||
* remove delalloc extents underneath us or reduce their size.
|
||||
* Returning a hole tells iomap to not write back any data from this
|
||||
* range, which is the right thing to do in that case.
|
||||
*
|
||||
* Otherwise just tell iomap to treat ranges previously covered by a
|
||||
* delalloc extent as mapped. The actual block allocation will be done
|
||||
* just before submitting the bio.
|
||||
*
|
||||
* This implies we never map outside folios that are locked or marked
|
||||
* as under writeback, and thus there is no need check the fork sequence
|
||||
* count here.
|
||||
*/
|
||||
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
||||
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
|
||||
imap.br_startoff = end_fsb; /* fake a hole past EOF */
|
||||
if (imap.br_startoff > offset_fsb) {
|
||||
imap.br_blockcount = imap.br_startoff - offset_fsb;
|
||||
imap.br_startoff = offset_fsb;
|
||||
imap.br_startblock = HOLESTARTBLOCK;
|
||||
imap.br_state = XFS_EXT_NORM;
|
||||
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
||||
xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
|
||||
return 0;
|
||||
}
|
||||
end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
|
||||
count_fsb = end_fsb - offset_fsb;
|
||||
|
||||
del = imap;
|
||||
xfs_trim_extent(&del, offset_fsb, count_fsb);
|
||||
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
|
||||
XFS_BMAPI_REMAP);
|
||||
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
||||
|
||||
wpc->iomap.type = IOMAP_MAPPED;
|
||||
wpc->iomap.flags = IOMAP_F_DIRTY;
|
||||
wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
|
||||
wpc->iomap.offset = offset;
|
||||
wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
|
||||
wpc->iomap.flags = IOMAP_F_ANON_WRITE;
|
||||
|
||||
trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_zoned_submit_ioend(
|
||||
struct iomap_writepage_ctx *wpc,
|
||||
int status)
|
||||
{
|
||||
wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
|
||||
if (status)
|
||||
return status;
|
||||
xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
|
||||
.map_blocks = xfs_zoned_map_blocks,
|
||||
.submit_ioend = xfs_zoned_submit_ioend,
|
||||
.discard_folio = xfs_discard_folio,
|
||||
};
|
||||
|
||||
@@ -472,10 +613,25 @@ xfs_vm_writepages(
|
||||
struct address_space *mapping,
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
struct xfs_writepage_ctx wpc = { };
|
||||
struct xfs_inode *ip = XFS_I(mapping->host);
|
||||
|
||||
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
|
||||
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
|
||||
xfs_iflags_clear(ip, XFS_ITRUNCATED);
|
||||
|
||||
if (xfs_is_zoned_inode(ip)) {
|
||||
struct xfs_zoned_writepage_ctx xc = { };
|
||||
int error;
|
||||
|
||||
error = iomap_writepages(mapping, wbc, &xc.ctx,
|
||||
&xfs_zoned_writeback_ops);
|
||||
if (xc.open_zone)
|
||||
xfs_open_zone_put(xc.open_zone);
|
||||
return error;
|
||||
} else {
|
||||
struct xfs_writepage_ctx wpc = { };
|
||||
|
||||
return iomap_writepages(mapping, wbc, &wpc.ctx,
|
||||
&xfs_writeback_ops);
|
||||
}
|
||||
}
|
||||
|
||||
STATIC int
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
extern const struct address_space_operations xfs_address_space_operations;
|
||||
extern const struct address_space_operations xfs_dax_aops;
|
||||
|
||||
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
|
||||
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
|
||||
void xfs_end_bio(struct bio *bio);
|
||||
|
||||
#endif /* __XFS_AOPS_H__ */
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "xfs_reflink.h"
|
||||
#include "xfs_rtbitmap.h"
|
||||
#include "xfs_rtgroup.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
/* Kernel only BMAP related definitions and functions */
|
||||
|
||||
@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
|
||||
struct xfs_inode *ip,
|
||||
int whichfork,
|
||||
xfs_off_t start_byte,
|
||||
xfs_off_t end_byte)
|
||||
xfs_off_t end_byte,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
|
||||
@@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range(
|
||||
continue;
|
||||
}
|
||||
|
||||
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
|
||||
if (xfs_is_zoned_inode(ip) && ac) {
|
||||
/*
|
||||
* In a zoned buffered write context we need to return
|
||||
* the punched delalloc allocations to the allocation
|
||||
* context. This allows reusing them in the following
|
||||
* iomap iterations.
|
||||
*/
|
||||
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
|
||||
&del, XFS_BMAPI_REMAP);
|
||||
ac->reserved_blocks += del.br_blockcount;
|
||||
} else {
|
||||
xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
|
||||
&del, 0);
|
||||
}
|
||||
|
||||
if (!xfs_iext_get_extent(ifp, &icur, &got))
|
||||
break;
|
||||
}
|
||||
@@ -582,7 +598,7 @@ xfs_free_eofblocks(
|
||||
if (ip->i_delayed_blks) {
|
||||
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
|
||||
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
|
||||
LLONG_MAX);
|
||||
LLONG_MAX, NULL);
|
||||
}
|
||||
xfs_inode_clear_eofblocks_tag(ip);
|
||||
return 0;
|
||||
@@ -825,7 +841,8 @@ int
|
||||
xfs_free_file_space(
|
||||
struct xfs_inode *ip,
|
||||
xfs_off_t offset,
|
||||
xfs_off_t len)
|
||||
xfs_off_t len,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
xfs_fileoff_t startoffset_fsb;
|
||||
@@ -880,7 +897,7 @@ xfs_free_file_space(
|
||||
return 0;
|
||||
if (offset + len > XFS_ISIZE(ip))
|
||||
len = XFS_ISIZE(ip) - offset;
|
||||
error = xfs_zero_range(ip, offset, len, NULL);
|
||||
error = xfs_zero_range(ip, offset, len, ac, NULL);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
@@ -968,7 +985,8 @@ int
|
||||
xfs_collapse_file_space(
|
||||
struct xfs_inode *ip,
|
||||
xfs_off_t offset,
|
||||
xfs_off_t len)
|
||||
xfs_off_t len,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
struct xfs_trans *tp;
|
||||
@@ -981,7 +999,7 @@ xfs_collapse_file_space(
|
||||
|
||||
trace_xfs_collapse_file_space(ip);
|
||||
|
||||
error = xfs_free_file_space(ip, offset, len);
|
||||
error = xfs_free_file_space(ip, offset, len, ac);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ struct xfs_inode;
|
||||
struct xfs_mount;
|
||||
struct xfs_trans;
|
||||
struct xfs_bmalloca;
|
||||
struct xfs_zone_alloc_ctx;
|
||||
|
||||
#ifdef CONFIG_XFS_RT
|
||||
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
|
||||
@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
|
||||
#endif /* CONFIG_XFS_RT */
|
||||
|
||||
void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
|
||||
xfs_off_t start_byte, xfs_off_t end_byte);
|
||||
xfs_off_t start_byte, xfs_off_t end_byte,
|
||||
struct xfs_zone_alloc_ctx *ac);
|
||||
|
||||
struct kgetbmap {
|
||||
__s64 bmv_offset; /* file offset of segment in blocks */
|
||||
@@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
|
||||
|
||||
/* preallocation and hole punch interface */
|
||||
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
|
||||
xfs_off_t len);
|
||||
xfs_off_t len);
|
||||
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
|
||||
xfs_off_t len);
|
||||
xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
|
||||
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
|
||||
xfs_off_t len);
|
||||
xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
|
||||
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
|
||||
xfs_off_t len);
|
||||
xfs_off_t len);
|
||||
|
||||
/* EOF block manipulation functions */
|
||||
bool xfs_can_free_eofblocks(struct xfs_inode *ip);
|
||||
|
||||
@@ -844,7 +844,8 @@ xfs_ioc_trim(
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
if (mp->m_rtdev_targp &&
|
||||
|
||||
if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
|
||||
bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
|
||||
rt_bdev = mp->m_rtdev_targp->bt_bdev;
|
||||
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
|
||||
|
||||
@@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
|
||||
while ((pag = xfs_perag_next(mp, pag)))
|
||||
xfs_extent_busy_wait_group(pag_group(pag));
|
||||
|
||||
if (xfs_has_rtgroups(mp))
|
||||
if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
|
||||
while ((rtg = xfs_rtgroup_next(mp, rtg)))
|
||||
xfs_extent_busy_wait_group(rtg_group(rtg));
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "xfs_inode.h"
|
||||
#include "xfs_rtbitmap.h"
|
||||
#include "xfs_rtgroup.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
struct kmem_cache *xfs_efi_cache;
|
||||
struct kmem_cache *xfs_efd_cache;
|
||||
@@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item(
|
||||
|
||||
trace_xfs_extent_free_deferred(mp, xefi);
|
||||
|
||||
if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) {
|
||||
if (*rtgp != to_rtg(xefi->xefi_group)) {
|
||||
*rtgp = to_rtg(xefi->xefi_group);
|
||||
xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP);
|
||||
xfs_rtgroup_trans_join(tp, *rtgp,
|
||||
XFS_RTGLOCK_BITMAP);
|
||||
}
|
||||
error = xfs_rtfree_blocks(tp, *rtgp,
|
||||
xefi->xefi_startblock, xefi->xefi_blockcount);
|
||||
if (xefi->xefi_flags & XFS_EFI_CANCELLED)
|
||||
goto done;
|
||||
|
||||
if (*rtgp != to_rtg(xefi->xefi_group)) {
|
||||
unsigned int lock_flags;
|
||||
|
||||
if (xfs_has_zoned(mp))
|
||||
lock_flags = XFS_RTGLOCK_RMAP;
|
||||
else
|
||||
lock_flags = XFS_RTGLOCK_BITMAP;
|
||||
|
||||
*rtgp = to_rtg(xefi->xefi_group);
|
||||
xfs_rtgroup_lock(*rtgp, lock_flags);
|
||||
xfs_rtgroup_trans_join(tp, *rtgp, lock_flags);
|
||||
}
|
||||
|
||||
if (xfs_has_zoned(mp)) {
|
||||
error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock,
|
||||
xefi->xefi_blockcount);
|
||||
} else {
|
||||
error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock,
|
||||
xefi->xefi_blockcount);
|
||||
}
|
||||
|
||||
if (error == -EAGAIN) {
|
||||
xfs_efd_from_efi(efdp);
|
||||
return error;
|
||||
}
|
||||
|
||||
done:
|
||||
xfs_efd_add_extent(efdp, xefi);
|
||||
xfs_extent_free_cancel_item(item);
|
||||
return error;
|
||||
|
||||
@@ -25,6 +25,8 @@
|
||||
#include "xfs_iomap.h"
|
||||
#include "xfs_reflink.h"
|
||||
#include "xfs_file.h"
|
||||
#include "xfs_aops.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
#include <linux/dax.h>
|
||||
#include <linux/falloc.h>
|
||||
@@ -150,7 +152,7 @@ xfs_file_fsync(
|
||||
* ensure newly written file data make it to disk before logging the new
|
||||
* inode size in case of an extending write.
|
||||
*/
|
||||
if (XFS_IS_REALTIME_INODE(ip))
|
||||
if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
|
||||
error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
|
||||
else if (mp->m_logdev_targp != mp->m_ddev_targp)
|
||||
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
|
||||
@@ -360,7 +362,8 @@ xfs_file_write_zero_eof(
|
||||
struct iov_iter *from,
|
||||
unsigned int *iolock,
|
||||
size_t count,
|
||||
bool *drained_dio)
|
||||
bool *drained_dio,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
|
||||
loff_t isize;
|
||||
@@ -414,7 +417,7 @@ xfs_file_write_zero_eof(
|
||||
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
|
||||
|
||||
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
|
||||
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
|
||||
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
|
||||
|
||||
return error;
|
||||
@@ -431,7 +434,8 @@ STATIC ssize_t
|
||||
xfs_file_write_checks(
|
||||
struct kiocb *iocb,
|
||||
struct iov_iter *from,
|
||||
unsigned int *iolock)
|
||||
unsigned int *iolock,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct inode *inode = iocb->ki_filp->f_mapping->host;
|
||||
size_t count = iov_iter_count(from);
|
||||
@@ -481,7 +485,7 @@ xfs_file_write_checks(
|
||||
*/
|
||||
if (iocb->ki_pos > i_size_read(inode)) {
|
||||
error = xfs_file_write_zero_eof(iocb, from, iolock, count,
|
||||
&drained_dio);
|
||||
&drained_dio, ac);
|
||||
if (error == 1)
|
||||
goto restart;
|
||||
if (error)
|
||||
@@ -491,6 +495,48 @@ xfs_file_write_checks(
|
||||
return kiocb_modified(iocb);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
xfs_zoned_write_space_reserve(
|
||||
struct xfs_inode *ip,
|
||||
struct kiocb *iocb,
|
||||
struct iov_iter *from,
|
||||
unsigned int flags,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
loff_t count = iov_iter_count(from);
|
||||
int error;
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
flags |= XFS_ZR_NOWAIT;
|
||||
|
||||
/*
|
||||
* Check the rlimit and LFS boundary first so that we don't over-reserve
|
||||
* by possibly a lot.
|
||||
*
|
||||
* The generic write path will redo this check later, and it might have
|
||||
* changed by then. If it got expanded we'll stick to our earlier
|
||||
* smaller limit, and if it is decreased the new smaller limit will be
|
||||
* used and our extra space reservation will be returned after finishing
|
||||
* the write.
|
||||
*/
|
||||
error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/*
|
||||
* Sloppily round up count to file system blocks.
|
||||
*
|
||||
* This will often reserve an extra block, but that avoids having to look
|
||||
* at the start offset, which isn't stable for O_APPEND until taking the
|
||||
* iolock. Also we need to reserve a block each for zeroing the old
|
||||
* EOF block and the new start block if they are unaligned.
|
||||
*
|
||||
* Any remaining block will be returned after the write.
|
||||
*/
|
||||
return xfs_zoned_space_reserve(ip,
|
||||
XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_dio_write_end_io(
|
||||
struct kiocb *iocb,
|
||||
@@ -503,6 +549,9 @@ xfs_dio_write_end_io(
|
||||
loff_t offset = iocb->ki_pos;
|
||||
unsigned int nofs_flag;
|
||||
|
||||
ASSERT(!xfs_is_zoned_inode(ip) ||
|
||||
!(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
|
||||
|
||||
trace_xfs_end_io_direct_write(ip, offset, size);
|
||||
|
||||
if (xfs_is_shutdown(ip->i_mount))
|
||||
@@ -582,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
|
||||
.end_io = xfs_dio_write_end_io,
|
||||
};
|
||||
|
||||
static void
|
||||
xfs_dio_zoned_submit_io(
|
||||
const struct iomap_iter *iter,
|
||||
struct bio *bio,
|
||||
loff_t file_offset)
|
||||
{
|
||||
struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
|
||||
struct xfs_zone_alloc_ctx *ac = iter->private;
|
||||
xfs_filblks_t count_fsb;
|
||||
struct iomap_ioend *ioend;
|
||||
|
||||
count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
|
||||
if (count_fsb > ac->reserved_blocks) {
|
||||
xfs_err(mp,
|
||||
"allocation (%lld) larger than reservation (%lld).",
|
||||
count_fsb, ac->reserved_blocks);
|
||||
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
|
||||
bio_io_error(bio);
|
||||
return;
|
||||
}
|
||||
ac->reserved_blocks -= count_fsb;
|
||||
|
||||
bio->bi_end_io = xfs_end_bio;
|
||||
ioend = iomap_init_ioend(iter->inode, bio, file_offset,
|
||||
IOMAP_IOEND_DIRECT);
|
||||
xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
|
||||
}
|
||||
|
||||
static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
|
||||
.bio_set = &iomap_ioend_bioset,
|
||||
.submit_io = xfs_dio_zoned_submit_io,
|
||||
.end_io = xfs_dio_write_end_io,
|
||||
};
|
||||
|
||||
/*
|
||||
* Handle block aligned direct I/O writes
|
||||
* Handle block aligned direct I/O writes.
|
||||
*/
|
||||
static noinline ssize_t
|
||||
xfs_file_dio_write_aligned(
|
||||
struct xfs_inode *ip,
|
||||
struct kiocb *iocb,
|
||||
struct iov_iter *from)
|
||||
struct iov_iter *from,
|
||||
const struct iomap_ops *ops,
|
||||
const struct iomap_dio_ops *dops,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
unsigned int iolock = XFS_IOLOCK_SHARED;
|
||||
ssize_t ret;
|
||||
@@ -597,7 +683,7 @@ xfs_file_dio_write_aligned(
|
||||
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock);
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock, ac);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
@@ -611,11 +697,31 @@ xfs_file_dio_write_aligned(
|
||||
iolock = XFS_IOLOCK_SHARED;
|
||||
}
|
||||
trace_xfs_file_direct_write(iocb, from);
|
||||
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
|
||||
&xfs_dio_write_ops, 0, NULL, 0);
|
||||
ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
|
||||
out_unlock:
|
||||
if (iolock)
|
||||
xfs_iunlock(ip, iolock);
|
||||
xfs_iunlock(ip, iolock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle block aligned direct I/O writes to zoned devices.
|
||||
*/
|
||||
static noinline ssize_t
|
||||
xfs_file_dio_write_zoned(
|
||||
struct xfs_inode *ip,
|
||||
struct kiocb *iocb,
|
||||
struct iov_iter *from)
|
||||
{
|
||||
struct xfs_zone_alloc_ctx ac = { };
|
||||
ssize_t ret;
|
||||
|
||||
ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
ret = xfs_file_dio_write_aligned(ip, iocb, from,
|
||||
&xfs_zoned_direct_write_iomap_ops,
|
||||
&xfs_dio_zoned_write_ops, &ac);
|
||||
xfs_zoned_space_unreserve(ip, &ac);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -675,7 +781,7 @@ xfs_file_dio_write_unaligned(
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock);
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
@@ -721,9 +827,21 @@ xfs_file_dio_write(
|
||||
/* direct I/O must be aligned to device logical sector size */
|
||||
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
|
||||
return -EINVAL;
|
||||
if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
|
||||
|
||||
/*
|
||||
* For always COW inodes we also must check the alignment of each
|
||||
* individual iovec segment, as they could end up with different
|
||||
* I/Os due to the way bio_iov_iter_get_pages works, and we'd
|
||||
* then overwrite an already written block.
|
||||
*/
|
||||
if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
|
||||
(xfs_is_always_cow_inode(ip) &&
|
||||
(iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
|
||||
return xfs_file_dio_write_unaligned(ip, iocb, from);
|
||||
return xfs_file_dio_write_aligned(ip, iocb, from);
|
||||
if (xfs_is_zoned_inode(ip))
|
||||
return xfs_file_dio_write_zoned(ip, iocb, from);
|
||||
return xfs_file_dio_write_aligned(ip, iocb, from,
|
||||
&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
|
||||
}
|
||||
|
||||
static noinline ssize_t
|
||||
@@ -740,7 +858,7 @@ xfs_file_dax_write(
|
||||
ret = xfs_ilock_iocb(iocb, iolock);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock);
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -784,7 +902,7 @@ xfs_file_buffered_write(
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock);
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -831,6 +949,67 @@ xfs_file_buffered_write(
|
||||
return ret;
|
||||
}
|
||||
|
||||
STATIC ssize_t
|
||||
xfs_file_buffered_write_zoned(
|
||||
struct kiocb *iocb,
|
||||
struct iov_iter *from)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
unsigned int iolock = XFS_IOLOCK_EXCL;
|
||||
bool cleared_space = false;
|
||||
struct xfs_zone_alloc_ctx ac = { };
|
||||
ssize_t ret;
|
||||
|
||||
ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = xfs_ilock_iocb(iocb, iolock);
|
||||
if (ret)
|
||||
goto out_unreserve;
|
||||
|
||||
ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* Truncate the iter to the length that we were actually able to
|
||||
* allocate blocks for. This needs to happen after
|
||||
* xfs_file_write_checks, because that assigns ki_pos for O_APPEND
|
||||
* writes.
|
||||
*/
|
||||
iov_iter_truncate(from,
|
||||
XFS_FSB_TO_B(mp, ac.reserved_blocks) -
|
||||
(iocb->ki_pos & mp->m_blockmask));
|
||||
if (!iov_iter_count(from))
|
||||
goto out_unlock;
|
||||
|
||||
retry:
|
||||
trace_xfs_file_buffered_write(iocb, from);
|
||||
ret = iomap_file_buffered_write(iocb, from,
|
||||
&xfs_buffered_write_iomap_ops, &ac);
|
||||
if (ret == -ENOSPC && !cleared_space) {
|
||||
/*
|
||||
* Kick off writeback to convert delalloc space and release the
|
||||
* usually too pessimistic indirect block reservations.
|
||||
*/
|
||||
xfs_flush_inodes(mp);
|
||||
cleared_space = true;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
xfs_iunlock(ip, iolock);
|
||||
out_unreserve:
|
||||
xfs_zoned_space_unreserve(ip, &ac);
|
||||
if (ret > 0) {
|
||||
XFS_STATS_ADD(mp, xs_write_bytes, ret);
|
||||
ret = generic_write_sync(iocb, ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
STATIC ssize_t
|
||||
xfs_file_write_iter(
|
||||
struct kiocb *iocb,
|
||||
@@ -878,6 +1057,8 @@ xfs_file_write_iter(
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (xfs_is_zoned_inode(ip))
|
||||
return xfs_file_buffered_write_zoned(iocb, from);
|
||||
return xfs_file_buffered_write(iocb, from);
|
||||
}
|
||||
|
||||
@@ -932,7 +1113,8 @@ static int
|
||||
xfs_falloc_collapse_range(
|
||||
struct file *file,
|
||||
loff_t offset,
|
||||
loff_t len)
|
||||
loff_t len,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
loff_t new_size = i_size_read(inode) - len;
|
||||
@@ -948,7 +1130,7 @@ xfs_falloc_collapse_range(
|
||||
if (offset + len >= i_size_read(inode))
|
||||
return -EINVAL;
|
||||
|
||||
error = xfs_collapse_file_space(XFS_I(inode), offset, len);
|
||||
error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
|
||||
if (error)
|
||||
return error;
|
||||
return xfs_falloc_setsize(file, new_size);
|
||||
@@ -1004,7 +1186,8 @@ xfs_falloc_zero_range(
|
||||
struct file *file,
|
||||
int mode,
|
||||
loff_t offset,
|
||||
loff_t len)
|
||||
loff_t len,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
unsigned int blksize = i_blocksize(inode);
|
||||
@@ -1017,7 +1200,7 @@ xfs_falloc_zero_range(
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
error = xfs_free_file_space(XFS_I(inode), offset, len);
|
||||
error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
@@ -1088,22 +1271,18 @@ xfs_falloc_allocate_range(
|
||||
FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
|
||||
|
||||
STATIC long
|
||||
xfs_file_fallocate(
|
||||
__xfs_file_fallocate(
|
||||
struct file *file,
|
||||
int mode,
|
||||
loff_t offset,
|
||||
loff_t len)
|
||||
loff_t len,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
long error;
|
||||
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
|
||||
|
||||
if (!S_ISREG(inode->i_mode))
|
||||
return -EINVAL;
|
||||
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
xfs_ilock(ip, iolock);
|
||||
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
|
||||
if (error)
|
||||
@@ -1124,16 +1303,16 @@ xfs_file_fallocate(
|
||||
|
||||
switch (mode & FALLOC_FL_MODE_MASK) {
|
||||
case FALLOC_FL_PUNCH_HOLE:
|
||||
error = xfs_free_file_space(ip, offset, len);
|
||||
error = xfs_free_file_space(ip, offset, len, ac);
|
||||
break;
|
||||
case FALLOC_FL_COLLAPSE_RANGE:
|
||||
error = xfs_falloc_collapse_range(file, offset, len);
|
||||
error = xfs_falloc_collapse_range(file, offset, len, ac);
|
||||
break;
|
||||
case FALLOC_FL_INSERT_RANGE:
|
||||
error = xfs_falloc_insert_range(file, offset, len);
|
||||
break;
|
||||
case FALLOC_FL_ZERO_RANGE:
|
||||
error = xfs_falloc_zero_range(file, mode, offset, len);
|
||||
error = xfs_falloc_zero_range(file, mode, offset, len, ac);
|
||||
break;
|
||||
case FALLOC_FL_UNSHARE_RANGE:
|
||||
error = xfs_falloc_unshare_range(file, mode, offset, len);
|
||||
@@ -1154,6 +1333,54 @@ xfs_file_fallocate(
|
||||
return error;
|
||||
}
|
||||
|
||||
static long
|
||||
xfs_file_zoned_fallocate(
|
||||
struct file *file,
|
||||
int mode,
|
||||
loff_t offset,
|
||||
loff_t len)
|
||||
{
|
||||
struct xfs_zone_alloc_ctx ac = { };
|
||||
struct xfs_inode *ip = XFS_I(file_inode(file));
|
||||
int error;
|
||||
|
||||
error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
|
||||
if (error)
|
||||
return error;
|
||||
error = __xfs_file_fallocate(file, mode, offset, len, &ac);
|
||||
xfs_zoned_space_unreserve(ip, &ac);
|
||||
return error;
|
||||
}
|
||||
|
||||
static long
|
||||
xfs_file_fallocate(
|
||||
struct file *file,
|
||||
int mode,
|
||||
loff_t offset,
|
||||
loff_t len)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
if (!S_ISREG(inode->i_mode))
|
||||
return -EINVAL;
|
||||
if (mode & ~XFS_FALLOC_FL_SUPPORTED)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/*
|
||||
* For zoned file systems, zeroing the first and last block of a hole
|
||||
* punch requires allocating a new block to rewrite the remaining data
|
||||
* and new zeroes out of place. Get a reservations for those before
|
||||
* taking the iolock. Dip into the reserved pool because we are
|
||||
* expected to be able to punch a hole even on a completely full
|
||||
* file system.
|
||||
*/
|
||||
if (xfs_is_zoned_inode(XFS_I(inode)) &&
|
||||
(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
|
||||
FALLOC_FL_COLLAPSE_RANGE)))
|
||||
return xfs_file_zoned_fallocate(file, mode, offset, len);
|
||||
return __xfs_file_fallocate(file, mode, offset, len, NULL);
|
||||
}
|
||||
|
||||
STATIC int
|
||||
xfs_file_fadvise(
|
||||
struct file *file,
|
||||
@@ -1347,15 +1574,22 @@ xfs_file_release(
|
||||
* blocks. This avoids open/read/close workloads from removing EOF
|
||||
* blocks that other writers depend upon to reduce fragmentation.
|
||||
*
|
||||
* Inodes on the zoned RT device never have preallocations, so skip
|
||||
* taking the locks below.
|
||||
*/
|
||||
if (!inode->i_nlink ||
|
||||
!(file->f_mode & FMODE_WRITE) ||
|
||||
(ip->i_diflags & XFS_DIFLAG_APPEND) ||
|
||||
xfs_is_zoned_inode(ip))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* If we can't get the iolock just skip truncating the blocks past EOF
|
||||
* because we could deadlock with the mmap_lock otherwise. We'll get
|
||||
* another chance to drop them once the last reference to the inode is
|
||||
* dropped, so we'll never leak blocks permanently.
|
||||
*/
|
||||
if (inode->i_nlink &&
|
||||
(file->f_mode & FMODE_WRITE) &&
|
||||
!(ip->i_diflags & XFS_DIFLAG_APPEND) &&
|
||||
!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
|
||||
if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
|
||||
xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
|
||||
if (xfs_can_free_eofblocks(ip) &&
|
||||
!xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
|
||||
@@ -1472,9 +1706,10 @@ xfs_dax_read_fault(
|
||||
* i_lock (XFS - extent map serialisation)
|
||||
*/
|
||||
static vm_fault_t
|
||||
xfs_write_fault(
|
||||
__xfs_write_fault(
|
||||
struct vm_fault *vmf,
|
||||
unsigned int order)
|
||||
unsigned int order,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct inode *inode = file_inode(vmf->vma->vm_file);
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
@@ -1511,13 +1746,50 @@ xfs_write_fault(
|
||||
if (IS_DAX(inode))
|
||||
ret = xfs_dax_fault_locked(vmf, order, true);
|
||||
else
|
||||
ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
|
||||
ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
|
||||
ac);
|
||||
xfs_iunlock(ip, lock_mode);
|
||||
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static vm_fault_t
|
||||
xfs_write_fault_zoned(
|
||||
struct vm_fault *vmf,
|
||||
unsigned int order)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
|
||||
unsigned int len = folio_size(page_folio(vmf->page));
|
||||
struct xfs_zone_alloc_ctx ac = { };
|
||||
int error;
|
||||
vm_fault_t ret;
|
||||
|
||||
/*
|
||||
* This could over-allocate as it doesn't check for truncation.
|
||||
*
|
||||
* But as the overallocation is limited to less than a folio and will be
|
||||
* release instantly that's just fine.
|
||||
*/
|
||||
error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
|
||||
&ac);
|
||||
if (error < 0)
|
||||
return vmf_fs_error(error);
|
||||
ret = __xfs_write_fault(vmf, order, &ac);
|
||||
xfs_zoned_space_unreserve(ip, &ac);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static vm_fault_t
|
||||
xfs_write_fault(
|
||||
struct vm_fault *vmf,
|
||||
unsigned int order)
|
||||
{
|
||||
if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
|
||||
return xfs_write_fault_zoned(vmf, order);
|
||||
return __xfs_write_fault(vmf, order, NULL);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
xfs_is_write_fault(
|
||||
struct vm_fault *vmf)
|
||||
@@ -1626,7 +1898,8 @@ const struct file_operations xfs_file_operations = {
|
||||
.fadvise = xfs_file_fadvise,
|
||||
.remap_file_range = xfs_file_remap_range,
|
||||
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
|
||||
FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
|
||||
FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
|
||||
FOP_DONTCACHE,
|
||||
};
|
||||
|
||||
const struct file_operations xfs_dir_file_operations = {
|
||||
|
||||
@@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt(
|
||||
struct xfs_mount *mp = tp->t_mountp;
|
||||
struct xfs_rtgroup *rtg = NULL;
|
||||
struct xfs_btree_cur *bt_cur = NULL;
|
||||
xfs_daddr_t rtstart_daddr;
|
||||
xfs_rtblock_t start_rtb;
|
||||
xfs_rtblock_t end_rtb;
|
||||
xfs_rgnumber_t start_rg, end_rg;
|
||||
uint64_t eofs;
|
||||
int error = 0;
|
||||
|
||||
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
|
||||
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
|
||||
if (keys[0].fmr_physical >= eofs)
|
||||
return 0;
|
||||
start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
|
||||
end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
|
||||
|
||||
rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
|
||||
if (keys[0].fmr_physical < rtstart_daddr) {
|
||||
struct xfs_fsmap_irec frec = {
|
||||
.owner = XFS_RMAP_OWN_FS,
|
||||
.len_daddr = rtstart_daddr,
|
||||
};
|
||||
|
||||
/* Adjust the low key if we are continuing from where we left off. */
|
||||
if (keys[0].fmr_length > 0) {
|
||||
info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Fabricate an rmap entry for space occupied by the data dev */
|
||||
error = xfs_getfsmap_helper(tp, info, &frec);
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
|
||||
start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
|
||||
end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
|
||||
min(eofs - 1, keys[1].fmr_physical));
|
||||
|
||||
info->missing_owner = XFS_FMR_OWN_FREE;
|
||||
|
||||
@@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt(
|
||||
}
|
||||
#endif /* CONFIG_XFS_RT */
|
||||
|
||||
static uint32_t
|
||||
xfs_getfsmap_device(
|
||||
struct xfs_mount *mp,
|
||||
enum xfs_device dev)
|
||||
{
|
||||
if (mp->m_sb.sb_rtstart)
|
||||
return dev;
|
||||
|
||||
switch (dev) {
|
||||
case XFS_DEV_DATA:
|
||||
return new_encode_dev(mp->m_ddev_targp->bt_dev);
|
||||
case XFS_DEV_LOG:
|
||||
return new_encode_dev(mp->m_logdev_targp->bt_dev);
|
||||
case XFS_DEV_RT:
|
||||
if (!mp->m_rtdev_targp)
|
||||
break;
|
||||
return new_encode_dev(mp->m_rtdev_targp->bt_dev);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Do we recognize the device? */
|
||||
STATIC bool
|
||||
xfs_getfsmap_is_valid_device(
|
||||
struct xfs_mount *mp,
|
||||
struct xfs_fsmap *fm)
|
||||
{
|
||||
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
|
||||
fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
|
||||
return true;
|
||||
if (mp->m_logdev_targp &&
|
||||
fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
|
||||
return true;
|
||||
if (mp->m_rtdev_targp &&
|
||||
fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
|
||||
return true;
|
||||
return false;
|
||||
return fm->fmr_device == 0 ||
|
||||
fm->fmr_device == UINT_MAX ||
|
||||
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) ||
|
||||
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) ||
|
||||
(mp->m_rtdev_targp &&
|
||||
fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT));
|
||||
}
|
||||
|
||||
/* Ensure that the low key is less than the high key. */
|
||||
@@ -1126,7 +1166,7 @@ xfs_getfsmap(
|
||||
/* Set up our device handlers. */
|
||||
memset(handlers, 0, sizeof(handlers));
|
||||
handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
|
||||
handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
|
||||
handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA);
|
||||
if (use_rmap)
|
||||
handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
|
||||
else
|
||||
@@ -1134,13 +1174,17 @@ xfs_getfsmap(
|
||||
if (mp->m_logdev_targp != mp->m_ddev_targp) {
|
||||
handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
|
||||
mp->m_sb.sb_logblocks);
|
||||
handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
|
||||
handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG);
|
||||
handlers[1].fn = xfs_getfsmap_logdev;
|
||||
}
|
||||
#ifdef CONFIG_XFS_RT
|
||||
if (mp->m_rtdev_targp) {
|
||||
/*
|
||||
* For zoned file systems there is no rtbitmap, so only support fsmap
|
||||
* if the callers is privileged enough to use the full rmap version.
|
||||
*/
|
||||
if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
|
||||
handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
|
||||
handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
|
||||
handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT);
|
||||
if (use_rmap)
|
||||
handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
|
||||
else
|
||||
@@ -1230,7 +1274,13 @@ xfs_getfsmap(
|
||||
|
||||
if (tp)
|
||||
xfs_trans_cancel(tp);
|
||||
head->fmh_oflags = FMH_OF_DEV_T;
|
||||
|
||||
/*
|
||||
* For internal RT device we need to report different synthetic devices
|
||||
* for a single physical device, and thus can't report the actual dev_t.
|
||||
*/
|
||||
if (!mp->m_sb.sb_rtstart)
|
||||
head->fmh_oflags = FMH_OF_DEV_T;
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "xfs_rtalloc.h"
|
||||
#include "xfs_rtrmap_btree.h"
|
||||
#include "xfs_rtrefcount_btree.h"
|
||||
#include "xfs_metafile.h"
|
||||
|
||||
/*
|
||||
* Write new AG headers to disk. Non-transactional, but need to be
|
||||
@@ -307,6 +308,10 @@ xfs_growfs_data(
|
||||
if (!mutex_trylock(&mp->m_growlock))
|
||||
return -EWOULDBLOCK;
|
||||
|
||||
/* we can't grow the data section when an internal RT section exists */
|
||||
if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart)
|
||||
return -EINVAL;
|
||||
|
||||
/* update imaxpct separately to the physical grow of the filesystem */
|
||||
if (in->imaxpct != mp->m_sb.sb_imax_pct) {
|
||||
error = xfs_growfs_imaxpct(mp, in->imaxpct);
|
||||
@@ -366,6 +371,7 @@ xfs_growfs_log(
|
||||
int
|
||||
xfs_reserve_blocks(
|
||||
struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr,
|
||||
uint64_t request)
|
||||
{
|
||||
int64_t lcounter, delta;
|
||||
@@ -373,6 +379,8 @@ xfs_reserve_blocks(
|
||||
int64_t free;
|
||||
int error = 0;
|
||||
|
||||
ASSERT(ctr < XC_FREE_NR);
|
||||
|
||||
/*
|
||||
* With per-cpu counters, this becomes an interesting problem. we need
|
||||
* to work out if we are freeing or allocation blocks first, then we can
|
||||
@@ -391,16 +399,16 @@ xfs_reserve_blocks(
|
||||
* counters directly since we shouldn't have any problems unreserving
|
||||
* space.
|
||||
*/
|
||||
if (mp->m_resblks > request) {
|
||||
lcounter = mp->m_resblks_avail - request;
|
||||
if (mp->m_free[ctr].res_total > request) {
|
||||
lcounter = mp->m_free[ctr].res_avail - request;
|
||||
if (lcounter > 0) { /* release unused blocks */
|
||||
fdblks_delta = lcounter;
|
||||
mp->m_resblks_avail -= lcounter;
|
||||
mp->m_free[ctr].res_avail -= lcounter;
|
||||
}
|
||||
mp->m_resblks = request;
|
||||
mp->m_free[ctr].res_total = request;
|
||||
if (fdblks_delta) {
|
||||
spin_unlock(&mp->m_sb_lock);
|
||||
xfs_add_fdblocks(mp, fdblks_delta);
|
||||
xfs_add_freecounter(mp, ctr, fdblks_delta);
|
||||
spin_lock(&mp->m_sb_lock);
|
||||
}
|
||||
|
||||
@@ -409,7 +417,7 @@ xfs_reserve_blocks(
|
||||
|
||||
/*
|
||||
* If the request is larger than the current reservation, reserve the
|
||||
* blocks before we update the reserve counters. Sample m_fdblocks and
|
||||
* blocks before we update the reserve counters. Sample m_free and
|
||||
* perform a partial reservation if the request exceeds free space.
|
||||
*
|
||||
* The code below estimates how many blocks it can request from
|
||||
@@ -419,10 +427,10 @@ xfs_reserve_blocks(
|
||||
* space to fill it because mod_fdblocks will refill an undersized
|
||||
* reserve when it can.
|
||||
*/
|
||||
free = percpu_counter_sum(&mp->m_fdblocks) -
|
||||
xfs_fdblocks_unavailable(mp);
|
||||
delta = request - mp->m_resblks;
|
||||
mp->m_resblks = request;
|
||||
free = xfs_sum_freecounter_raw(mp, ctr) -
|
||||
xfs_freecounter_unavailable(mp, ctr);
|
||||
delta = request - mp->m_free[ctr].res_total;
|
||||
mp->m_free[ctr].res_total = request;
|
||||
if (delta > 0 && free > 0) {
|
||||
/*
|
||||
* We'll either succeed in getting space from the free block
|
||||
@@ -436,9 +444,9 @@ xfs_reserve_blocks(
|
||||
*/
|
||||
fdblks_delta = min(free, delta);
|
||||
spin_unlock(&mp->m_sb_lock);
|
||||
error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
|
||||
error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0);
|
||||
if (!error)
|
||||
xfs_add_fdblocks(mp, fdblks_delta);
|
||||
xfs_add_freecounter(mp, ctr, fdblks_delta);
|
||||
spin_lock(&mp->m_sb_lock);
|
||||
}
|
||||
out:
|
||||
@@ -558,15 +566,13 @@ xfs_fs_reserve_ag_blocks(
|
||||
return error;
|
||||
}
|
||||
|
||||
if (xfs_has_realtime(mp)) {
|
||||
err2 = xfs_rt_resv_init(mp);
|
||||
if (err2 && err2 != -ENOSPC) {
|
||||
xfs_warn(mp,
|
||||
"Error %d reserving realtime metadata reserve pool.", err2);
|
||||
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
|
||||
}
|
||||
err2 = xfs_metafile_resv_init(mp);
|
||||
if (err2 && err2 != -ENOSPC) {
|
||||
xfs_warn(mp,
|
||||
"Error %d reserving realtime metadata reserve pool.", err2);
|
||||
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
|
||||
|
||||
if (err2 && !error)
|
||||
if (!error)
|
||||
error = err2;
|
||||
}
|
||||
|
||||
@@ -582,9 +588,7 @@ xfs_fs_unreserve_ag_blocks(
|
||||
{
|
||||
struct xfs_perag *pag = NULL;
|
||||
|
||||
if (xfs_has_realtime(mp))
|
||||
xfs_rt_resv_free(mp);
|
||||
|
||||
xfs_metafile_resv_free(mp);
|
||||
while ((pag = xfs_perag_next(mp, pag)))
|
||||
xfs_ag_resv_free(pag);
|
||||
}
|
||||
|
||||
@@ -8,7 +8,8 @@
|
||||
|
||||
int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
|
||||
int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
|
||||
int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
|
||||
int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt,
|
||||
uint64_t request);
|
||||
int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
|
||||
|
||||
int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
|
||||
|
||||
@@ -2073,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file(
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
|
||||
if (!XFS_IS_REALTIME_INODE(ip))
|
||||
if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
|
||||
return false;
|
||||
|
||||
if (__percpu_counter_compare(&mp->m_frextents,
|
||||
if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
|
||||
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
|
||||
XFS_FDBLOCKS_BATCH) < 0)
|
||||
return true;
|
||||
@@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work(
|
||||
if (items > mp->m_ino_geo.inodes_per_cluster)
|
||||
return true;
|
||||
|
||||
if (__percpu_counter_compare(&mp->m_fdblocks,
|
||||
if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
|
||||
mp->m_low_space[XFS_LOWSP_5_PCNT],
|
||||
XFS_FDBLOCKS_BATCH) < 0)
|
||||
return true;
|
||||
|
||||
@@ -3074,5 +3074,6 @@ bool
|
||||
xfs_is_always_cow_inode(
|
||||
const struct xfs_inode *ip)
|
||||
{
|
||||
return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
|
||||
return xfs_is_zoned_inode(ip) ||
|
||||
(ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount));
|
||||
}
|
||||
|
||||
@@ -25,19 +25,9 @@ struct xfs_dquot;
|
||||
typedef struct xfs_inode {
|
||||
/* Inode linking and identification information. */
|
||||
struct xfs_mount *i_mount; /* fs mount struct ptr */
|
||||
union {
|
||||
struct {
|
||||
struct xfs_dquot *i_udquot; /* user dquot */
|
||||
struct xfs_dquot *i_gdquot; /* group dquot */
|
||||
struct xfs_dquot *i_pdquot; /* project dquot */
|
||||
};
|
||||
|
||||
/*
|
||||
* Space that has been set aside to accomodate expansions of a
|
||||
* metadata btree rooted in this file.
|
||||
*/
|
||||
uint64_t i_meta_resv_asked;
|
||||
};
|
||||
struct xfs_dquot *i_udquot; /* user dquot */
|
||||
struct xfs_dquot *i_gdquot; /* group dquot */
|
||||
struct xfs_dquot *i_pdquot; /* project dquot */
|
||||
|
||||
/* Inode location stuff */
|
||||
xfs_ino_t i_ino; /* inode number (agno/agino)*/
|
||||
@@ -69,8 +59,13 @@ typedef struct xfs_inode {
|
||||
xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
|
||||
prid_t i_projid; /* owner's project id */
|
||||
xfs_extlen_t i_extsize; /* basic/minimum extent size */
|
||||
/* cowextsize is only used for v3 inodes, flushiter for v1/2 */
|
||||
/*
|
||||
* i_used_blocks is used for zoned rtrmap inodes,
|
||||
* i_cowextsize is used for other v3 inodes,
|
||||
* i_flushiter for v1/2 inodes
|
||||
*/
|
||||
union {
|
||||
uint32_t i_used_blocks; /* used blocks in RTG */
|
||||
xfs_extlen_t i_cowextsize; /* basic cow extent size */
|
||||
uint16_t i_flushiter; /* incremented on flush */
|
||||
};
|
||||
@@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
|
||||
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
|
||||
}
|
||||
|
||||
static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip)
|
||||
{
|
||||
return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip);
|
||||
}
|
||||
|
||||
bool xfs_is_always_cow_inode(const struct xfs_inode *ip);
|
||||
|
||||
static inline bool xfs_is_cow_inode(const struct xfs_inode *ip)
|
||||
|
||||
@@ -596,6 +596,7 @@ xfs_inode_to_log_dinode(
|
||||
to->di_changecount = inode_peek_iversion(inode);
|
||||
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
|
||||
to->di_flags2 = ip->i_diflags2;
|
||||
/* also covers the di_used_blocks union arm: */
|
||||
to->di_cowextsize = ip->i_cowextsize;
|
||||
to->di_ino = ip->i_ino;
|
||||
to->di_lsn = lsn;
|
||||
|
||||
@@ -203,6 +203,7 @@ xfs_log_dinode_to_disk(
|
||||
to->di_crtime = xfs_log_dinode_to_disk_ts(from,
|
||||
from->di_crtime);
|
||||
to->di_flags2 = cpu_to_be64(from->di_flags2);
|
||||
/* also covers the di_used_blocks union arm: */
|
||||
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
|
||||
to->di_ino = cpu_to_be64(from->di_ino);
|
||||
to->di_lsn = cpu_to_be64(lsn);
|
||||
|
||||
@@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks(
|
||||
error = mnt_want_write_file(filp);
|
||||
if (error)
|
||||
return error;
|
||||
error = xfs_reserve_blocks(mp, fsop.resblks);
|
||||
error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks);
|
||||
mnt_drop_write_file(filp);
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
|
||||
spin_lock(&mp->m_sb_lock);
|
||||
fsop.resblks = mp->m_resblks;
|
||||
fsop.resblks_avail = mp->m_resblks_avail;
|
||||
fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total;
|
||||
fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail;
|
||||
spin_unlock(&mp->m_sb_lock);
|
||||
|
||||
if (copy_to_user(arg, &fsop, sizeof(fsop)))
|
||||
@@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts(
|
||||
struct xfs_fsop_counts out = {
|
||||
.allocino = percpu_counter_read_positive(&mp->m_icount),
|
||||
.freeino = percpu_counter_read_positive(&mp->m_ifree),
|
||||
.freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
|
||||
xfs_fdblocks_unavailable(mp),
|
||||
.freertx = percpu_counter_read_positive(&mp->m_frextents),
|
||||
.freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) -
|
||||
xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS),
|
||||
.freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS),
|
||||
};
|
||||
|
||||
if (copy_to_user(uarg, &out, sizeof(out)))
|
||||
|
||||
@@ -30,6 +30,8 @@
|
||||
#include "xfs_reflink.h"
|
||||
#include "xfs_health.h"
|
||||
#include "xfs_rtbitmap.h"
|
||||
#include "xfs_icache.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
#define XFS_ALLOC_ALIGN(mp, off) \
|
||||
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
|
||||
@@ -431,13 +433,14 @@ xfs_quota_calc_throttle(
|
||||
|
||||
static int64_t
|
||||
xfs_iomap_freesp(
|
||||
struct percpu_counter *counter,
|
||||
struct xfs_mount *mp,
|
||||
unsigned int idx,
|
||||
uint64_t low_space[XFS_LOWSP_MAX],
|
||||
int *shift)
|
||||
{
|
||||
int64_t freesp;
|
||||
|
||||
freesp = percpu_counter_read_positive(counter);
|
||||
freesp = xfs_estimate_freecounter(mp, idx);
|
||||
if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
|
||||
*shift = 2;
|
||||
if (freesp < low_space[XFS_LOWSP_4_PCNT])
|
||||
@@ -536,10 +539,10 @@ xfs_iomap_prealloc_size(
|
||||
|
||||
if (unlikely(XFS_IS_REALTIME_INODE(ip)))
|
||||
freesp = xfs_rtbxlen_to_blen(mp,
|
||||
xfs_iomap_freesp(&mp->m_frextents,
|
||||
xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS,
|
||||
mp->m_low_rtexts, &shift));
|
||||
else
|
||||
freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
|
||||
freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space,
|
||||
&shift);
|
||||
|
||||
/*
|
||||
@@ -962,6 +965,59 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
|
||||
.iomap_begin = xfs_direct_write_iomap_begin,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_XFS_RT
|
||||
/*
|
||||
* This is really simple. The space has already been reserved before taking the
|
||||
* IOLOCK, the actual block allocation is done just before submitting the bio
|
||||
* and only recorded in the extent map on I/O completion.
|
||||
*/
|
||||
static int
|
||||
xfs_zoned_direct_write_iomap_begin(
|
||||
struct inode *inode,
|
||||
loff_t offset,
|
||||
loff_t length,
|
||||
unsigned flags,
|
||||
struct iomap *iomap,
|
||||
struct iomap *srcmap)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
int error;
|
||||
|
||||
ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
|
||||
|
||||
/*
|
||||
* Needs to be pushed down into the allocator so that only writes into
|
||||
* a single zone can be supported.
|
||||
*/
|
||||
if (flags & IOMAP_NOWAIT)
|
||||
return -EAGAIN;
|
||||
|
||||
/*
|
||||
* Ensure the extent list is in memory in so that we don't have to do
|
||||
* read it from the I/O completion handler.
|
||||
*/
|
||||
if (xfs_need_iread_extents(&ip->i_df)) {
|
||||
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
||||
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
|
||||
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
|
||||
iomap->type = IOMAP_MAPPED;
|
||||
iomap->flags = IOMAP_F_DIRTY;
|
||||
iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
|
||||
iomap->offset = offset;
|
||||
iomap->length = length;
|
||||
iomap->flags = IOMAP_F_ANON_WRITE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
|
||||
.iomap_begin = xfs_zoned_direct_write_iomap_begin,
|
||||
};
|
||||
#endif /* CONFIG_XFS_RT */
|
||||
|
||||
static int
|
||||
xfs_dax_write_iomap_end(
|
||||
struct inode *inode,
|
||||
@@ -987,6 +1043,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
|
||||
.iomap_end = xfs_dax_write_iomap_end,
|
||||
};
|
||||
|
||||
/*
|
||||
* Convert a hole to a delayed allocation.
|
||||
*/
|
||||
static void
|
||||
xfs_bmap_add_extent_hole_delay(
|
||||
struct xfs_inode *ip, /* incore inode pointer */
|
||||
int whichfork,
|
||||
struct xfs_iext_cursor *icur,
|
||||
struct xfs_bmbt_irec *new) /* new data to add to file extents */
|
||||
{
|
||||
struct xfs_ifork *ifp; /* inode fork pointer */
|
||||
xfs_bmbt_irec_t left; /* left neighbor extent entry */
|
||||
xfs_filblks_t newlen=0; /* new indirect size */
|
||||
xfs_filblks_t oldlen=0; /* old indirect size */
|
||||
xfs_bmbt_irec_t right; /* right neighbor extent entry */
|
||||
uint32_t state = xfs_bmap_fork_to_state(whichfork);
|
||||
xfs_filblks_t temp; /* temp for indirect calculations */
|
||||
|
||||
ifp = xfs_ifork_ptr(ip, whichfork);
|
||||
ASSERT(isnullstartblock(new->br_startblock));
|
||||
|
||||
/*
|
||||
* Check and set flags if this segment has a left neighbor
|
||||
*/
|
||||
if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
|
||||
state |= BMAP_LEFT_VALID;
|
||||
if (isnullstartblock(left.br_startblock))
|
||||
state |= BMAP_LEFT_DELAY;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check and set flags if the current (right) segment exists.
|
||||
* If it doesn't exist, we're converting the hole at end-of-file.
|
||||
*/
|
||||
if (xfs_iext_get_extent(ifp, icur, &right)) {
|
||||
state |= BMAP_RIGHT_VALID;
|
||||
if (isnullstartblock(right.br_startblock))
|
||||
state |= BMAP_RIGHT_DELAY;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set contiguity flags on the left and right neighbors.
|
||||
* Don't let extents get too large, even if the pieces are contiguous.
|
||||
*/
|
||||
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
|
||||
left.br_startoff + left.br_blockcount == new->br_startoff &&
|
||||
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
|
||||
state |= BMAP_LEFT_CONTIG;
|
||||
|
||||
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
|
||||
new->br_startoff + new->br_blockcount == right.br_startoff &&
|
||||
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
|
||||
(!(state & BMAP_LEFT_CONTIG) ||
|
||||
(left.br_blockcount + new->br_blockcount +
|
||||
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
|
||||
state |= BMAP_RIGHT_CONTIG;
|
||||
|
||||
/*
|
||||
* Switch out based on the contiguity flags.
|
||||
*/
|
||||
switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
|
||||
case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
|
||||
/*
|
||||
* New allocation is contiguous with delayed allocations
|
||||
* on the left and on the right.
|
||||
* Merge all three into a single extent record.
|
||||
*/
|
||||
temp = left.br_blockcount + new->br_blockcount +
|
||||
right.br_blockcount;
|
||||
|
||||
oldlen = startblockval(left.br_startblock) +
|
||||
startblockval(new->br_startblock) +
|
||||
startblockval(right.br_startblock);
|
||||
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
|
||||
oldlen);
|
||||
left.br_startblock = nullstartblock(newlen);
|
||||
left.br_blockcount = temp;
|
||||
|
||||
xfs_iext_remove(ip, icur, state);
|
||||
xfs_iext_prev(ifp, icur);
|
||||
xfs_iext_update_extent(ip, state, icur, &left);
|
||||
break;
|
||||
|
||||
case BMAP_LEFT_CONTIG:
|
||||
/*
|
||||
* New allocation is contiguous with a delayed allocation
|
||||
* on the left.
|
||||
* Merge the new allocation with the left neighbor.
|
||||
*/
|
||||
temp = left.br_blockcount + new->br_blockcount;
|
||||
|
||||
oldlen = startblockval(left.br_startblock) +
|
||||
startblockval(new->br_startblock);
|
||||
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
|
||||
oldlen);
|
||||
left.br_blockcount = temp;
|
||||
left.br_startblock = nullstartblock(newlen);
|
||||
|
||||
xfs_iext_prev(ifp, icur);
|
||||
xfs_iext_update_extent(ip, state, icur, &left);
|
||||
break;
|
||||
|
||||
case BMAP_RIGHT_CONTIG:
|
||||
/*
|
||||
* New allocation is contiguous with a delayed allocation
|
||||
* on the right.
|
||||
* Merge the new allocation with the right neighbor.
|
||||
*/
|
||||
temp = new->br_blockcount + right.br_blockcount;
|
||||
oldlen = startblockval(new->br_startblock) +
|
||||
startblockval(right.br_startblock);
|
||||
newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
|
||||
oldlen);
|
||||
right.br_startoff = new->br_startoff;
|
||||
right.br_startblock = nullstartblock(newlen);
|
||||
right.br_blockcount = temp;
|
||||
xfs_iext_update_extent(ip, state, icur, &right);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
/*
|
||||
* New allocation is not contiguous with another
|
||||
* delayed allocation.
|
||||
* Insert a new entry.
|
||||
*/
|
||||
oldlen = newlen = 0;
|
||||
xfs_iext_insert(ip, icur, new, state);
|
||||
break;
|
||||
}
|
||||
if (oldlen != newlen) {
|
||||
ASSERT(oldlen > newlen);
|
||||
xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
|
||||
|
||||
/*
|
||||
* Nothing to do for disk quota accounting here.
|
||||
*/
|
||||
xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a delayed allocation extent to an inode. Blocks are reserved from the
|
||||
* global pool and the extent inserted into the inode in-core extent tree.
|
||||
*
|
||||
* On entry, got refers to the first extent beyond the offset of the extent to
|
||||
* allocate or eof is specified if no such extent exists. On return, got refers
|
||||
* to the extent record that was inserted to the inode fork.
|
||||
*
|
||||
* Note that the allocated extent may have been merged with contiguous extents
|
||||
* during insertion into the inode fork. Thus, got does not reflect the current
|
||||
* state of the inode fork on return. If necessary, the caller can use lastx to
|
||||
* look up the updated record in the inode fork.
|
||||
*/
|
||||
static int
|
||||
xfs_bmapi_reserve_delalloc(
|
||||
struct xfs_inode *ip,
|
||||
int whichfork,
|
||||
xfs_fileoff_t off,
|
||||
xfs_filblks_t len,
|
||||
xfs_filblks_t prealloc,
|
||||
struct xfs_bmbt_irec *got,
|
||||
struct xfs_iext_cursor *icur,
|
||||
int eof)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
|
||||
xfs_extlen_t alen;
|
||||
xfs_extlen_t indlen;
|
||||
uint64_t fdblocks;
|
||||
int error;
|
||||
xfs_fileoff_t aoff;
|
||||
bool use_cowextszhint =
|
||||
whichfork == XFS_COW_FORK && !prealloc;
|
||||
|
||||
retry:
|
||||
/*
|
||||
* Cap the alloc length. Keep track of prealloc so we know whether to
|
||||
* tag the inode before we return.
|
||||
*/
|
||||
aoff = off;
|
||||
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
|
||||
if (!eof)
|
||||
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
|
||||
if (prealloc && alen >= len)
|
||||
prealloc = alen - len;
|
||||
|
||||
/*
|
||||
* If we're targetting the COW fork but aren't creating a speculative
|
||||
* posteof preallocation, try to expand the reservation to align with
|
||||
* the COW extent size hint if there's sufficient free space.
|
||||
*
|
||||
* Unlike the data fork, the CoW cancellation functions will free all
|
||||
* the reservations at inactivation, so we don't require that every
|
||||
* delalloc reservation have a dirty pagecache.
|
||||
*/
|
||||
if (use_cowextszhint) {
|
||||
struct xfs_bmbt_irec prev;
|
||||
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
|
||||
|
||||
if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
|
||||
prev.br_startoff = NULLFILEOFF;
|
||||
|
||||
error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
|
||||
1, 0, &aoff, &alen);
|
||||
ASSERT(!error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make a transaction-less quota reservation for delayed allocation
|
||||
* blocks. This number gets adjusted later. We return if we haven't
|
||||
* allocated blocks already inside this loop.
|
||||
*/
|
||||
error = xfs_quota_reserve_blkres(ip, alen);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Split changing sb for alen and indlen since they could be coming
|
||||
* from different places.
|
||||
*/
|
||||
indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
|
||||
ASSERT(indlen > 0);
|
||||
|
||||
fdblocks = indlen;
|
||||
if (XFS_IS_REALTIME_INODE(ip)) {
|
||||
ASSERT(!xfs_is_zoned_inode(ip));
|
||||
error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
|
||||
if (error)
|
||||
goto out_unreserve_quota;
|
||||
} else {
|
||||
fdblocks += alen;
|
||||
}
|
||||
|
||||
error = xfs_dec_fdblocks(mp, fdblocks, false);
|
||||
if (error)
|
||||
goto out_unreserve_frextents;
|
||||
|
||||
ip->i_delayed_blks += alen;
|
||||
xfs_mod_delalloc(ip, alen, indlen);
|
||||
|
||||
got->br_startoff = aoff;
|
||||
got->br_startblock = nullstartblock(indlen);
|
||||
got->br_blockcount = alen;
|
||||
got->br_state = XFS_EXT_NORM;
|
||||
|
||||
xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
|
||||
|
||||
/*
|
||||
* Tag the inode if blocks were preallocated. Note that COW fork
|
||||
* preallocation can occur at the start or end of the extent, even when
|
||||
* prealloc == 0, so we must also check the aligned offset and length.
|
||||
*/
|
||||
if (whichfork == XFS_DATA_FORK && prealloc)
|
||||
xfs_inode_set_eofblocks_tag(ip);
|
||||
if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
|
||||
xfs_inode_set_cowblocks_tag(ip);
|
||||
|
||||
return 0;
|
||||
|
||||
out_unreserve_frextents:
|
||||
if (XFS_IS_REALTIME_INODE(ip))
|
||||
xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
|
||||
out_unreserve_quota:
|
||||
if (XFS_IS_QUOTA_ON(mp))
|
||||
xfs_quota_unreserve_blkres(ip, alen);
|
||||
out:
|
||||
if (error == -ENOSPC || error == -EDQUOT) {
|
||||
trace_xfs_delalloc_enospc(ip, off, len);
|
||||
|
||||
if (prealloc || use_cowextszhint) {
|
||||
/* retry without any preallocation */
|
||||
use_cowextszhint = false;
|
||||
prealloc = 0;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_zoned_buffered_write_iomap_begin(
|
||||
struct inode *inode,
|
||||
loff_t offset,
|
||||
loff_t count,
|
||||
unsigned flags,
|
||||
struct iomap *iomap,
|
||||
struct iomap *srcmap)
|
||||
{
|
||||
struct iomap_iter *iter =
|
||||
container_of(iomap, struct iomap_iter, iomap);
|
||||
struct xfs_zone_alloc_ctx *ac = iter->private;
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
|
||||
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
|
||||
u16 iomap_flags = IOMAP_F_SHARED;
|
||||
unsigned int lockmode = XFS_ILOCK_EXCL;
|
||||
xfs_filblks_t count_fsb;
|
||||
xfs_extlen_t indlen;
|
||||
struct xfs_bmbt_irec got;
|
||||
struct xfs_iext_cursor icur;
|
||||
int error = 0;
|
||||
|
||||
ASSERT(!xfs_get_extsz_hint(ip));
|
||||
ASSERT(!(flags & IOMAP_UNSHARE));
|
||||
ASSERT(ac);
|
||||
|
||||
if (xfs_is_shutdown(mp))
|
||||
return -EIO;
|
||||
|
||||
error = xfs_qm_dqattach(ip);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
|
||||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
|
||||
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
|
||||
error = -EFSCORRUPTED;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
XFS_STATS_INC(mp, xs_blk_mapw);
|
||||
|
||||
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* For zeroing operations check if there is any data to zero first.
|
||||
*
|
||||
* For regular writes we always need to allocate new blocks, but need to
|
||||
* provide the source mapping when the range is unaligned to support
|
||||
* read-modify-write of the whole block in the page cache.
|
||||
*
|
||||
* In either case we need to limit the reported range to the boundaries
|
||||
* of the source map in the data fork.
|
||||
*/
|
||||
if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
|
||||
!IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
|
||||
(flags & IOMAP_ZERO)) {
|
||||
struct xfs_bmbt_irec smap;
|
||||
struct xfs_iext_cursor scur;
|
||||
|
||||
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
|
||||
&smap))
|
||||
smap.br_startoff = end_fsb; /* fake hole until EOF */
|
||||
if (smap.br_startoff > offset_fsb) {
|
||||
/*
|
||||
* We never need to allocate blocks for zeroing a hole.
|
||||
*/
|
||||
if (flags & IOMAP_ZERO) {
|
||||
xfs_hole_to_iomap(ip, iomap, offset_fsb,
|
||||
smap.br_startoff);
|
||||
goto out_unlock;
|
||||
}
|
||||
end_fsb = min(end_fsb, smap.br_startoff);
|
||||
} else {
|
||||
end_fsb = min(end_fsb,
|
||||
smap.br_startoff + smap.br_blockcount);
|
||||
xfs_trim_extent(&smap, offset_fsb,
|
||||
end_fsb - offset_fsb);
|
||||
error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
|
||||
xfs_iomap_inode_sequence(ip, 0));
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ip->i_cowfp)
|
||||
xfs_ifork_init_cow(ip);
|
||||
|
||||
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
|
||||
got.br_startoff = end_fsb;
|
||||
if (got.br_startoff <= offset_fsb) {
|
||||
trace_xfs_reflink_cow_found(ip, &got);
|
||||
goto done;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cap the maximum length to keep the chunks of work done here somewhat
|
||||
* symmetric with the work writeback does.
|
||||
*/
|
||||
end_fsb = min(end_fsb, got.br_startoff);
|
||||
count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
|
||||
XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
|
||||
|
||||
/*
|
||||
* The block reservation is supposed to cover all blocks that the
|
||||
* operation could possible write, but there is a nasty corner case
|
||||
* where blocks could be stolen from underneath us:
|
||||
*
|
||||
* 1) while this thread iterates over a larger buffered write,
|
||||
* 2) another thread is causing a write fault that calls into
|
||||
* ->page_mkwrite in range this thread writes to, using up the
|
||||
* delalloc reservation created by a previous call to this function.
|
||||
* 3) another thread does direct I/O on the range that the write fault
|
||||
* happened on, which causes writeback of the dirty data.
|
||||
* 4) this then set the stale flag, which cuts the current iomap
|
||||
* iteration short, causing the new call to ->iomap_begin that gets
|
||||
* us here again, but now without a sufficient reservation.
|
||||
*
|
||||
* This is a very unusual I/O pattern, and nothing but generic/095 is
|
||||
* known to hit it. There's not really much we can do here, so turn this
|
||||
* into a short write.
|
||||
*/
|
||||
if (count_fsb > ac->reserved_blocks) {
|
||||
xfs_warn_ratelimited(mp,
|
||||
"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
|
||||
ip->i_ino, current->comm);
|
||||
count_fsb = ac->reserved_blocks;
|
||||
if (!count_fsb) {
|
||||
error = -EIO;
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
error = xfs_quota_reserve_blkres(ip, count_fsb);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
|
||||
indlen = xfs_bmap_worst_indlen(ip, count_fsb);
|
||||
error = xfs_dec_fdblocks(mp, indlen, false);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
ip->i_delayed_blks += count_fsb;
|
||||
xfs_mod_delalloc(ip, count_fsb, indlen);
|
||||
|
||||
got.br_startoff = offset_fsb;
|
||||
got.br_startblock = nullstartblock(indlen);
|
||||
got.br_blockcount = count_fsb;
|
||||
got.br_state = XFS_EXT_NORM;
|
||||
xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
|
||||
ac->reserved_blocks -= count_fsb;
|
||||
iomap_flags |= IOMAP_F_NEW;
|
||||
|
||||
trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
|
||||
XFS_COW_FORK, &got);
|
||||
done:
|
||||
error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
|
||||
xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
|
||||
out_unlock:
|
||||
xfs_iunlock(ip, lockmode);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_buffered_write_iomap_begin(
|
||||
struct inode *inode,
|
||||
@@ -1013,6 +1518,10 @@ xfs_buffered_write_iomap_begin(
|
||||
if (xfs_is_shutdown(mp))
|
||||
return -EIO;
|
||||
|
||||
if (xfs_is_zoned_inode(ip))
|
||||
return xfs_zoned_buffered_write_iomap_begin(inode, offset,
|
||||
count, flags, iomap, srcmap);
|
||||
|
||||
/* we can't use delayed allocations when using extent size hints */
|
||||
if (xfs_get_extsz_hint(ip))
|
||||
return xfs_direct_write_iomap_begin(inode, offset, count,
|
||||
@@ -1245,10 +1754,13 @@ xfs_buffered_write_delalloc_punch(
|
||||
loff_t length,
|
||||
struct iomap *iomap)
|
||||
{
|
||||
struct iomap_iter *iter =
|
||||
container_of(iomap, struct iomap_iter, iomap);
|
||||
|
||||
xfs_bmap_punch_delalloc_range(XFS_I(inode),
|
||||
(iomap->flags & IOMAP_F_SHARED) ?
|
||||
XFS_COW_FORK : XFS_DATA_FORK,
|
||||
offset, offset + length);
|
||||
offset, offset + length, iter->private);
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -1485,6 +1997,7 @@ xfs_zero_range(
|
||||
struct xfs_inode *ip,
|
||||
loff_t pos,
|
||||
loff_t len,
|
||||
struct xfs_zone_alloc_ctx *ac,
|
||||
bool *did_zero)
|
||||
{
|
||||
struct inode *inode = VFS_I(ip);
|
||||
@@ -1495,13 +2008,14 @@ xfs_zero_range(
|
||||
return dax_zero_range(inode, pos, len, did_zero,
|
||||
&xfs_dax_write_iomap_ops);
|
||||
return iomap_zero_range(inode, pos, len, did_zero,
|
||||
&xfs_buffered_write_iomap_ops);
|
||||
&xfs_buffered_write_iomap_ops, ac);
|
||||
}
|
||||
|
||||
int
|
||||
xfs_truncate_page(
|
||||
struct xfs_inode *ip,
|
||||
loff_t pos,
|
||||
struct xfs_zone_alloc_ctx *ac,
|
||||
bool *did_zero)
|
||||
{
|
||||
struct inode *inode = VFS_I(ip);
|
||||
@@ -1510,5 +2024,5 @@ xfs_truncate_page(
|
||||
return dax_truncate_page(inode, pos, did_zero,
|
||||
&xfs_dax_write_iomap_ops);
|
||||
return iomap_truncate_page(inode, pos, did_zero,
|
||||
&xfs_buffered_write_iomap_ops);
|
||||
&xfs_buffered_write_iomap_ops, ac);
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
struct xfs_inode;
|
||||
struct xfs_bmbt_irec;
|
||||
struct xfs_zone_alloc_ctx;
|
||||
|
||||
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
|
||||
xfs_fileoff_t count_fsb, unsigned int flags,
|
||||
@@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
|
||||
u16 iomap_flags, u64 sequence_cookie);
|
||||
|
||||
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
|
||||
bool *did_zero);
|
||||
int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
|
||||
struct xfs_zone_alloc_ctx *ac, bool *did_zero);
|
||||
int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
|
||||
struct xfs_zone_alloc_ctx *ac, bool *did_zero);
|
||||
|
||||
static inline xfs_filblks_t
|
||||
xfs_aligned_fsb_count(
|
||||
@@ -49,6 +51,7 @@ xfs_aligned_fsb_count(
|
||||
|
||||
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
|
||||
extern const struct iomap_ops xfs_direct_write_iomap_ops;
|
||||
extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
|
||||
extern const struct iomap_ops xfs_read_iomap_ops;
|
||||
extern const struct iomap_ops xfs_seek_iomap_ops;
|
||||
extern const struct iomap_ops xfs_xattr_iomap_ops;
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "xfs_xattr.h"
|
||||
#include "xfs_file.h"
|
||||
#include "xfs_bmap.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
#include <linux/posix_acl.h>
|
||||
#include <linux/security.h>
|
||||
@@ -854,6 +855,7 @@ xfs_setattr_size(
|
||||
uint lock_flags = 0;
|
||||
uint resblks = 0;
|
||||
bool did_zeroing = false;
|
||||
struct xfs_zone_alloc_ctx ac = { };
|
||||
|
||||
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
|
||||
ASSERT(S_ISREG(inode->i_mode));
|
||||
@@ -889,6 +891,28 @@ xfs_setattr_size(
|
||||
*/
|
||||
inode_dio_wait(inode);
|
||||
|
||||
/*
|
||||
* Normally xfs_zoned_space_reserve is supposed to be called outside the
|
||||
* IOLOCK. For truncate we can't do that since ->setattr is called with
|
||||
* it already held by the VFS. So for now chicken out and try to
|
||||
* allocate space under it.
|
||||
*
|
||||
* To avoid deadlocks this means we can't block waiting for space, which
|
||||
* can lead to spurious -ENOSPC if there are no directly available
|
||||
* blocks. We mitigate this a bit by allowing zeroing to dip into the
|
||||
* reserved pool, but eventually the VFS calling convention needs to
|
||||
* change.
|
||||
*/
|
||||
if (xfs_is_zoned_inode(ip)) {
|
||||
error = xfs_zoned_space_reserve(ip, 1,
|
||||
XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
|
||||
if (error) {
|
||||
if (error == -EAGAIN)
|
||||
return -ENOSPC;
|
||||
return error;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* File data changes must be complete before we start the transaction to
|
||||
* modify the inode. This needs to be done before joining the inode to
|
||||
@@ -902,11 +926,14 @@ xfs_setattr_size(
|
||||
if (newsize > oldsize) {
|
||||
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
|
||||
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
|
||||
&did_zeroing);
|
||||
&ac, &did_zeroing);
|
||||
} else {
|
||||
error = xfs_truncate_page(ip, newsize, &did_zeroing);
|
||||
error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
|
||||
}
|
||||
|
||||
if (xfs_is_zoned_inode(ip))
|
||||
xfs_zoned_space_unreserve(ip, &ac);
|
||||
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "xfs_sysfs.h"
|
||||
#include "xfs_sb.h"
|
||||
#include "xfs_health.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
struct kmem_cache *xfs_log_ticket_cache;
|
||||
|
||||
@@ -3540,6 +3541,9 @@ xlog_force_shutdown(
|
||||
spin_unlock(&log->l_icloglock);
|
||||
|
||||
wake_up_var(&log->l_opstate);
|
||||
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
|
||||
xfs_zoned_wake_all(log->l_mp);
|
||||
|
||||
return log_error;
|
||||
}
|
||||
|
||||
|
||||
@@ -173,6 +173,10 @@ xfs_warn_experimental(
|
||||
.opstate = XFS_OPSTATE_WARNED_METADIR,
|
||||
.name = "metadata directory tree",
|
||||
},
|
||||
[XFS_EXPERIMENTAL_ZONED] = {
|
||||
.opstate = XFS_OPSTATE_WARNED_ZONED,
|
||||
.name = "zoned RT device",
|
||||
},
|
||||
};
|
||||
ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
|
||||
BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);
|
||||
|
||||
@@ -99,6 +99,7 @@ enum xfs_experimental_feat {
|
||||
XFS_EXPERIMENTAL_EXCHRANGE,
|
||||
XFS_EXPERIMENTAL_PPTR,
|
||||
XFS_EXPERIMENTAL_METADIR,
|
||||
XFS_EXPERIMENTAL_ZONED,
|
||||
|
||||
XFS_EXPERIMENTAL_MAX,
|
||||
};
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include "xfs_rtrmap_btree.h"
|
||||
#include "xfs_rtrefcount_btree.h"
|
||||
#include "scrub/stats.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
static DEFINE_MUTEX(xfs_uuid_table_mutex);
|
||||
static int xfs_uuid_table_size;
|
||||
@@ -461,22 +462,38 @@ xfs_mount_reset_sbqflags(
|
||||
return xfs_sync_sb(mp, false);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
xfs_default_resblks(xfs_mount_t *mp)
|
||||
{
|
||||
uint64_t resblks;
|
||||
static const char *const xfs_free_pool_name[] = {
|
||||
[XC_FREE_BLOCKS] = "free blocks",
|
||||
[XC_FREE_RTEXTENTS] = "free rt extents",
|
||||
[XC_FREE_RTAVAILABLE] = "available rt extents",
|
||||
};
|
||||
|
||||
/*
|
||||
* We default to 5% or 8192 fsbs of space reserved, whichever is
|
||||
* smaller. This is intended to cover concurrent allocation
|
||||
* transactions when we initially hit enospc. These each require a 4
|
||||
* block reservation. Hence by default we cover roughly 2000 concurrent
|
||||
* allocation reservations.
|
||||
*/
|
||||
resblks = mp->m_sb.sb_dblocks;
|
||||
do_div(resblks, 20);
|
||||
resblks = min_t(uint64_t, resblks, 8192);
|
||||
return resblks;
|
||||
uint64_t
|
||||
xfs_default_resblks(
|
||||
struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr)
|
||||
{
|
||||
switch (ctr) {
|
||||
case XC_FREE_BLOCKS:
|
||||
/*
|
||||
* Default to 5% or 8192 FSBs of space reserved, whichever is
|
||||
* smaller.
|
||||
*
|
||||
* This is intended to cover concurrent allocation transactions
|
||||
* when we initially hit ENOSPC. These each require a 4 block
|
||||
* reservation. Hence by default we cover roughly 2000
|
||||
* concurrent allocation reservations.
|
||||
*/
|
||||
return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
|
||||
case XC_FREE_RTEXTENTS:
|
||||
case XC_FREE_RTAVAILABLE:
|
||||
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
|
||||
return xfs_zoned_default_resblks(mp, ctr);
|
||||
return 0;
|
||||
default:
|
||||
ASSERT(0);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Ensure the summary counts are correct. */
|
||||
@@ -543,7 +560,7 @@ xfs_check_summary_counts(
|
||||
* If we're mounting the rt volume after recovering the log, recompute
|
||||
* frextents from the rtbitmap file to fix the inconsistency.
|
||||
*/
|
||||
if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
|
||||
if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
|
||||
error = xfs_rtalloc_reinit_frextents(mp);
|
||||
if (error)
|
||||
return error;
|
||||
@@ -678,6 +695,7 @@ xfs_mountfs(
|
||||
uint quotamount = 0;
|
||||
uint quotaflags = 0;
|
||||
int error = 0;
|
||||
int i;
|
||||
|
||||
xfs_sb_mount_common(mp, sbp);
|
||||
|
||||
@@ -747,27 +765,15 @@ xfs_mountfs(
|
||||
/* enable fail_at_unmount as default */
|
||||
mp->m_fail_unmount = true;
|
||||
|
||||
super_set_sysfs_name_id(mp->m_super);
|
||||
|
||||
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
|
||||
NULL, mp->m_super->s_id);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
|
||||
&mp->m_kobj, "stats");
|
||||
if (error)
|
||||
goto out_remove_sysfs;
|
||||
|
||||
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
|
||||
|
||||
error = xfs_error_sysfs_init(mp);
|
||||
error = xfs_mount_sysfs_init(mp);
|
||||
if (error)
|
||||
goto out_remove_scrub_stats;
|
||||
|
||||
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
|
||||
|
||||
error = xfs_errortag_init(mp);
|
||||
if (error)
|
||||
goto out_remove_error_sysfs;
|
||||
goto out_remove_sysfs;
|
||||
|
||||
error = xfs_uuid_mount(mp);
|
||||
if (error)
|
||||
@@ -1031,6 +1037,12 @@ xfs_mountfs(
|
||||
if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
|
||||
xfs_log_clean(mp);
|
||||
|
||||
if (xfs_has_zoned(mp)) {
|
||||
error = xfs_mount_zones(mp);
|
||||
if (error)
|
||||
goto out_rtunmount;
|
||||
}
|
||||
|
||||
/*
|
||||
* Complete the quota initialisation, post-log-replay component.
|
||||
*/
|
||||
@@ -1046,22 +1058,28 @@ xfs_mountfs(
|
||||
* privileged transactions. This is needed so that transaction
|
||||
* space required for critical operations can dip into this pool
|
||||
* when at ENOSPC. This is needed for operations like create with
|
||||
* attr, unwritten extent conversion at ENOSPC, etc. Data allocations
|
||||
* are not allowed to use this reserved space.
|
||||
* attr, unwritten extent conversion at ENOSPC, garbage collection
|
||||
* etc. Data allocations are not allowed to use this reserved space.
|
||||
*
|
||||
* This may drive us straight to ENOSPC on mount, but that implies
|
||||
* we were already there on the last unmount. Warn if this occurs.
|
||||
*/
|
||||
if (!xfs_is_readonly(mp)) {
|
||||
error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
|
||||
if (error)
|
||||
xfs_warn(mp,
|
||||
"Unable to allocate reserve blocks. Continuing without reserve pool.");
|
||||
for (i = 0; i < XC_FREE_NR; i++) {
|
||||
error = xfs_reserve_blocks(mp, i,
|
||||
xfs_default_resblks(mp, i));
|
||||
if (error)
|
||||
xfs_warn(mp,
|
||||
"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
|
||||
xfs_free_pool_name[i]);
|
||||
}
|
||||
|
||||
/* Reserve AG blocks for future btree expansion. */
|
||||
error = xfs_fs_reserve_ag_blocks(mp);
|
||||
if (error && error != -ENOSPC)
|
||||
goto out_agresv;
|
||||
|
||||
xfs_zone_gc_start(mp);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -1069,6 +1087,8 @@ xfs_mountfs(
|
||||
out_agresv:
|
||||
xfs_fs_unreserve_ag_blocks(mp);
|
||||
xfs_qm_unmount_quotas(mp);
|
||||
if (xfs_has_zoned(mp))
|
||||
xfs_unmount_zones(mp);
|
||||
out_rtunmount:
|
||||
xfs_rtunmount_inodes(mp);
|
||||
out_rele_rip:
|
||||
@@ -1116,13 +1136,10 @@ xfs_mountfs(
|
||||
xfs_uuid_unmount(mp);
|
||||
out_remove_errortag:
|
||||
xfs_errortag_del(mp);
|
||||
out_remove_error_sysfs:
|
||||
xfs_error_sysfs_del(mp);
|
||||
out_remove_sysfs:
|
||||
xfs_mount_sysfs_del(mp);
|
||||
out_remove_scrub_stats:
|
||||
xchk_stats_unregister(mp->m_scrub_stats);
|
||||
xfs_sysfs_del(&mp->m_stats.xs_kobj);
|
||||
out_remove_sysfs:
|
||||
xfs_sysfs_del(&mp->m_kobj);
|
||||
out:
|
||||
return error;
|
||||
}
|
||||
@@ -1148,8 +1165,12 @@ xfs_unmountfs(
|
||||
xfs_inodegc_flush(mp);
|
||||
|
||||
xfs_blockgc_stop(mp);
|
||||
if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
|
||||
xfs_zone_gc_stop(mp);
|
||||
xfs_fs_unreserve_ag_blocks(mp);
|
||||
xfs_qm_unmount_quotas(mp);
|
||||
if (xfs_has_zoned(mp))
|
||||
xfs_unmount_zones(mp);
|
||||
xfs_rtunmount_inodes(mp);
|
||||
xfs_irele(mp->m_rootip);
|
||||
if (mp->m_metadirip)
|
||||
@@ -1173,7 +1194,7 @@ xfs_unmountfs(
|
||||
* we only every apply deltas to the superblock and hence the incore
|
||||
* value does not matter....
|
||||
*/
|
||||
error = xfs_reserve_blocks(mp, 0);
|
||||
error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
|
||||
if (error)
|
||||
xfs_warn(mp, "Unable to free reserved block pool. "
|
||||
"Freespace may not be correct on next mount.");
|
||||
@@ -1195,10 +1216,8 @@ xfs_unmountfs(
|
||||
xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
|
||||
xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
|
||||
xfs_errortag_del(mp);
|
||||
xfs_error_sysfs_del(mp);
|
||||
xchk_stats_unregister(mp->m_scrub_stats);
|
||||
xfs_sysfs_del(&mp->m_stats.xs_kobj);
|
||||
xfs_sysfs_del(&mp->m_kobj);
|
||||
xfs_mount_sysfs_del(mp);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1220,52 +1239,67 @@ xfs_fs_writable(
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Estimate the amount of free space that is not available to userspace and is
|
||||
* not explicitly reserved from the incore fdblocks. This includes:
|
||||
*
|
||||
* - The minimum number of blocks needed to support splitting a bmap btree
|
||||
* - The blocks currently in use by the freespace btrees because they record
|
||||
* the actual blocks that will fill per-AG metadata space reservations
|
||||
*/
|
||||
uint64_t
|
||||
xfs_freecounter_unavailable(
|
||||
struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr)
|
||||
{
|
||||
if (ctr != XC_FREE_BLOCKS)
|
||||
return 0;
|
||||
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
|
||||
}
|
||||
|
||||
void
|
||||
xfs_add_freecounter(
|
||||
struct xfs_mount *mp,
|
||||
struct percpu_counter *counter,
|
||||
enum xfs_free_counter ctr,
|
||||
uint64_t delta)
|
||||
{
|
||||
bool has_resv_pool = (counter == &mp->m_fdblocks);
|
||||
struct xfs_freecounter *counter = &mp->m_free[ctr];
|
||||
uint64_t res_used;
|
||||
|
||||
/*
|
||||
* If the reserve pool is depleted, put blocks back into it first.
|
||||
* Most of the time the pool is full.
|
||||
*/
|
||||
if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
|
||||
percpu_counter_add(counter, delta);
|
||||
if (likely(counter->res_avail == counter->res_total)) {
|
||||
percpu_counter_add(&counter->count, delta);
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock(&mp->m_sb_lock);
|
||||
res_used = mp->m_resblks - mp->m_resblks_avail;
|
||||
res_used = counter->res_total - counter->res_avail;
|
||||
if (res_used > delta) {
|
||||
mp->m_resblks_avail += delta;
|
||||
counter->res_avail += delta;
|
||||
} else {
|
||||
delta -= res_used;
|
||||
mp->m_resblks_avail = mp->m_resblks;
|
||||
percpu_counter_add(counter, delta);
|
||||
counter->res_avail = counter->res_total;
|
||||
percpu_counter_add(&counter->count, delta);
|
||||
}
|
||||
spin_unlock(&mp->m_sb_lock);
|
||||
}
|
||||
|
||||
|
||||
/* Adjust in-core free blocks or RT extents. */
|
||||
int
|
||||
xfs_dec_freecounter(
|
||||
struct xfs_mount *mp,
|
||||
struct percpu_counter *counter,
|
||||
enum xfs_free_counter ctr,
|
||||
uint64_t delta,
|
||||
bool rsvd)
|
||||
{
|
||||
int64_t lcounter;
|
||||
uint64_t set_aside = 0;
|
||||
struct xfs_freecounter *counter = &mp->m_free[ctr];
|
||||
s32 batch;
|
||||
bool has_resv_pool;
|
||||
|
||||
ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
|
||||
has_resv_pool = (counter == &mp->m_fdblocks);
|
||||
if (rsvd)
|
||||
ASSERT(has_resv_pool);
|
||||
ASSERT(ctr < XC_FREE_NR);
|
||||
|
||||
/*
|
||||
* Taking blocks away, need to be more accurate the closer we
|
||||
@@ -1275,7 +1309,7 @@ xfs_dec_freecounter(
|
||||
* then make everything serialise as we are real close to
|
||||
* ENOSPC.
|
||||
*/
|
||||
if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
|
||||
if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH,
|
||||
XFS_FDBLOCKS_BATCH) < 0)
|
||||
batch = 1;
|
||||
else
|
||||
@@ -1292,34 +1326,34 @@ xfs_dec_freecounter(
|
||||
* problems (i.e. transaction abort, pagecache discards, etc.) than
|
||||
* slightly premature -ENOSPC.
|
||||
*/
|
||||
if (has_resv_pool)
|
||||
set_aside = xfs_fdblocks_unavailable(mp);
|
||||
percpu_counter_add_batch(counter, -((int64_t)delta), batch);
|
||||
if (__percpu_counter_compare(counter, set_aside,
|
||||
XFS_FDBLOCKS_BATCH) >= 0) {
|
||||
/* we had space! */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* lock up the sb for dipping into reserves before releasing the space
|
||||
* that took us to ENOSPC.
|
||||
*/
|
||||
spin_lock(&mp->m_sb_lock);
|
||||
percpu_counter_add(counter, delta);
|
||||
if (!has_resv_pool || !rsvd)
|
||||
goto fdblocks_enospc;
|
||||
|
||||
lcounter = (long long)mp->m_resblks_avail - delta;
|
||||
if (lcounter >= 0) {
|
||||
mp->m_resblks_avail = lcounter;
|
||||
spin_unlock(&mp->m_sb_lock);
|
||||
return 0;
|
||||
}
|
||||
xfs_warn_once(mp,
|
||||
percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch);
|
||||
if (__percpu_counter_compare(&counter->count,
|
||||
xfs_freecounter_unavailable(mp, ctr),
|
||||
XFS_FDBLOCKS_BATCH) < 0) {
|
||||
/*
|
||||
* Lock up the sb for dipping into reserves before releasing the
|
||||
* space that took us to ENOSPC.
|
||||
*/
|
||||
spin_lock(&mp->m_sb_lock);
|
||||
percpu_counter_add(&counter->count, delta);
|
||||
if (!rsvd)
|
||||
goto fdblocks_enospc;
|
||||
if (delta > counter->res_avail) {
|
||||
if (ctr == XC_FREE_BLOCKS)
|
||||
xfs_warn_once(mp,
|
||||
"Reserve blocks depleted! Consider increasing reserve pool size.");
|
||||
goto fdblocks_enospc;
|
||||
}
|
||||
counter->res_avail -= delta;
|
||||
trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_);
|
||||
spin_unlock(&mp->m_sb_lock);
|
||||
}
|
||||
|
||||
/* we had space! */
|
||||
return 0;
|
||||
|
||||
fdblocks_enospc:
|
||||
trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_);
|
||||
spin_unlock(&mp->m_sb_lock);
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
@@ -97,12 +97,42 @@ struct xfs_groups {
|
||||
*/
|
||||
uint8_t blklog;
|
||||
|
||||
/*
|
||||
* Zoned devices can have gaps beyond the usable capacity of a zone and
|
||||
* the end in the LBA/daddr address space. In other words, the hardware
|
||||
* equivalent to the RT groups already takes care of the power of 2
|
||||
* alignment for us. In this case the sparse FSB/RTB address space maps
|
||||
* 1:1 to the device address space.
|
||||
*/
|
||||
bool has_daddr_gaps;
|
||||
|
||||
/*
|
||||
* Mask to extract the group-relative block number from a FSB.
|
||||
* For a pre-rtgroups filesystem we pretend to have one very large
|
||||
* rtgroup, so this mask must be 64-bit.
|
||||
*/
|
||||
uint64_t blkmask;
|
||||
|
||||
/*
|
||||
* Start of the first group in the device. This is used to support a
|
||||
* RT device following the data device on the same block device for
|
||||
* SMR hard drives.
|
||||
*/
|
||||
xfs_fsblock_t start_fsb;
|
||||
};
|
||||
|
||||
struct xfs_freecounter {
|
||||
/* free blocks for general use: */
|
||||
struct percpu_counter count;
|
||||
|
||||
/* total reserved blocks: */
|
||||
uint64_t res_total;
|
||||
|
||||
/* available reserved blocks: */
|
||||
uint64_t res_avail;
|
||||
|
||||
/* reserved blks @ remount,ro: */
|
||||
uint64_t res_saved;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -198,6 +228,7 @@ typedef struct xfs_mount {
|
||||
bool m_fail_unmount;
|
||||
bool m_finobt_nores; /* no per-AG finobt resv. */
|
||||
bool m_update_sb; /* sb needs update in mount */
|
||||
unsigned int m_max_open_zones;
|
||||
|
||||
/*
|
||||
* Bitsets of per-fs metadata that have been checked and/or are sick.
|
||||
@@ -222,8 +253,8 @@ typedef struct xfs_mount {
|
||||
spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
|
||||
struct percpu_counter m_icount; /* allocated inodes counter */
|
||||
struct percpu_counter m_ifree; /* free inodes counter */
|
||||
struct percpu_counter m_fdblocks; /* free block counter */
|
||||
struct percpu_counter m_frextents; /* free rt extent counter */
|
||||
|
||||
struct xfs_freecounter m_free[XC_FREE_NR];
|
||||
|
||||
/*
|
||||
* Count of data device blocks reserved for delayed allocations,
|
||||
@@ -245,10 +276,8 @@ typedef struct xfs_mount {
|
||||
atomic64_t m_allocbt_blks;
|
||||
|
||||
struct xfs_groups m_groups[XG_TYPE_MAX];
|
||||
uint64_t m_resblks; /* total reserved blocks */
|
||||
uint64_t m_resblks_avail;/* available reserved blocks */
|
||||
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
|
||||
struct delayed_work m_reclaim_work; /* background inode reclaim */
|
||||
struct xfs_zone_info *m_zone_info; /* zone allocator information */
|
||||
struct dentry *m_debugfs; /* debugfs parent */
|
||||
struct xfs_kobj m_kobj;
|
||||
struct xfs_kobj m_error_kobj;
|
||||
@@ -258,10 +287,16 @@ typedef struct xfs_mount {
|
||||
#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
|
||||
struct xchk_stats *m_scrub_stats;
|
||||
#endif
|
||||
struct xfs_kobj m_zoned_kobj;
|
||||
xfs_agnumber_t m_agfrotor; /* last ag where space found */
|
||||
atomic_t m_agirotor; /* last ag dir inode alloced */
|
||||
atomic_t m_rtgrotor; /* last rtgroup rtpicked */
|
||||
|
||||
struct mutex m_metafile_resv_lock;
|
||||
uint64_t m_metafile_resv_target;
|
||||
uint64_t m_metafile_resv_used;
|
||||
uint64_t m_metafile_resv_avail;
|
||||
|
||||
/* Memory shrinker to throttle and reprioritize inodegc */
|
||||
struct shrinker *m_inodegc_shrinker;
|
||||
/*
|
||||
@@ -336,8 +371,10 @@ typedef struct xfs_mount {
|
||||
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
|
||||
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
|
||||
#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
|
||||
#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
|
||||
|
||||
/* Mount features */
|
||||
#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
|
||||
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
|
||||
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
|
||||
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
|
||||
@@ -392,6 +429,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
|
||||
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
|
||||
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
|
||||
__XFS_HAS_FEAT(metadir, METADIR)
|
||||
__XFS_HAS_FEAT(zoned, ZONED)
|
||||
__XFS_HAS_FEAT(nolifetime, NOLIFETIME)
|
||||
|
||||
static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
|
||||
{
|
||||
@@ -402,7 +441,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
|
||||
static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
|
||||
{
|
||||
/* all rtgroups filesystems with an rt section have an rtsb */
|
||||
return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
|
||||
return xfs_has_rtgroups(mp) &&
|
||||
xfs_has_realtime(mp) &&
|
||||
!xfs_has_zoned(mp);
|
||||
}
|
||||
|
||||
static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
|
||||
@@ -417,6 +458,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
|
||||
xfs_has_reflink(mp);
|
||||
}
|
||||
|
||||
static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
|
||||
{
|
||||
return !xfs_has_zoned(mp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Some features are always on for v5 file systems, allow the compiler to
|
||||
* eliminiate dead code when building without v4 support.
|
||||
@@ -520,6 +566,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
|
||||
#define XFS_OPSTATE_WARNED_METADIR 17
|
||||
/* Filesystem should use qflags to determine quotaon status */
|
||||
#define XFS_OPSTATE_RESUMING_QUOTAON 18
|
||||
/* Kernel has logged a warning about zoned RT device being used on this fs. */
|
||||
#define XFS_OPSTATE_WARNED_ZONED 19
|
||||
/* (Zoned) GC is in progress */
|
||||
#define XFS_OPSTATE_ZONEGC_RUNNING 20
|
||||
|
||||
#define __XFS_IS_OPSTATE(name, NAME) \
|
||||
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
|
||||
@@ -564,6 +614,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
|
||||
#endif /* CONFIG_XFS_QUOTA */
|
||||
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
|
||||
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
|
||||
__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING)
|
||||
|
||||
static inline bool
|
||||
xfs_should_warn(struct xfs_mount *mp, long nr)
|
||||
@@ -633,7 +684,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
|
||||
}
|
||||
|
||||
extern void xfs_uuid_table_free(void);
|
||||
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
|
||||
uint64_t xfs_default_resblks(struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr);
|
||||
extern int xfs_mountfs(xfs_mount_t *mp);
|
||||
extern void xfs_unmountfs(xfs_mount_t *);
|
||||
|
||||
@@ -646,45 +698,74 @@ extern void xfs_unmountfs(xfs_mount_t *);
|
||||
*/
|
||||
#define XFS_FDBLOCKS_BATCH 1024
|
||||
|
||||
uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr);
|
||||
|
||||
/*
|
||||
* Estimate the amount of free space that is not available to userspace and is
|
||||
* not explicitly reserved from the incore fdblocks. This includes:
|
||||
*
|
||||
* - The minimum number of blocks needed to support splitting a bmap btree
|
||||
* - The blocks currently in use by the freespace btrees because they record
|
||||
* the actual blocks that will fill per-AG metadata space reservations
|
||||
* Sum up the freecount, but never return negative values.
|
||||
*/
|
||||
static inline uint64_t
|
||||
xfs_fdblocks_unavailable(
|
||||
struct xfs_mount *mp)
|
||||
static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr)
|
||||
{
|
||||
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
|
||||
return percpu_counter_sum_positive(&mp->m_free[ctr].count);
|
||||
}
|
||||
|
||||
int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
|
||||
/*
|
||||
* Same as above, but does return negative values. Mostly useful for
|
||||
* special cases like repair and tracing.
|
||||
*/
|
||||
static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr)
|
||||
{
|
||||
return percpu_counter_sum(&mp->m_free[ctr].count);
|
||||
}
|
||||
|
||||
/*
|
||||
* This just provides and estimate without the cpu-local updates, use
|
||||
* xfs_sum_freecounter for the exact value.
|
||||
*/
|
||||
static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr)
|
||||
{
|
||||
return percpu_counter_read_positive(&mp->m_free[ctr].count);
|
||||
}
|
||||
|
||||
static inline int xfs_compare_freecounter(struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr, s64 rhs, s32 batch)
|
||||
{
|
||||
return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch);
|
||||
}
|
||||
|
||||
static inline void xfs_set_freecounter(struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr, uint64_t val)
|
||||
{
|
||||
percpu_counter_set(&mp->m_free[ctr].count, val);
|
||||
}
|
||||
|
||||
int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
|
||||
uint64_t delta, bool rsvd);
|
||||
void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
|
||||
void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
|
||||
uint64_t delta);
|
||||
|
||||
static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
|
||||
bool reserved)
|
||||
{
|
||||
return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
|
||||
return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
|
||||
}
|
||||
|
||||
static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
|
||||
{
|
||||
xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
|
||||
xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
|
||||
}
|
||||
|
||||
static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
|
||||
{
|
||||
return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
|
||||
return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
|
||||
}
|
||||
|
||||
static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
|
||||
{
|
||||
xfs_add_freecounter(mp, &mp->m_frextents, delta);
|
||||
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
|
||||
}
|
||||
|
||||
extern int xfs_readsb(xfs_mount_t *, int);
|
||||
@@ -706,5 +787,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
|
||||
bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
|
||||
void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta,
|
||||
int64_t ind_delta);
|
||||
static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
|
||||
{
|
||||
percpu_counter_add(&mp->m_delalloc_blks, delta);
|
||||
}
|
||||
|
||||
#endif /* __XFS_MOUNT_H__ */
|
||||
|
||||
@@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas(
|
||||
* immediately. We only support rtquota if rtgroups are enabled to
|
||||
* avoid problems with older kernels.
|
||||
*/
|
||||
if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) {
|
||||
if (mp->m_sb.sb_rextents &&
|
||||
(!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) {
|
||||
xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
|
||||
mp->m_qflags = 0;
|
||||
goto write_changes;
|
||||
|
||||
@@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared(
|
||||
int error = 0;
|
||||
|
||||
/* Holes, unwritten, and delalloc extents cannot be shared */
|
||||
if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
|
||||
if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
|
||||
*shared = false;
|
||||
return 0;
|
||||
}
|
||||
@@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks(
|
||||
|
||||
if (isnullstartblock(del.br_startblock)) {
|
||||
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
|
||||
&del);
|
||||
&del, 0);
|
||||
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
|
||||
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
|
||||
|
||||
@@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space(
|
||||
if (!xfs_has_rmapbt(mp))
|
||||
return 0;
|
||||
if (XFS_IS_REALTIME_INODE(ip)) {
|
||||
struct xfs_rtgroup *rtg;
|
||||
xfs_rgnumber_t rgno;
|
||||
|
||||
rgno = xfs_rtb_to_rgno(mp, fsb);
|
||||
rtg = xfs_rtgroup_get(mp, rgno);
|
||||
if (xfs_metafile_resv_critical(rtg_rmap(rtg)))
|
||||
error = -ENOSPC;
|
||||
xfs_rtgroup_put(rtg);
|
||||
return error;
|
||||
if (xfs_metafile_resv_critical(mp))
|
||||
return -ENOSPC;
|
||||
return 0;
|
||||
}
|
||||
|
||||
agno = XFS_FSB_TO_AGNO(mp, fsb);
|
||||
@@ -1538,7 +1532,7 @@ xfs_reflink_zero_posteof(
|
||||
return 0;
|
||||
|
||||
trace_xfs_zero_eof(ip, isize, pos - isize);
|
||||
return xfs_zero_range(ip, isize, pos - isize, NULL);
|
||||
return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include "xfs_trace.h"
|
||||
#include "xfs_rtrefcount_btree.h"
|
||||
#include "xfs_reflink.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
|
||||
/*
|
||||
* Return whether there are any free extents in the size range given
|
||||
@@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
|
||||
|
||||
for (i = 0; i < XFS_RTGI_MAX; i++)
|
||||
xfs_rtginode_irele(&rtg->rtg_inodes[i]);
|
||||
kvfree(rtg->rtg_rsum_cache);
|
||||
if (!xfs_has_zoned(rtg_mount(rtg)))
|
||||
kvfree(rtg->rtg_rsum_cache);
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb(
|
||||
return error;
|
||||
}
|
||||
|
||||
static void
|
||||
xfs_growfs_rt_sb_fields(
|
||||
struct xfs_trans *tp,
|
||||
const struct xfs_mount *nmp)
|
||||
{
|
||||
struct xfs_mount *mp = tp->t_mountp;
|
||||
|
||||
if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
|
||||
nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
|
||||
if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
|
||||
nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
|
||||
if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
|
||||
nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
|
||||
if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
|
||||
nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
|
||||
if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
|
||||
nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
|
||||
if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT,
|
||||
nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_growfs_rt_zoned(
|
||||
struct xfs_rtgroup *rtg,
|
||||
xfs_rfsblock_t nrblocks)
|
||||
{
|
||||
struct xfs_mount *mp = rtg_mount(rtg);
|
||||
struct xfs_mount *nmp;
|
||||
struct xfs_trans *tp;
|
||||
xfs_rtbxlen_t freed_rtx;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* Calculate new sb and mount fields for this round. Also ensure the
|
||||
* rtg_extents value is uptodate as the rtbitmap code relies on it.
|
||||
*/
|
||||
nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks,
|
||||
mp->m_sb.sb_rextsize);
|
||||
if (!nmp)
|
||||
return -ENOMEM;
|
||||
freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
|
||||
|
||||
xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg),
|
||||
nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents);
|
||||
|
||||
error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp);
|
||||
if (error)
|
||||
goto out_free;
|
||||
|
||||
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
|
||||
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
|
||||
|
||||
xfs_growfs_rt_sb_fields(tp, nmp);
|
||||
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
|
||||
|
||||
error = xfs_trans_commit(tp);
|
||||
if (error)
|
||||
goto out_free;
|
||||
|
||||
/*
|
||||
* Ensure the mount RT feature flag is now set, and compute new
|
||||
* maxlevels for rt btrees.
|
||||
*/
|
||||
mp->m_features |= XFS_FEAT_REALTIME;
|
||||
xfs_rtrmapbt_compute_maxlevels(mp);
|
||||
xfs_rtrefcountbt_compute_maxlevels(mp);
|
||||
xfs_zoned_add_available(mp, freed_rtx);
|
||||
out_free:
|
||||
kfree(nmp);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_growfs_rt_bmblock(
|
||||
struct xfs_rtgroup *rtg,
|
||||
@@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock(
|
||||
/*
|
||||
* Update superblock fields.
|
||||
*/
|
||||
if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
|
||||
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE,
|
||||
nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
|
||||
if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
|
||||
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS,
|
||||
nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
|
||||
if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
|
||||
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS,
|
||||
nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
|
||||
if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
|
||||
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS,
|
||||
nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
|
||||
if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
|
||||
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
|
||||
nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
|
||||
if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
|
||||
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT,
|
||||
nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
|
||||
xfs_growfs_rt_sb_fields(args.tp, nmp);
|
||||
|
||||
/*
|
||||
* Free the new extent.
|
||||
@@ -1127,6 +1190,11 @@ xfs_growfs_rtg(
|
||||
goto out_rele;
|
||||
}
|
||||
|
||||
if (xfs_has_zoned(mp)) {
|
||||
error = xfs_growfs_rt_zoned(rtg, nrblocks);
|
||||
goto out_rele;
|
||||
}
|
||||
|
||||
error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);
|
||||
if (error)
|
||||
goto out_rele;
|
||||
@@ -1146,8 +1214,7 @@ xfs_growfs_rtg(
|
||||
|
||||
if (old_rsum_cache)
|
||||
kvfree(old_rsum_cache);
|
||||
xfs_rtgroup_rele(rtg);
|
||||
return 0;
|
||||
goto out_rele;
|
||||
|
||||
out_error:
|
||||
/*
|
||||
@@ -1195,6 +1262,22 @@ xfs_growfs_check_rtgeom(
|
||||
|
||||
if (min_logfsbs > mp->m_sb.sb_logblocks)
|
||||
return -EINVAL;
|
||||
|
||||
if (xfs_has_zoned(mp)) {
|
||||
uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks;
|
||||
uint32_t rem;
|
||||
|
||||
if (rextsize != 1)
|
||||
return -EINVAL;
|
||||
div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem);
|
||||
if (rem) {
|
||||
xfs_warn(mp,
|
||||
"new RT volume size (%lld) not aligned to RT group size (%d)",
|
||||
mp->m_sb.sb_rblocks, gblocks);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1248,6 +1331,35 @@ xfs_grow_last_rtg(
|
||||
mp->m_sb.sb_rgextents;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read in the last block of the RT device to make sure it is accessible.
|
||||
*/
|
||||
static int
|
||||
xfs_rt_check_size(
|
||||
struct xfs_mount *mp,
|
||||
xfs_rfsblock_t last_block)
|
||||
{
|
||||
xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block);
|
||||
struct xfs_buf *bp;
|
||||
int error;
|
||||
|
||||
if (XFS_BB_TO_FSB(mp, daddr) != last_block) {
|
||||
xfs_warn(mp, "RT device size overflow: %llu != %llu",
|
||||
XFS_BB_TO_FSB(mp, daddr), last_block);
|
||||
return -EFBIG;
|
||||
}
|
||||
|
||||
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
|
||||
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr,
|
||||
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
|
||||
if (error)
|
||||
xfs_warn(mp, "cannot read last RT device sector (%lld)",
|
||||
last_block);
|
||||
else
|
||||
xfs_buf_relse(bp);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Grow the realtime area of the filesystem.
|
||||
*/
|
||||
@@ -1259,7 +1371,6 @@ xfs_growfs_rt(
|
||||
xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount;
|
||||
xfs_rgnumber_t new_rgcount = 1;
|
||||
xfs_rgnumber_t rgno;
|
||||
struct xfs_buf *bp;
|
||||
xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
|
||||
int error;
|
||||
|
||||
@@ -1302,15 +1413,10 @@ xfs_growfs_rt(
|
||||
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
/*
|
||||
* Read in the last block of the device, make sure it exists.
|
||||
*/
|
||||
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
|
||||
XFS_FSB_TO_BB(mp, in->newblocks - 1),
|
||||
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
|
||||
|
||||
error = xfs_rt_check_size(mp, in->newblocks - 1);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
xfs_buf_relse(bp);
|
||||
|
||||
/*
|
||||
* Calculate new parameters. These are the final values to be reached.
|
||||
@@ -1376,8 +1482,7 @@ xfs_growfs_rt(
|
||||
error = error2;
|
||||
|
||||
/* Reset the rt metadata btree space reservations. */
|
||||
xfs_rt_resv_free(mp);
|
||||
error2 = xfs_rt_resv_init(mp);
|
||||
error2 = xfs_metafile_resv_init(mp);
|
||||
if (error2 && error2 != -ENOSPC)
|
||||
error = error2;
|
||||
}
|
||||
@@ -1444,10 +1549,6 @@ int /* error */
|
||||
xfs_rtmount_init(
|
||||
struct xfs_mount *mp) /* file system mount structure */
|
||||
{
|
||||
struct xfs_buf *bp; /* buffer for last block of subvolume */
|
||||
xfs_daddr_t d; /* address of last block of subvolume */
|
||||
int error;
|
||||
|
||||
if (mp->m_sb.sb_rblocks == 0)
|
||||
return 0;
|
||||
if (mp->m_rtdev_targp == NULL) {
|
||||
@@ -1458,25 +1559,7 @@ xfs_rtmount_init(
|
||||
|
||||
mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels);
|
||||
|
||||
/*
|
||||
* Check that the realtime section is an ok size.
|
||||
*/
|
||||
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
|
||||
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
|
||||
xfs_warn(mp, "realtime mount -- %llu != %llu",
|
||||
(unsigned long long) XFS_BB_TO_FSB(mp, d),
|
||||
(unsigned long long) mp->m_sb.sb_rblocks);
|
||||
return -EFBIG;
|
||||
}
|
||||
error = xfs_buf_read_uncached(mp->m_rtdev_targp,
|
||||
d - XFS_FSB_TO_BB(mp, 1),
|
||||
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
|
||||
if (error) {
|
||||
xfs_warn(mp, "realtime device size check failed");
|
||||
return error;
|
||||
}
|
||||
xfs_buf_relse(bp);
|
||||
return 0;
|
||||
return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1);
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -1519,50 +1602,10 @@ xfs_rtalloc_reinit_frextents(
|
||||
spin_lock(&mp->m_sb_lock);
|
||||
mp->m_sb.sb_frextents = val;
|
||||
spin_unlock(&mp->m_sb_lock);
|
||||
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
|
||||
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Free space reservations for rt metadata inodes. */
|
||||
void
|
||||
xfs_rt_resv_free(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
struct xfs_rtgroup *rtg = NULL;
|
||||
unsigned int i;
|
||||
|
||||
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
|
||||
for (i = 0; i < XFS_RTGI_MAX; i++)
|
||||
xfs_metafile_resv_free(rtg->rtg_inodes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Reserve space for rt metadata inodes' space expansion. */
|
||||
int
|
||||
xfs_rt_resv_init(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
struct xfs_rtgroup *rtg = NULL;
|
||||
xfs_filblks_t ask;
|
||||
int error = 0;
|
||||
|
||||
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
|
||||
int err2;
|
||||
|
||||
ask = xfs_rtrmapbt_calc_reserves(mp);
|
||||
err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
|
||||
if (err2 && !error)
|
||||
error = err2;
|
||||
|
||||
ask = xfs_rtrefcountbt_calc_reserves(mp);
|
||||
err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask);
|
||||
if (err2 && !error)
|
||||
error = err2;
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read in the bmbt of an rt metadata inode so that we never have to load them
|
||||
* at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
|
||||
@@ -1613,6 +1656,8 @@ xfs_rtmount_rtg(
|
||||
}
|
||||
}
|
||||
|
||||
if (xfs_has_zoned(mp))
|
||||
return 0;
|
||||
return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
|
||||
}
|
||||
|
||||
@@ -2097,6 +2142,8 @@ xfs_bmap_rtalloc(
|
||||
ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
|
||||
int error;
|
||||
|
||||
ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
|
||||
|
||||
retry:
|
||||
error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
|
||||
if (error)
|
||||
|
||||
@@ -34,9 +34,6 @@ int /* error */
|
||||
xfs_rtmount_inodes(
|
||||
struct xfs_mount *mp); /* file system mount structure */
|
||||
|
||||
void xfs_rt_resv_free(struct xfs_mount *mp);
|
||||
int xfs_rt_resv_init(struct xfs_mount *mp);
|
||||
|
||||
/*
|
||||
* Grow the realtime area of the filesystem.
|
||||
*/
|
||||
@@ -65,8 +62,6 @@ xfs_rtmount_init(
|
||||
}
|
||||
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
|
||||
# define xfs_rtunmount_inodes(m)
|
||||
# define xfs_rt_resv_free(mp) ((void)0)
|
||||
# define xfs_rt_resv_init(mp) (0)
|
||||
|
||||
static inline int
|
||||
xfs_growfs_check_rtgeom(const struct xfs_mount *mp,
|
||||
|
||||
@@ -46,6 +46,7 @@
|
||||
#include "xfs_exchmaps_item.h"
|
||||
#include "xfs_parent.h"
|
||||
#include "xfs_rtalloc.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
#include "scrub/stats.h"
|
||||
#include "scrub/rcbag_btree.h"
|
||||
|
||||
@@ -109,7 +110,8 @@ enum {
|
||||
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
|
||||
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
|
||||
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
|
||||
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
|
||||
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
|
||||
Opt_lifetime, Opt_nolifetime,
|
||||
};
|
||||
|
||||
static const struct fs_parameter_spec xfs_fs_parameters[] = {
|
||||
@@ -154,6 +156,9 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
|
||||
fsparam_flag("nodiscard", Opt_nodiscard),
|
||||
fsparam_flag("dax", Opt_dax),
|
||||
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
|
||||
fsparam_u32("max_open_zones", Opt_max_open_zones),
|
||||
fsparam_flag("lifetime", Opt_lifetime),
|
||||
fsparam_flag("nolifetime", Opt_nolifetime),
|
||||
{}
|
||||
};
|
||||
|
||||
@@ -182,6 +187,7 @@ xfs_fs_show_options(
|
||||
{ XFS_FEAT_LARGE_IOSIZE, ",largeio" },
|
||||
{ XFS_FEAT_DAX_ALWAYS, ",dax=always" },
|
||||
{ XFS_FEAT_DAX_NEVER, ",dax=never" },
|
||||
{ XFS_FEAT_NOLIFETIME, ",nolifetime" },
|
||||
{ 0, NULL }
|
||||
};
|
||||
struct xfs_mount *mp = XFS_M(root->d_sb);
|
||||
@@ -233,6 +239,9 @@ xfs_fs_show_options(
|
||||
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
|
||||
seq_puts(m, ",noquota");
|
||||
|
||||
if (mp->m_max_open_zones)
|
||||
seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -533,7 +542,15 @@ xfs_setup_devices(
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
if (mp->m_rtdev_targp) {
|
||||
|
||||
if (mp->m_sb.sb_rtstart) {
|
||||
if (mp->m_rtdev_targp) {
|
||||
xfs_warn(mp,
|
||||
"can't use internal and external rtdev at the same time");
|
||||
return -EINVAL;
|
||||
}
|
||||
mp->m_rtdev_targp = mp->m_ddev_targp;
|
||||
} else if (mp->m_rtname) {
|
||||
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
|
||||
mp->m_sb.sb_sectsize);
|
||||
if (error)
|
||||
@@ -757,7 +774,7 @@ xfs_mount_free(
|
||||
{
|
||||
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
|
||||
xfs_free_buftarg(mp->m_logdev_targp);
|
||||
if (mp->m_rtdev_targp)
|
||||
if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp)
|
||||
xfs_free_buftarg(mp->m_rtdev_targp);
|
||||
if (mp->m_ddev_targp)
|
||||
xfs_free_buftarg(mp->m_ddev_targp);
|
||||
@@ -814,6 +831,7 @@ xfs_fs_sync_fs(
|
||||
if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
|
||||
xfs_inodegc_stop(mp);
|
||||
xfs_blockgc_stop(mp);
|
||||
xfs_zone_gc_stop(mp);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -834,10 +852,12 @@ xfs_statfs_data(
|
||||
struct kstatfs *st)
|
||||
{
|
||||
int64_t fdblocks =
|
||||
percpu_counter_sum(&mp->m_fdblocks);
|
||||
xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
|
||||
|
||||
/* make sure st->f_bfree does not underflow */
|
||||
st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
|
||||
st->f_bfree = max(0LL,
|
||||
fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));
|
||||
|
||||
/*
|
||||
* sb_dblocks can change during growfs, but nothing cares about reporting
|
||||
* the old or new value during growfs.
|
||||
@@ -856,8 +876,9 @@ xfs_statfs_rt(
|
||||
struct kstatfs *st)
|
||||
{
|
||||
st->f_bfree = xfs_rtbxlen_to_blen(mp,
|
||||
percpu_counter_sum_positive(&mp->m_frextents));
|
||||
st->f_blocks = mp->m_sb.sb_rblocks;
|
||||
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
|
||||
st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp,
|
||||
mp->m_free[XC_FREE_RTEXTENTS].res_total);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -922,24 +943,32 @@ xfs_fs_statfs(
|
||||
}
|
||||
|
||||
STATIC void
|
||||
xfs_save_resvblks(struct xfs_mount *mp)
|
||||
xfs_save_resvblks(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
mp->m_resblks_save = mp->m_resblks;
|
||||
xfs_reserve_blocks(mp, 0);
|
||||
enum xfs_free_counter i;
|
||||
|
||||
for (i = 0; i < XC_FREE_NR; i++) {
|
||||
mp->m_free[i].res_saved = mp->m_free[i].res_total;
|
||||
xfs_reserve_blocks(mp, i, 0);
|
||||
}
|
||||
}
|
||||
|
||||
STATIC void
|
||||
xfs_restore_resvblks(struct xfs_mount *mp)
|
||||
xfs_restore_resvblks(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
uint64_t resblks;
|
||||
uint64_t resblks;
|
||||
enum xfs_free_counter i;
|
||||
|
||||
if (mp->m_resblks_save) {
|
||||
resblks = mp->m_resblks_save;
|
||||
mp->m_resblks_save = 0;
|
||||
} else
|
||||
resblks = xfs_default_resblks(mp);
|
||||
|
||||
xfs_reserve_blocks(mp, resblks);
|
||||
for (i = 0; i < XC_FREE_NR; i++) {
|
||||
if (mp->m_free[i].res_saved) {
|
||||
resblks = mp->m_free[i].res_saved;
|
||||
mp->m_free[i].res_saved = 0;
|
||||
} else
|
||||
resblks = xfs_default_resblks(mp, i);
|
||||
xfs_reserve_blocks(mp, i, resblks);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -976,6 +1005,7 @@ xfs_fs_freeze(
|
||||
if (ret && !xfs_is_readonly(mp)) {
|
||||
xfs_blockgc_start(mp);
|
||||
xfs_inodegc_start(mp);
|
||||
xfs_zone_gc_start(mp);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@@ -997,6 +1027,7 @@ xfs_fs_unfreeze(
|
||||
* filesystem.
|
||||
*/
|
||||
if (!xfs_is_readonly(mp)) {
|
||||
xfs_zone_gc_start(mp);
|
||||
xfs_blockgc_start(mp);
|
||||
xfs_inodegc_start(mp);
|
||||
}
|
||||
@@ -1058,6 +1089,19 @@ xfs_finish_flags(
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!xfs_has_zoned(mp)) {
|
||||
if (mp->m_max_open_zones) {
|
||||
xfs_warn(mp,
|
||||
"max_open_zones mount option only supported on zoned file systems.");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (mp->m_features & XFS_FEAT_NOLIFETIME) {
|
||||
xfs_warn(mp,
|
||||
"nolifetime mount option only supported on zoned file systems.");
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1065,7 +1109,8 @@ static int
|
||||
xfs_init_percpu_counters(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
int error;
|
||||
int error;
|
||||
int i;
|
||||
|
||||
error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
@@ -1075,30 +1120,29 @@ xfs_init_percpu_counters(
|
||||
if (error)
|
||||
goto free_icount;
|
||||
|
||||
error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
goto free_ifree;
|
||||
|
||||
error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
goto free_fdblocks;
|
||||
goto free_ifree;
|
||||
|
||||
error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
goto free_delalloc;
|
||||
|
||||
error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
goto free_delalloc_rt;
|
||||
for (i = 0; i < XC_FREE_NR; i++) {
|
||||
error = percpu_counter_init(&mp->m_free[i].count, 0,
|
||||
GFP_KERNEL);
|
||||
if (error)
|
||||
goto free_freecounters;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
free_delalloc_rt:
|
||||
free_freecounters:
|
||||
while (--i > 0)
|
||||
percpu_counter_destroy(&mp->m_free[i].count);
|
||||
percpu_counter_destroy(&mp->m_delalloc_rtextents);
|
||||
free_delalloc:
|
||||
percpu_counter_destroy(&mp->m_delalloc_blks);
|
||||
free_fdblocks:
|
||||
percpu_counter_destroy(&mp->m_fdblocks);
|
||||
free_ifree:
|
||||
percpu_counter_destroy(&mp->m_ifree);
|
||||
free_icount:
|
||||
@@ -1112,24 +1156,28 @@ xfs_reinit_percpu_counters(
|
||||
{
|
||||
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
|
||||
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
|
||||
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
|
||||
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
|
||||
xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
|
||||
if (!xfs_has_zoned(mp))
|
||||
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
|
||||
mp->m_sb.sb_frextents);
|
||||
}
|
||||
|
||||
static void
|
||||
xfs_destroy_percpu_counters(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
enum xfs_free_counter i;
|
||||
|
||||
for (i = 0; i < XC_FREE_NR; i++)
|
||||
percpu_counter_destroy(&mp->m_free[i].count);
|
||||
percpu_counter_destroy(&mp->m_icount);
|
||||
percpu_counter_destroy(&mp->m_ifree);
|
||||
percpu_counter_destroy(&mp->m_fdblocks);
|
||||
ASSERT(xfs_is_shutdown(mp) ||
|
||||
percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
|
||||
percpu_counter_destroy(&mp->m_delalloc_rtextents);
|
||||
ASSERT(xfs_is_shutdown(mp) ||
|
||||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
|
||||
percpu_counter_destroy(&mp->m_delalloc_blks);
|
||||
percpu_counter_destroy(&mp->m_frextents);
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -1210,6 +1258,18 @@ xfs_fs_shutdown(
|
||||
xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_fs_show_stats(
|
||||
struct seq_file *m,
|
||||
struct dentry *root)
|
||||
{
|
||||
struct xfs_mount *mp = XFS_M(root->d_sb);
|
||||
|
||||
if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
|
||||
xfs_zoned_show_stats(m, mp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct super_operations xfs_super_operations = {
|
||||
.alloc_inode = xfs_fs_alloc_inode,
|
||||
.destroy_inode = xfs_fs_destroy_inode,
|
||||
@@ -1224,6 +1284,7 @@ static const struct super_operations xfs_super_operations = {
|
||||
.nr_cached_objects = xfs_fs_nr_cached_objects,
|
||||
.free_cached_objects = xfs_fs_free_cached_objects,
|
||||
.shutdown = xfs_fs_shutdown,
|
||||
.show_stats = xfs_fs_show_stats,
|
||||
};
|
||||
|
||||
static int
|
||||
@@ -1436,6 +1497,15 @@ xfs_fs_parse_param(
|
||||
xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
|
||||
parsing_mp->m_features |= XFS_FEAT_NOATTR2;
|
||||
return 0;
|
||||
case Opt_max_open_zones:
|
||||
parsing_mp->m_max_open_zones = result.uint_32;
|
||||
return 0;
|
||||
case Opt_lifetime:
|
||||
parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
|
||||
return 0;
|
||||
case Opt_nolifetime:
|
||||
parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
|
||||
return 0;
|
||||
default:
|
||||
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
|
||||
return -EINVAL;
|
||||
@@ -1780,8 +1850,17 @@ xfs_fs_fill_super(
|
||||
mp->m_features &= ~XFS_FEAT_DISCARD;
|
||||
}
|
||||
|
||||
if (xfs_has_metadir(mp))
|
||||
if (xfs_has_zoned(mp)) {
|
||||
if (!xfs_has_metadir(mp)) {
|
||||
xfs_alert(mp,
|
||||
"metadir feature required for zoned realtime devices.");
|
||||
error = -EINVAL;
|
||||
goto out_filestream_unmount;
|
||||
}
|
||||
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED);
|
||||
} else if (xfs_has_metadir(mp)) {
|
||||
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
|
||||
}
|
||||
|
||||
if (xfs_has_reflink(mp)) {
|
||||
if (xfs_has_realtime(mp) &&
|
||||
@@ -1793,6 +1872,13 @@ xfs_fs_fill_super(
|
||||
goto out_filestream_unmount;
|
||||
}
|
||||
|
||||
if (xfs_has_zoned(mp)) {
|
||||
xfs_alert(mp,
|
||||
"reflink not compatible with zoned RT device!");
|
||||
error = -EINVAL;
|
||||
goto out_filestream_unmount;
|
||||
}
|
||||
|
||||
if (xfs_globals.always_cow) {
|
||||
xfs_info(mp, "using DEBUG-only always_cow mode.");
|
||||
mp->m_always_cow = true;
|
||||
@@ -1917,6 +2003,9 @@ xfs_remount_rw(
|
||||
/* Re-enable the background inode inactivation worker. */
|
||||
xfs_inodegc_start(mp);
|
||||
|
||||
/* Restart zone reclaim */
|
||||
xfs_zone_gc_start(mp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1961,6 +2050,9 @@ xfs_remount_ro(
|
||||
*/
|
||||
xfs_inodegc_stop(mp);
|
||||
|
||||
/* Stop zone reclaim */
|
||||
xfs_zone_gc_stop(mp);
|
||||
|
||||
/* Free the per-AG metadata reservation pool. */
|
||||
xfs_fs_unreserve_ag_blocks(mp);
|
||||
|
||||
@@ -2082,6 +2174,7 @@ xfs_init_fs_context(
|
||||
for (i = 0; i < XG_TYPE_MAX; i++)
|
||||
xa_init(&mp->m_groups[i].xa);
|
||||
mutex_init(&mp->m_growlock);
|
||||
mutex_init(&mp->m_metafile_resv_lock);
|
||||
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
|
||||
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
|
||||
mp->m_kobj.kobject.kset = xfs_kset;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "xfs_log.h"
|
||||
#include "xfs_log_priv.h"
|
||||
#include "xfs_mount.h"
|
||||
#include "xfs_zones.h"
|
||||
|
||||
struct xfs_sysfs_attr {
|
||||
struct attribute attr;
|
||||
@@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = {
|
||||
};
|
||||
ATTRIBUTE_GROUPS(xfs_mp);
|
||||
|
||||
const struct kobj_type xfs_mp_ktype = {
|
||||
static const struct kobj_type xfs_mp_ktype = {
|
||||
.release = xfs_sysfs_release,
|
||||
.sysfs_ops = &xfs_sysfs_ops,
|
||||
.default_groups = xfs_mp_groups,
|
||||
@@ -701,45 +702,103 @@ xfs_error_sysfs_init_class(
|
||||
return error;
|
||||
}
|
||||
|
||||
static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj)
|
||||
{
|
||||
return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
max_open_zones_show(
|
||||
struct kobject *kobj,
|
||||
char *buf)
|
||||
{
|
||||
/* only report the open zones available for user data */
|
||||
return sysfs_emit(buf, "%u\n",
|
||||
zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES);
|
||||
}
|
||||
XFS_SYSFS_ATTR_RO(max_open_zones);
|
||||
|
||||
static struct attribute *xfs_zoned_attrs[] = {
|
||||
ATTR_LIST(max_open_zones),
|
||||
NULL,
|
||||
};
|
||||
ATTRIBUTE_GROUPS(xfs_zoned);
|
||||
|
||||
static const struct kobj_type xfs_zoned_ktype = {
|
||||
.release = xfs_sysfs_release,
|
||||
.sysfs_ops = &xfs_sysfs_ops,
|
||||
.default_groups = xfs_zoned_groups,
|
||||
};
|
||||
|
||||
int
|
||||
xfs_error_sysfs_init(
|
||||
xfs_mount_sysfs_init(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
int error;
|
||||
|
||||
super_set_sysfs_name_id(mp->m_super);
|
||||
|
||||
/* .../xfs/<dev>/ */
|
||||
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
|
||||
NULL, mp->m_super->s_id);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/* .../xfs/<dev>/stats/ */
|
||||
error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
|
||||
&mp->m_kobj, "stats");
|
||||
if (error)
|
||||
goto out_remove_fsdir;
|
||||
|
||||
/* .../xfs/<dev>/error/ */
|
||||
error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
|
||||
&mp->m_kobj, "error");
|
||||
if (error)
|
||||
return error;
|
||||
goto out_remove_stats_dir;
|
||||
|
||||
/* .../xfs/<dev>/error/fail_at_unmount */
|
||||
error = sysfs_create_file(&mp->m_error_kobj.kobject,
|
||||
ATTR_LIST(fail_at_unmount));
|
||||
|
||||
if (error)
|
||||
goto out_error;
|
||||
goto out_remove_error_dir;
|
||||
|
||||
/* .../xfs/<dev>/error/metadata/ */
|
||||
error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
|
||||
"metadata", &mp->m_error_meta_kobj,
|
||||
xfs_error_meta_init);
|
||||
if (error)
|
||||
goto out_error;
|
||||
goto out_remove_error_dir;
|
||||
|
||||
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) {
|
||||
/* .../xfs/<dev>/zoned/ */
|
||||
error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype,
|
||||
&mp->m_kobj, "zoned");
|
||||
if (error)
|
||||
goto out_remove_error_dir;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out_error:
|
||||
out_remove_error_dir:
|
||||
xfs_sysfs_del(&mp->m_error_kobj);
|
||||
out_remove_stats_dir:
|
||||
xfs_sysfs_del(&mp->m_stats.xs_kobj);
|
||||
out_remove_fsdir:
|
||||
xfs_sysfs_del(&mp->m_kobj);
|
||||
return error;
|
||||
}
|
||||
|
||||
void
|
||||
xfs_error_sysfs_del(
|
||||
xfs_mount_sysfs_del(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
struct xfs_error_cfg *cfg;
|
||||
int i, j;
|
||||
|
||||
if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
|
||||
xfs_sysfs_del(&mp->m_zoned_kobj);
|
||||
|
||||
for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
|
||||
for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
|
||||
cfg = &mp->m_error_cfg[i][j];
|
||||
@@ -749,6 +808,8 @@ xfs_error_sysfs_del(
|
||||
}
|
||||
xfs_sysfs_del(&mp->m_error_meta_kobj);
|
||||
xfs_sysfs_del(&mp->m_error_kobj);
|
||||
xfs_sysfs_del(&mp->m_stats.xs_kobj);
|
||||
xfs_sysfs_del(&mp->m_kobj);
|
||||
}
|
||||
|
||||
struct xfs_error_cfg *
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#ifndef __XFS_SYSFS_H__
|
||||
#define __XFS_SYSFS_H__
|
||||
|
||||
extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */
|
||||
extern const struct kobj_type xfs_dbg_ktype; /* debug */
|
||||
extern const struct kobj_type xfs_log_ktype; /* xlog */
|
||||
extern const struct kobj_type xfs_stats_ktype; /* stats */
|
||||
@@ -53,7 +52,7 @@ xfs_sysfs_del(
|
||||
wait_for_completion(&kobj->complete);
|
||||
}
|
||||
|
||||
int xfs_error_sysfs_init(struct xfs_mount *mp);
|
||||
void xfs_error_sysfs_del(struct xfs_mount *mp);
|
||||
int xfs_mount_sysfs_init(struct xfs_mount *mp);
|
||||
void xfs_mount_sysfs_del(struct xfs_mount *mp);
|
||||
|
||||
#endif /* __XFS_SYSFS_H__ */
|
||||
|
||||
@@ -49,6 +49,8 @@
|
||||
#include "xfs_metafile.h"
|
||||
#include "xfs_metadir.h"
|
||||
#include "xfs_rtgroup.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
#include "xfs_zone_priv.h"
|
||||
|
||||
/*
|
||||
* We include this last to have the helpers above available for the trace
|
||||
|
||||
@@ -102,6 +102,7 @@ struct xfs_rmap_intent;
|
||||
struct xfs_refcount_intent;
|
||||
struct xfs_metadir_update;
|
||||
struct xfs_rtgroup;
|
||||
struct xfs_open_zone;
|
||||
|
||||
#define XFS_ATTR_FILTER_FLAGS \
|
||||
{ XFS_ATTR_ROOT, "ROOT" }, \
|
||||
@@ -265,6 +266,152 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
|
||||
DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
|
||||
DEFINE_GROUP_REF_EVENT(xfs_group_rele);
|
||||
|
||||
#ifdef CONFIG_XFS_RT
|
||||
DECLARE_EVENT_CLASS(xfs_zone_class,
|
||||
TP_PROTO(struct xfs_rtgroup *rtg),
|
||||
TP_ARGS(rtg),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(xfs_rgnumber_t, rgno)
|
||||
__field(xfs_rgblock_t, used)
|
||||
__field(unsigned int, nr_open)
|
||||
),
|
||||
TP_fast_assign(
|
||||
struct xfs_mount *mp = rtg_mount(rtg);
|
||||
|
||||
__entry->dev = mp->m_super->s_dev;
|
||||
__entry->rgno = rtg_rgno(rtg);
|
||||
__entry->used = rtg_rmap(rtg)->i_used_blocks;
|
||||
__entry->nr_open = mp->m_zone_info->zi_nr_open_zones;
|
||||
),
|
||||
TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rgno,
|
||||
__entry->used,
|
||||
__entry->nr_open)
|
||||
);
|
||||
|
||||
#define DEFINE_ZONE_EVENT(name) \
|
||||
DEFINE_EVENT(xfs_zone_class, name, \
|
||||
TP_PROTO(struct xfs_rtgroup *rtg), \
|
||||
TP_ARGS(rtg))
|
||||
DEFINE_ZONE_EVENT(xfs_zone_emptied);
|
||||
DEFINE_ZONE_EVENT(xfs_zone_full);
|
||||
DEFINE_ZONE_EVENT(xfs_zone_opened);
|
||||
DEFINE_ZONE_EVENT(xfs_zone_reset);
|
||||
DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
|
||||
|
||||
TRACE_EVENT(xfs_zone_free_blocks,
|
||||
TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
|
||||
xfs_extlen_t len),
|
||||
TP_ARGS(rtg, rgbno, len),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(xfs_rgnumber_t, rgno)
|
||||
__field(xfs_rgblock_t, used)
|
||||
__field(xfs_rgblock_t, rgbno)
|
||||
__field(xfs_extlen_t, len)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = rtg_mount(rtg)->m_super->s_dev;
|
||||
__entry->rgno = rtg_rgno(rtg);
|
||||
__entry->used = rtg_rmap(rtg)->i_used_blocks;
|
||||
__entry->rgbno = rgbno;
|
||||
__entry->len = len;
|
||||
),
|
||||
TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rgno,
|
||||
__entry->used,
|
||||
__entry->rgbno,
|
||||
__entry->len)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
|
||||
TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
|
||||
xfs_extlen_t len),
|
||||
TP_ARGS(oz, rgbno, len),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(xfs_rgnumber_t, rgno)
|
||||
__field(xfs_rgblock_t, used)
|
||||
__field(xfs_rgblock_t, written)
|
||||
__field(xfs_rgblock_t, write_pointer)
|
||||
__field(xfs_rgblock_t, rgbno)
|
||||
__field(xfs_extlen_t, len)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
|
||||
__entry->rgno = rtg_rgno(oz->oz_rtg);
|
||||
__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
|
||||
__entry->written = oz->oz_written;
|
||||
__entry->write_pointer = oz->oz_write_pointer;
|
||||
__entry->rgbno = rgbno;
|
||||
__entry->len = len;
|
||||
),
|
||||
TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rgno,
|
||||
__entry->used,
|
||||
__entry->written,
|
||||
__entry->write_pointer,
|
||||
__entry->rgbno,
|
||||
__entry->len)
|
||||
);
|
||||
|
||||
#define DEFINE_ZONE_ALLOC_EVENT(name) \
|
||||
DEFINE_EVENT(xfs_zone_alloc_class, name, \
|
||||
TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \
|
||||
xfs_extlen_t len), \
|
||||
TP_ARGS(oz, rgbno, len))
|
||||
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
|
||||
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
|
||||
|
||||
TRACE_EVENT(xfs_zone_gc_select_victim,
|
||||
TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
|
||||
TP_ARGS(rtg, bucket),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(xfs_rgnumber_t, rgno)
|
||||
__field(xfs_rgblock_t, used)
|
||||
__field(unsigned int, bucket)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = rtg_mount(rtg)->m_super->s_dev;
|
||||
__entry->rgno = rtg_rgno(rtg);
|
||||
__entry->used = rtg_rmap(rtg)->i_used_blocks;
|
||||
__entry->bucket = bucket;
|
||||
),
|
||||
TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rgno,
|
||||
__entry->used,
|
||||
__entry->bucket)
|
||||
);
|
||||
|
||||
TRACE_EVENT(xfs_zones_mount,
|
||||
TP_PROTO(struct xfs_mount *mp),
|
||||
TP_ARGS(mp),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(xfs_rgnumber_t, rgcount)
|
||||
__field(uint32_t, blocks)
|
||||
__field(unsigned int, max_open_zones)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = mp->m_super->s_dev;
|
||||
__entry->rgcount = mp->m_sb.sb_rgcount;
|
||||
__entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
|
||||
__entry->max_open_zones = mp->m_max_open_zones;
|
||||
),
|
||||
TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rgcount,
|
||||
__entry->blocks,
|
||||
__entry->max_open_zones)
|
||||
);
|
||||
#endif /* CONFIG_XFS_RT */
|
||||
|
||||
TRACE_EVENT(xfs_inodegc_worker,
|
||||
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
|
||||
TP_ARGS(mp, shrinker_hits),
|
||||
@@ -1596,6 +1743,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
|
||||
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
|
||||
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
|
||||
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
|
||||
DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
|
||||
|
||||
DECLARE_EVENT_CLASS(xfs_itrunc_class,
|
||||
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
|
||||
@@ -3983,6 +4131,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
|
||||
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
|
||||
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
|
||||
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
|
||||
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
|
||||
|
||||
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
|
||||
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
|
||||
@@ -5606,11 +5755,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup);
|
||||
/* metadata inode space reservations */
|
||||
|
||||
DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
|
||||
TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
|
||||
TP_ARGS(ip, len),
|
||||
TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len),
|
||||
TP_ARGS(mp, len),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(xfs_ino_t, ino)
|
||||
__field(unsigned long long, freeblks)
|
||||
__field(unsigned long long, reserved)
|
||||
__field(unsigned long long, asked)
|
||||
@@ -5618,19 +5766,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
|
||||
__field(unsigned long long, len)
|
||||
),
|
||||
TP_fast_assign(
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
|
||||
__entry->dev = mp->m_super->s_dev;
|
||||
__entry->ino = ip->i_ino;
|
||||
__entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
|
||||
__entry->reserved = ip->i_delayed_blks;
|
||||
__entry->asked = ip->i_meta_resv_asked;
|
||||
__entry->used = ip->i_nblocks;
|
||||
__entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
|
||||
__entry->reserved = mp->m_metafile_resv_avail;
|
||||
__entry->asked = mp->m_metafile_resv_target;
|
||||
__entry->used = mp->m_metafile_resv_used;
|
||||
__entry->len = len;
|
||||
),
|
||||
TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
|
||||
TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->ino,
|
||||
__entry->freeblks,
|
||||
__entry->reserved,
|
||||
__entry->asked,
|
||||
@@ -5639,14 +5783,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
|
||||
)
|
||||
#define DEFINE_METAFILE_RESV_EVENT(name) \
|
||||
DEFINE_EVENT(xfs_metafile_resv_class, name, \
|
||||
TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
|
||||
TP_ARGS(ip, len))
|
||||
TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \
|
||||
TP_ARGS(mp, len))
|
||||
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init);
|
||||
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free);
|
||||
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space);
|
||||
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
|
||||
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
|
||||
DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error);
|
||||
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error);
|
||||
|
||||
#ifdef CONFIG_XFS_RT
|
||||
TRACE_EVENT(xfs_growfs_check_rtgeom,
|
||||
@@ -5669,6 +5813,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
|
||||
);
|
||||
#endif /* CONFIG_XFS_RT */
|
||||
|
||||
TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
|
||||
TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
|
||||
TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
|
||||
|
||||
DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
|
||||
TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
|
||||
uint64_t delta, unsigned long caller_ip),
|
||||
TP_ARGS(mp, ctr, delta, caller_ip),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(enum xfs_free_counter, ctr)
|
||||
__field(uint64_t, delta)
|
||||
__field(uint64_t, avail)
|
||||
__field(uint64_t, total)
|
||||
__field(unsigned long, caller_ip)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = mp->m_super->s_dev;
|
||||
__entry->ctr = ctr;
|
||||
__entry->delta = delta;
|
||||
__entry->avail = mp->m_free[ctr].res_avail;
|
||||
__entry->total = mp->m_free[ctr].res_total;
|
||||
__entry->caller_ip = caller_ip;
|
||||
),
|
||||
TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR),
|
||||
__entry->delta,
|
||||
__entry->avail,
|
||||
__entry->total,
|
||||
(char *)__entry->caller_ip)
|
||||
)
|
||||
#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \
|
||||
DEFINE_EVENT(xfs_freeblocks_resv_class, name, \
|
||||
TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \
|
||||
uint64_t delta, unsigned long caller_ip), \
|
||||
TP_ARGS(mp, ctr, delta, caller_ip))
|
||||
DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved);
|
||||
DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc);
|
||||
|
||||
#endif /* _TRACE_XFS_H */
|
||||
|
||||
#undef TRACE_INCLUDE_PATH
|
||||
|
||||
1211
fs/xfs/xfs_zone_alloc.c
Normal file
1211
fs/xfs/xfs_zone_alloc.c
Normal file
File diff suppressed because it is too large
Load Diff
70
fs/xfs/xfs_zone_alloc.h
Normal file
70
fs/xfs/xfs_zone_alloc.h
Normal file
@@ -0,0 +1,70 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _XFS_ZONE_ALLOC_H
|
||||
#define _XFS_ZONE_ALLOC_H
|
||||
|
||||
struct iomap_ioend;
|
||||
struct xfs_open_zone;
|
||||
|
||||
struct xfs_zone_alloc_ctx {
|
||||
struct xfs_open_zone *open_zone;
|
||||
xfs_filblks_t reserved_blocks;
|
||||
};
|
||||
|
||||
/*
|
||||
* Grab any available space, even if it is less than what the caller asked for.
|
||||
*/
|
||||
#define XFS_ZR_GREEDY (1U << 0)
|
||||
/*
|
||||
* Only grab instantly available space, don't wait or GC.
|
||||
*/
|
||||
#define XFS_ZR_NOWAIT (1U << 1)
|
||||
/*
|
||||
* Dip into the reserved pool.
|
||||
*/
|
||||
#define XFS_ZR_RESERVED (1U << 2)
|
||||
|
||||
int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
|
||||
unsigned int flags, struct xfs_zone_alloc_ctx *ac);
|
||||
void xfs_zoned_space_unreserve(struct xfs_inode *ip,
|
||||
struct xfs_zone_alloc_ctx *ac);
|
||||
void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
|
||||
|
||||
void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
|
||||
struct xfs_open_zone **oz);
|
||||
int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
|
||||
xfs_fsblock_t fsbno, xfs_filblks_t len);
|
||||
int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
|
||||
xfs_daddr_t daddr, struct xfs_open_zone *oz,
|
||||
xfs_fsblock_t old_startblock);
|
||||
void xfs_open_zone_put(struct xfs_open_zone *oz);
|
||||
|
||||
void xfs_zoned_wake_all(struct xfs_mount *mp);
|
||||
bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
|
||||
void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
|
||||
|
||||
uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr);
|
||||
void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
|
||||
|
||||
#ifdef CONFIG_XFS_RT
|
||||
int xfs_mount_zones(struct xfs_mount *mp);
|
||||
void xfs_unmount_zones(struct xfs_mount *mp);
|
||||
void xfs_zone_gc_start(struct xfs_mount *mp);
|
||||
void xfs_zone_gc_stop(struct xfs_mount *mp);
|
||||
#else
|
||||
static inline int xfs_mount_zones(struct xfs_mount *mp)
|
||||
{
|
||||
return -EIO;
|
||||
}
|
||||
static inline void xfs_unmount_zones(struct xfs_mount *mp)
|
||||
{
|
||||
}
|
||||
static inline void xfs_zone_gc_start(struct xfs_mount *mp)
|
||||
{
|
||||
}
|
||||
static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_XFS_RT */
|
||||
|
||||
#endif /* _XFS_ZONE_ALLOC_H */
|
||||
1165
fs/xfs/xfs_zone_gc.c
Normal file
1165
fs/xfs/xfs_zone_gc.c
Normal file
File diff suppressed because it is too large
Load Diff
105
fs/xfs/xfs_zone_info.c
Normal file
105
fs/xfs/xfs_zone_info.c
Normal file
@@ -0,0 +1,105 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2023-2025 Christoph Hellwig.
|
||||
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
|
||||
*/
|
||||
#include "xfs.h"
|
||||
#include "xfs_shared.h"
|
||||
#include "xfs_format.h"
|
||||
#include "xfs_trans_resv.h"
|
||||
#include "xfs_mount.h"
|
||||
#include "xfs_inode.h"
|
||||
#include "xfs_rtgroup.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
#include "xfs_zone_priv.h"
|
||||
|
||||
static const char xfs_write_hint_shorthand[6][16] = {
|
||||
"NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"};
|
||||
|
||||
static inline const char *
|
||||
xfs_write_hint_to_str(
|
||||
uint8_t write_hint)
|
||||
{
|
||||
if (write_hint > WRITE_LIFE_EXTREME)
|
||||
return "UNKNOWN";
|
||||
return xfs_write_hint_shorthand[write_hint];
|
||||
}
|
||||
|
||||
static void
|
||||
xfs_show_open_zone(
|
||||
struct seq_file *m,
|
||||
struct xfs_open_zone *oz)
|
||||
{
|
||||
seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
|
||||
rtg_rgno(oz->oz_rtg),
|
||||
oz->oz_write_pointer, oz->oz_written,
|
||||
rtg_rmap(oz->oz_rtg)->i_used_blocks,
|
||||
xfs_write_hint_to_str(oz->oz_write_hint));
|
||||
}
|
||||
|
||||
static void
|
||||
xfs_show_full_zone_used_distribution(
|
||||
struct seq_file *m,
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
struct xfs_zone_info *zi = mp->m_zone_info;
|
||||
unsigned int reclaimable = 0, full, i;
|
||||
|
||||
spin_lock(&zi->zi_used_buckets_lock);
|
||||
for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
|
||||
unsigned int entries = zi->zi_used_bucket_entries[i];
|
||||
|
||||
seq_printf(m, "\t %2u..%2u%%: %u\n",
|
||||
i * (100 / XFS_ZONE_USED_BUCKETS),
|
||||
(i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1,
|
||||
entries);
|
||||
reclaimable += entries;
|
||||
}
|
||||
spin_unlock(&zi->zi_used_buckets_lock);
|
||||
|
||||
full = mp->m_sb.sb_rgcount;
|
||||
if (zi->zi_open_gc_zone)
|
||||
full--;
|
||||
full -= zi->zi_nr_open_zones;
|
||||
full -= atomic_read(&zi->zi_nr_free_zones);
|
||||
full -= reclaimable;
|
||||
|
||||
seq_printf(m, "\t 100%%: %u\n", full);
|
||||
}
|
||||
|
||||
void
|
||||
xfs_zoned_show_stats(
|
||||
struct seq_file *m,
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
struct xfs_zone_info *zi = mp->m_zone_info;
|
||||
struct xfs_open_zone *oz;
|
||||
|
||||
seq_puts(m, "\n");
|
||||
|
||||
seq_printf(m, "\tuser free RT blocks: %lld\n",
|
||||
xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
|
||||
seq_printf(m, "\treserved free RT blocks: %lld\n",
|
||||
mp->m_free[XC_FREE_RTEXTENTS].res_avail);
|
||||
seq_printf(m, "\tuser available RT blocks: %lld\n",
|
||||
xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE));
|
||||
seq_printf(m, "\treserved available RT blocks: %lld\n",
|
||||
mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
|
||||
seq_printf(m, "\tRT reservations required: %d\n",
|
||||
!list_empty_careful(&zi->zi_reclaim_reservations));
|
||||
seq_printf(m, "\tRT GC required: %d\n",
|
||||
xfs_zoned_need_gc(mp));
|
||||
|
||||
seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
|
||||
seq_puts(m, "\topen zones:\n");
|
||||
spin_lock(&zi->zi_open_zones_lock);
|
||||
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
|
||||
xfs_show_open_zone(m, oz);
|
||||
if (zi->zi_open_gc_zone) {
|
||||
seq_puts(m, "\topen gc zone:\n");
|
||||
xfs_show_open_zone(m, zi->zi_open_gc_zone);
|
||||
}
|
||||
spin_unlock(&zi->zi_open_zones_lock);
|
||||
seq_puts(m, "\tused blocks distribution (fully written zones):\n");
|
||||
xfs_show_full_zone_used_distribution(m, mp);
|
||||
}
|
||||
119
fs/xfs/xfs_zone_priv.h
Normal file
119
fs/xfs/xfs_zone_priv.h
Normal file
@@ -0,0 +1,119 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _XFS_ZONE_PRIV_H
|
||||
#define _XFS_ZONE_PRIV_H
|
||||
|
||||
struct xfs_open_zone {
|
||||
/*
|
||||
* Entry in the open zone list and refcount. Protected by
|
||||
* zi_open_zones_lock in struct xfs_zone_info.
|
||||
*/
|
||||
struct list_head oz_entry;
|
||||
atomic_t oz_ref;
|
||||
|
||||
/*
|
||||
* oz_write_pointer is the write pointer at which space is handed out
|
||||
* for conventional zones, or simple the count of blocks handed out
|
||||
* so far for sequential write required zones and is protected by
|
||||
* oz_alloc_lock/
|
||||
*/
|
||||
spinlock_t oz_alloc_lock;
|
||||
xfs_rgblock_t oz_write_pointer;
|
||||
|
||||
/*
|
||||
* oz_written is the number of blocks for which we've received a
|
||||
* write completion. oz_written must always be <= oz_write_pointer
|
||||
* and is protected by the ILOCK of the rmap inode.
|
||||
*/
|
||||
xfs_rgblock_t oz_written;
|
||||
|
||||
/*
|
||||
* Write hint (data temperature) assigned to this zone, or
|
||||
* WRITE_LIFE_NOT_SET if none was set.
|
||||
*/
|
||||
enum rw_hint oz_write_hint;
|
||||
|
||||
/*
|
||||
* Is this open zone used for garbage collection? There can only be a
|
||||
* single open GC zone, which is pointed to by zi_open_gc_zone in
|
||||
* struct xfs_zone_info. Constant over the life time of an open zone.
|
||||
*/
|
||||
bool oz_is_gc;
|
||||
|
||||
/*
|
||||
* Pointer to the RT groups structure for this open zone. Constant over
|
||||
* the life time of an open zone.
|
||||
*/
|
||||
struct xfs_rtgroup *oz_rtg;
|
||||
};
|
||||
|
||||
/*
|
||||
* Number of bitmap buckets to track reclaimable zones. There are 10 buckets
|
||||
* so that each 10% of the usable capacity get their own bucket and GC can
|
||||
* only has to walk the bitmaps of the lesser used zones if there are any.
|
||||
*/
|
||||
#define XFS_ZONE_USED_BUCKETS 10u
|
||||
|
||||
struct xfs_zone_info {
|
||||
/*
|
||||
* List of pending space reservations:
|
||||
*/
|
||||
spinlock_t zi_reservation_lock;
|
||||
struct list_head zi_reclaim_reservations;
|
||||
|
||||
/*
|
||||
* List and number of open zones:
|
||||
*/
|
||||
spinlock_t zi_open_zones_lock;
|
||||
struct list_head zi_open_zones;
|
||||
unsigned int zi_nr_open_zones;
|
||||
|
||||
/*
|
||||
* Free zone search cursor and number of free zones:
|
||||
*/
|
||||
unsigned long zi_free_zone_cursor;
|
||||
atomic_t zi_nr_free_zones;
|
||||
|
||||
/*
|
||||
* Wait queue to wait for free zones or open zone resources to become
|
||||
* available:
|
||||
*/
|
||||
wait_queue_head_t zi_zone_wait;
|
||||
|
||||
/*
|
||||
* Pointer to the GC thread, and the current open zone used by GC
|
||||
* (if any).
|
||||
*
|
||||
* zi_open_gc_zone is mostly private to the GC thread, but can be read
|
||||
* for debugging from other threads, in which case zi_open_zones_lock
|
||||
* must be taken to access it.
|
||||
*/
|
||||
struct task_struct *zi_gc_thread;
|
||||
struct xfs_open_zone *zi_open_gc_zone;
|
||||
|
||||
/*
|
||||
* List of zones that need a reset:
|
||||
*/
|
||||
spinlock_t zi_reset_list_lock;
|
||||
struct xfs_group *zi_reset_list;
|
||||
|
||||
/*
|
||||
* A set of bitmaps to bucket-sort reclaimable zones by used blocks to help
|
||||
* garbage collection to quickly find the best candidate for reclaim.
|
||||
*/
|
||||
spinlock_t zi_used_buckets_lock;
|
||||
unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS];
|
||||
unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS];
|
||||
|
||||
};
|
||||
|
||||
struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
|
||||
enum rw_hint write_hint, bool is_gc);
|
||||
|
||||
int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
|
||||
bool xfs_zoned_need_gc(struct xfs_mount *mp);
|
||||
int xfs_zone_gc_mount(struct xfs_mount *mp);
|
||||
void xfs_zone_gc_unmount(struct xfs_mount *mp);
|
||||
|
||||
void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
|
||||
|
||||
#endif /* _XFS_ZONE_PRIV_H */
|
||||
253
fs/xfs/xfs_zone_space_resv.c
Normal file
253
fs/xfs/xfs_zone_space_resv.c
Normal file
@@ -0,0 +1,253 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (c) 2023-2025 Christoph Hellwig.
|
||||
* Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
|
||||
*/
|
||||
#include "xfs.h"
|
||||
#include "xfs_shared.h"
|
||||
#include "xfs_format.h"
|
||||
#include "xfs_trans_resv.h"
|
||||
#include "xfs_mount.h"
|
||||
#include "xfs_inode.h"
|
||||
#include "xfs_rtbitmap.h"
|
||||
#include "xfs_zone_alloc.h"
|
||||
#include "xfs_zone_priv.h"
|
||||
#include "xfs_zones.h"
|
||||
|
||||
/*
|
||||
* Note: the zoned allocator does not support a rtextsize > 1, so this code and
|
||||
* the allocator itself uses file system blocks interchangeable with realtime
|
||||
* extents without doing the otherwise required conversions.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Per-task space reservation.
|
||||
*
|
||||
* Tasks that need to wait for GC to free up space allocate one of these
|
||||
* on-stack and adds it to the per-mount zi_reclaim_reservations lists.
|
||||
* The GC thread will then wake the tasks in order when space becomes available.
|
||||
*/
|
||||
struct xfs_zone_reservation {
|
||||
struct list_head entry;
|
||||
struct task_struct *task;
|
||||
xfs_filblks_t count_fsb;
|
||||
};
|
||||
|
||||
/*
|
||||
* Calculate the number of reserved blocks.
|
||||
*
|
||||
* XC_FREE_RTEXTENTS counts the user available capacity, to which the file
|
||||
* system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
|
||||
* available for writes without waiting for GC.
|
||||
*
|
||||
* For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
|
||||
* block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
|
||||
* is further restricted by at least one zone as well as the optional
|
||||
* persistently reserved blocks. This allows the allocator to run more
|
||||
* smoothly by not always triggering GC.
|
||||
*/
|
||||
uint64_t
|
||||
xfs_zoned_default_resblks(
|
||||
struct xfs_mount *mp,
|
||||
enum xfs_free_counter ctr)
|
||||
{
|
||||
switch (ctr) {
|
||||
case XC_FREE_RTEXTENTS:
|
||||
return (uint64_t)XFS_RESERVED_ZONES *
|
||||
mp->m_groups[XG_TYPE_RTG].blocks +
|
||||
mp->m_sb.sb_rtreserved;
|
||||
case XC_FREE_RTAVAILABLE:
|
||||
return (uint64_t)XFS_GC_ZONES *
|
||||
mp->m_groups[XG_TYPE_RTG].blocks;
|
||||
default:
|
||||
ASSERT(0);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
xfs_zoned_resv_wake_all(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
struct xfs_zone_info *zi = mp->m_zone_info;
|
||||
struct xfs_zone_reservation *reservation;
|
||||
|
||||
spin_lock(&zi->zi_reservation_lock);
|
||||
list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
|
||||
wake_up_process(reservation->task);
|
||||
spin_unlock(&zi->zi_reservation_lock);
|
||||
}
|
||||
|
||||
void
|
||||
xfs_zoned_add_available(
|
||||
struct xfs_mount *mp,
|
||||
xfs_filblks_t count_fsb)
|
||||
{
|
||||
struct xfs_zone_info *zi = mp->m_zone_info;
|
||||
struct xfs_zone_reservation *reservation;
|
||||
|
||||
if (list_empty_careful(&zi->zi_reclaim_reservations)) {
|
||||
xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock(&zi->zi_reservation_lock);
|
||||
xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
|
||||
count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
|
||||
list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
|
||||
if (reservation->count_fsb > count_fsb)
|
||||
break;
|
||||
wake_up_process(reservation->task);
|
||||
count_fsb -= reservation->count_fsb;
|
||||
|
||||
}
|
||||
spin_unlock(&zi->zi_reservation_lock);
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_zoned_space_wait_error(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
if (xfs_is_shutdown(mp))
|
||||
return -EIO;
|
||||
if (fatal_signal_pending(current))
|
||||
return -EINTR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
xfs_zoned_reserve_available(
|
||||
struct xfs_inode *ip,
|
||||
xfs_filblks_t count_fsb,
|
||||
unsigned int flags)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
struct xfs_zone_info *zi = mp->m_zone_info;
|
||||
struct xfs_zone_reservation reservation = {
|
||||
.task = current,
|
||||
.count_fsb = count_fsb,
|
||||
};
|
||||
int error;
|
||||
|
||||
/*
|
||||
* If there are no waiters, try to directly grab the available blocks
|
||||
* from the percpu counter.
|
||||
*
|
||||
* If the caller wants to dip into the reserved pool also bypass the
|
||||
* wait list. This relies on the fact that we have a very graciously
|
||||
* sized reserved pool that always has enough space. If the reserved
|
||||
* allocations fail we're in trouble.
|
||||
*/
|
||||
if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
|
||||
(flags & XFS_ZR_RESERVED))) {
|
||||
error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
|
||||
flags & XFS_ZR_RESERVED);
|
||||
if (error != -ENOSPC)
|
||||
return error;
|
||||
}
|
||||
|
||||
if (flags & XFS_ZR_NOWAIT)
|
||||
return -EAGAIN;
|
||||
|
||||
spin_lock(&zi->zi_reservation_lock);
|
||||
list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
|
||||
while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
|
||||
set_current_state(TASK_KILLABLE);
|
||||
|
||||
error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
|
||||
flags & XFS_ZR_RESERVED);
|
||||
if (error != -ENOSPC)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If there is no reclaimable group left and we aren't still
|
||||
* processing a pending GC request give up as we're fully out
|
||||
* of space.
|
||||
*/
|
||||
if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
|
||||
!xfs_is_zonegc_running(mp))
|
||||
break;
|
||||
|
||||
spin_unlock(&zi->zi_reservation_lock);
|
||||
schedule();
|
||||
spin_lock(&zi->zi_reservation_lock);
|
||||
}
|
||||
list_del(&reservation.entry);
|
||||
spin_unlock(&zi->zi_reservation_lock);
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Implement greedy space allocation for short writes by trying to grab all
|
||||
* that is left after locking out other threads from trying to do the same.
|
||||
*
|
||||
* This isn't exactly optimal and can hopefully be replaced by a proper
|
||||
* percpu_counter primitive one day.
|
||||
*/
|
||||
static int
|
||||
xfs_zoned_reserve_extents_greedy(
|
||||
struct xfs_inode *ip,
|
||||
xfs_filblks_t *count_fsb,
|
||||
unsigned int flags)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
struct xfs_zone_info *zi = mp->m_zone_info;
|
||||
s64 len = *count_fsb;
|
||||
int error = -ENOSPC;
|
||||
|
||||
spin_lock(&zi->zi_reservation_lock);
|
||||
len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
|
||||
if (len > 0) {
|
||||
*count_fsb = len;
|
||||
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
|
||||
flags & XFS_ZR_RESERVED);
|
||||
}
|
||||
spin_unlock(&zi->zi_reservation_lock);
|
||||
return error;
|
||||
}
|
||||
|
||||
int
|
||||
xfs_zoned_space_reserve(
|
||||
struct xfs_inode *ip,
|
||||
xfs_filblks_t count_fsb,
|
||||
unsigned int flags,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
int error;
|
||||
|
||||
ASSERT(ac->reserved_blocks == 0);
|
||||
ASSERT(ac->open_zone == NULL);
|
||||
|
||||
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
|
||||
flags & XFS_ZR_RESERVED);
|
||||
if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
|
||||
error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
error = xfs_zoned_reserve_available(ip, count_fsb, flags);
|
||||
if (error) {
|
||||
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
|
||||
return error;
|
||||
}
|
||||
ac->reserved_blocks = count_fsb;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
xfs_zoned_space_unreserve(
|
||||
struct xfs_inode *ip,
|
||||
struct xfs_zone_alloc_ctx *ac)
|
||||
{
|
||||
if (ac->reserved_blocks > 0) {
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
|
||||
xfs_zoned_add_available(mp, ac->reserved_blocks);
|
||||
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
|
||||
}
|
||||
if (ac->open_zone)
|
||||
xfs_open_zone_put(ac->open_zone);
|
||||
}
|
||||
@@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
|
||||
|
||||
/* Serialize against truncates */
|
||||
filemap_invalidate_lock_shared(inode->i_mapping);
|
||||
ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
|
||||
ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
|
||||
filemap_invalidate_unlock_shared(inode->i_mapping);
|
||||
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
|
||||
@@ -56,6 +56,10 @@ struct vm_fault;
|
||||
*
|
||||
* IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
|
||||
* never be merged with the mapping before it.
|
||||
*
|
||||
* IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
|
||||
* assigned to it yet and the file system will do that in the bio submission
|
||||
* handler, splitting the I/O as needed.
|
||||
*/
|
||||
#define IOMAP_F_NEW (1U << 0)
|
||||
#define IOMAP_F_DIRTY (1U << 1)
|
||||
@@ -68,6 +72,7 @@ struct vm_fault;
|
||||
#endif /* CONFIG_BUFFER_HEAD */
|
||||
#define IOMAP_F_XATTR (1U << 5)
|
||||
#define IOMAP_F_BOUNDARY (1U << 6)
|
||||
#define IOMAP_F_ANON_WRITE (1U << 7)
|
||||
|
||||
/*
|
||||
* Flags set by the core iomap code during operations:
|
||||
@@ -111,6 +116,8 @@ struct iomap {
|
||||
|
||||
static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
|
||||
{
|
||||
if (iomap->flags & IOMAP_F_ANON_WRITE)
|
||||
return U64_MAX; /* invalid */
|
||||
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
|
||||
}
|
||||
|
||||
@@ -182,7 +189,9 @@ struct iomap_folio_ops {
|
||||
#else
|
||||
#define IOMAP_DAX 0
|
||||
#endif /* CONFIG_FS_DAX */
|
||||
#define IOMAP_ATOMIC (1 << 9)
|
||||
#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */
|
||||
#define IOMAP_DONTCACHE (1 << 10)
|
||||
#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */
|
||||
|
||||
struct iomap_ops {
|
||||
/*
|
||||
@@ -211,8 +220,10 @@ struct iomap_ops {
|
||||
* calls to iomap_iter(). Treat as read-only in the body.
|
||||
* @len: The remaining length of the file segment we're operating on.
|
||||
* It is updated at the same time as @pos.
|
||||
* @processed: The number of bytes processed by the body in the most recent
|
||||
* iteration, or a negative errno. 0 causes the iteration to stop.
|
||||
* @iter_start_pos: The original start pos for the current iomap. Used for
|
||||
* incremental iter advance.
|
||||
* @status: Status of the most recent iteration. Zero on success or a negative
|
||||
* errno on error.
|
||||
* @flags: Zero or more of the iomap_begin flags above.
|
||||
* @iomap: Map describing the I/O iteration
|
||||
* @srcmap: Source map for COW operations
|
||||
@@ -221,7 +232,8 @@ struct iomap_iter {
|
||||
struct inode *inode;
|
||||
loff_t pos;
|
||||
u64 len;
|
||||
s64 processed;
|
||||
loff_t iter_start_pos;
|
||||
int status;
|
||||
unsigned flags;
|
||||
struct iomap iomap;
|
||||
struct iomap srcmap;
|
||||
@@ -229,6 +241,26 @@ struct iomap_iter {
|
||||
};
|
||||
|
||||
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
|
||||
int iomap_iter_advance(struct iomap_iter *iter, u64 *count);
|
||||
|
||||
/**
|
||||
* iomap_length_trim - trimmed length of the current iomap iteration
|
||||
* @iter: iteration structure
|
||||
* @pos: File position to trim from.
|
||||
* @len: Length of the mapping to trim to.
|
||||
*
|
||||
* Returns a trimmed length that the operation applies to for the current
|
||||
* iteration.
|
||||
*/
|
||||
static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos,
|
||||
u64 len)
|
||||
{
|
||||
u64 end = iter->iomap.offset + iter->iomap.length;
|
||||
|
||||
if (iter->srcmap.type != IOMAP_HOLE)
|
||||
end = min(end, iter->srcmap.offset + iter->srcmap.length);
|
||||
return min(len, end - pos);
|
||||
}
|
||||
|
||||
/**
|
||||
* iomap_length - length of the current iomap iteration
|
||||
@@ -238,11 +270,17 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
|
||||
*/
|
||||
static inline u64 iomap_length(const struct iomap_iter *iter)
|
||||
{
|
||||
u64 end = iter->iomap.offset + iter->iomap.length;
|
||||
return iomap_length_trim(iter, iter->pos, iter->len);
|
||||
}
|
||||
|
||||
if (iter->srcmap.type != IOMAP_HOLE)
|
||||
end = min(end, iter->srcmap.offset + iter->srcmap.length);
|
||||
return min(iter->len, end - iter->pos);
|
||||
/**
|
||||
* iomap_iter_advance_full - advance by the full length of current map
|
||||
*/
|
||||
static inline int iomap_iter_advance_full(struct iomap_iter *iter)
|
||||
{
|
||||
u64 length = iomap_length(iter);
|
||||
|
||||
return iomap_iter_advance(iter, &length);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -306,12 +344,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
|
||||
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
|
||||
const struct iomap_ops *ops);
|
||||
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
|
||||
bool *did_zero, const struct iomap_ops *ops);
|
||||
bool *did_zero, const struct iomap_ops *ops, void *private);
|
||||
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
|
||||
const struct iomap_ops *ops);
|
||||
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
|
||||
const struct iomap_ops *ops);
|
||||
|
||||
const struct iomap_ops *ops, void *private);
|
||||
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
|
||||
void *private);
|
||||
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
|
||||
struct iomap *iomap);
|
||||
void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
|
||||
@@ -327,17 +364,43 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset,
|
||||
sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
|
||||
const struct iomap_ops *ops);
|
||||
|
||||
/*
|
||||
* Flags for iomap_ioend->io_flags.
|
||||
*/
|
||||
/* shared COW extent */
|
||||
#define IOMAP_IOEND_SHARED (1U << 0)
|
||||
/* unwritten extent */
|
||||
#define IOMAP_IOEND_UNWRITTEN (1U << 1)
|
||||
/* don't merge into previous ioend */
|
||||
#define IOMAP_IOEND_BOUNDARY (1U << 2)
|
||||
/* is direct I/O */
|
||||
#define IOMAP_IOEND_DIRECT (1U << 3)
|
||||
|
||||
/*
|
||||
* Flags that if set on either ioend prevent the merge of two ioends.
|
||||
* (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
|
||||
*/
|
||||
#define IOMAP_IOEND_NOMERGE_FLAGS \
|
||||
(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
|
||||
|
||||
/*
|
||||
* Structure for writeback I/O completions.
|
||||
*
|
||||
* File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io
|
||||
* for direct I/O) can split a bio generated by iomap. In that case the parent
|
||||
* ioend it was split from is recorded in ioend->io_parent.
|
||||
*/
|
||||
struct iomap_ioend {
|
||||
struct list_head io_list; /* next ioend in chain */
|
||||
u16 io_type;
|
||||
u16 io_flags; /* IOMAP_F_* */
|
||||
u16 io_flags; /* IOMAP_IOEND_* */
|
||||
struct inode *io_inode; /* file being written to */
|
||||
size_t io_size; /* size of data within eof */
|
||||
size_t io_size; /* size of the extent */
|
||||
atomic_t io_remaining; /* completetion defer count */
|
||||
int io_error; /* stashed away status */
|
||||
struct iomap_ioend *io_parent; /* parent for completions */
|
||||
loff_t io_offset; /* offset in the file */
|
||||
sector_t io_sector; /* start sector of ioend */
|
||||
void *io_private; /* file system private data */
|
||||
struct bio io_bio; /* MUST BE LAST! */
|
||||
};
|
||||
|
||||
@@ -362,12 +425,14 @@ struct iomap_writeback_ops {
|
||||
loff_t offset, unsigned len);
|
||||
|
||||
/*
|
||||
* Optional, allows the file systems to perform actions just before
|
||||
* submitting the bio and/or override the bio end_io handler for complex
|
||||
* operations like copy on write extent manipulation or unwritten extent
|
||||
* conversions.
|
||||
* Optional, allows the file systems to hook into bio submission,
|
||||
* including overriding the bi_end_io handler.
|
||||
*
|
||||
* Returns 0 if the bio was successfully submitted, or a negative
|
||||
* error code if status was non-zero or another error happened and
|
||||
* the bio could not be submitted.
|
||||
*/
|
||||
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
|
||||
int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
|
||||
|
||||
/*
|
||||
* Optional, allows the file system to discard state on a page where
|
||||
@@ -383,6 +448,10 @@ struct iomap_writepage_ctx {
|
||||
u32 nr_folios; /* folios added to the ioend */
|
||||
};
|
||||
|
||||
struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
|
||||
loff_t file_offset, u16 ioend_flags);
|
||||
struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
|
||||
unsigned int max_len, bool is_append);
|
||||
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
|
||||
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
|
||||
struct list_head *more_ioends);
|
||||
@@ -434,6 +503,11 @@ struct iomap_dio_ops {
|
||||
*/
|
||||
#define IOMAP_DIO_PARTIAL (1 << 2)
|
||||
|
||||
/*
|
||||
* Use software-based torn-write protection.
|
||||
*/
|
||||
#define IOMAP_DIO_ATOMIC_SW (1 << 3)
|
||||
|
||||
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
|
||||
unsigned int dio_flags, void *private, size_t done_before);
|
||||
@@ -454,4 +528,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
|
||||
# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO)
|
||||
#endif /* CONFIG_SWAP */
|
||||
|
||||
extern struct bio_set iomap_ioend_bioset;
|
||||
|
||||
#endif /* LINUX_IOMAP_H */
|
||||
|
||||
Reference in New Issue
Block a user