Merge branch 'xfs-6.15-folios_vmalloc' into XFS-for-linus-6.15-merge

Merge buffer cache conversion to folios and vmalloc

Signed-off-by: Carlos Maiolino <cem@kernel.org>
This commit is contained in:
Carlos Maiolino
2025-03-18 14:10:30 +01:00
16 changed files with 206 additions and 437 deletions

View File

@@ -514,8 +514,8 @@ IOMAP_WRITE`` with any combination of the following enhancements:
if the mapping is unwritten and the filesystem cannot handle zeroing
the unaligned regions without exposing stale contents.
* ``IOMAP_ATOMIC``: This write is being issued with torn-write
protection.
* ``IOMAP_ATOMIC_HW``: This write is being issued with torn-write
protection based on HW-offload support.
Only a single bio can be created for the write, and the write must
not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
set.
@@ -526,8 +526,20 @@ IOMAP_WRITE`` with any combination of the following enhancements:
conversion or copy on write), all updates for the entire file range
must be committed atomically as well.
Only one space mapping is allowed per untorn write.
Untorn writes must be aligned to, and must not be longer than, a
single file block.
Untorn writes may be longer than a single file block. In all cases,
the mapping start disk block must have at least the same alignment as
the write offset.
* ``IOMAP_ATOMIC_SW``: This write is being issued with torn-write
protection via a software mechanism provided by the filesystem.
All the disk block alignment and single bio restrictions which apply
to IOMAP_ATOMIC_HW do not apply here.
SW-based untorn writes would typically be used as a fallback when
HW-based untorn writes may not be issued, e.g. the range of the write
covers multiple extents, meaning that it is not possible to issue
a single bio.
All filesystem metadata updates for the entire file range must be
committed atomically as well.
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
calling this function.

View File

@@ -3467,7 +3467,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
return false;
/* atomic writes are all-or-nothing */
if (flags & IOMAP_ATOMIC)
if (flags & IOMAP_ATOMIC_HW)
return false;
/* can only try again if we wrote nothing */

View File

@@ -317,7 +317,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua, bool atomic)
const struct iomap *iomap, bool use_fua, bool atomic_hw)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@@ -329,7 +329,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
if (atomic)
if (atomic_hw)
opflags |= REQ_ATOMIC;
return opflags;
@@ -340,8 +340,8 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW;
const loff_t length = iomap_length(iter);
bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
@@ -351,7 +351,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
u64 copied = 0;
size_t orig_count;
if (atomic && length != fs_block_size)
if (atomic_hw && length != iter->len)
return -EINVAL;
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
@@ -428,7 +428,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
goto out;
}
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@@ -461,7 +461,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
}
n = bio->bi_iter.bi_size;
if (WARN_ON_ONCE(atomic && n != length)) {
if (WARN_ON_ONCE(atomic_hw && n != length)) {
/*
* This bio should have covered the complete length,
* which it doesn't, so error. We may need to zero out
@@ -650,9 +650,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC;
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -687,6 +684,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
if (dio_flags & IOMAP_DIO_ATOMIC_SW)
iomi.flags |= IOMAP_ATOMIC_SW;
else if (iocb->ki_flags & IOCB_ATOMIC)
iomi.flags |= IOMAP_ATOMIC_HW;
/* for data sync or sync, we need sync completion processing */
if (iocb_is_dsync(iocb)) {
dio->flags |= IOMAP_DIO_NEED_SYNC;

View File

@@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
{ IOMAP_NOWAIT, "NOWAIT" }, \
{ IOMAP_ATOMIC, "ATOMIC" }
{ IOMAP_ATOMIC_HW, "ATOMIC_HW" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \

View File

@@ -364,7 +364,7 @@ xfs_ialloc_inode_init(
(j * M_IGEO(mp)->blocks_per_cluster));
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
XBF_UNMAPPED, &fbuf);
0, &fbuf);
if (error)
return error;

View File

@@ -137,7 +137,7 @@ xfs_imap_to_bp(
int error;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
imap->im_len, 0, bpp, &xfs_inode_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
XFS_SICK_AG_INODES);

View File

@@ -1560,8 +1560,7 @@ xrep_dinode_core(
/* Read the inode cluster buffer. */
error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
NULL);
ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL);
if (error)
return error;

View File

@@ -55,27 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
}
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
{
/*
* Return true if the buffer is vmapped.
*
* b_addr is null if the buffer is not mapped, but the code is clever
* enough to know it doesn't have to map a single page, so the check has
* to be both for b_addr and bp->b_page_count > 1.
*/
return bp->b_addr && bp->b_page_count > 1;
}
static inline int
xfs_buf_vmap_len(
struct xfs_buf *bp)
{
return (bp->b_page_count * PAGE_SIZE);
}
/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -159,7 +138,7 @@ _xfs_buf_alloc(
* We don't want certain flags to appear in b_flags unless they are
* specifically set by later operations on the buffer.
*/
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
/*
* A new buffer is held and locked by the owner. This ensures that the
@@ -204,29 +183,6 @@ _xfs_buf_alloc(
return 0;
}
static void
xfs_buf_free_pages(
struct xfs_buf *bp)
{
uint i;
ASSERT(bp->b_flags & _XBF_PAGES);
if (xfs_buf_is_vmapped(bp))
vm_unmap_ram(bp->b_addr, bp->b_page_count);
for (i = 0; i < bp->b_page_count; i++) {
if (bp->b_pages[i])
__free_page(bp->b_pages[i]);
}
mm_account_reclaimed_pages(bp->b_page_count);
if (bp->b_pages != bp->b_page_array)
kfree(bp->b_pages);
bp->b_pages = NULL;
bp->b_flags &= ~_XBF_PAGES;
}
static void
xfs_buf_free_callback(
struct callback_head *cb)
@@ -241,154 +197,148 @@ static void
xfs_buf_free(
struct xfs_buf *bp)
{
unsigned int size = BBTOB(bp->b_length);
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
if (xfs_buftarg_is_mem(bp->b_target))
xmbuf_unmap_page(bp);
else if (bp->b_flags & _XBF_PAGES)
xfs_buf_free_pages(bp);
if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
if (is_vmalloc_addr(bp->b_addr))
vfree(bp->b_addr);
else if (bp->b_flags & _XBF_KMEM)
kfree(bp->b_addr);
else
folio_put(virt_to_folio(bp->b_addr));
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
static int
xfs_buf_alloc_kmem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
struct xfs_buf *bp,
size_t size,
gfp_t gfp_mask)
{
gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
size_t size = BBTOB(bp->b_length);
ASSERT(is_power_of_2(size));
ASSERT(size < PAGE_SIZE);
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
gfp_mask |= __GFP_ZERO;
bp->b_addr = kmalloc(size, gfp_mask);
bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL);
if (!bp->b_addr)
return -ENOMEM;
if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
((unsigned long)bp->b_addr & PAGE_MASK)) {
/* b_addr spans two pages - use alloc_page instead */
/*
* Slab guarantees that we get back naturally aligned allocations for
* power of two sizes. Keep this check as the canary in the coal mine
* if anything changes in slab.
*/
if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) {
kfree(bp->b_addr);
bp->b_addr = NULL;
return -ENOMEM;
}
bp->b_offset = offset_in_page(bp->b_addr);
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = kmem_to_page(bp->b_addr);
bp->b_page_count = 1;
bp->b_flags |= _XBF_KMEM;
trace_xfs_buf_backing_kmem(bp, _RET_IP_);
return 0;
}
/*
* Allocate backing memory for a buffer.
*
* For tmpfs-backed buffers used by in-memory btrees this directly maps the
* tmpfs page cache folios.
*
* For real file system buffers there are three different kinds backing memory:
*
* The first type backs the buffer by a kmalloc allocation. This is done for
* less than PAGE_SIZE allocations to avoid wasting memory.
*
* The second type is a single folio buffer - this may be a high order folio or
* just a single page sized folio, but either way they get treated the same way
* by the rest of the code - the buffer memory spans a single contiguous memory
* region that we don't have to map and unmap to access the data directly.
*
* The third type of buffer is the vmalloc()d buffer. This provides the buffer
* with the required contiguous memory region but backed by discontiguous
* physical pages.
*/
static int
xfs_buf_alloc_pages(
xfs_buf_alloc_backing_mem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
size_t size = BBTOB(bp->b_length);
gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
long filled = 0;
struct folio *folio;
if (flags & XBF_READ_AHEAD)
gfp_mask |= __GFP_NORETRY;
/* Make sure that we have a page list */
bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
if (bp->b_page_count <= XB_PAGES) {
bp->b_pages = bp->b_page_array;
} else {
bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
gfp_mask);
if (!bp->b_pages)
return -ENOMEM;
}
bp->b_flags |= _XBF_PAGES;
if (xfs_buftarg_is_mem(bp->b_target))
return xmbuf_map_backing_mem(bp);
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
gfp_mask |= __GFP_ZERO;
if (flags & XBF_READ_AHEAD)
gfp_mask |= __GFP_NORETRY;
/*
* Bulk filling of pages can take multiple calls. Not filling the entire
* array is not an allocation failure, so don't back off if we get at
* least one extra page.
* For buffers smaller than PAGE_SIZE use a kmalloc allocation if that
* is properly aligned. The slab allocator now guarantees an aligned
* allocation for all power of two sizes, which matches most of the
* smaller than PAGE_SIZE buffers used by XFS.
*/
for (;;) {
long last = filled;
if (size < PAGE_SIZE && is_power_of_2(size))
return xfs_buf_alloc_kmem(bp, size, gfp_mask);
filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
bp->b_pages);
if (filled == bp->b_page_count) {
XFS_STATS_INC(bp->b_mount, xb_page_found);
break;
}
/*
* Don't bother with the retry loop for single PAGE allocations: vmalloc
* won't do any better.
*/
if (size <= PAGE_SIZE)
gfp_mask |= __GFP_NOFAIL;
if (filled != last)
continue;
if (flags & XBF_READ_AHEAD) {
xfs_buf_free_pages(bp);
/*
* Optimistically attempt a single high order folio allocation for
* larger than PAGE_SIZE buffers.
*
* Allocating a high order folio makes the assumption that buffers are a
* power-of-2 size, matching the power-of-2 folios sizes available.
*
* The exception here are user xattr data buffers, which can be arbitrarily
* sized up to 64kB plus structure metadata, skip straight to the vmalloc
* path for them instead of wasting memory here.
*/
if (size > PAGE_SIZE) {
if (!is_power_of_2(size))
goto fallback;
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
gfp_mask |= __GFP_NORETRY;
}
folio = folio_alloc(gfp_mask, get_order(size));
if (!folio) {
if (size <= PAGE_SIZE)
return -ENOMEM;
}
trace_xfs_buf_backing_fallback(bp, _RET_IP_);
goto fallback;
}
bp->b_addr = folio_address(folio);
trace_xfs_buf_backing_folio(bp, _RET_IP_);
return 0;
fallback:
for (;;) {
bp->b_addr = __vmalloc(size, gfp_mask);
if (bp->b_addr)
break;
if (flags & XBF_READ_AHEAD)
return -ENOMEM;
XFS_STATS_INC(bp->b_mount, xb_page_retries);
memalloc_retry_wait(gfp_mask);
}
return 0;
}
/*
* Map buffer into kernel address-space if necessary.
*/
STATIC int
_xfs_buf_map_pages(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
/* A single page buffer is always mappable */
bp->b_addr = page_address(bp->b_pages[0]);
} else if (flags & XBF_UNMAPPED) {
bp->b_addr = NULL;
} else {
int retried = 0;
unsigned nofs_flag;
/*
* vm_map_ram() will allocate auxiliary structures (e.g.
* pagetables) with GFP_KERNEL, yet we often under a scoped nofs
* context here. Mixing GFP_KERNEL with GFP_NOFS allocations
* from the same call site that can be run from both above and
* below memory reclaim causes lockdep false positives. Hence we
* always need to force this allocation to nofs context because
* we can't pass __GFP_NOLOCKDEP down to auxillary structures to
* prevent false positive lockdep reports.
*
* XXX(dgc): I think dquot reclaim is the only place we can get
* to this function from memory reclaim context now. If we fix
* that like we've fixed inode reclaim to avoid writeback from
* reclaim, this nofs wrapping can go away.
*/
nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-1);
if (bp->b_addr)
break;
vm_unmap_aliases();
} while (retried++ <= 1);
memalloc_nofs_restore(nofs_flag);
if (!bp->b_addr)
return -ENOMEM;
}
trace_xfs_buf_backing_vmalloc(bp, _RET_IP_);
return 0;
}
@@ -507,7 +457,7 @@ xfs_buf_find_lock(
return -ENOENT;
}
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_flags &= _XBF_KMEM;
bp->b_ops = NULL;
}
return 0;
@@ -579,18 +529,7 @@ xfs_buf_find_insert(
if (error)
goto out_drop_pag;
if (xfs_buftarg_is_mem(new_bp->b_target)) {
error = xmbuf_map_page(new_bp);
} else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
xfs_buf_alloc_kmem(new_bp, flags) < 0) {
/*
* For buffers that fit entirely within a single page, first
* attempt to allocate the memory from the heap to minimise
* memory usage. If we can't get heap memory for these small
* buffers, we fall back to using the page allocator.
*/
error = xfs_buf_alloc_pages(new_bp, flags);
}
error = xfs_buf_alloc_backing_mem(new_bp, flags);
if (error)
goto out_free_buf;
@@ -704,18 +643,6 @@ xfs_buf_get_map(
xfs_perag_put(pag);
}
/* We do not hold a perag reference anymore. */
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
xfs_warn_ratelimited(btp->bt_mount,
"%s: failed to map %u pages", __func__,
bp->b_page_count);
xfs_buf_relse(bp);
return error;
}
}
/*
* Clear b_error if this is a lookup from a caller that doesn't expect
* valid data to be found in the buffer.
@@ -954,20 +881,10 @@ xfs_buf_get_uncached(
if (error)
return error;
if (xfs_buftarg_is_mem(bp->b_target))
error = xmbuf_map_page(bp);
else
error = xfs_buf_alloc_pages(bp, flags);
error = xfs_buf_alloc_backing_mem(bp, flags);
if (error)
goto fail_free_buf;
error = _xfs_buf_map_pages(bp, 0);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
"%s: failed to map pages", __func__);
goto fail_free_buf;
}
trace_xfs_buf_get_uncached(bp, _RET_IP_);
*bpp = bp;
return 0;
@@ -1299,9 +1216,9 @@ __xfs_buf_ioend(
trace_xfs_buf_iodone(bp, _RET_IP_);
if (bp->b_flags & XBF_READ) {
if (!bp->b_error && xfs_buf_is_vmapped(bp))
if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
invalidate_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
roundup(BBTOB(bp->b_length), PAGE_SIZE));
if (!bp->b_error && bp->b_ops)
bp->b_ops->verify_read(bp);
if (!bp->b_error)
@@ -1462,29 +1379,48 @@ static void
xfs_buf_submit_bio(
struct xfs_buf *bp)
{
unsigned int size = BBTOB(bp->b_length);
unsigned int map = 0, p;
unsigned int map = 0;
struct blk_plug plug;
struct bio *bio;
bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
xfs_buf_bio_op(bp), GFP_NOIO);
if (is_vmalloc_addr(bp->b_addr)) {
unsigned int size = BBTOB(bp->b_length);
unsigned int alloc_size = roundup(size, PAGE_SIZE);
void *data = bp->b_addr;
bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
xfs_buf_bio_op(bp), GFP_NOIO);
do {
unsigned int len = min(size, PAGE_SIZE);
ASSERT(offset_in_page(data) == 0);
__bio_add_page(bio, vmalloc_to_page(data), len, 0);
data += len;
size -= len;
} while (size);
flush_kernel_vmap_range(bp->b_addr, alloc_size);
} else {
/*
* Single folio or slab allocation. Must be contiguous and thus
* only a single bvec is needed.
*
* This uses the page based bio add helper for now as that is
* the lowest common denominator between folios and slab
* allocations. To be replaced with a better block layer
* helper soon (hopefully).
*/
bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
GFP_NOIO);
__bio_add_page(bio, virt_to_page(bp->b_addr),
BBTOB(bp->b_length),
offset_in_page(bp->b_addr));
}
bio->bi_private = bp;
bio->bi_end_io = xfs_buf_bio_end_io;
if (bp->b_flags & _XBF_KMEM) {
__bio_add_page(bio, virt_to_page(bp->b_addr), size,
bp->b_offset);
} else {
for (p = 0; p < bp->b_page_count; p++)
__bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
bio->bi_iter.bi_size = size; /* limit to the actual size used */
if (xfs_buf_is_vmapped(bp))
flush_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
}
/*
* If there is more than one map segment, split out a new bio for each
* map except of the last one. The last map is handled by the
@@ -1611,47 +1547,6 @@ xfs_buf_submit(
xfs_buf_submit_bio(bp);
}
void *
xfs_buf_offset(
struct xfs_buf *bp,
size_t offset)
{
struct page *page;
if (bp->b_addr)
return bp->b_addr + offset;
page = bp->b_pages[offset >> PAGE_SHIFT];
return page_address(page) + (offset & (PAGE_SIZE-1));
}
void
xfs_buf_zero(
struct xfs_buf *bp,
size_t boff,
size_t bsize)
{
size_t bend;
bend = boff + bsize;
while (boff < bend) {
struct page *page;
int page_index, page_offset, csize;
page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
page = bp->b_pages[page_index];
csize = min_t(size_t, PAGE_SIZE - page_offset,
BBTOB(bp->b_length) - boff);
ASSERT((csize + page_offset) <= PAGE_SIZE);
memset(page_address(page) + page_offset, 0, csize);
boff += csize;
}
}
/*
* Log a message about and stale a buffer that a caller has decided is corrupt.
*

View File

@@ -36,7 +36,6 @@ struct xfs_buf;
#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1u << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
@@ -48,7 +47,6 @@ struct xfs_buf;
#define XBF_LIVESCAN (1u << 28)
#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
typedef unsigned int xfs_buf_flags_t;
@@ -62,14 +60,12 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
/* The following interface flags should never be set */ \
{ XBF_LIVESCAN, "LIVESCAN" }, \
{ XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }, \
{ XBF_UNMAPPED, "UNMAPPED" }
{ XBF_TRYLOCK, "TRYLOCK" }
/*
* Internal state flags.
@@ -124,8 +120,6 @@ struct xfs_buftarg {
struct xfs_buf_cache bt_cache[];
};
#define XB_PAGES 2
struct xfs_buf_map {
xfs_daddr_t bm_bn; /* block number for I/O */
int bm_len; /* size of I/O */
@@ -187,15 +181,10 @@ struct xfs_buf {
struct xfs_buf_log_item *b_log_item;
struct list_head b_li_list; /* Log items list head */
struct xfs_trans *b_transp;
struct page **b_pages; /* array of page pointers */
struct page *b_page_array[XB_PAGES]; /* inline pages */
struct xfs_buf_map *b_maps; /* compound buffer map */
struct xfs_buf_map __b_map; /* inline compound buffer map */
int b_map_count;
atomic_t b_pin_count; /* pin count */
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset of b_addr,
only for _XBF_KMEM buffers */
int b_error; /* error code on I/O */
void (*b_iodone)(struct xfs_buf *bp);
@@ -315,12 +304,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
void xfs_buf_ioend_fail(struct xfs_buf *);
void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset)
{
return bp->b_addr + offset;
}
static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize)
{
memset(bp->b_addr + boff, 0, bsize);
}
extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */

View File

@@ -57,24 +57,6 @@ xfs_buf_log_format_size(
(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
}
static inline bool
xfs_buf_item_straddle(
struct xfs_buf *bp,
uint offset,
int first_bit,
int nbits)
{
void *first, *last;
first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
last = xfs_buf_offset(bp,
offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
if (last - first != nbits * XFS_BLF_CHUNK)
return true;
return false;
}
/*
* Return the number of log iovecs and space needed to log the given buf log
* item segment.
@@ -91,11 +73,8 @@ xfs_buf_item_size_segment(
int *nvecs,
int *nbytes)
{
struct xfs_buf *bp = bip->bli_buf;
int first_bit;
int nbits;
int next_bit;
int last_bit;
first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
if (first_bit == -1)
@@ -108,15 +87,6 @@ xfs_buf_item_size_segment(
nbits = xfs_contig_bits(blfp->blf_data_map,
blfp->blf_map_size, first_bit);
ASSERT(nbits > 0);
/*
* Straddling a page is rare because we don't log contiguous
* chunks of unmapped buffers anywhere.
*/
if (nbits > 1 &&
xfs_buf_item_straddle(bp, offset, first_bit, nbits))
goto slow_scan;
(*nvecs)++;
*nbytes += nbits * XFS_BLF_CHUNK;
@@ -131,40 +101,6 @@ xfs_buf_item_size_segment(
} while (first_bit != -1);
return;
slow_scan:
/* Count the first bit we jumped out of the above loop from */
(*nvecs)++;
*nbytes += XFS_BLF_CHUNK;
last_bit = first_bit;
while (last_bit != -1) {
/*
* This takes the bit number to start looking from and
* returns the next set bit from there. It returns -1
* if there are no more bits set or the start bit is
* beyond the end of the bitmap.
*/
next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
last_bit + 1);
/*
* If we run out of bits, leave the loop,
* else if we find a new set of bits bump the number of vecs,
* else keep scanning the current set of bits.
*/
if (next_bit == -1) {
break;
} else if (next_bit != last_bit + 1 ||
xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
last_bit = next_bit;
first_bit = next_bit;
(*nvecs)++;
nbits = 1;
} else {
last_bit++;
nbits++;
}
*nbytes += XFS_BLF_CHUNK;
}
}
/*
@@ -277,8 +213,6 @@ xfs_buf_item_format_segment(
struct xfs_buf *bp = bip->bli_buf;
uint base_size;
int first_bit;
int last_bit;
int next_bit;
uint nbits;
/* copy the flags across from the base format item */
@@ -323,15 +257,6 @@ xfs_buf_item_format_segment(
nbits = xfs_contig_bits(blfp->blf_data_map,
blfp->blf_map_size, first_bit);
ASSERT(nbits > 0);
/*
* Straddling a page is rare because we don't log contiguous
* chunks of unmapped buffers anywhere.
*/
if (nbits > 1 &&
xfs_buf_item_straddle(bp, offset, first_bit, nbits))
goto slow_scan;
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
@@ -347,45 +272,6 @@ xfs_buf_item_format_segment(
} while (first_bit != -1);
return;
slow_scan:
ASSERT(bp->b_addr == NULL);
last_bit = first_bit;
nbits = 1;
for (;;) {
/*
* This takes the bit number to start looking from and
* returns the next set bit from there. It returns -1
* if there are no more bits set or the start bit is
* beyond the end of the bitmap.
*/
next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
(uint)last_bit + 1);
/*
* If we run out of bits fill in the last iovec and get out of
* the loop. Else if we start a new set of bits then fill in
* the iovec for the series we were looking at and start
* counting the bits in the new one. Else we're still in the
* same set of bits so just keep counting and scanning.
*/
if (next_bit == -1) {
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
break;
} else if (next_bit != last_bit + 1 ||
xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
first_bit = next_bit;
last_bit = next_bit;
nbits = 1;
} else {
last_bit++;
nbits++;
}
}
}
/*

View File

@@ -1006,7 +1006,6 @@ xlog_recover_buf_commit_pass2(
struct xfs_mount *mp = log->l_mp;
struct xfs_buf *bp;
int error;
uint buf_flags;
xfs_lsn_t lsn;
/*
@@ -1025,13 +1024,8 @@ xlog_recover_buf_commit_pass2(
}
trace_xfs_log_recover_buf_recover(log, buf_f);
buf_flags = 0;
if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
buf_flags |= XBF_UNMAPPED;
error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
buf_flags, &bp, NULL);
0, &bp, NULL);
if (error)
return error;

View File

@@ -74,7 +74,7 @@ xmbuf_alloc(
/*
* We don't want to bother with kmapping data during repair, so don't
* allow highmem pages to back this mapping.
* allow highmem folios to back this mapping.
*/
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
@@ -127,14 +127,13 @@ xmbuf_free(
kfree(btp);
}
/* Directly map a shmem page into the buffer cache. */
/* Directly map a shmem folio into the buffer cache. */
int
xmbuf_map_page(
xmbuf_map_backing_mem(
struct xfs_buf *bp)
{
struct inode *inode = file_inode(bp->b_target->bt_file);
struct folio *folio = NULL;
struct page *page;
loff_t pos = BBTOB(xfs_buf_daddr(bp));
int error;
@@ -159,39 +158,17 @@ xmbuf_map_page(
return -EIO;
}
page = folio_file_page(folio, pos >> PAGE_SHIFT);
/*
* Mark the page dirty so that it won't be reclaimed once we drop the
* (potentially last) reference in xmbuf_unmap_page.
* Mark the folio dirty so that it won't be reclaimed once we drop the
* (potentially last) reference in xfs_buf_free.
*/
set_page_dirty(page);
unlock_page(page);
folio_set_dirty(folio);
folio_unlock(folio);
bp->b_addr = page_address(page);
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = page;
bp->b_page_count = 1;
bp->b_addr = folio_address(folio);
return 0;
}
/* Unmap a shmem page that was mapped into the buffer cache. */
void
xmbuf_unmap_page(
struct xfs_buf *bp)
{
struct page *page = bp->b_pages[0];
ASSERT(xfs_buftarg_is_mem(bp->b_target));
put_page(page);
bp->b_addr = NULL;
bp->b_pages[0] = NULL;
bp->b_pages = NULL;
bp->b_page_count = 0;
}
/* Is this a valid daddr within the buftarg? */
bool
xmbuf_verify_daddr(
@@ -205,7 +182,7 @@ xmbuf_verify_daddr(
return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
}
/* Discard the page backing this buffer. */
/* Discard the folio backing this buffer. */
static void
xmbuf_stale(
struct xfs_buf *bp)
@@ -220,7 +197,7 @@ xmbuf_stale(
}
/*
* Finalize a buffer -- discard the backing page if it's stale, or run the
* Finalize a buffer -- discard the backing folio if it's stale, or run the
* write verifier to detect problems.
*/
int

View File

@@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
struct xfs_buftarg **btpp);
void xmbuf_free(struct xfs_buftarg *btp);
int xmbuf_map_page(struct xfs_buf *bp);
void xmbuf_unmap_page(struct xfs_buf *bp);
bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
int xmbuf_finalize(struct xfs_buf *bp);
#else
# define xfs_buftarg_is_mem(...) (false)
# define xmbuf_map_page(...) (-ENOMEM)
# define xmbuf_unmap_page(...) ((void)0)
# define xmbuf_verify_daddr(...) (false)
#endif /* CONFIG_XFS_MEMORY_BUFS */
int xmbuf_map_backing_mem(struct xfs_buf *bp);
#endif /* __XFS_BUF_MEM_H__ */

View File

@@ -1721,8 +1721,7 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
mp->m_bsize * igeo->blocks_per_cluster,
XBF_UNMAPPED, &bp);
mp->m_bsize * igeo->blocks_per_cluster, 0, &bp);
if (error)
return error;

View File

@@ -692,6 +692,10 @@ DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
DEFINE_BUF_EVENT(xfs_buf_backing_folio);
DEFINE_BUF_EVENT(xfs_buf_backing_kmem);
DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc);
DEFINE_BUF_EVENT(xfs_buf_backing_fallback);
/* not really buffer traces, but the buf provides useful information */
DEFINE_BUF_EVENT(xfs_btree_corrupt);

View File

@@ -189,8 +189,9 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
#define IOMAP_ATOMIC (1 << 9)
#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */
#define IOMAP_DONTCACHE (1 << 10)
#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */
struct iomap_ops {
/*
@@ -502,6 +503,11 @@ struct iomap_dio_ops {
*/
#define IOMAP_DIO_PARTIAL (1 << 2)
/*
* Use software-based torn-write protection.
*/
#define IOMAP_DIO_ATOMIC_SW (1 << 3)
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before);