Merge tag 'atomic-writes-6.16_2025-05-07' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into atomic_writes

large atomic writes for xfs [v12.1]

Currently atomic write support for xfs is limited to writing a single
block as we have no way to guarantee alignment and that the write covers
a single extent.

This series introduces a method to issue atomic writes via a
software-based method.

The software-based method is used as a fallback for when attempting to
issue an atomic write over misaligned or multiple extents.

For xfs, this support is based on reflink CoW support.

The basic idea of this CoW method is to alloc a range in the CoW fork,
write the data, and atomically update the mapping.

Initial mysql performance testing has shown this method to perform ok.
However, there we are only using 16K atomic writes (and 4K block size),
so typically - and thankfully - this software fallback method won't be
used often.

For other FSes which want large atomics writes and don't support CoW, I
think that they can follow the example in [0].

Catherine is currently working on further xfstests for this feature,
which we hope to share soon.

About 17/17, maybe it can be omitted as there is no strong demand to have
it included.

Based on bfecc4091e (xfs/next-rc, xfs/for-next) xfs: allow ro mounts
if rtdev or logdev are read-only

[0] https://lore.kernel.org/linux-xfs/20250102140411.14617-1-john.g.garry@oracle.com/

Differences to v12:
- add more review tags

Differences to v11:
- split "xfs: ignore ..." patch
- inline sync_blockdev() in xfs_alloc_buftarg() (Christoph)
- fix xfs_calc_rtgroup_awu_max() for 0 block count (Darrick)
- Add RB tag from Christoph (thanks!)

Differences to v10:
- add "xfs: only call xfs_setsize_buftarg once ..." by Darrick
- symbol renames in "xfs: ignore HW which cannot..." by Darrick

Differences to v9:
- rework "ignore HW which cannot .." patch by Darrick
- Ensure power-of-2 max always for unit min/max when no HW support

With a bit of luck, this should all go splendidly.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
This commit is contained in:
Carlos Maiolino
2025-05-09 09:15:39 +02:00
38 changed files with 1351 additions and 127 deletions

View File

@@ -151,6 +151,17 @@ When mounting an XFS filesystem, the following options are accepted.
optional, and the log section can be separate from the data
section or contained within it.
max_atomic_write=value
Set the maximum size of an atomic write. The size may be
specified in bytes, in kilobytes with a "k" suffix, in megabytes
with a "m" suffix, or in gigabytes with a "g" suffix. The size
cannot be larger than the maximum write size, larger than the
size of any allocation group, or larger than the size of a
remapping operation that the log can complete atomically.
The default value is to set the maximum I/O completion size
to allow each CPU to handle one at a time.
max_open_zones=value
Specify the max number of zones to keep open for writing on a
zoned rt device. Many open zones aids file data separation

View File

@@ -1336,7 +1336,8 @@ void bdev_statx(struct path *path, struct kstat *stat,
generic_fill_statx_atomic_writes(stat,
queue_atomic_write_unit_min_bytes(bd_queue),
queue_atomic_write_unit_max_bytes(bd_queue));
queue_atomic_write_unit_max_bytes(bd_queue),
0);
}
stat->blksize = bdev_io_min(bdev);

View File

@@ -5692,7 +5692,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
awu_max = sbi->s_awu_max;
}
generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
}
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;

View File

@@ -136,13 +136,15 @@ EXPORT_SYMBOL(generic_fill_statx_attr);
* @stat: Where to fill in the attribute flags
* @unit_min: Minimum supported atomic write length in bytes
* @unit_max: Maximum supported atomic write length in bytes
* @unit_max_opt: Optimised maximum supported atomic write length in bytes
*
* Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
* atomic write unit_min and unit_max values.
*/
void generic_fill_statx_atomic_writes(struct kstat *stat,
unsigned int unit_min,
unsigned int unit_max)
unsigned int unit_max,
unsigned int unit_max_opt)
{
/* Confirm that the request type is known */
stat->result_mask |= STATX_WRITE_ATOMIC;
@@ -153,6 +155,7 @@ void generic_fill_statx_atomic_writes(struct kstat *stat,
if (unit_min) {
stat->atomic_write_unit_min = unit_min;
stat->atomic_write_unit_max = unit_max;
stat->atomic_write_unit_max_opt = unit_max_opt;
/* Initially only allow 1x segment */
stat->atomic_write_segments_max = 1;
@@ -732,6 +735,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

View File

@@ -3312,6 +3312,11 @@ xfs_bmap_compute_alignments(
align = xfs_get_cowextsz_hint(ap->ip);
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
/* Try to align start block to any minimum allocation alignment */
if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))
args->alignment = align;
if (align) {
if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
ap->eof, 0, ap->conv, &ap->offset,

View File

@@ -87,6 +87,9 @@ struct xfs_bmalloca {
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
#define XFS_BMAPI_NORMAP (1u << 10)
/* Try to align allocations to the extent size hint */
#define XFS_BMAPI_EXTSZALIGN (1u << 11)
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -98,7 +101,8 @@ struct xfs_bmalloca {
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
{ XFS_BMAPI_NORMAP, "NORMAP" }
{ XFS_BMAPI_NORMAP, "NORMAP" },\
{ XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" }
static inline int xfs_bmapi_aflag(int w)

View File

@@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks(
*/
if (xfs_want_minlogsize_fixes(&mp->m_sb)) {
xfs_trans_resv_calc(mp, resv);
resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
return;
}
@@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks(
xfs_trans_resv_calc(mp, resv);
/* Copy the dynamic transaction reservation types from the running fs */
resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
if (xfs_has_reflink(mp)) {
/*
* In the early days of reflink, typical log operation counts

View File

@@ -22,6 +22,12 @@
#include "xfs_rtbitmap.h"
#include "xfs_attr_item.h"
#include "xfs_log.h"
#include "xfs_defer.h"
#include "xfs_bmap_item.h"
#include "xfs_extfree_item.h"
#include "xfs_rmap_item.h"
#include "xfs_refcount_item.h"
#include "xfs_trace.h"
#define _ALLOC true
#define _FREE false
@@ -263,6 +269,42 @@ xfs_rtalloc_block_count(
* register overflow from temporaries in the calculations.
*/
/*
* Finishing a data device refcount updates (t1):
* the agfs of the ags containing the blocks: nr_ops * sector size
* the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_cui_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
if (!xfs_has_reflink(mp))
return 0;
return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops),
mp->m_sb.sb_blocksize);
}
/*
* Realtime refcount updates (t2);
* the rt refcount inode
* the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_rt_cui_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
if (!xfs_has_rtreflink(mp))
return 0;
return xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
mp->m_sb.sb_blocksize);
}
/*
* Compute the log reservation required to handle the refcount update
* transaction. Refcount updates are always done via deferred log items.
@@ -280,19 +322,10 @@ xfs_calc_refcountbt_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
unsigned int t1, t2 = 0;
unsigned int t1, t2;
if (!xfs_has_reflink(mp))
return 0;
t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
if (xfs_has_realtime(mp))
t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
blksz);
t1 = xfs_calc_finish_cui_reservation(mp, nr_ops);
t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops);
return max(t1, t2);
}
@@ -379,6 +412,96 @@ xfs_calc_write_reservation_minlogsize(
return xfs_calc_write_reservation(mp, true);
}
/*
* Finishing an EFI can free the blocks and bmap blocks (t2):
* the agf for each of the ags: nr * sector size
* the agfl for each of the ags: nr * sector size
* the super block to reflect the freed blocks: sector size
* worst case split in allocation btrees per extent assuming nr extents:
* nr exts * 2 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_efi_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
mp->m_sb.sb_blocksize);
}
/*
* Or, if it's a realtime file (t3):
* the agf for each of the ags: 2 * sector size
* the agfl for each of the ags: 2 * sector size
* the super block to reflect the freed blocks: sector size
* the realtime bitmap:
* 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 2 exts * 1 block
* worst case split in allocation btrees per extent assuming 2 extents:
* 2 exts * 2 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_rt_efi_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
if (!xfs_has_realtime(mp))
return 0;
return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr),
mp->m_sb.sb_blocksize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
mp->m_sb.sb_blocksize);
}
/*
* Finishing an RUI is the same as an EFI. We can split the rmap btree twice
* on each end of the record, and that can cause the AGFL to be refilled or
* emptied out.
*/
inline unsigned int
xfs_calc_finish_rui_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
if (!xfs_has_rmapbt(mp))
return 0;
return xfs_calc_finish_efi_reservation(mp, nr);
}
/*
* Finishing an RUI is the same as an EFI. We can split the rmap btree twice
* on each end of the record, and that can cause the AGFL to be refilled or
* emptied out.
*/
inline unsigned int
xfs_calc_finish_rt_rui_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
if (!xfs_has_rtrmapbt(mp))
return 0;
return xfs_calc_finish_rt_efi_reservation(mp, nr);
}
/*
* In finishing a BUI, we can modify:
* the inode being truncated: inode size
* dquots
* the inode's bmap btree: (max depth + 1) * block size
*/
inline unsigned int
xfs_calc_finish_bui_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
mp->m_sb.sb_blocksize);
}
/*
* In truncating a file we free up to two extents at once. We can modify (t1):
* the inode being truncated: inode size
@@ -411,16 +534,8 @@ xfs_calc_itruncate_reservation(
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
if (xfs_has_realtime(mp)) {
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
} else {
t3 = 0;
}
t2 = xfs_calc_finish_efi_reservation(mp, 4);
t3 = xfs_calc_finish_rt_efi_reservation(mp, 2);
/*
* In the early days of reflink, we included enough reservation to log
@@ -501,9 +616,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_finish_efi_reservation(mp, 3);
if (xfs_has_parent(mp)) {
unsigned int rename_overhead, exchange_overhead;
@@ -611,9 +724,7 @@ xfs_calc_link_reservation(
overhead += xfs_calc_iunlink_remove_reservation(mp);
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_finish_efi_reservation(mp, 1);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrsetm.tr_logres;
@@ -676,9 +787,7 @@ xfs_calc_remove_reservation(
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1));
t2 = xfs_calc_finish_efi_reservation(mp, 2);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrrm.tr_logres;
@@ -1181,6 +1290,15 @@ xfs_calc_namespace_reservations(
resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
}
STATIC void
xfs_calc_default_atomic_ioend_reservation(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
/* Pick a default that will scale reasonably for the log size. */
resp->tr_atomic_ioend = resp->tr_itruncate;
}
void
xfs_trans_resv_calc(
struct xfs_mount *mp,
@@ -1275,4 +1393,167 @@ xfs_trans_resv_calc(
resp->tr_itruncate.tr_logcount += logcount_adj;
resp->tr_write.tr_logcount += logcount_adj;
resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
/*
* Now that we've finished computing the static reservations, we can
* compute the dynamic reservation for atomic writes.
*/
xfs_calc_default_atomic_ioend_reservation(mp, resp);
}
/*
* Return the per-extent and fixed transaction reservation sizes needed to
* complete an atomic write.
*/
STATIC unsigned int
xfs_calc_atomic_write_ioend_geometry(
struct xfs_mount *mp,
unsigned int *step_size)
{
const unsigned int efi = xfs_efi_log_space(1);
const unsigned int efd = xfs_efd_log_space(1);
const unsigned int rui = xfs_rui_log_space(1);
const unsigned int rud = xfs_rud_log_space();
const unsigned int cui = xfs_cui_log_space(1);
const unsigned int cud = xfs_cud_log_space();
const unsigned int bui = xfs_bui_log_space(1);
const unsigned int bud = xfs_bud_log_space();
/*
* Maximum overhead to complete an atomic write ioend in software:
* remove data fork extent + remove cow fork extent + map extent into
* data fork.
*
* tx0: Creates a BUI and a CUI and that's all it needs.
*
* tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
* enough space to relog the CUI (== CUI + CUD).
*
* tx2: Roll again to finish the RUI. Need space for the RUD and space
* to relog the CUI.
*
* tx3: Roll again, need space for the CUD and possibly a new EFI.
*
* tx4: Roll again, need space for an EFD.
*
* If the extent referenced by the pair of BUI/CUI items is not the one
* being currently processed, then we need to reserve space to relog
* both items.
*/
const unsigned int tx0 = bui + cui;
const unsigned int tx1 = bud + rui + cui + cud;
const unsigned int tx2 = rud + cui + cud;
const unsigned int tx3 = cud + efi;
const unsigned int tx4 = efd;
const unsigned int relog = bui + bud + cui + cud;
const unsigned int per_intent = max(max3(tx0, tx1, tx2),
max3(tx3, tx4, relog));
/* Overhead to finish one step of each intent item type */
const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1);
/* We only finish one item per transaction in a chain */
*step_size = max(f4, max3(f1, f2, f3));
return per_intent;
}
/*
* Compute the maximum size (in fsblocks) of atomic writes that we can complete
* given the existing log reservations.
*/
xfs_extlen_t
xfs_calc_max_atomic_write_fsblocks(
struct xfs_mount *mp)
{
const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend;
unsigned int per_intent = 0;
unsigned int step_size = 0;
unsigned int ret = 0;
if (resv->tr_logres > 0) {
per_intent = xfs_calc_atomic_write_ioend_geometry(mp,
&step_size);
if (resv->tr_logres >= step_size)
ret = (resv->tr_logres - step_size) / per_intent;
}
trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size,
resv->tr_logres, ret);
return ret;
}
/*
* Compute the log blocks and transaction reservation needed to complete an
* atomic write of a given number of blocks. Worst case, each block requires
* separate handling. A return value of 0 means something went wrong.
*/
xfs_extlen_t
xfs_calc_atomic_write_log_geometry(
struct xfs_mount *mp,
xfs_extlen_t blockcount,
unsigned int *new_logres)
{
struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend;
uint old_logres = curr_res->tr_logres;
unsigned int per_intent, step_size;
unsigned int logres;
xfs_extlen_t min_logblocks;
ASSERT(blockcount > 0);
xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
/* Check for overflows */
if (check_mul_overflow(blockcount, per_intent, &logres) ||
check_add_overflow(logres, step_size, &logres))
return 0;
curr_res->tr_logres = logres;
min_logblocks = xfs_log_calc_minimum_size(mp);
curr_res->tr_logres = old_logres;
trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size,
blockcount, min_logblocks, logres);
*new_logres = logres;
return min_logblocks;
}
/*
* Compute the transaction reservation needed to complete an out of place
* atomic write of a given number of blocks.
*/
int
xfs_calc_atomic_write_reservation(
struct xfs_mount *mp,
xfs_extlen_t blockcount)
{
unsigned int new_logres;
xfs_extlen_t min_logblocks;
/*
* If the caller doesn't ask for a specific atomic write size, then
* use the defaults.
*/
if (blockcount == 0) {
xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
return 0;
}
min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount,
&new_logres);
if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks)
return -EINVAL;
M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres;
return 0;
}

View File

@@ -48,6 +48,7 @@ struct xfs_trans_resv {
struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
struct xfs_trans_res tr_sb; /* modify superblock */
struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */
};
/* shorthand way of accessing reservation structure */
@@ -98,8 +99,32 @@ struct xfs_trans_resv {
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp,
unsigned int nr_ops);
unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp,
xfs_extlen_t blockcount, unsigned int *new_logres);
int xfs_calc_atomic_write_reservation(struct xfs_mount *mp,
xfs_extlen_t blockcount);
#endif /* __XFS_TRANS_RESV_H__ */

View File

@@ -77,6 +77,11 @@ xfs_bui_item_size(
*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
}
unsigned int xfs_bui_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
}
/*
* This is called to fill in the vector of log iovecs for the
* given bui log item. We use only 1 iovec, and we point that
@@ -168,6 +173,11 @@ xfs_bud_item_size(
*nbytes += sizeof(struct xfs_bud_log_format);
}
unsigned int xfs_bud_log_space(void)
{
return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
}
/*
* This is called to fill in the vector of log iovecs for the
* given bud log item. We use only 1 iovec, and we point that

View File

@@ -72,4 +72,7 @@ struct xfs_bmap_intent;
void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
unsigned int xfs_bui_log_space(unsigned int nr);
unsigned int xfs_bud_log_space(void);
#endif /* __XFS_BMAP_ITEM_H__ */

View File

@@ -1714,13 +1714,45 @@ xfs_free_buftarg(
kfree(btp);
}
/*
* Configure this buffer target for hardware-assisted atomic writes if the
* underlying block device supports is congruent with the filesystem geometry.
*/
static inline void
xfs_configure_buftarg_atomic_writes(
struct xfs_buftarg *btp)
{
struct xfs_mount *mp = btp->bt_mount;
unsigned int min_bytes, max_bytes;
min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev);
max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev);
/*
* Ignore atomic write geometry that is nonsense or doesn't even cover
* a single fsblock.
*/
if (min_bytes > max_bytes ||
min_bytes > mp->m_sb.sb_blocksize ||
max_bytes < mp->m_sb.sb_blocksize) {
min_bytes = 0;
max_bytes = 0;
}
btp->bt_bdev_awu_min = min_bytes;
btp->bt_bdev_awu_max = max_bytes;
}
/* Configure a buffer target that abstracts a block device. */
int
xfs_setsize_buftarg(
xfs_configure_buftarg(
struct xfs_buftarg *btp,
unsigned int sectorsize)
{
int error;
ASSERT(btp->bt_bdev != NULL);
/* Set up metadata sector size info */
btp->bt_meta_sectorsize = sectorsize;
btp->bt_meta_sectormask = sectorsize - 1;
@@ -1733,11 +1765,10 @@ xfs_setsize_buftarg(
return -EINVAL;
}
/*
* Flush the block device pagecache so our bios see anything dirtied
* before mount.
*/
return sync_blockdev(btp->bt_bdev);
if (bdev_can_atomic_write(btp->bt_bdev))
xfs_configure_buftarg_atomic_writes(btp);
return 0;
}
int
@@ -1786,6 +1817,8 @@ xfs_alloc_buftarg(
{
struct xfs_buftarg *btp;
const struct dax_holder_operations *ops = NULL;
int error;
#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
ops = &xfs_dax_holder_operations;
@@ -1799,28 +1832,31 @@ xfs_alloc_buftarg(
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
if (bdev_can_atomic_write(btp->bt_bdev)) {
btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
btp->bt_bdev);
btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
btp->bt_bdev);
}
/*
* Flush and invalidate all devices' pagecaches before reading any
* metadata because XFS doesn't use the bdev pagecache.
*/
error = sync_blockdev(btp->bt_bdev);
if (error)
goto error_free;
/*
* When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet.
*/
if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
goto error_free;
if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
mp->m_super->s_id))
btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev);
btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1;
error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize,
mp->m_super->s_id);
if (error)
goto error_free;
return btp;
error_free:
kfree(btp);
return NULL;
return ERR_PTR(error);
}
static inline void

View File

@@ -112,7 +112,7 @@ struct xfs_buftarg {
struct percpu_counter bt_readahead_count;
struct ratelimit_state bt_ioerror_rl;
/* Atomic write unit values */
/* Atomic write unit values, bytes */
unsigned int bt_bdev_awu_min;
unsigned int bt_bdev_awu_max;
@@ -374,7 +374,7 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int);
int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)

View File

@@ -103,6 +103,25 @@ xfs_buf_item_size_segment(
return;
}
/*
* Compute the worst case log item overhead for an invalidated buffer with the
* given map count and block size.
*/
unsigned int
xfs_buf_inval_log_space(
unsigned int map_count,
unsigned int blocksize)
{
unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK);
unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD);
unsigned int ret =
offsetof(struct xfs_buf_log_format, blf_data_map) +
(bitmap_size * sizeof_field(struct xfs_buf_log_format,
blf_data_map[0]));
return ret * map_count;
}
/*
* Return the number of log iovecs and space needed to log the given buf log
* item.

View File

@@ -64,6 +64,9 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
unsigned int xfs_buf_inval_log_space(unsigned int map_count,
unsigned int blocksize);
extern struct kmem_cache *xfs_buf_item_cache;
#endif /* __XFS_BUF_ITEM_H__ */

View File

@@ -83,6 +83,11 @@ xfs_efi_item_size(
*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
}
unsigned int xfs_efi_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
}
/*
* This is called to fill in the vector of log iovecs for the
* given efi log item. We use only 1 iovec, and we point that
@@ -254,6 +259,11 @@ xfs_efd_item_size(
*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
}
unsigned int xfs_efd_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
}
/*
* This is called to fill in the vector of log iovecs for the
* given efd log item. We use only 1 iovec, and we point that

View File

@@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
struct xfs_extent_free_item *xefi,
struct xfs_defer_pending **dfpp);
unsigned int xfs_efi_log_space(unsigned int nr);
unsigned int xfs_efd_log_space(unsigned int nr);
#endif /* __XFS_EXTFREE_ITEM_H__ */

View File

@@ -576,7 +576,10 @@ xfs_dio_write_end_io(
nofs_flag = memalloc_nofs_save();
if (flags & IOMAP_DIO_COW) {
error = xfs_reflink_end_cow(ip, offset, size);
if (iocb->ki_flags & IOCB_ATOMIC)
error = xfs_reflink_end_atomic_cow(ip, offset, size);
else
error = xfs_reflink_end_cow(ip, offset, size);
if (error)
goto out;
}
@@ -725,6 +728,72 @@ xfs_file_dio_write_zoned(
return ret;
}
/*
* Handle block atomic writes
*
* Two methods of atomic writes are supported:
* - REQ_ATOMIC-based, which would typically use some form of HW offload in the
* disk
* - COW-based, which uses a COW fork as a staging extent for data updates
* before atomically updating extent mappings for the range being written
*
*/
static noinline ssize_t
xfs_file_dio_write_atomic(
struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret, ocount = iov_iter_count(from);
const struct iomap_ops *dops;
/*
* HW offload should be faster, so try that first if it is already
* known that the write length is not too large.
*/
if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max)
dops = &xfs_atomic_write_cow_iomap_ops;
else
dops = &xfs_direct_write_iomap_ops;
retry:
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
/* Demote similar to xfs_file_dio_write_aligned() */
if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
0, NULL, 0);
/*
* The retry mechanism is based on the ->iomap_begin method returning
* -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
* possible. The REQ_ATOMIC-based method typically not be possible if
* the write spans multiple extents or the disk blocks are misaligned.
*/
if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
xfs_iunlock(ip, iolock);
dops = &xfs_atomic_write_cow_iomap_ops;
goto retry;
}
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
return ret;
}
/*
* Handle block unaligned direct I/O writes
*
@@ -840,6 +909,8 @@ xfs_file_dio_write(
return xfs_file_dio_write_unaligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
if (iocb->ki_flags & IOCB_ATOMIC)
return xfs_file_dio_write_atomic(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
@@ -1032,14 +1103,12 @@ xfs_file_write_iter(
return xfs_file_dax_write(iocb, from);
if (iocb->ki_flags & IOCB_ATOMIC) {
/*
* Currently only atomic writing of a single FS block is
* supported. It would be possible to atomic write smaller than
* a FS block, but there is no requirement to support this.
* Note that iomap also does not support this yet.
*/
if (ocount != ip->i_mount->m_sb.sb_blocksize)
if (ocount < xfs_get_atomic_write_min(ip))
return -EINVAL;
if (ocount > xfs_get_atomic_write_max(ip))
return -EINVAL;
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
@@ -1488,7 +1557,7 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
if (xfs_inode_can_atomicwrite(XFS_I(inode)))
if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}

View File

@@ -356,19 +356,9 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
static inline bool
xfs_inode_can_atomicwrite(
struct xfs_inode *ip)
static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
return false;
if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
return false;
return true;
return xfs_inode_buftarg(ip)->bt_bdev_awu_max > 0;
}
/*

View File

@@ -798,6 +798,38 @@ imap_spans_range(
return true;
}
static bool
xfs_bmap_hw_atomic_write_possible(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
xfs_fileoff_t offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb);
/*
* atomic writes are required to be naturally aligned for disk blocks,
* which ensures that we adhere to block layer rules that we won't
* straddle any boundary or violate write alignment requirement.
*/
if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
return false;
/*
* Spanning multiple extents would mean that multiple BIOs would be
* issued, and so would lose atomicity required for REQ_ATOMIC-based
* atomics.
*/
if (!imap_spans_range(imap, offset_fsb, end_fsb))
return false;
/*
* The ->iomap_begin caller should ensure this, but check anyway.
*/
return len <= xfs_inode_buftarg(ip)->bt_bdev_awu_max;
}
static int
xfs_direct_write_iomap_begin(
struct inode *inode,
@@ -812,9 +844,11 @@ xfs_direct_write_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
xfs_fileoff_t orig_end_fsb = end_fsb;
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
bool needs_alloc;
unsigned int lockmode;
u64 seq;
@@ -875,13 +909,37 @@ xfs_direct_write_iomap_begin(
(flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
if (shared)
if (shared) {
if ((flags & IOMAP_ATOMIC) &&
!xfs_bmap_hw_atomic_write_possible(ip, &cmap,
offset_fsb, end_fsb)) {
error = -ENOPROTOOPT;
goto out_unlock;
}
goto out_found_cow;
}
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
if (imap_needs_alloc(inode, flags, &imap, nimaps))
needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
if (flags & IOMAP_ATOMIC) {
error = -ENOPROTOOPT;
/*
* If we allocate less than what is required for the write
* then we may end up with multiple extents, which means that
* REQ_ATOMIC-based cannot be used, so avoid this possibility.
*/
if (needs_alloc && orig_end_fsb - offset_fsb > 1)
goto out_unlock;
if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb,
orig_end_fsb))
goto out_unlock;
}
if (needs_alloc)
goto allocate_blocks;
/*
@@ -1022,6 +1080,134 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
};
#endif /* CONFIG_XFS_RT */
static int
xfs_atomic_write_cow_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
unsigned flags,
struct iomap *iomap,
struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
xfs_filblks_t count_fsb = end_fsb - offset_fsb;
int nmaps = 1;
xfs_filblks_t resaligned;
struct xfs_bmbt_irec cmap;
struct xfs_iext_cursor icur;
struct xfs_trans *tp;
unsigned int dblocks = 0, rblocks = 0;
int error;
u64 seq;
ASSERT(flags & IOMAP_WRITE);
ASSERT(flags & IOMAP_DIRECT);
if (xfs_is_shutdown(mp))
return -EIO;
if (!xfs_can_sw_atomic_write(mp)) {
ASSERT(xfs_can_sw_atomic_write(mp));
return -EINVAL;
}
/* blocks are always allocated in this path */
if (flags & IOMAP_NOWAIT)
return -EAGAIN;
trace_xfs_iomap_atomic_write_cow(ip, offset, length);
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
}
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
cmap.br_startoff = end_fsb;
if (cmap.br_startoff <= offset_fsb) {
xfs_trim_extent(&cmap, offset_fsb, count_fsb);
goto found;
}
end_fsb = cmap.br_startoff;
count_fsb = end_fsb - offset_fsb;
resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
xfs_get_cowextsz_hint(ip));
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (XFS_IS_REALTIME_INODE(ip)) {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
rblocks = resaligned;
} else {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
rblocks = 0;
}
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
rblocks, false, &tp);
if (error)
return error;
/* extent layout could have changed since the unlock, so check again */
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
cmap.br_startoff = end_fsb;
if (cmap.br_startoff <= offset_fsb) {
xfs_trim_extent(&cmap, offset_fsb, count_fsb);
xfs_trans_cancel(tp);
goto found;
}
/*
* Allocate the entire reservation as unwritten blocks.
*
* Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
* extszhint, such that there will be a greater chance that future
* atomic writes to that same range will be aligned (and don't require
* this COW-based method).
*/
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
}
xfs_inode_set_cowblocks_tag(ip);
error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
found:
if (cmap.br_state != XFS_EXT_NORM) {
error = xfs_reflink_convert_cow_locked(ip, offset_fsb,
count_fsb);
if (error)
goto out_unlock;
cmap.br_state = XFS_EXT_NORM;
}
length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
.iomap_begin = xfs_atomic_write_cow_iomap_begin,
};
static int
xfs_dax_write_iomap_end(
struct inode *inode,

View File

@@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
extern const struct iomap_ops xfs_dax_write_iomap_ops;
extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
#endif /* __XFS_IOMAP_H__*/

View File

@@ -601,16 +601,82 @@ xfs_report_dioalign(
stat->dio_offset_align = stat->dio_read_offset_align;
}
unsigned int
xfs_get_atomic_write_min(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
/*
* If we can complete an atomic write via atomic out of place writes,
* then advertise a minimum size of one fsblock. Without this
* mechanism, we can only guarantee atomic writes up to a single LBA.
*
* If out of place writes are not available, we can guarantee an atomic
* write of exactly one single fsblock if the bdev will make that
* guarantee for us.
*/
if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp))
return mp->m_sb.sb_blocksize;
return 0;
}
unsigned int
xfs_get_atomic_write_max(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
/*
* If out of place writes are not available, we can guarantee an atomic
* write of exactly one single fsblock if the bdev will make that
* guarantee for us.
*/
if (!xfs_can_sw_atomic_write(mp)) {
if (xfs_inode_can_hw_atomic_write(ip))
return mp->m_sb.sb_blocksize;
return 0;
}
/*
* If we can complete an atomic write via atomic out of place writes,
* then advertise a maximum size of whatever we can complete through
* that means. Hardware support is reported via max_opt, not here.
*/
if (XFS_IS_REALTIME_INODE(ip))
return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max);
return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
}
unsigned int
xfs_get_atomic_write_max_opt(
struct xfs_inode *ip)
{
unsigned int awu_max = xfs_get_atomic_write_max(ip);
/* if the max is 1x block, then just keep behaviour that opt is 0 */
if (awu_max <= ip->i_mount->m_sb.sb_blocksize)
return 0;
/*
* Advertise the maximum size of an atomic write that we can tell the
* block device to perform for us. In general the bdev limit will be
* less than our out of place write limit, but we don't want to exceed
* the awu_max.
*/
return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max);
}
static void
xfs_report_atomic_write(
struct xfs_inode *ip,
struct kstat *stat)
{
unsigned int unit_min = 0, unit_max = 0;
if (xfs_inode_can_atomicwrite(ip))
unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
generic_fill_statx_atomic_writes(stat,
xfs_get_atomic_write_min(ip),
xfs_get_atomic_write_max(ip),
xfs_get_atomic_write_max_opt(ip));
}
STATIC int

View File

@@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir,
extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip);
unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip);
unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip);
#endif /* __XFS_IOPS_H__ */

View File

@@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs(
* Then round nbytes up to 64-bit alignment so that the initial
* buffer alignment is easy to calculate and verify.
*/
nbytes += niovecs *
(sizeof(uint64_t) + sizeof(struct xlog_op_header));
nbytes = round_up(nbytes, sizeof(uint64_t));
nbytes = xlog_item_space(niovecs, nbytes);
/*
* The data buffer needs to start 64-bit aligned, so round up

View File

@@ -698,4 +698,17 @@ xlog_kvmalloc(
return p;
}
/*
* Given a count of iovecs and space for a log item, compute the space we need
* in the log to store that data plus the log headers.
*/
static inline unsigned int
xlog_item_space(
unsigned int niovecs,
unsigned int nbytes)
{
nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
return round_up(nbytes, sizeof(uint64_t));
}
#endif /* __XFS_LOG_PRIV_H__ */

View File

@@ -666,6 +666,158 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
/* Maximum atomic write IO size that the kernel allows. */
static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp)
{
return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
}
static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
{
return 1 << (ffs(nr) - 1);
}
/*
* If the data device advertises atomic write support, limit the size of data
* device atomic writes to the greatest power-of-two factor of the AG size so
* that every atomic write unit aligns with the start of every AG. This is
* required so that the per-AG allocations for an atomic write will always be
* aligned compatibly with the alignment requirements of the storage.
*
* If the data device doesn't advertise atomic writes, then there are no
* alignment restrictions and the largest out-of-place write we can do
* ourselves is the number of blocks that user files can allocate from any AG.
*/
static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp)
{
if (mp->m_ddev_targp->bt_bdev_awu_min > 0)
return max_pow_of_two_factor(mp->m_sb.sb_agblocks);
return rounddown_pow_of_two(mp->m_ag_max_usable);
}
/*
* Reflink on the realtime device requires rtgroups, and atomic writes require
* reflink.
*
* If the realtime device advertises atomic write support, limit the size of
* data device atomic writes to the greatest power-of-two factor of the rtgroup
* size so that every atomic write unit aligns with the start of every rtgroup.
* This is required so that the per-rtgroup allocations for an atomic write
* will always be aligned compatibly with the alignment requirements of the
* storage.
*
* If the rt device doesn't advertise atomic writes, then there are no
* alignment restrictions and the largest out-of-place write we can do
* ourselves is the number of blocks that user files can allocate from any
* rtgroup.
*/
static inline xfs_extlen_t xfs_calc_rtgroup_awu_max(struct xfs_mount *mp)
{
struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
if (rgs->blocks == 0)
return 0;
if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev_awu_min > 0)
return max_pow_of_two_factor(rgs->blocks);
return rounddown_pow_of_two(rgs->blocks);
}
/* Compute the maximum atomic write unit size for each section. */
static inline void
xfs_calc_atomic_write_unit_max(
struct xfs_mount *mp)
{
struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG];
struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp);
const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp);
const xfs_extlen_t max_rgsize = xfs_calc_rtgroup_awu_max(mp);
ags->awu_max = min3(max_write, max_ioend, max_agsize);
rgs->awu_max = min3(max_write, max_ioend, max_rgsize);
trace_xfs_calc_atomic_write_unit_max(mp, max_write, max_ioend,
max_agsize, max_rgsize);
}
/*
* Try to set the atomic write maximum to a new value that we got from
* userspace via mount option.
*/
int
xfs_set_max_atomic_write_opt(
struct xfs_mount *mp,
unsigned long long new_max_bytes)
{
const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);
const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
const xfs_extlen_t max_group =
max(mp->m_groups[XG_TYPE_AG].blocks,
mp->m_groups[XG_TYPE_RTG].blocks);
const xfs_extlen_t max_group_write =
max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp));
int error;
if (new_max_bytes == 0)
goto set_limit;
ASSERT(max_write <= U32_MAX);
/* generic_atomic_write_valid enforces power of two length */
if (!is_power_of_2(new_max_bytes)) {
xfs_warn(mp,
"max atomic write size of %llu bytes is not a power of 2",
new_max_bytes);
return -EINVAL;
}
if (new_max_bytes & mp->m_blockmask) {
xfs_warn(mp,
"max atomic write size of %llu bytes not aligned with fsblock",
new_max_bytes);
return -EINVAL;
}
if (new_max_fsbs > max_write) {
xfs_warn(mp,
"max atomic write size of %lluk cannot be larger than max write size %lluk",
new_max_bytes >> 10,
XFS_FSB_TO_B(mp, max_write) >> 10);
return -EINVAL;
}
if (new_max_fsbs > max_group) {
xfs_warn(mp,
"max atomic write size of %lluk cannot be larger than allocation group size %lluk",
new_max_bytes >> 10,
XFS_FSB_TO_B(mp, max_group) >> 10);
return -EINVAL;
}
if (new_max_fsbs > max_group_write) {
xfs_warn(mp,
"max atomic write size of %lluk cannot be larger than max allocation group write size %lluk",
new_max_bytes >> 10,
XFS_FSB_TO_B(mp, max_group_write) >> 10);
return -EINVAL;
}
set_limit:
error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
if (error) {
xfs_warn(mp,
"cannot support completing atomic writes of %lluk",
new_max_bytes >> 10);
return error;
}
xfs_calc_atomic_write_unit_max(mp);
mp->m_awu_max_bytes = new_max_bytes;
return 0;
}
/* Compute maximum possible height for realtime btree types for this fs. */
static inline void
xfs_rtbtree_compute_maxlevels(
@@ -1082,6 +1234,15 @@ xfs_mountfs(
xfs_zone_gc_start(mp);
}
/*
* Pre-calculate atomic write unit max. This involves computations
* derived from transaction reservations, so we must do this after the
* log is fully initialized.
*/
error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes);
if (error)
goto out_agresv;
return 0;
out_agresv:

View File

@@ -119,6 +119,12 @@ struct xfs_groups {
* SMR hard drives.
*/
xfs_fsblock_t start_fsb;
/*
* Maximum length of an atomic write for files stored in this
* collection of allocation groups, in fsblocks.
*/
xfs_extlen_t awu_max;
};
struct xfs_freecounter {
@@ -231,6 +237,9 @@ typedef struct xfs_mount {
unsigned int m_max_open_zones;
unsigned int m_zonegc_low_space;
/* max_atomic_write mount option value */
unsigned long long m_awu_max_bytes;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
* Callers must hold m_sb_lock to access these two fields.
@@ -464,6 +473,11 @@ static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
return !xfs_has_zoned(mp);
}
static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp)
{
return xfs_has_reflink(mp);
}
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
@@ -793,4 +807,7 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
percpu_counter_add(&mp->m_delalloc_blks, delta);
}
int xfs_set_max_atomic_write_opt(struct xfs_mount *mp,
unsigned long long new_max_bytes);
#endif /* __XFS_MOUNT_H__ */

View File

@@ -78,6 +78,11 @@ xfs_cui_item_size(
*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
}
unsigned int xfs_cui_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
}
/*
* This is called to fill in the vector of log iovecs for the
* given cui log item. We use only 1 iovec, and we point that
@@ -179,6 +184,11 @@ xfs_cud_item_size(
*nbytes += sizeof(struct xfs_cud_log_format);
}
unsigned int xfs_cud_log_space(void)
{
return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
}
/*
* This is called to fill in the vector of log iovecs for the
* given cud log item. We use only 1 iovec, and we point that

View File

@@ -76,4 +76,7 @@ struct xfs_refcount_intent;
void xfs_refcount_defer_add(struct xfs_trans *tp,
struct xfs_refcount_intent *ri);
unsigned int xfs_cui_log_space(unsigned int nr);
unsigned int xfs_cud_log_space(void);
#endif /* __XFS_REFCOUNT_ITEM_H__ */

View File

@@ -293,7 +293,7 @@ xfs_bmap_trim_cow(
return xfs_reflink_trim_around_shared(ip, imap, shared);
}
static int
int
xfs_reflink_convert_cow_locked(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
@@ -786,35 +786,19 @@ xfs_reflink_update_quota(
* requirements as low as possible.
*/
STATIC int
xfs_reflink_end_cow_extent(
xfs_reflink_end_cow_extent_locked(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
unsigned int resblks;
int nmaps;
bool isrt = XFS_IS_REALTIME_INODE(ip);
int error;
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
/*
* Lock the inode. We have to ijoin without automatic unlock because
* the lead transaction is the refcountbt record deletion; the data
* fork update follows as a deferred log item.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -823,7 +807,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
goto out_cancel;
return 0;
}
/*
@@ -837,7 +821,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_next_extent(ifp, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
goto out_cancel;
return 0;
}
}
del = got;
@@ -846,14 +830,14 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
goto out_cancel;
return error;
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
&nmaps, 0);
if (error)
goto out_cancel;
return error;
/* We can only remap the smaller of the two extent sizes. */
data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
@@ -882,7 +866,7 @@ xfs_reflink_end_cow_extent(
error = xfs_bunmapi(NULL, ip, data.br_startoff,
data.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
return error;
ASSERT(done);
}
@@ -899,17 +883,45 @@ xfs_reflink_end_cow_extent(
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
/* Update the caller about how much progress we made. */
*offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
}
out_cancel:
xfs_trans_cancel(tp);
/*
* Remap part of the CoW fork into the data fork.
*
* We aim to remap the range starting at @offset_fsb and ending at @end_fsb
* into the data fork; this function will remap what it can (at the end of the
* range) and update @end_fsb appropriately. Each remap gets its own
* transaction because we can end up merging and splitting bmbt blocks for
* every remap operation and we'd like to keep the block reservation
* requirements as low as possible.
*/
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
unsigned int resblks;
int error;
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb);
if (error)
xfs_trans_cancel(tp);
else
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -972,6 +984,78 @@ xfs_reflink_end_cow(
return error;
}
/*
* Fully remap all of the file's data fork at once, which is the critical part
* in achieving atomic behaviour.
* The regular CoW end path does not use function as to keep the block
* reservation per transaction as low as possible.
*/
int
xfs_reflink_end_atomic_cow(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t count)
{
xfs_fileoff_t offset_fsb;
xfs_fileoff_t end_fsb;
int error = 0;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
unsigned int resblks;
trace_xfs_reflink_end_cow(ip, offset, count);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
end_fsb = XFS_B_TO_FSB(mp, offset + count);
/*
* Each remapping operation could cause a btree split, so in the worst
* case that's one for each block.
*/
resblks = (end_fsb - offset_fsb) *
XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0,
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
while (end_fsb > offset_fsb && !error) {
error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
end_fsb);
}
if (error) {
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
goto out_cancel;
}
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
/* Compute the largest atomic write that we can complete through software. */
xfs_extlen_t
xfs_reflink_max_atomic_cow(
struct xfs_mount *mp)
{
/* We cannot do any atomic writes without out of place writes. */
if (!xfs_can_sw_atomic_write(mp))
return 0;
/*
* Atomic write limits must always be a power-of-2, according to
* generic_atomic_write_valid.
*/
return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp));
}
/*
* Free all CoW staging blocks that are still referenced by the ondisk refcount
* metadata. The ondisk metadata does not track which inode created the

View File

@@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
int xfs_reflink_convert_cow_locked(struct xfs_inode *ip,
xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb);
extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
@@ -43,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count, bool cancel_real);
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
@@ -64,4 +68,6 @@ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize);
xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp);
#endif /* __XFS_REFLINK_H */

View File

@@ -77,6 +77,11 @@ xfs_rui_item_size(
*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
}
unsigned int xfs_rui_log_space(unsigned int nr)
{
return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
}
/*
* This is called to fill in the vector of log iovecs for the
* given rui log item. We use only 1 iovec, and we point that
@@ -180,6 +185,11 @@ xfs_rud_item_size(
*nbytes += sizeof(struct xfs_rud_log_format);
}
unsigned int xfs_rud_log_space(void)
{
return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
}
/*
* This is called to fill in the vector of log iovecs for the
* given rud log item. We use only 1 iovec, and we point that

View File

@@ -75,4 +75,7 @@ struct xfs_rmap_intent;
void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
unsigned int xfs_rui_log_space(unsigned int nr);
unsigned int xfs_rud_log_space(void);
#endif /* __XFS_RMAP_ITEM_H__ */

View File

@@ -111,7 +111,7 @@ enum {
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
Opt_lifetime, Opt_nolifetime,
Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -159,6 +159,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_u32("max_open_zones", Opt_max_open_zones),
fsparam_flag("lifetime", Opt_lifetime),
fsparam_flag("nolifetime", Opt_nolifetime),
fsparam_string("max_atomic_write", Opt_max_atomic_write),
{}
};
@@ -241,6 +242,9 @@ xfs_fs_show_options(
if (mp->m_max_open_zones)
seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
if (mp->m_awu_max_bytes)
seq_printf(m, ",max_atomic_write=%lluk",
mp->m_awu_max_bytes >> 10);
return 0;
}
@@ -482,21 +486,29 @@ xfs_open_devices(
/*
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
if (!mp->m_ddev_targp)
if (IS_ERR(mp->m_ddev_targp)) {
error = PTR_ERR(mp->m_ddev_targp);
mp->m_ddev_targp = NULL;
goto out_close_rtdev;
}
if (rtdev_file) {
mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
if (!mp->m_rtdev_targp)
if (IS_ERR(mp->m_rtdev_targp)) {
error = PTR_ERR(mp->m_rtdev_targp);
mp->m_rtdev_targp = NULL;
goto out_free_ddev_targ;
}
}
if (logdev_file && file_bdev(logdev_file) != ddev) {
mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
if (!mp->m_logdev_targp)
if (IS_ERR(mp->m_logdev_targp)) {
error = PTR_ERR(mp->m_logdev_targp);
mp->m_logdev_targp = NULL;
goto out_free_rtdev_targ;
}
} else {
mp->m_logdev_targp = mp->m_ddev_targp;
/* Handle won't be used, drop it */
@@ -529,7 +541,7 @@ xfs_setup_devices(
{
int error;
error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -538,7 +550,7 @@ xfs_setup_devices(
if (xfs_has_sector(mp))
log_sector_size = mp->m_sb.sb_logsectsize;
error = xfs_setsize_buftarg(mp->m_logdev_targp,
error = xfs_configure_buftarg(mp->m_logdev_targp,
log_sector_size);
if (error)
return error;
@@ -552,7 +564,7 @@ xfs_setup_devices(
}
mp->m_rtdev_targp = mp->m_ddev_targp;
} else if (mp->m_rtname) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
error = xfs_configure_buftarg(mp->m_rtdev_targp,
mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -1335,6 +1347,42 @@ suffix_kstrtoint(
return ret;
}
static int
suffix_kstrtoull(
const char *s,
unsigned int base,
unsigned long long *res)
{
int last, shift_left_factor = 0;
unsigned long long _res;
char *value;
int ret = 0;
value = kstrdup(s, GFP_KERNEL);
if (!value)
return -ENOMEM;
last = strlen(value) - 1;
if (value[last] == 'K' || value[last] == 'k') {
shift_left_factor = 10;
value[last] = '\0';
}
if (value[last] == 'M' || value[last] == 'm') {
shift_left_factor = 20;
value[last] = '\0';
}
if (value[last] == 'G' || value[last] == 'g') {
shift_left_factor = 30;
value[last] = '\0';
}
if (kstrtoull(value, base, &_res))
ret = -EINVAL;
kfree(value);
*res = _res << shift_left_factor;
return ret;
}
static inline void
xfs_fs_warn_deprecated(
struct fs_context *fc,
@@ -1519,6 +1567,14 @@ xfs_fs_parse_param(
case Opt_nolifetime:
parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
return 0;
case Opt_max_atomic_write:
if (suffix_kstrtoull(param->string, 10,
&parsing_mp->m_awu_max_bytes)) {
xfs_warn(parsing_mp,
"max atomic write size must be positive integer");
return -EINVAL;
}
return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
@@ -2129,6 +2185,14 @@ xfs_fs_reconfigure(
if (error)
return error;
/* Validate new max_atomic_write option before making other changes */
if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
error = xfs_set_max_atomic_write_opt(mp,
new_mp->m_awu_max_bytes);
if (error)
return error;
}
/* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS;

View File

@@ -170,6 +170,99 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
TRACE_EVENT(xfs_calc_atomic_write_unit_max,
TP_PROTO(struct xfs_mount *mp, unsigned int max_write,
unsigned int max_ioend, unsigned int max_agsize,
unsigned int max_rgsize),
TP_ARGS(mp, max_write, max_ioend, max_agsize, max_rgsize),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, max_write)
__field(unsigned int, max_ioend)
__field(unsigned int, max_agsize)
__field(unsigned int, max_rgsize)
__field(unsigned int, data_awu_max)
__field(unsigned int, rt_awu_max)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->max_write = max_write;
__entry->max_ioend = max_ioend;
__entry->max_agsize = max_agsize;
__entry->max_rgsize = max_rgsize;
__entry->data_awu_max = mp->m_groups[XG_TYPE_AG].awu_max;
__entry->rt_awu_max = mp->m_groups[XG_TYPE_RTG].awu_max;
),
TP_printk("dev %d:%d max_write %u max_ioend %u max_agsize %u max_rgsize %u data_awu_max %u rt_awu_max %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->max_write,
__entry->max_ioend,
__entry->max_agsize,
__entry->max_rgsize,
__entry->data_awu_max,
__entry->rt_awu_max)
);
TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
unsigned int step_size, unsigned int logres,
unsigned int blockcount),
TP_ARGS(mp, per_intent, step_size, logres, blockcount),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, per_intent)
__field(unsigned int, step_size)
__field(unsigned int, logres)
__field(unsigned int, blockcount)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->per_intent = per_intent;
__entry->step_size = step_size;
__entry->logres = logres;
__entry->blockcount = blockcount;
),
TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->per_intent,
__entry->step_size,
__entry->logres,
__entry->blockcount)
);
TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry,
TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
unsigned int step_size, unsigned int blockcount,
unsigned int min_logblocks, unsigned int logres),
TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned int, per_intent)
__field(unsigned int, step_size)
__field(unsigned int, blockcount)
__field(unsigned int, min_logblocks)
__field(unsigned int, cur_logblocks)
__field(unsigned int, logres)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->per_intent = per_intent;
__entry->step_size = step_size;
__entry->blockcount = blockcount;
__entry->min_logblocks = min_logblocks;
__entry->cur_logblocks = mp->m_sb.sb_logblocks;
__entry->logres = logres;
),
TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->per_intent,
__entry->step_size,
__entry->blockcount,
__entry->min_logblocks,
__entry->cur_logblocks,
__entry->logres)
);
TRACE_EVENT(xlog_intent_recovery_failed,
TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
int error),
@@ -1657,6 +1750,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
TRACE_EVENT(xfs_iomap_atomic_write_cow,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
TP_ARGS(ip, offset, count),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_off_t, offset)
__field(ssize_t, count)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->offset = offset;
__entry->count = count;
),
TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->offset,
__entry->count)
)
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec),

View File

@@ -3475,7 +3475,8 @@ void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
void generic_fill_statx_atomic_writes(struct kstat *stat,
unsigned int unit_min,
unsigned int unit_max);
unsigned int unit_max,
unsigned int unit_max_opt);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);

View File

@@ -57,6 +57,7 @@ struct kstat {
u32 dio_read_offset_align;
u32 atomic_write_unit_min;
u32 atomic_write_unit_max;
u32 atomic_write_unit_max_opt;
u32 atomic_write_segments_max;
};

View File

@@ -182,8 +182,12 @@ struct statx {
/* File offset alignment for direct I/O reads */
__u32 stx_dio_read_offset_align;
/* 0xb8 */
__u64 __spare3[9]; /* Spare space for future expansion */
/* Optimised max atomic write unit in bytes */
__u32 stx_atomic_write_unit_max_opt;
__u32 __spare2[1];
/* 0xc0 */
__u64 __spare3[8]; /* Spare space for future expansion */
/* 0x100 */
};