NFSv4/flexfiles: Read path updates for striped layouts

Updates read path to calculate and use dss_id to direct IO to the
appropriate stripe DS.

Signed-off-by: Jonathan Curley <jcurley@purestorage.com>
Signed-off-by: Anna Schumaker <anna.schumaker@oracle.com>
This commit is contained in:
Jonathan Curley
2025-09-24 16:20:46 +00:00
committed by Anna Schumaker
parent a1491919c8
commit 4934ccbeae

View File

@@ -770,6 +770,7 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
static struct nfs4_pnfs_ds *
ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
u32 start_idx, u32 *best_idx,
u32 offset, u32 *dss_id,
bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
@@ -780,12 +781,16 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, 0, false);
*dss_id = nfs4_ff_layout_calc_dss_id(
fls->stripe_unit,
fls->mirror_array[idx]->dss_count,
offset);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false);
if (IS_ERR(ds))
continue;
if (check_device &&
nfs4_test_deviceid_unavailable(&mirror->dss[0].mirror_ds->id_node)) {
nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) {
// reinitialize the error state in case if this is the last iteration
ds = ERR_PTR(-EINVAL);
continue;
@@ -800,42 +805,52 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
static struct nfs4_pnfs_ds *
ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
u32 start_idx, u32 *best_idx)
u32 start_idx, u32 *best_idx,
u32 offset, u32 *dss_id)
{
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
offset, dss_id, false);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
u32 start_idx, u32 *best_idx)
u32 start_idx, u32 *best_idx,
u32 offset, u32 *dss_id)
{
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
offset, dss_id, true);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
u32 start_idx, u32 *best_idx)
u32 start_idx, u32 *best_idx,
u32 offset, u32 *dss_id)
{
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx,
offset, dss_id);
if (!IS_ERR(ds))
return ds;
return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx,
offset, dss_id);
}
static struct nfs4_pnfs_ds *
ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
u32 *best_idx)
u32 *best_idx,
u32 offset,
u32 *dss_id)
{
struct pnfs_layout_segment *lseg = pgio->pg_lseg;
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
best_idx);
best_idx, offset, dss_id);
if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
return ds;
return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx,
offset, dss_id);
}
static void
@@ -854,6 +869,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
}
static bool
ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls)
{
return fls->mirror_array[0]->dss_count > 1;
}
/*
* ff_layout_pg_test(). Called by nfs_can_coalesce_requests()
*
* Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
* of bytes (maximum @req->wb_bytes) that can be coalesced.
*/
static size_t
ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
unsigned int size;
u64 p_stripe, r_stripe;
u32 stripe_offset;
u64 segment_offset = pgio->pg_lseg->pls_range.offset;
u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
/* calls nfs_generic_pg_test */
size = pnfs_generic_pg_test(pgio, prev, req);
if (!size)
return 0;
else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg)))
return size;
/* see if req and prev are in the same stripe */
if (prev) {
p_stripe = (u64)req_offset(prev) - segment_offset;
r_stripe = (u64)req_offset(req) - segment_offset;
do_div(p_stripe, stripe_unit);
do_div(r_stripe, stripe_unit);
if (p_stripe != r_stripe)
return 0;
}
/* calculate remaining bytes in the current stripe */
div_u64_rem((u64)req_offset(req) - segment_offset,
stripe_unit,
&stripe_offset);
WARN_ON_ONCE(stripe_offset > stripe_unit);
if (stripe_offset >= stripe_unit)
return 0;
return min(stripe_unit - (unsigned int)stripe_offset, size);
}
static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
@@ -861,7 +926,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_pgio_mirror *pgm;
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
u32 ds_idx;
u32 ds_idx, dss_id;
if (NFS_SERVER(pgio->pg_inode)->flags &
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
@@ -882,7 +947,8 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
/* Reset wb_nio, since getting layout segment was successful */
req->wb_nio = 0;
ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
ds = ff_layout_get_ds_for_read(pgio, &ds_idx,
req_offset(req), &dss_id);
if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
@@ -894,7 +960,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
pgm = &pgio->pg_mirrors[0];
pgm->pg_bsize = mirror->dss[0].mirror_ds->ds_versions[0].rsize;
pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize;
pgio->pg_mirror_idx = ds_idx;
return;
@@ -1032,7 +1098,7 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
.pg_init = ff_layout_pg_init_read,
.pg_test = pnfs_generic_pg_test,
.pg_test = ff_layout_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
.pg_cleanup = pnfs_generic_pg_cleanup,
};
@@ -1087,9 +1153,11 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
u32 new_idx = 0;
u32 dss_id = 0;
struct nfs4_pnfs_ds *ds;
ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx,
hdr->args.offset, &dss_id);
if (IS_ERR(ds))
pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
else
@@ -1884,6 +1952,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
u32 idx = hdr->pgio_mirror_idx;
int vers;
struct nfs_fh *fh;
u32 dss_id;
bool ds_fatal_error = false;
dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
@@ -1891,22 +1960,26 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->args.pgbase, (size_t)hdr->args.count, offset);
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, 0, false);
dss_id = nfs4_ff_layout_calc_dss_id(
FF_LAYOUT_LSEG(lseg)->stripe_unit,
mirror->dss_count,
offset);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false);
if (IS_ERR(ds)) {
ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
}
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode, 0);
hdr->inode, dss_id);
if (IS_ERR(ds_clnt))
goto out_failed;
ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, 0);
ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
if (!ds_cred)
goto out_failed;
vers = nfs4_ff_layout_ds_version(mirror, 0);
vers = nfs4_ff_layout_ds_version(mirror, dss_id);
dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1914,11 +1987,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->pgio_done_cb = ff_layout_read_done_cb;
refcount_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
fh = nfs4_ff_layout_select_ds_fh(mirror, 0);
fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
if (fh)
hdr->args.fh = fh;
nfs4_ff_layout_select_ds_stateid(mirror, 0, &hdr->args.stateid);
nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
/*
* Note that if we ever decide to split across DSes,
@@ -1928,7 +2001,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->mds_offset = offset;
/* Start IO accounting for local read */
localio = ff_local_open_fh(lseg, idx, 0, ds->ds_clp, ds_cred, fh, FMODE_READ);
localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
FMODE_READ);
if (localio) {
hdr->task.tk_start = ktime_get();
ff_layout_read_record_layoutstats_start(&hdr->task, hdr);