mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-16 06:41:39 -04:00
drm/xe/vf: Introduce RESFIX start marker support
In scenarios involving double migration, the VF KMD may encounter situations where it is instructed to re-migrate before having the opportunity to send RESFIX_DONE for the initial migration. This can occur when the fix-up for the prior migration is still underway, but the VF KMD is migrated again. Consequently, this may lead to the possibility of sending two migration notifications (i.e., pending fix-up for the first migration and a second notification for the new migration). Upon receiving the first RES_FIX notification, the GuC will resume VF submission on the GPU, potentially resulting in undefined behavior, such as system hangs or crashes. To avoid this, post migration, a marker is sent to the GUC prior to the start of resource fixups to indicate start of resource fixups. The same marker is sent along with RESFIX_DONE notification so that GUC can avoid submitting jobs to HW in case of double migration. Signed-off-by: Satyanarayana K V P <satyanarayana.k.v.p@intel.com> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> Cc: Matthew Brost <matthew.brost@intel.com> Cc: Tomasz Lis <tomasz.lis@intel.com> Reviewed-by: Michal Wajdeczko <michal.wajdeczko@intel.com> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com> Link: https://patch.msgid.link/20251201095011.21453-8-satyanarayana.k.v.p@intel.com
This commit is contained in:
committed by
Michal Wajdeczko
parent
2e2dab20dd
commit
b5fbb94341
@@ -502,13 +502,17 @@
|
||||
#define VF2GUC_VF_RESET_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0
|
||||
|
||||
/**
|
||||
* DOC: VF2GUC_NOTIFY_RESFIX_DONE
|
||||
* DOC: VF2GUC_RESFIX_DONE
|
||||
*
|
||||
* This action is used by VF to notify the GuC that the VF KMD has completed
|
||||
* post-migration recovery steps.
|
||||
* This action is used by VF to inform the GuC that the VF KMD has completed
|
||||
* post-migration recovery steps. From GuC VF compatibility 1.27.0 onwards, it
|
||||
* shall only be sent after posting RESFIX_START and that both @MARKER fields
|
||||
* must match.
|
||||
*
|
||||
* This message must be sent as `MMIO HXG Message`_.
|
||||
*
|
||||
* Updated since GuC VF compatibility 1.27.0.
|
||||
*
|
||||
* +---+-------+--------------------------------------------------------------+
|
||||
* | | Bits | Description |
|
||||
* +===+=======+==============================================================+
|
||||
@@ -516,9 +520,11 @@
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ |
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 27:16 | DATA0 = MBZ |
|
||||
* | | 27:16 | DATA0 = MARKER = MBZ (only prior 1.27.0) |
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 15:0 | ACTION = _`GUC_ACTION_VF2GUC_NOTIFY_RESFIX_DONE` = 0x5508 |
|
||||
* | | 27:16 | DATA0 = MARKER - can't be zero (1.27.0+) |
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 15:0 | ACTION = _`GUC_ACTION_VF2GUC_RESFIX_DONE` = 0x5508 |
|
||||
* +---+-------+--------------------------------------------------------------+
|
||||
*
|
||||
* +---+-------+--------------------------------------------------------------+
|
||||
@@ -531,13 +537,13 @@
|
||||
* | | 27:0 | DATA0 = MBZ |
|
||||
* +---+-------+--------------------------------------------------------------+
|
||||
*/
|
||||
#define GUC_ACTION_VF2GUC_NOTIFY_RESFIX_DONE 0x5508u
|
||||
#define GUC_ACTION_VF2GUC_RESFIX_DONE 0x5508u
|
||||
|
||||
#define VF2GUC_NOTIFY_RESFIX_DONE_REQUEST_MSG_LEN GUC_HXG_REQUEST_MSG_MIN_LEN
|
||||
#define VF2GUC_NOTIFY_RESFIX_DONE_REQUEST_MSG_0_MBZ GUC_HXG_REQUEST_MSG_0_DATA0
|
||||
#define VF2GUC_RESFIX_DONE_REQUEST_MSG_LEN GUC_HXG_REQUEST_MSG_MIN_LEN
|
||||
#define VF2GUC_RESFIX_DONE_REQUEST_MSG_0_MARKER GUC_HXG_REQUEST_MSG_0_DATA0
|
||||
|
||||
#define VF2GUC_NOTIFY_RESFIX_DONE_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN
|
||||
#define VF2GUC_NOTIFY_RESFIX_DONE_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0
|
||||
#define VF2GUC_RESFIX_DONE_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN
|
||||
#define VF2GUC_RESFIX_DONE_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0
|
||||
|
||||
/**
|
||||
* DOC: VF2GUC_QUERY_SINGLE_KLV
|
||||
@@ -656,4 +662,45 @@
|
||||
#define PF2GUC_SAVE_RESTORE_VF_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN
|
||||
#define PF2GUC_SAVE_RESTORE_VF_RESPONSE_MSG_0_USED GUC_HXG_RESPONSE_MSG_0_DATA0
|
||||
|
||||
/**
|
||||
* DOC: VF2GUC_RESFIX_START
|
||||
*
|
||||
* This action is used by VF to inform the GuC that the VF KMD will be starting
|
||||
* post-migration recovery fixups. The @MARKER sent with this action must match
|
||||
* with the MARKER posted in the VF2GUC_RESFIX_DONE message.
|
||||
*
|
||||
* This message must be sent as `MMIO HXG Message`_.
|
||||
*
|
||||
* Available since GuC VF compatibility 1.27.0.
|
||||
*
|
||||
* +---+-------+--------------------------------------------------------------+
|
||||
* | | Bits | Description |
|
||||
* +===+=======+==============================================================+
|
||||
* | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_HOST_ |
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ |
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 27:16 | DATA0 = MARKER - can't be zero |
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 15:0 | ACTION = _`GUC_ACTION_VF2GUC_RESFIX_START` = 0x550F |
|
||||
* +---+-------+--------------------------------------------------------------+
|
||||
*
|
||||
* +---+-------+--------------------------------------------------------------+
|
||||
* | | Bits | Description |
|
||||
* +===+=======+==============================================================+
|
||||
* | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_GUC_ |
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 30:28 | TYPE = GUC_HXG_TYPE_RESPONSE_SUCCESS_ |
|
||||
* | +-------+--------------------------------------------------------------+
|
||||
* | | 27:0 | DATA0 = MBZ |
|
||||
* +---+-------+--------------------------------------------------------------+
|
||||
*/
|
||||
#define GUC_ACTION_VF2GUC_RESFIX_START 0x550Fu
|
||||
|
||||
#define VF2GUC_RESFIX_START_REQUEST_MSG_LEN GUC_HXG_REQUEST_MSG_MIN_LEN
|
||||
#define VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER GUC_HXG_REQUEST_MSG_0_DATA0
|
||||
|
||||
#define VF2GUC_RESFIX_START_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN
|
||||
#define VF2GUC_RESFIX_START_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0
|
||||
|
||||
#endif
|
||||
|
||||
@@ -299,12 +299,13 @@ void xe_gt_sriov_vf_guc_versions(struct xe_gt *gt,
|
||||
*found = gt->sriov.vf.guc_version;
|
||||
}
|
||||
|
||||
static int guc_action_vf_notify_resfix_done(struct xe_guc *guc)
|
||||
static int guc_action_vf_resfix_start(struct xe_guc *guc, u16 marker)
|
||||
{
|
||||
u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
|
||||
FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
|
||||
FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
|
||||
FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_NOTIFY_RESFIX_DONE),
|
||||
FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_RESFIX_START) |
|
||||
FIELD_PREP(VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER, marker),
|
||||
};
|
||||
int ret;
|
||||
|
||||
@@ -313,28 +314,41 @@ static int guc_action_vf_notify_resfix_done(struct xe_guc *guc)
|
||||
return ret > 0 ? -EPROTO : ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* vf_notify_resfix_done - Notify GuC about resource fixups apply completed.
|
||||
* @gt: the &xe_gt struct instance linked to target GuC
|
||||
*
|
||||
* Returns: 0 if the operation completed successfully, or a negative error
|
||||
* code otherwise.
|
||||
*/
|
||||
static int vf_notify_resfix_done(struct xe_gt *gt)
|
||||
static int vf_resfix_start(struct xe_gt *gt, u16 marker)
|
||||
{
|
||||
struct xe_guc *guc = >->uc.guc;
|
||||
int err;
|
||||
|
||||
xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
|
||||
|
||||
err = guc_action_vf_notify_resfix_done(guc);
|
||||
if (unlikely(err))
|
||||
xe_gt_sriov_err(gt, "Failed to notify GuC about resource fixup done (%pe)\n",
|
||||
ERR_PTR(err));
|
||||
else
|
||||
xe_gt_sriov_dbg_verbose(gt, "sent GuC resource fixup done\n");
|
||||
xe_gt_sriov_dbg_verbose(gt, "Sending resfix start marker %u\n", marker);
|
||||
|
||||
return err;
|
||||
return guc_action_vf_resfix_start(guc, marker);
|
||||
}
|
||||
|
||||
static int guc_action_vf_resfix_done(struct xe_guc *guc, u16 marker)
|
||||
{
|
||||
u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
|
||||
FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
|
||||
FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
|
||||
FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_RESFIX_DONE) |
|
||||
FIELD_PREP(VF2GUC_RESFIX_DONE_REQUEST_MSG_0_MARKER, marker),
|
||||
};
|
||||
int ret;
|
||||
|
||||
ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request));
|
||||
|
||||
return ret > 0 ? -EPROTO : ret;
|
||||
}
|
||||
|
||||
static int vf_resfix_done(struct xe_gt *gt, u16 marker)
|
||||
{
|
||||
struct xe_guc *guc = >->uc.guc;
|
||||
|
||||
xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
|
||||
|
||||
xe_gt_sriov_dbg_verbose(gt, "Sending resfix done marker %u\n", marker);
|
||||
|
||||
return guc_action_vf_resfix_done(guc, marker);
|
||||
}
|
||||
|
||||
static int guc_action_query_single_klv(struct xe_guc *guc, u32 key,
|
||||
@@ -1162,6 +1176,13 @@ static int vf_post_migration_fixups(struct xe_gt *gt)
|
||||
|
||||
static void vf_post_migration_rearm(struct xe_gt *gt)
|
||||
{
|
||||
/*
|
||||
* Make sure interrupts on the new HW are properly set. The GuC IRQ
|
||||
* must be working at this point, since the recovery did started,
|
||||
* but the rest was not enabled using the procedure from spec.
|
||||
*/
|
||||
xe_irq_resume(gt_to_xe(gt));
|
||||
|
||||
xe_guc_ct_restart(>->uc.guc.ct);
|
||||
xe_guc_submit_unpause_prepare_vf(>->uc.guc);
|
||||
}
|
||||
@@ -1183,37 +1204,40 @@ static void vf_post_migration_abort(struct xe_gt *gt)
|
||||
xe_guc_submit_pause_abort(>->uc.guc);
|
||||
}
|
||||
|
||||
static int vf_post_migration_notify_resfix_done(struct xe_gt *gt)
|
||||
static int vf_post_migration_resfix_done(struct xe_gt *gt, u16 marker)
|
||||
{
|
||||
bool skip_resfix = false;
|
||||
|
||||
spin_lock_irq(>->sriov.vf.migration.lock);
|
||||
if (gt->sriov.vf.migration.recovery_queued) {
|
||||
skip_resfix = true;
|
||||
xe_gt_sriov_dbg(gt, "another recovery imminent, resfix skipped\n");
|
||||
} else {
|
||||
if (gt->sriov.vf.migration.recovery_queued)
|
||||
xe_gt_sriov_dbg(gt, "another recovery imminent\n");
|
||||
else
|
||||
WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
|
||||
}
|
||||
spin_unlock_irq(>->sriov.vf.migration.lock);
|
||||
|
||||
if (skip_resfix)
|
||||
return -EAGAIN;
|
||||
return vf_resfix_done(gt, marker);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure interrupts on the new HW are properly set. The GuC IRQ
|
||||
* must be working at this point, since the recovery did started,
|
||||
* but the rest was not enabled using the procedure from spec.
|
||||
*/
|
||||
xe_irq_resume(gt_to_xe(gt));
|
||||
static int vf_post_migration_resfix_start(struct xe_gt *gt, u16 marker)
|
||||
{
|
||||
return vf_resfix_start(gt, marker);
|
||||
}
|
||||
|
||||
return vf_notify_resfix_done(gt);
|
||||
static u16 vf_post_migration_next_resfix_marker(struct xe_gt *gt)
|
||||
{
|
||||
xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
|
||||
|
||||
BUILD_BUG_ON(1 + ((typeof(gt->sriov.vf.migration.resfix_marker))~0) >
|
||||
FIELD_MAX(VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER));
|
||||
|
||||
/* add 1 to avoid zero-marker */
|
||||
return 1 + gt->sriov.vf.migration.resfix_marker++;
|
||||
}
|
||||
|
||||
static void vf_post_migration_recovery(struct xe_gt *gt)
|
||||
{
|
||||
struct xe_device *xe = gt_to_xe(gt);
|
||||
int err;
|
||||
u16 marker;
|
||||
bool retry;
|
||||
int err;
|
||||
|
||||
xe_gt_sriov_dbg(gt, "migration recovery in progress\n");
|
||||
|
||||
@@ -1227,15 +1251,27 @@ static void vf_post_migration_recovery(struct xe_gt *gt)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
marker = vf_post_migration_next_resfix_marker(gt);
|
||||
|
||||
err = vf_post_migration_resfix_start(gt, marker);
|
||||
if (unlikely(err)) {
|
||||
xe_gt_sriov_err(gt, "Recovery failed at GuC RESFIX_START step (%pe)\n",
|
||||
ERR_PTR(err));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
err = vf_post_migration_fixups(gt);
|
||||
if (err)
|
||||
goto fail;
|
||||
|
||||
vf_post_migration_rearm(gt);
|
||||
|
||||
err = vf_post_migration_notify_resfix_done(gt);
|
||||
if (err && err != -EAGAIN)
|
||||
err = vf_post_migration_resfix_done(gt, marker);
|
||||
if (err) {
|
||||
xe_gt_sriov_err(gt, "Recovery failed at GuC RESFIX_DONE step (%pe)\n",
|
||||
ERR_PTR(err));
|
||||
goto fail;
|
||||
}
|
||||
|
||||
vf_post_migration_kickstart(gt);
|
||||
|
||||
|
||||
@@ -52,6 +52,11 @@ struct xe_gt_sriov_vf_migration {
|
||||
wait_queue_head_t wq;
|
||||
/** @scratch: Scratch memory for VF recovery */
|
||||
void *scratch;
|
||||
/**
|
||||
* @resfix_marker: Marker sent on start and on end of post-migration
|
||||
* steps.
|
||||
*/
|
||||
u8 resfix_marker;
|
||||
/** @recovery_teardown: VF post migration recovery is being torn down */
|
||||
bool recovery_teardown;
|
||||
/** @recovery_queued: VF post migration recovery in queued */
|
||||
|
||||
@@ -49,11 +49,13 @@
|
||||
*
|
||||
* As soon as Virtual GPU of the VM starts, the VF driver within receives
|
||||
* the MIGRATED interrupt and schedules post-migration recovery worker.
|
||||
* That worker queries GuC for new provisioning (using MMIO communication),
|
||||
* That worker sends `VF2GUC_RESFIX_START` action along with non-zero
|
||||
* marker, queries GuC for new provisioning (using MMIO communication),
|
||||
* and applies fixups to any non-virtualized resources used by the VF.
|
||||
*
|
||||
* When the VF driver is ready to continue operation on the newly connected
|
||||
* hardware, it sends `VF2GUC_NOTIFY_RESFIX_DONE` which causes it to
|
||||
* hardware, it sends `VF2GUC_RESFIX_DONE` action along with the same
|
||||
* marker which was sent with `VF2GUC_RESFIX_START` which causes it to
|
||||
* enter the long awaited `VF_RUNNING` state, and therefore start handling
|
||||
* CTB messages and scheduling workloads from the VF::
|
||||
*
|
||||
@@ -102,12 +104,17 @@
|
||||
* | [ ] new VF provisioning [ ]
|
||||
* | [ ]---------------------------> [ ]
|
||||
* | | [ ]
|
||||
* | | VF2GUC_RESFIX_START [ ]
|
||||
* | [ ] <---------------------------[ ]
|
||||
* | [ ] [ ]
|
||||
* | [ ] success [ ]
|
||||
* | [ ]---------------------------> [ ]
|
||||
* | | VF driver applies post [ ]
|
||||
* | | migration fixups -------[ ]
|
||||
* | | | [ ]
|
||||
* | | -----> [ ]
|
||||
* | | [ ]
|
||||
* | | VF2GUC_NOTIFY_RESFIX_DONE [ ]
|
||||
* | | VF2GUC_RESFIX_DONE [ ]
|
||||
* | [ ] <---------------------------[ ]
|
||||
* | [ ] [ ]
|
||||
* | [ ] GuC sets new VF state to [ ]
|
||||
@@ -118,6 +125,55 @@
|
||||
* | [ ]---------------------------> [ ]
|
||||
* | | |
|
||||
* | | |
|
||||
*
|
||||
* Handling of VF double migration flow is shown below::
|
||||
*
|
||||
* GuC1 VF
|
||||
* | |
|
||||
* | [ ]<--- start fixups
|
||||
* | VF2GUC_RESFIX_START(marker) [ ]
|
||||
* [ ] <-------------------------------------------[ ]
|
||||
* [ ] [ ]
|
||||
* [ ]---\ [ ]
|
||||
* [ ] store marker [ ]
|
||||
* [ ]<--/ [ ]
|
||||
* [ ] [ ]
|
||||
* [ ] success [ ]
|
||||
* [ ] ------------------------------------------> [ ]
|
||||
* | [ ]
|
||||
* | [ ]---\
|
||||
* | [ ] do fixups
|
||||
* | [ ]<--/
|
||||
* | [ ]
|
||||
* -------------- VF paused / saved ----------------
|
||||
* :
|
||||
*
|
||||
* GuC2
|
||||
* |
|
||||
* ----------------- VF restored ------------------
|
||||
* |
|
||||
* [ ]
|
||||
* [ ]---\
|
||||
* [ ] reset marker
|
||||
* [ ]<--/
|
||||
* [ ]
|
||||
* ----------------- VF resumed ------------------
|
||||
* | [ ]
|
||||
* | [ ]
|
||||
* | VF2GUC_RESFIX_DONE(marker) [ ]
|
||||
* [ ] <-------------------------------------------[ ]
|
||||
* [ ] [ ]
|
||||
* [ ]---\ [ ]
|
||||
* [ ] check marker [ ]
|
||||
* [ ] (mismatch) [ ]
|
||||
* [ ]<--/ [ ]
|
||||
* [ ] [ ]
|
||||
* [ ] RESPONSE_VF_MIGRATED [ ]
|
||||
* [ ] ------------------------------------------> [ ]
|
||||
* | [ ]---\
|
||||
* | [ ] reschedule fixups
|
||||
* | [ ]<--/
|
||||
* | |
|
||||
*/
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user