From 0a5bfb824a6ea35e54b7e5ac6f881beea5e309d2 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 16 Aug 2022 19:25:17 -0300 Subject: [PATCH 1/5] KVM: PPC: Book3S HV: Fix decrementer migration We used to have a workaround[1] for a hang during migration that was made ineffective when we converted the decrementer expiry to be relative to guest timebase. The point of the workaround was that in the absence of an explicit decrementer expiry value provided by userspace during migration, KVM needs to initialize dec_expires to a value that will result in an expired decrementer after subtracting the current guest timebase. That stops the vcpu from hanging after migration due to a decrementer that's too large. If the dec_expires is now relative to guest timebase, its initialization needs to be guest timebase-relative as well, otherwise we end up with a decrementer expiry that is still larger than the guest timebase. 1- https://git.kernel.org/torvalds/c/5855564c8ab2 Fixes: 3c1a4322bba7 ("KVM: PPC: Book3S HV: Change dec_expires to be relative to guest timebase") Signed-off-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220816222517.1916391-1-farosas@linux.ibm.com --- arch/powerpc/kvm/book3s_hv.c | 18 ++++++++++++++++-- arch/powerpc/kvm/powerpc.c | 1 - 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 57d0835e56fd..917abda9e5ce 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2517,10 +2517,24 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; case KVM_REG_PPC_TB_OFFSET: + { /* round up to multiple of 2^24 */ - vcpu->arch.vcore->tb_offset = - ALIGN(set_reg_val(id, *val), 1UL << 24); + u64 tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24); + + /* + * Now that we know the timebase offset, update the + * decrementer expiry with a guest timebase value. If + * the userspace does not set DEC_EXPIRY, this ensures + * a migrated vcpu at least starts with an expired + * decrementer, which is better than a large one that + * causes a hang. + */ + if (!vcpu->arch.dec_expires && tb_offset) + vcpu->arch.dec_expires = get_tb() + tb_offset; + + vcpu->arch.vcore->tb_offset = tb_offset; break; + } case KVM_REG_PPC_LPCR: kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true); break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index fb1490761c87..757491dd6b7b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -786,7 +786,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; - vcpu->arch.dec_expires = get_tb(); #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); From bc91c04bfff7cdf676011b97bb21b2861d7b21c9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 8 Sep 2022 23:25:41 +1000 Subject: [PATCH 2/5] KVM: PPC: Book3S HV P9: Clear vcpu cpu fields before enabling host irqs On guest entry, vcpu->cpu and vcpu->arch.thread_cpu are set after disabling host irqs. On guest exit there is a window whre tick time accounting briefly enables irqs before these fields are cleared. Move them up to ensure they are cleared before host irqs are run. This is possibly not a problem, but is more symmetric and makes the fields less surprising. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220908132545.4085849-1-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 917abda9e5ce..0868c015c6b0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4629,6 +4629,9 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, set_irq_happened(trap); + vcpu->cpu = -1; + vcpu->arch.thread_cpu = -1; + context_tracking_guest_exit(); if (!vtime_accounting_enabled_this_cpu()) { local_irq_enable(); @@ -4644,9 +4647,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, } vtime_account_guest_exit(); - vcpu->cpu = -1; - vcpu->arch.thread_cpu = -1; - powerpc_local_irq_pmu_restore(flags); preempt_enable(); From c953f7500b65f2b157d1eb468ca8b86328834cce Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 8 Sep 2022 23:25:42 +1000 Subject: [PATCH 3/5] KVM: PPC: Book3S HV P9: Fix irq disabling in tick accounting kvmhv_run_single_vcpu() disables PMIs as well as Linux irqs, however the tick time accounting code enables and disables irqs and not PMIs within this region. By chance this might not actually cause a bug, but it is clearly an incorrect use of the APIs. Fixes: 2251fbe76395e ("KVM: PPC: Book3S HV P9: Improve mtmsrd scheduling by delaying MSR[EE] disable") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220908132545.4085849-2-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0868c015c6b0..0f8dee657336 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4634,7 +4634,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, context_tracking_guest_exit(); if (!vtime_accounting_enabled_this_cpu()) { - local_irq_enable(); + powerpc_local_irq_pmu_restore(flags); /* * Service IRQs here before vtime_account_guest_exit() so any * ticks that occurred while running the guest are accounted to @@ -4643,7 +4643,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, * interrupts here, which has the problem that it accounts * interrupt processing overhead to the host. */ - local_irq_disable(); + powerpc_local_irq_pmu_save(flags); } vtime_account_guest_exit(); From b31bc24a49037aad7aa00d2b0354e9704d8134dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 8 Sep 2022 23:25:43 +1000 Subject: [PATCH 4/5] KVM: PPC: Book3S HV: Update guest state entry/exit accounting to new API Update the guest state and timing entry/exit accounting to use the new API, which was introduced following issues found[1]. KVM HV does possibly call instrumented code inside the guest context, and it does call srcu inside the guest context which is fragile at best. Switch to the new API, moving the guest context inside the srcu_read_lock/unlock region. [1] https://lore.kernel.org/lkml/20220201132926.3301912-1-mark.rutland@arm.com/ Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220908132545.4085849-3-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0f8dee657336..23bcda3585fe 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3854,23 +3854,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) for (sub = 0; sub < core_info.n_subcores; ++sub) spin_unlock(&core_info.vc[sub]->lock); - guest_enter_irqoff(); + guest_timing_enter_irqoff(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); + guest_state_enter_irqoff(); this_cpu_disable_ftrace(); - /* - * Interrupts will be enabled once we get into the guest, - * so tell lockdep that we're about to enable interrupts. - */ - trace_hardirqs_on(); - trap = __kvmppc_vcore_entry(); - trace_hardirqs_off(); - this_cpu_enable_ftrace(); + guest_state_exit_irqoff(); srcu_read_unlock(&vc->kvm->srcu, srcu_idx); @@ -3905,11 +3899,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_set_host_core(pcpu); - context_tracking_guest_exit(); if (!vtime_accounting_enabled_this_cpu()) { local_irq_enable(); /* - * Service IRQs here before vtime_account_guest_exit() so any + * Service IRQs here before guest_timing_exit_irqoff() so any * ticks that occurred while running the guest are accounted to * the guest. If vtime accounting is enabled, accounting uses * TB rather than ticks, so it can be done without enabling @@ -3918,7 +3911,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ local_irq_disable(); } - vtime_account_guest_exit(); + guest_timing_exit_irqoff(); local_irq_enable(); @@ -4609,21 +4602,18 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, trace_kvm_guest_enter(vcpu); - guest_enter_irqoff(); + guest_timing_enter_irqoff(); srcu_idx = srcu_read_lock(&kvm->srcu); + guest_state_enter_irqoff(); this_cpu_disable_ftrace(); - /* Tell lockdep that we're about to enable interrupts */ - trace_hardirqs_on(); - trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr, &tb); vcpu->arch.trap = trap; - trace_hardirqs_off(); - this_cpu_enable_ftrace(); + guest_state_exit_irqoff(); srcu_read_unlock(&kvm->srcu, srcu_idx); @@ -4632,11 +4622,10 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->cpu = -1; vcpu->arch.thread_cpu = -1; - context_tracking_guest_exit(); if (!vtime_accounting_enabled_this_cpu()) { powerpc_local_irq_pmu_restore(flags); /* - * Service IRQs here before vtime_account_guest_exit() so any + * Service IRQs here before guest_timing_exit_irqoff() so any * ticks that occurred while running the guest are accounted to * the guest. If vtime accounting is enabled, accounting uses * TB rather than ticks, so it can be done without enabling @@ -4645,7 +4634,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, */ powerpc_local_irq_pmu_save(flags); } - vtime_account_guest_exit(); + guest_timing_exit_irqoff(); powerpc_local_irq_pmu_restore(flags); From 1a5486b3c3517aa1f608a10003ade4da122cb175 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 8 Sep 2022 23:25:44 +1000 Subject: [PATCH 5/5] KVM: PPC: Book3S HV P9: Restore stolen time logging in dtl Stolen time logging in dtl was removed from the P9 path, so guests had no stolen time accounting. Add it back in a simpler way that still avoids locks and per-core accounting code. Fixes: ecb6a7207f92 ("KVM: PPC: Book3S HV P9: Remove most of the vcore logic") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220908132545.4085849-4-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 49 +++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 23bcda3585fe..ecce10a4486d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -249,6 +249,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) /* * We use the vcpu_load/put functions to measure stolen time. + * * Stolen time is counted as time when either the vcpu is able to * run as part of a virtual core, but the task running the vcore * is preempted or sleeping, or when the vcpu needs something done @@ -278,6 +279,12 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) * lock. The stolen times are measured in units of timebase ticks. * (Note that the != TB_NIL checks below are purely defensive; * they should never fail.) + * + * The POWER9 path is simpler, one vcpu per virtual core so the + * former case does not exist. If a vcpu is preempted when it is + * BUSY_IN_HOST and not ceded or otherwise blocked, then accumulate + * the stolen cycles in busy_stolen. RUNNING is not a preemptible + * state in the P9 path. */ static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb) @@ -311,8 +318,14 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) unsigned long flags; u64 now; - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (vcpu->arch.busy_preempt != TB_NIL) { + WARN_ON_ONCE(vcpu->arch.state != KVMPPC_VCPU_BUSY_IN_HOST); + vc->stolen_tb += mftb() - vcpu->arch.busy_preempt; + vcpu->arch.busy_preempt = TB_NIL; + } return; + } now = mftb(); @@ -340,8 +353,21 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) unsigned long flags; u64 now; - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * In the P9 path, RUNNABLE is not preemptible + * (nor takes host interrupts) + */ + WARN_ON_ONCE(vcpu->arch.state == KVMPPC_VCPU_RUNNABLE); + /* + * Account stolen time when preempted while the vcpu task is + * running in the kernel (but not in qemu, which is INACTIVE). + */ + if (task_is_running(current) && + vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) + vcpu->arch.busy_preempt = mftb(); return; + } now = mftb(); @@ -740,6 +766,18 @@ static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, vcpu->arch.dtl.dirty = true; } +static void kvmppc_create_dtl_entry_p9(struct kvm_vcpu *vcpu, + struct kvmppc_vcore *vc, + u64 now) +{ + unsigned long stolen; + + stolen = vc->stolen_tb - vcpu->arch.stolen_logged; + vcpu->arch.stolen_logged = vc->stolen_tb; + + __kvmppc_create_dtl_entry(vcpu, vc->pcpu, now, stolen); +} + static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { @@ -4527,7 +4565,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, vc = vcpu->arch.vcore; vcpu->arch.ceded = 0; vcpu->arch.run_task = current; - vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; /* See if the MMU is ready to go */ @@ -4554,6 +4591,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, /* flags save not required, but irq_pmu has no disable/enable API */ powerpc_local_irq_pmu_save(flags); + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + if (signal_pending(current)) goto sigpend; if (need_resched() || !kvm->arch.mmu_ready) @@ -4598,7 +4637,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, tb = mftb(); - __kvmppc_create_dtl_entry(vcpu, pcpu, tb + vc->tb_offset, 0); + kvmppc_create_dtl_entry_p9(vcpu, vc, tb + vc->tb_offset); trace_kvm_guest_enter(vcpu); @@ -4621,6 +4660,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->cpu = -1; vcpu->arch.thread_cpu = -1; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; if (!vtime_accounting_enabled_this_cpu()) { powerpc_local_irq_pmu_restore(flags); @@ -4697,6 +4737,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, out: vcpu->cpu = -1; vcpu->arch.thread_cpu = -1; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; powerpc_local_irq_pmu_restore(flags); preempt_enable(); goto done;