Commit d76fb406 authored by Sean Christopherson's avatar Sean Christopherson Committed by Paolo Bonzini
Browse files

KVM: VMX: Handle PI descriptor updates during vcpu_put/load



Move the posted interrupt pre/post_block logic into vcpu_put/load
respectively, using the kvm_vcpu_is_blocking() to determining whether or
not the wakeup handler needs to be set (and unset).  This avoids updating
the PI descriptor if halt-polling is successful, reduces the number of
touchpoints for updating the descriptor, and eliminates the confusing
behavior of intentionally leaving a "stale" PI.NDST when a blocking vCPU
is scheduled back in after preemption.

The downside is that KVM will do the PID update twice if the vCPU is
preempted after prepare_to_rcuwait() but before schedule(), but that's a
rare case (and non-existent on !PREEMPT kernels).

The notable wart is the need to send a self-IPI on the wakeup vector if
an outstanding notification is pending after configuring the wakeup
vector.  Ideally, KVM would just do a kvm_vcpu_wake_up() in this case,
but the scheduler doesn't support waking a task from its preemption
notifier callback, i.e. while the task is right in the middle of
being scheduled out.

Note, setting the wakeup vector before halt-polling is not necessary:
once the pending IRQ will be recorded in the PIR, kvm_vcpu_has_events()
will detect this (via kvm_cpu_get_interrupt(), kvm_apic_get_interrupt(),
apic_has_interrupt_for_ppr() and finally vmx_sync_pir_to_irr()) and
terminate the polling.

Signed-off-by: default avatarSean Christopherson <seanjc@google.com>
Reviewed-by: default avatarMaxim Levitsky <mlevitsk@redhat.com>
Message-Id: <20211208015236.1616697-5-seanjc@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent 4f5a884f
Loading
Loading
Loading
Loading
+64 −86
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
	struct pi_desc old, new;
	unsigned long flags;
	unsigned int dest;

	/*
@@ -62,23 +63,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
	if (!enable_apicv || !lapic_in_kernel(vcpu))
		return;

	/* Nothing to do if PI.SN and PI.NDST both have the desired value. */
	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
	/*
	 * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
	 * full update can be skipped as neither the vector nor the destination
	 * needs to be changed.
	 */
	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
		/*
		 * Clear SN if it was set due to being preempted.  Again, do
		 * this even if there is no assigned device for simplicity.
		 */
		if (pi_test_and_clear_sn(pi_desc))
			goto after_clear_sn;
		return;
	}

	local_irq_save(flags);

	/*
	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
	 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
	 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
	 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
	 * correctly.
	 * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
	 * list of the _previous_ pCPU, which will not be the same as the
	 * current pCPU if the task was migrated.
	 */
	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
		pi_clear_sn(pi_desc);
		goto after_clear_sn;
	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
		raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
		list_del(&vcpu->blocked_vcpu_list);
		raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
	}

	/* The full case.  Set the new destination and clear SN. */
	dest = cpu_physical_id(cpu);
	if (!x2apic_mode)
		dest = (dest << 8) & 0xFF00;
@@ -86,10 +98,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
	do {
		old.control = new.control = READ_ONCE(pi_desc->control);

		/*
		 * Clear SN (as above) and refresh the destination APIC ID to
		 * handle task migration (@cpu != vcpu->cpu).
		 */
		new.ndst = dest;
		new.sn = 0;

		/*
		 * Restore the notification vector; in the blocking case, the
		 * descriptor was modified on "put" to use the wakeup vector.
		 */
		new.nv = POSTED_INTR_VECTOR;
	} while (pi_try_set_control(pi_desc, old.control, new.control));

	local_irq_restore(flags);

after_clear_sn:

	/*
@@ -111,83 +135,24 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
		irq_remapping_cap(IRQ_POSTING_CAP);
}

void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

	if (!vmx_can_use_vtd_pi(vcpu->kvm))
		return;

	/* Set SN when the vCPU is preempted */
	if (vcpu->preempted)
		pi_set_sn(pi_desc);
}

static void __pi_post_block(struct kvm_vcpu *vcpu)
{
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
	struct pi_desc old, new;
	unsigned int dest;

	/*
	 * Remove the vCPU from the wakeup list of the _previous_ pCPU, which
	 * will not be the same as the current pCPU if the task was migrated.
	 */
	raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
	list_del(&vcpu->blocked_vcpu_list);
	raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));

	dest = cpu_physical_id(vcpu->cpu);
	if (!x2apic_mode)
		dest = (dest << 8) & 0xFF00;

	WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
	     "Wakeup handler not enabled while the vCPU was blocking");

	do {
		old.control = new.control = READ_ONCE(pi_desc->control);

		new.ndst = dest;

		/* set 'NV' to 'notification vector' */
		new.nv = POSTED_INTR_VECTOR;
	} while (pi_try_set_control(pi_desc, old.control, new.control));

	vcpu->pre_pcpu = -1;
}

/*
 * This routine does the following things for vCPU which is going
 * to be blocked if VT-d PI is enabled.
 * - Store the vCPU to the wakeup list, so when interrupts happen
 *   we can find the right vCPU to wake up.
 * - Change the Posted-interrupt descriptor as below:
 *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
 * - If 'ON' is set during this process, which means at least one
 *   interrupt is posted for this vCPU, we cannot block it, in
 *   this case, return 1, otherwise, return 0.
 *
 * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
 * WAKEUP as the notification vector in the PI descriptor.
 */
int pi_pre_block(struct kvm_vcpu *vcpu)
static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
	struct pi_desc old, new;
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
	struct pi_desc old, new;
	unsigned long flags;

	if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
	    vmx_interrupt_blocked(vcpu))
		return 0;

	local_irq_save(flags);

	vcpu->pre_pcpu = vcpu->cpu;
	raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
	list_add_tail(&vcpu->blocked_vcpu_list,
		      &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
	raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));

	WARN(pi_desc->sn == 1,
	     "Posted Interrupt Suppress Notification set before blocking");
	WARN(pi_desc->sn, "PI descriptor SN field set before blocking");

	do {
		old.control = new.control = READ_ONCE(pi_desc->control);
@@ -196,24 +161,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
		new.nv = POSTED_INTR_WAKEUP_VECTOR;
	} while (pi_try_set_control(pi_desc, old.control, new.control));

	/* We should not block the vCPU if an interrupt is posted for it.  */
	if (pi_test_on(pi_desc))
		__pi_post_block(vcpu);
	/*
	 * Send a wakeup IPI to this CPU if an interrupt may have been posted
	 * before the notification vector was updated, in which case the IRQ
	 * will arrive on the non-wakeup vector.  An IPI is needed as calling
	 * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
	 * enabled until it is safe to call try_to_wake_up() on the task being
	 * scheduled out).
	 */
	if (pi_test_on(&new))
		apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);

	local_irq_restore(flags);
	return (vcpu->pre_pcpu == -1);
}

void pi_post_block(struct kvm_vcpu *vcpu)
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
	unsigned long flags;
	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);

	if (vcpu->pre_pcpu == -1)
	if (!vmx_can_use_vtd_pi(vcpu->kvm))
		return;

	local_irq_save(flags);
	__pi_post_block(vcpu);
	local_irq_restore(flags);
	if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
		pi_enable_wakeup_handler(vcpu);

	/*
	 * Set SN when the vCPU is preempted.  Note, the vCPU can both be seen
	 * as blocking and preempted, e.g. if it's preempted between setting
	 * its wait state and manually scheduling out.
	 */
	if (vcpu->preempted)
		pi_set_sn(pi_desc);
}

/*
@@ -254,7 +232,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
 * Bail out of the block loop if the VM has an assigned
 * device, but the blocking vCPU didn't reconfigure the
 * PI.NV to the wakeup vector, i.e. the assigned device
 * came along after the initial check in pi_pre_block().
 * came along after the initial check in vmx_vcpu_pi_put().
 */
void vmx_pi_start_assignment(struct kvm *kvm)
{
+6 −2
Original line number Diff line number Diff line
@@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
			(unsigned long *)&pi_desc->control);
}

static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
{
	return test_and_clear_bit(POSTED_INTR_SN,
			(unsigned long *)&pi_desc->control);
}

static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
@@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)

void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
int pi_pre_block(struct kvm_vcpu *vcpu);
void pi_post_block(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
+0 −5
Original line number Diff line number Diff line
@@ -7566,9 +7566,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)

static int vmx_pre_block(struct kvm_vcpu *vcpu)
{
	if (pi_pre_block(vcpu))
		return 1;

	if (kvm_lapic_hv_timer_in_use(vcpu))
		kvm_lapic_switch_to_sw_timer(vcpu);

@@ -7579,8 +7576,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
{
	if (kvm_x86_ops.set_hv_timer)
		kvm_lapic_switch_to_hv_timer(vcpu);

	pi_post_block(vcpu);
}

static void vmx_setup_mce(struct kvm_vcpu *vcpu)