KVM: MMU: split kvm_mmu_pte_write function (889e5cbc) · Commits · jan.koester / Linux

arch/x86/kvm/mmu.c

+119 −75

Original line number	Diff line number	Diff line
		@@ -3530,48 +3530,28 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
		return !!(spte && (*spte & shadow_accessed_mask));
		}

		void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
		const u8 *new, int bytes)
		static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu vcpu, gpa_t gpa,
		const u8 new, int bytes)
		{
		gfn_t gfn = gpa >> PAGE_SHIFT;
		union kvm_mmu_page_role mask = { .word = 0 };
		struct kvm_mmu_page *sp;
		struct hlist_node *node;
		LIST_HEAD(invalid_list);
		u64 entry, gentry, *spte;
		unsigned pte_size, page_offset, misaligned, quadrant, offset;
		int level, npte, r, flooded = 0;
		bool remote_flush, local_flush, zap_page;

		/*
		* If we don't have indirect shadow pages, it means no page is
		* write-protected, so we can exit simply.
		*/
		if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
		return;

		zap_page = remote_flush = local_flush = false;
		offset = offset_in_page(gpa);

		pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
		u64 gentry;
		int r;

		/*
		* Assume that the pte write on a page table of the same type
		* as the current vcpu paging mode since we update the sptes only
		* when they have the same mode.
		*/
		if (is_pae(vcpu) && bytes == 4) {
		if (is_pae(vcpu) && *bytes == 4) {
		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
		gpa &= ~(gpa_t)7;
		bytes = 8;

		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
		*gpa &= ~(gpa_t)7;
		*bytes = 8;
		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
		if (r)
		gentry = 0;
		new = (const u8 *)&gentry;
		}

		switch (bytes) {
		switch (*bytes) {
		case 4:
		gentry = (const u32 )new;
		break;
		@@ -3583,52 +3563,60 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
		break;
		}

		return gentry;
		}

		/*
		* No need to care whether allocation memory is successful
		* or not since pte prefetch is skiped if it does not have
		* enough objects in the cache.
		* If we're seeing too many writes to a page, it may no longer be a page table,
		* or we may be forking, in which case it is better to unmap the page.
		*/
		mmu_topup_memory_caches(vcpu);
		spin_lock(&vcpu->kvm->mmu_lock);
		++vcpu->kvm->stat.mmu_pte_write;
		trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
		static bool detect_write_flooding(struct kvm_vcpu *vcpu, gfn_t gfn)
		{
		bool flooded = false;

		if (gfn == vcpu->arch.last_pt_write_gfn
		&& !last_updated_pte_accessed(vcpu)) {
		++vcpu->arch.last_pt_write_count;
		if (vcpu->arch.last_pt_write_count >= 3)
		flooded = 1;
		flooded = true;
		} else {
		vcpu->arch.last_pt_write_gfn = gfn;
		vcpu->arch.last_pt_write_count = 1;
		vcpu->arch.last_pte_updated = NULL;
		}

		mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
		for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
		pte_size = sp->role.cr4_pae ? 8 : 4;
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
		misaligned \|= bytes < 4;
		if (misaligned \|\| flooded) {
		return flooded;
		}

		/*
		* Misaligned accesses are too much trouble to fix
		* up; also, they usually indicate a page is not used
		* as a page table.
		*
		* If we're seeing too many writes to a page,
		* it may no longer be a page table, or we may be
		* forking, in which case it is better to unmap the
		* page.
		* Misaligned accesses are too much trouble to fix up; also, they usually
		* indicate a page is not used as a page table.
		*/
		static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
		int bytes)
		{
		unsigned offset, pte_size, misaligned;

		pgprintk("misaligned: gpa %llx bytes %d role %x\n",
		gpa, bytes, sp->role.word);
		zap_page \|= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
		&invalid_list);
		++vcpu->kvm->stat.mmu_flooded;
		continue;

		offset = offset_in_page(gpa);
		pte_size = sp->role.cr4_pae ? 8 : 4;
		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
		misaligned \|= bytes < 4;

		return misaligned;
		}
		page_offset = offset;

		static u64 get_written_sptes(struct kvm_mmu_page sp, gpa_t gpa, int *nspte)
		{
		unsigned page_offset, quadrant;
		u64 *spte;
		int level;

		page_offset = offset_in_page(gpa);
		level = sp->role.level;
		npte = 1;
		*nspte = 1;
		if (!sp->role.cr4_pae) {
		page_offset <<= 1; /* 32->64 */
		/*
		@@ -3639,15 +3627,71 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
		if (level == PT32_ROOT_LEVEL) {
		page_offset &= ~7; /* kill rounding error */
		page_offset <<= 1;
		npte = 2;
		*nspte = 2;
		}
		quadrant = page_offset >> PAGE_SHIFT;
		page_offset &= ~PAGE_MASK;
		if (quadrant != sp->role.quadrant)
		return NULL;
		}

		spte = &sp->spt[page_offset / sizeof(*spte)];
		return spte;
		}

		void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
		const u8 *new, int bytes)
		{
		gfn_t gfn = gpa >> PAGE_SHIFT;
		union kvm_mmu_page_role mask = { .word = 0 };
		struct kvm_mmu_page *sp;
		struct hlist_node *node;
		LIST_HEAD(invalid_list);
		u64 entry, gentry, *spte;
		int npte;
		bool remote_flush, local_flush, zap_page, flooded, misaligned;

		/*
		* If we don't have indirect shadow pages, it means no page is
		* write-protected, so we can exit simply.
		*/
		if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
		return;

		zap_page = remote_flush = local_flush = false;

		pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);

		gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);

		/*
		* No need to care whether allocation memory is successful
		* or not since pte prefetch is skiped if it does not have
		* enough objects in the cache.
		*/
		mmu_topup_memory_caches(vcpu);

		spin_lock(&vcpu->kvm->mmu_lock);
		++vcpu->kvm->stat.mmu_pte_write;
		trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);

		flooded = detect_write_flooding(vcpu, gfn);
		mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
		for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
		misaligned = detect_write_misaligned(sp, gpa, bytes);

		if (misaligned \|\| flooded) {
		zap_page \|= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
		&invalid_list);
		++vcpu->kvm->stat.mmu_flooded;
		continue;
		}

		spte = get_written_sptes(sp, gpa, &npte);
		if (!spte)
		continue;

		local_flush = true;
		spte = &sp->spt[page_offset / sizeof(*spte)];
		while (npte--) {
		entry = *spte;
		mmu_page_zap_pte(vcpu->kvm, sp, spte);