Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 2f405b0..38b4c91 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2012 - Virtual Open Systems and Columbia University
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <linux/mman.h>
@@ -27,10 +15,10 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_mmio.h>
+#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>
-#include <asm/system_misc.h>
#include "trace.h"
@@ -45,7 +33,6 @@
static unsigned long io_map_base;
-#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t))
#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
@@ -103,8 +90,7 @@
* @addr: IPA
* @pmd: pmd pointer for IPA
*
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
- * pages in the range dirty.
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
*/
static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
{
@@ -116,6 +102,24 @@
put_page(virt_to_page(pmd));
}
+/**
+ * stage2_dissolve_pud() - clear and flush huge PUD entry
+ * @kvm: pointer to kvm structure.
+ * @addr: IPA
+ * @pud: pud pointer for IPA
+ *
+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
+ */
+static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
+{
+ if (!stage2_pud_huge(kvm, *pudp))
+ return;
+
+ stage2_pud_clear(kvm, pudp);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ put_page(virt_to_page(pudp));
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
int min, int max)
{
@@ -125,7 +129,7 @@
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < max) {
- page = (void *)__get_free_page(PGALLOC_GFP);
+ page = (void *)__get_free_page(GFP_PGTABLE_USER);
if (!page)
return -ENOMEM;
cache->objects[cache->nobjs++] = page;
@@ -150,20 +154,20 @@
static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
{
- pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
- stage2_pgd_clear(pgd);
+ pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
+ stage2_pgd_clear(kvm, pgd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
- stage2_pud_free(pud_table);
+ stage2_pud_free(kvm, pud_table);
put_page(virt_to_page(pgd));
}
static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
{
- pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
- VM_BUG_ON(stage2_pud_huge(*pud));
- stage2_pud_clear(pud);
+ pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
+ VM_BUG_ON(stage2_pud_huge(kvm, *pud));
+ stage2_pud_clear(kvm, pud);
kvm_tlb_flush_vmid_ipa(kvm, addr);
- stage2_pmd_free(pmd_table);
+ stage2_pmd_free(kvm, pmd_table);
put_page(virt_to_page(pud));
}
@@ -173,7 +177,7 @@
VM_BUG_ON(pmd_thp_or_huge(*pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
- pte_free_kernel(NULL, pte_table);
+ free_page((unsigned long)pte_table);
put_page(virt_to_page(pmd));
}
@@ -252,7 +256,7 @@
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- if (stage2_pte_table_empty(start_pte))
+ if (stage2_pte_table_empty(kvm, start_pte))
clear_stage2_pmd_entry(kvm, pmd, start_addr);
}
@@ -262,9 +266,9 @@
phys_addr_t next, start_addr = addr;
pmd_t *pmd, *start_pmd;
- start_pmd = pmd = stage2_pmd_offset(pud, addr);
+ start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
do {
- next = stage2_pmd_addr_end(addr, end);
+ next = stage2_pmd_addr_end(kvm, addr, end);
if (!pmd_none(*pmd)) {
if (pmd_thp_or_huge(*pmd)) {
pmd_t old_pmd = *pmd;
@@ -281,7 +285,7 @@
}
} while (pmd++, addr = next, addr != end);
- if (stage2_pmd_table_empty(start_pmd))
+ if (stage2_pmd_table_empty(kvm, start_pmd))
clear_stage2_pud_entry(kvm, pud, start_addr);
}
@@ -291,14 +295,14 @@
phys_addr_t next, start_addr = addr;
pud_t *pud, *start_pud;
- start_pud = pud = stage2_pud_offset(pgd, addr);
+ start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
do {
- next = stage2_pud_addr_end(addr, end);
- if (!stage2_pud_none(*pud)) {
- if (stage2_pud_huge(*pud)) {
+ next = stage2_pud_addr_end(kvm, addr, end);
+ if (!stage2_pud_none(kvm, *pud)) {
+ if (stage2_pud_huge(kvm, *pud)) {
pud_t old_pud = *pud;
- stage2_pud_clear(pud);
+ stage2_pud_clear(kvm, pud);
kvm_tlb_flush_vmid_ipa(kvm, addr);
kvm_flush_dcache_pud(old_pud);
put_page(virt_to_page(pud));
@@ -308,7 +312,7 @@
}
} while (pud++, addr = next, addr != end);
- if (stage2_pud_table_empty(start_pud))
+ if (stage2_pud_table_empty(kvm, start_pud))
clear_stage2_pgd_entry(kvm, pgd, start_addr);
}
@@ -332,7 +336,7 @@
assert_spin_locked(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
- pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
do {
/*
* Make sure the page table is still active, as another thread
@@ -341,8 +345,8 @@
*/
if (!READ_ONCE(kvm->arch.pgd))
break;
- next = stage2_pgd_addr_end(addr, end);
- if (!stage2_pgd_none(*pgd))
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ if (!stage2_pgd_none(kvm, *pgd))
unmap_stage2_puds(kvm, pgd, addr, next);
/*
* If the range is too large, release the kvm->mmu_lock
@@ -371,9 +375,9 @@
pmd_t *pmd;
phys_addr_t next;
- pmd = stage2_pmd_offset(pud, addr);
+ pmd = stage2_pmd_offset(kvm, pud, addr);
do {
- next = stage2_pmd_addr_end(addr, end);
+ next = stage2_pmd_addr_end(kvm, addr, end);
if (!pmd_none(*pmd)) {
if (pmd_thp_or_huge(*pmd))
kvm_flush_dcache_pmd(*pmd);
@@ -389,11 +393,11 @@
pud_t *pud;
phys_addr_t next;
- pud = stage2_pud_offset(pgd, addr);
+ pud = stage2_pud_offset(kvm, pgd, addr);
do {
- next = stage2_pud_addr_end(addr, end);
- if (!stage2_pud_none(*pud)) {
- if (stage2_pud_huge(*pud))
+ next = stage2_pud_addr_end(kvm, addr, end);
+ if (!stage2_pud_none(kvm, *pud)) {
+ if (stage2_pud_huge(kvm, *pud))
kvm_flush_dcache_pud(*pud);
else
stage2_flush_pmds(kvm, pud, addr, next);
@@ -409,10 +413,11 @@
phys_addr_t next;
pgd_t *pgd;
- pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
do {
- next = stage2_pgd_addr_end(addr, end);
- stage2_flush_puds(kvm, pgd, addr, next);
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ if (!stage2_pgd_none(kvm, *pgd))
+ stage2_flush_puds(kvm, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
@@ -607,7 +612,7 @@
addr = start;
do {
pte = pte_offset_kernel(pmd, addr);
- kvm_set_pte(pte, pfn_pte(pfn, prot));
+ kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
get_page(virt_to_page(pte));
pfn++;
} while (addr += PAGE_SIZE, addr != end);
@@ -628,7 +633,7 @@
BUG_ON(pmd_sect(*pmd));
if (pmd_none(*pmd)) {
- pte = pte_alloc_one_kernel(NULL, addr);
+ pte = pte_alloc_one_kernel(NULL);
if (!pte) {
kvm_err("Cannot allocate Hyp pte\n");
return -ENOMEM;
@@ -880,15 +885,15 @@
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
* @kvm: The KVM struct pointer for the VM.
*
- * Allocates only the stage-2 HW PGD level table(s) (can support either full
- * 40-bit input addresses or limited to 32-bit input addresses). Clears the
- * allocated pages.
+ * Allocates only the stage-2 HW PGD level table(s) of size defined by
+ * stage2_pgd_size(kvm).
*
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
int kvm_alloc_stage2_pgd(struct kvm *kvm)
{
+ phys_addr_t pgd_phys;
pgd_t *pgd;
if (kvm->arch.pgd != NULL) {
@@ -897,11 +902,16 @@
}
/* Allocate the HW PGD, making sure that each page gets its own refcount */
- pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO);
+ pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
if (!pgd)
return -ENOMEM;
+ pgd_phys = virt_to_phys(pgd);
+ if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
+ return -EINVAL;
+
kvm->arch.pgd = pgd;
+ kvm->arch.pgd_phys = pgd_phys;
return 0;
}
@@ -986,15 +996,16 @@
spin_lock(&kvm->mmu_lock);
if (kvm->arch.pgd) {
- unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
+ unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
pgd = READ_ONCE(kvm->arch.pgd);
kvm->arch.pgd = NULL;
+ kvm->arch.pgd_phys = 0;
}
spin_unlock(&kvm->mmu_lock);
/* Free the HW pgd, one page at a time */
if (pgd)
- free_pages_exact(pgd, S2_PGD_SIZE);
+ free_pages_exact(pgd, stage2_pgd_size(kvm));
}
static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
@@ -1003,16 +1014,16 @@
pgd_t *pgd;
pud_t *pud;
- pgd = kvm->arch.pgd + stage2_pgd_index(addr);
- if (WARN_ON(stage2_pgd_none(*pgd))) {
+ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+ if (stage2_pgd_none(kvm, *pgd)) {
if (!cache)
return NULL;
pud = mmu_memory_cache_alloc(cache);
- stage2_pgd_populate(pgd, pud);
+ stage2_pgd_populate(kvm, pgd, pud);
get_page(virt_to_page(pgd));
}
- return stage2_pud_offset(pgd, addr);
+ return stage2_pud_offset(kvm, pgd, addr);
}
static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
@@ -1022,18 +1033,18 @@
pmd_t *pmd;
pud = stage2_get_pud(kvm, cache, addr);
- if (!pud)
+ if (!pud || stage2_pud_huge(kvm, *pud))
return NULL;
- if (stage2_pud_none(*pud)) {
+ if (stage2_pud_none(kvm, *pud)) {
if (!cache)
return NULL;
pmd = mmu_memory_cache_alloc(cache);
- stage2_pud_populate(pud, pmd);
+ stage2_pud_populate(kvm, pud, pmd);
get_page(virt_to_page(pud));
}
- return stage2_pmd_offset(pud, addr);
+ return stage2_pmd_offset(kvm, pud, addr);
}
static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
@@ -1041,25 +1052,43 @@
{
pmd_t *pmd, old_pmd;
+retry:
pmd = stage2_get_pmd(kvm, cache, addr);
VM_BUG_ON(!pmd);
old_pmd = *pmd;
+ /*
+ * Multiple vcpus faulting on the same PMD entry, can
+ * lead to them sequentially updating the PMD with the
+ * same value. Following the break-before-make
+ * (pmd_clear() followed by tlb_flush()) process can
+ * hinder forward progress due to refaults generated
+ * on missing translations.
+ *
+ * Skip updating the page table if the entry is
+ * unchanged.
+ */
+ if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+ return 0;
+
if (pmd_present(old_pmd)) {
/*
- * Multiple vcpus faulting on the same PMD entry, can
- * lead to them sequentially updating the PMD with the
- * same value. Following the break-before-make
- * (pmd_clear() followed by tlb_flush()) process can
- * hinder forward progress due to refaults generated
- * on missing translations.
+ * If we already have PTE level mapping for this block,
+ * we must unmap it to avoid inconsistent TLB state and
+ * leaking the table page. We could end up in this situation
+ * if the memory slot was marked for dirty logging and was
+ * reverted, leaving PTE level mappings for the pages accessed
+ * during the period. So, unmap the PTE level mapping for this
+ * block and retry, as we could have released the upper level
+ * table in the process.
*
- * Skip updating the page table if the entry is
- * unchanged.
+ * Normal THP split/merge follows mmu_notifier callbacks and do
+ * get handled accordingly.
*/
- if (pmd_val(old_pmd) == pmd_val(*new_pmd))
- return 0;
-
+ if (!pmd_thp_or_huge(old_pmd)) {
+ unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
+ goto retry;
+ }
/*
* Mapping in huge pages should only happen through a
* fault. If a page is merged into a transparent huge
@@ -1071,8 +1100,7 @@
* should become splitting first, unmapped, merged,
* and mapped back in on-demand.
*/
- VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-
+ WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
@@ -1083,29 +1111,113 @@
return 0;
}
-static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ phys_addr_t addr, const pud_t *new_pudp)
{
+ pud_t *pudp, old_pud;
+
+retry:
+ pudp = stage2_get_pud(kvm, cache, addr);
+ VM_BUG_ON(!pudp);
+
+ old_pud = *pudp;
+
+ /*
+ * A large number of vcpus faulting on the same stage 2 entry,
+ * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+ * Skip updating the page tables if there is no change.
+ */
+ if (pud_val(old_pud) == pud_val(*new_pudp))
+ return 0;
+
+ if (stage2_pud_present(kvm, old_pud)) {
+ /*
+ * If we already have table level mapping for this block, unmap
+ * the range for this block and retry.
+ */
+ if (!stage2_pud_huge(kvm, old_pud)) {
+ unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
+ goto retry;
+ }
+
+ WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
+ stage2_pud_clear(kvm, pudp);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ } else {
+ get_page(virt_to_page(pudp));
+ }
+
+ kvm_set_pud(pudp, *new_pudp);
+ return 0;
+}
+
+/*
+ * stage2_get_leaf_entry - walk the stage2 VM page tables and return
+ * true if a valid and present leaf-entry is found. A pointer to the
+ * leaf-entry is returned in the appropriate level variable - pudpp,
+ * pmdpp, ptepp.
+ */
+static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
+ pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
+{
+ pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
- pmdp = stage2_get_pmd(kvm, NULL, addr);
+ *pudpp = NULL;
+ *pmdpp = NULL;
+ *ptepp = NULL;
+
+ pudp = stage2_get_pud(kvm, NULL, addr);
+ if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
+ return false;
+
+ if (stage2_pud_huge(kvm, *pudp)) {
+ *pudpp = pudp;
+ return true;
+ }
+
+ pmdp = stage2_pmd_offset(kvm, pudp, addr);
if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
return false;
- if (pmd_thp_or_huge(*pmdp))
- return kvm_s2pmd_exec(pmdp);
+ if (pmd_thp_or_huge(*pmdp)) {
+ *pmdpp = pmdp;
+ return true;
+ }
ptep = pte_offset_kernel(pmdp, addr);
if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
return false;
- return kvm_s2pte_exec(ptep);
+ *ptepp = ptep;
+ return true;
+}
+
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+{
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ bool found;
+
+ found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
+ if (!found)
+ return false;
+
+ if (pudp)
+ return kvm_s2pud_exec(pudp);
+ else if (pmdp)
+ return kvm_s2pmd_exec(pmdp);
+ else
+ return kvm_s2pte_exec(ptep);
}
static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
phys_addr_t addr, const pte_t *new_pte,
unsigned long flags)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte, old_pte;
bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
@@ -1114,7 +1226,31 @@
VM_BUG_ON(logging_active && !cache);
/* Create stage-2 page table mapping - Levels 0 and 1 */
- pmd = stage2_get_pmd(kvm, cache, addr);
+ pud = stage2_get_pud(kvm, cache, addr);
+ if (!pud) {
+ /*
+ * Ignore calls from kvm_set_spte_hva for unallocated
+ * address ranges.
+ */
+ return 0;
+ }
+
+ /*
+ * While dirty page logging - dissolve huge PUD, then continue
+ * on to allocate page.
+ */
+ if (logging_active)
+ stage2_dissolve_pud(kvm, addr, pud);
+
+ if (stage2_pud_none(kvm, *pud)) {
+ if (!cache)
+ return 0; /* ignore calls from kvm_set_spte_hva */
+ pmd = mmu_memory_cache_alloc(cache);
+ stage2_pud_populate(kvm, pud, pmd);
+ get_page(virt_to_page(pud));
+ }
+
+ pmd = stage2_pmd_offset(kvm, pud, addr);
if (!pmd) {
/*
* Ignore calls from kvm_set_spte_hva for unallocated
@@ -1182,6 +1318,11 @@
return stage2_ptep_test_and_clear_young((pte_t *)pmd);
}
+static int stage2_pudp_test_and_clear_young(pud_t *pud)
+{
+ return stage2_ptep_test_and_clear_young((pte_t *)pud);
+}
+
/**
* kvm_phys_addr_ioremap - map a device range to guest IPA
*
@@ -1202,13 +1343,14 @@
pfn = __phys_to_pfn(pa);
for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
- pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
+ pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
if (writable)
pte = kvm_s2pte_mkwrite(pte);
- ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
- KVM_NR_MEM_OBJS);
+ ret = mmu_topup_memory_cache(&cache,
+ kvm_mmu_cache_min_pages(kvm),
+ KVM_NR_MEM_OBJS);
if (ret)
goto out;
spin_lock(&kvm->mmu_lock);
@@ -1233,7 +1375,7 @@
struct page *page = pfn_to_page(pfn);
/*
- * PageTransCompoungMap() returns true for THP and
+ * PageTransCompoundMap() returns true for THP and
* hugetlbfs. Make sure the adjustment is done only for THP
* pages.
*/
@@ -1273,14 +1415,6 @@
return false;
}
-static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
-{
- if (kvm_vcpu_trap_is_iabt(vcpu))
- return false;
-
- return kvm_vcpu_dabt_iswrite(vcpu);
-}
-
/**
* stage2_wp_ptes - write protect PMD range
* @pmd: pointer to pmd entry
@@ -1302,19 +1436,21 @@
/**
* stage2_wp_pmds - write protect PUD range
+ * kvm: kvm instance for the VM
* @pud: pointer to pud entry
* @addr: range start address
* @end: range end address
*/
-static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
+ phys_addr_t addr, phys_addr_t end)
{
pmd_t *pmd;
phys_addr_t next;
- pmd = stage2_pmd_offset(pud, addr);
+ pmd = stage2_pmd_offset(kvm, pud, addr);
do {
- next = stage2_pmd_addr_end(addr, end);
+ next = stage2_pmd_addr_end(kvm, addr, end);
if (!pmd_none(*pmd)) {
if (pmd_thp_or_huge(*pmd)) {
if (!kvm_s2pmd_readonly(pmd))
@@ -1327,25 +1463,27 @@
}
/**
- * stage2_wp_puds - write protect PGD range
- * @pgd: pointer to pgd entry
- * @addr: range start address
- * @end: range end address
- *
- * Process PUD entries, for a huge PUD we cause a panic.
- */
-static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+ * stage2_wp_puds - write protect PGD range
+ * @pgd: pointer to pgd entry
+ * @addr: range start address
+ * @end: range end address
+ */
+static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
+ phys_addr_t addr, phys_addr_t end)
{
pud_t *pud;
phys_addr_t next;
- pud = stage2_pud_offset(pgd, addr);
+ pud = stage2_pud_offset(kvm, pgd, addr);
do {
- next = stage2_pud_addr_end(addr, end);
- if (!stage2_pud_none(*pud)) {
- /* TODO:PUD not supported, revisit later if supported */
- BUG_ON(stage2_pud_huge(*pud));
- stage2_wp_pmds(pud, addr, next);
+ next = stage2_pud_addr_end(kvm, addr, end);
+ if (!stage2_pud_none(kvm, *pud)) {
+ if (stage2_pud_huge(kvm, *pud)) {
+ if (!kvm_s2pud_readonly(pud))
+ kvm_set_s2pud_readonly(pud);
+ } else {
+ stage2_wp_pmds(kvm, pud, addr, next);
+ }
}
} while (pud++, addr = next, addr != end);
}
@@ -1361,7 +1499,7 @@
pgd_t *pgd;
phys_addr_t next;
- pgd = kvm->arch.pgd + stage2_pgd_index(addr);
+ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
do {
/*
* Release kvm_mmu_lock periodically if the memory region is
@@ -1375,9 +1513,9 @@
cond_resched_lock(&kvm->mmu_lock);
if (!READ_ONCE(kvm->arch.pgd))
break;
- next = stage2_pgd_addr_end(addr, end);
- if (stage2_pgd_present(*pgd))
- stage2_wp_puds(pgd, addr, next);
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ if (stage2_pgd_present(kvm, *pgd))
+ stage2_wp_puds(kvm, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
@@ -1388,7 +1526,7 @@
*
* Called to start logging dirty pages after memory region
* KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
- * all present PMD and PTEs are write protected in the memory region.
+ * all present PUD, PMD and PTEs are write protected in the memory region.
* Afterwards read of dirty page log can be called.
*
* Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
@@ -1456,20 +1594,71 @@
static void kvm_send_hwpoison_signal(unsigned long address,
struct vm_area_struct *vma)
{
- siginfo_t info;
-
- clear_siginfo(&info);
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_MCEERR_AR;
- info.si_addr = (void __user *)address;
+ short lsb;
if (is_vm_hugetlb_page(vma))
- info.si_addr_lsb = huge_page_shift(hstate_vma(vma));
+ lsb = huge_page_shift(hstate_vma(vma));
else
- info.si_addr_lsb = PAGE_SHIFT;
+ lsb = PAGE_SHIFT;
- send_sig_info(SIGBUS, &info, current);
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
+}
+
+static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
+ unsigned long hva,
+ unsigned long map_size)
+{
+ gpa_t gpa_start;
+ hva_t uaddr_start, uaddr_end;
+ size_t size;
+
+ size = memslot->npages * PAGE_SIZE;
+
+ gpa_start = memslot->base_gfn << PAGE_SHIFT;
+
+ uaddr_start = memslot->userspace_addr;
+ uaddr_end = uaddr_start + size;
+
+ /*
+ * Pages belonging to memslots that don't have the same alignment
+ * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
+ * PMD/PUD entries, because we'll end up mapping the wrong pages.
+ *
+ * Consider a layout like the following:
+ *
+ * memslot->userspace_addr:
+ * +-----+--------------------+--------------------+---+
+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
+ * +-----+--------------------+--------------------+---+
+ *
+ * memslot->base_gfn << PAGE_SIZE:
+ * +---+--------------------+--------------------+-----+
+ * |abc|def Stage-2 block | Stage-2 block |tvxyz|
+ * +---+--------------------+--------------------+-----+
+ *
+ * If we create those stage-2 blocks, we'll end up with this incorrect
+ * mapping:
+ * d -> f
+ * e -> g
+ * f -> h
+ */
+ if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
+ return false;
+
+ /*
+ * Next, let's make sure we're not trying to map anything not covered
+ * by the memslot. This means we have to prohibit block size mappings
+ * for the beginning and end of a non-block aligned and non-block sized
+ * memory slot (illustrated by the head and tail parts of the
+ * userspace view above containing pages 'abcde' and 'xyz',
+ * respectively).
+ *
+ * Note that it doesn't matter if we do the check using the
+ * userspace_addr or the base_gfn, as both are equally aligned (per
+ * the check above) and equally sized.
+ */
+ return (hva & ~(map_size - 1)) >= uaddr_start &&
+ (hva & ~(map_size - 1)) + map_size <= uaddr_end;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1477,7 +1666,8 @@
unsigned long fault_status)
{
int ret;
- bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false;
+ bool write_fault, writable, force_pte = false;
+ bool exec_fault, needs_exec;
unsigned long mmu_seq;
gfn_t gfn = fault_ipa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
@@ -1486,7 +1676,7 @@
kvm_pfn_t pfn;
pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
- unsigned long flags = 0;
+ unsigned long vma_pagesize, flags = 0;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
@@ -1506,27 +1696,27 @@
return -EFAULT;
}
- if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) {
- hugetlb = true;
- gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
- } else {
- /*
- * Pages belonging to memslots that don't have the same
- * alignment for userspace and IPA cannot be mapped using
- * block descriptors even if the pages belong to a THP for
- * the process, because the stage-2 block descriptor will
- * cover more than a single THP and we loose atomicity for
- * unmapping, updates, and splits of the THP or other pages
- * in the stage-2 block range.
- */
- if ((memslot->userspace_addr & ~PMD_MASK) !=
- ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
- force_pte = true;
+ vma_pagesize = vma_kernel_pagesize(vma);
+ if (logging_active ||
+ !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+ force_pte = true;
+ vma_pagesize = PAGE_SIZE;
}
+
+ /*
+ * The stage2 has a minimum of 2 level table (For arm64 see
+ * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
+ * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
+ * As for PUD huge maps, we must make sure that we have at least
+ * 3 levels, i.e, PMD is not folded.
+ */
+ if (vma_pagesize == PMD_SIZE ||
+ (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
+ gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
up_read(¤t->mm->mmap_sem);
/* We need minimum second+third level pages */
- ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
+ ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
KVM_NR_MEM_OBJS);
if (ret)
return ret;
@@ -1560,7 +1750,6 @@
* should not be mapped with huge pages (it introduces churn
* and performance degradation), so force a pte mapping.
*/
- force_pte = true;
flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
/*
@@ -1575,50 +1764,73 @@
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
- if (!hugetlb && !force_pte)
- hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
+ if (vma_pagesize == PAGE_SIZE && !force_pte) {
+ /*
+ * Only PMD_SIZE transparent hugepages(THP) are
+ * currently supported. This code will need to be
+ * updated to support other THP sizes.
+ *
+ * Make sure the host VA and the guest IPA are sufficiently
+ * aligned and that the block is contained within the memslot.
+ */
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+ transparent_hugepage_adjust(&pfn, &fault_ipa))
+ vma_pagesize = PMD_SIZE;
+ }
- if (hugetlb) {
- pmd_t new_pmd = pfn_pmd(pfn, mem_type);
- new_pmd = pmd_mkhuge(new_pmd);
- if (writable) {
+ if (writable)
+ kvm_set_pfn_dirty(pfn);
+
+ if (fault_status != FSC_PERM)
+ clean_dcache_guest_page(pfn, vma_pagesize);
+
+ if (exec_fault)
+ invalidate_icache_guest_page(pfn, vma_pagesize);
+
+ /*
+ * If we took an execution fault we have made the
+ * icache/dcache coherent above and should now let the s2
+ * mapping be executable.
+ *
+ * Write faults (!exec_fault && FSC_PERM) are orthogonal to
+ * execute permissions, and we preserve whatever we have.
+ */
+ needs_exec = exec_fault ||
+ (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
+
+ if (vma_pagesize == PUD_SIZE) {
+ pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
+
+ new_pud = kvm_pud_mkhuge(new_pud);
+ if (writable)
+ new_pud = kvm_s2pud_mkwrite(new_pud);
+
+ if (needs_exec)
+ new_pud = kvm_s2pud_mkexec(new_pud);
+
+ ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
+ } else if (vma_pagesize == PMD_SIZE) {
+ pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
+
+ new_pmd = kvm_pmd_mkhuge(new_pmd);
+
+ if (writable)
new_pmd = kvm_s2pmd_mkwrite(new_pmd);
- kvm_set_pfn_dirty(pfn);
- }
- if (fault_status != FSC_PERM)
- clean_dcache_guest_page(pfn, PMD_SIZE);
-
- if (exec_fault) {
+ if (needs_exec)
new_pmd = kvm_s2pmd_mkexec(new_pmd);
- invalidate_icache_guest_page(pfn, PMD_SIZE);
- } else if (fault_status == FSC_PERM) {
- /* Preserve execute if XN was already cleared */
- if (stage2_is_exec(kvm, fault_ipa))
- new_pmd = kvm_s2pmd_mkexec(new_pmd);
- }
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
} else {
- pte_t new_pte = pfn_pte(pfn, mem_type);
+ pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
if (writable) {
new_pte = kvm_s2pte_mkwrite(new_pte);
- kvm_set_pfn_dirty(pfn);
mark_page_dirty(kvm, gfn);
}
- if (fault_status != FSC_PERM)
- clean_dcache_guest_page(pfn, PAGE_SIZE);
-
- if (exec_fault) {
+ if (needs_exec)
new_pte = kvm_s2pte_mkexec(new_pte);
- invalidate_icache_guest_page(pfn, PAGE_SIZE);
- } else if (fault_status == FSC_PERM) {
- /* Preserve execute if XN was already cleared */
- if (stage2_is_exec(kvm, fault_ipa))
- new_pte = kvm_s2pte_mkexec(new_pte);
- }
ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
}
@@ -1639,6 +1851,7 @@
*/
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
kvm_pfn_t pfn;
@@ -1648,24 +1861,23 @@
spin_lock(&vcpu->kvm->mmu_lock);
- pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
goto out;
- if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */
+ if (pud) { /* HugeTLB */
+ *pud = kvm_s2pud_mkyoung(*pud);
+ pfn = kvm_pud_pfn(*pud);
+ pfn_valid = true;
+ } else if (pmd) { /* THP, HugeTLB */
*pmd = pmd_mkyoung(*pmd);
pfn = pmd_pfn(*pmd);
pfn_valid = true;
- goto out;
+ } else {
+ *pte = pte_mkyoung(*pte); /* Just a page... */
+ pfn = pte_pfn(*pte);
+ pfn_valid = true;
}
- pte = pte_offset_kernel(pmd, fault_ipa);
- if (pte_none(*pte)) /* Nothing there either */
- goto out;
-
- *pte = pte_mkyoung(*pte); /* Just a page... */
- pfn = pte_pfn(*pte);
- pfn_valid = true;
out:
spin_unlock(&vcpu->kvm->mmu_lock);
if (pfn_valid)
@@ -1705,7 +1917,7 @@
* For RAS the host kernel may handle this abort.
* There is no need to pass the error into the guest.
*/
- if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
+ if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
return 1;
if (unlikely(!is_iabt)) {
@@ -1769,7 +1981,7 @@
}
/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
+ VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
if (fault_status == FSC_ACCESS) {
handle_access_fault(vcpu, fault_ipa);
@@ -1851,14 +2063,14 @@
}
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
unsigned long end = hva + PAGE_SIZE;
kvm_pfn_t pfn = pte_pfn(pte);
pte_t stage2_pte;
if (!kvm->arch.pgd)
- return;
+ return 0;
trace_kvm_set_spte_hva(hva);
@@ -1867,48 +2079,46 @@
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- stage2_pte = pfn_pte(pfn, PAGE_S2);
+ stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
+
+ return 0;
}
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
- pmd = stage2_get_pmd(kvm, NULL, gpa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+ if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
return 0;
- if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
+ if (pud)
+ return stage2_pudp_test_and_clear_young(pud);
+ else if (pmd)
return stage2_pmdp_test_and_clear_young(pmd);
-
- pte = pte_offset_kernel(pmd, gpa);
- if (pte_none(*pte))
- return 0;
-
- return stage2_ptep_test_and_clear_young(pte);
+ else
+ return stage2_ptep_test_and_clear_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
- pmd = stage2_get_pmd(kvm, NULL, gpa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+ if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
return 0;
- if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
+ if (pud)
+ return kvm_s2pud_young(*pud);
+ else if (pmd)
return pmd_young(*pmd);
-
- pte = pte_offset_kernel(pmd, gpa);
- if (!pte_none(*pte)) /* Just a page... */
+ else
return pte_young(*pte);
-
- return 0;
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
@@ -2068,7 +2278,7 @@
* space addressable by the KVM guest IPA space.
*/
if (memslot->base_gfn + memslot->npages >=
- (KVM_PHYS_SIZE >> PAGE_SHIFT))
+ (kvm_phys_size(kvm) >> PAGE_SHIFT))
return -EFAULT;
down_read(¤t->mm->mmap_sem);
@@ -2154,7 +2364,7 @@
return 0;
}
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
}