From 83e48c398825ba9ee71a0b0426f95c519cb5db0a Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Thu, 2 Apr 2026 21:23:00 +0800 Subject: [PATCH 1/4] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode Introduces one per-VM architecture-specific fields to support runtime configuration of the G-stage page table format: - kvm->arch.pgd_levels: the corresponding number of page table levels for the selected mode. These fields replace the previous global variables kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different virtual machines to independently select their G-stage page table format instead of being forced to share the maximum mode detected by the kernel at boot time. Signed-off-by: Fangyu Yu Reviewed-by: Andrew Jones Reviewed-by: Anup Patel Reviewed-by: Guo Ren Signed-off-by: Linux RISC-V bot --- arch/riscv/include/asm/kvm_gstage.h | 37 ++++++++++++---- arch/riscv/include/asm/kvm_host.h | 1 + arch/riscv/kvm/gstage.c | 65 ++++++++++++++--------------- arch/riscv/kvm/main.c | 12 +++--- arch/riscv/kvm/mmu.c | 20 +++++---- arch/riscv/kvm/vm.c | 2 +- arch/riscv/kvm/vmid.c | 3 +- 7 files changed, 83 insertions(+), 57 deletions(-) diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index 595e2183173ebd..5aa58d1f692aca 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -29,16 +29,22 @@ struct kvm_gstage_mapping { #define kvm_riscv_gstage_index_bits 10 #endif -extern unsigned long kvm_riscv_gstage_mode; -extern unsigned long kvm_riscv_gstage_pgd_levels; +extern unsigned long kvm_riscv_gstage_max_pgd_levels; #define kvm_riscv_gstage_pgd_xbits 2 #define kvm_riscv_gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits)) -#define kvm_riscv_gstage_gpa_bits (HGATP_PAGE_SHIFT + \ - (kvm_riscv_gstage_pgd_levels * \ - kvm_riscv_gstage_index_bits) + \ - kvm_riscv_gstage_pgd_xbits) -#define kvm_riscv_gstage_gpa_size ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits)) + +static inline unsigned long kvm_riscv_gstage_gpa_bits(unsigned long pgd_levels) +{ + return (HGATP_PAGE_SHIFT + + pgd_levels * kvm_riscv_gstage_index_bits + + kvm_riscv_gstage_pgd_xbits); +} + +static inline gpa_t kvm_riscv_gstage_gpa_size(unsigned long pgd_levels) +{ + return BIT_ULL(kvm_riscv_gstage_gpa_bits(pgd_levels)); +} bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, pte_t **ptepp, u32 *ptep_level); @@ -69,4 +75,21 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end void kvm_riscv_gstage_mode_detect(void); +static inline unsigned long kvm_riscv_gstage_mode(unsigned long pgd_levels) +{ + switch (pgd_levels) { + case 2: + return HGATP_MODE_SV32X4; + case 3: + return HGATP_MODE_SV39X4; + case 4: + return HGATP_MODE_SV48X4; + case 5: + return HGATP_MODE_SV57X4; + default: + WARN_ON_ONCE(1); + return HGATP_MODE_OFF; + } +} + #endif diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 24585304c02b14..478f699e9decbd 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -94,6 +94,7 @@ struct kvm_arch { /* G-stage page table */ pgd_t *pgd; phys_addr_t pgd_phys; + unsigned long pgd_levels; /* Guest Timer */ struct kvm_guest_timer timer; diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index b67d60d722c2fb..4beb9322fe7682 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -12,22 +12,21 @@ #include #ifdef CONFIG_64BIT -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4; -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3; +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3; #else -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4; -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2; +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2; #endif #define gstage_pte_leaf(__ptep) \ (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) +static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage, + gpa_t addr, u32 level) { unsigned long mask; unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level); - if (level == (kvm_riscv_gstage_pgd_levels - 1)) + if (level == gstage->kvm->arch.pgd_levels - 1) mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1; else mask = PTRS_PER_PTE - 1; @@ -40,12 +39,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte) return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte))); } -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) +static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size, + u32 *out_level) { u32 i; unsigned long psz = 1UL << 12; - for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) { + for (i = 0; i < gstage->kvm->arch.pgd_levels; i++) { if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) { *out_level = i; return 0; @@ -55,21 +55,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) return -EINVAL; } -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) +static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level, + unsigned long *out_pgorder) { - if (kvm_riscv_gstage_pgd_levels < level) + if (gstage->kvm->arch.pgd_levels < level) return -EINVAL; *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits); return 0; } -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) +static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level, + unsigned long *out_pgsize) { int rc; unsigned long page_order = PAGE_SHIFT; - rc = gstage_level_to_page_order(level, &page_order); + rc = gstage_level_to_page_order(gstage, level, &page_order); if (rc) return rc; @@ -81,11 +83,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, pte_t **ptepp, u32 *ptep_level) { pte_t *ptep; - u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + u32 current_level = gstage->kvm->arch.pgd_levels - 1; *ptep_level = current_level; ptep = (pte_t *)gstage->pgd; - ptep = &ptep[gstage_pte_index(addr, current_level)]; + ptep = &ptep[gstage_pte_index(gstage, addr, current_level)]; while (ptep && pte_val(ptep_get(ptep))) { if (gstage_pte_leaf(ptep)) { *ptep_level = current_level; @@ -97,7 +99,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, current_level--; *ptep_level = current_level; ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); - ptep = &ptep[gstage_pte_index(addr, current_level)]; + ptep = &ptep[gstage_pte_index(gstage, addr, current_level)]; } else { ptep = NULL; } @@ -110,7 +112,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr) { unsigned long order = PAGE_SHIFT; - if (gstage_level_to_page_order(level, &order)) + if (gstage_level_to_page_order(gstage, level, &order)) return; addr &= ~(BIT(order) - 1); @@ -125,9 +127,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, const struct kvm_gstage_mapping *map) { - u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + u32 current_level = gstage->kvm->arch.pgd_levels - 1; pte_t *next_ptep = (pte_t *)gstage->pgd; - pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; + pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)]; if (current_level < map->level) return -EINVAL; @@ -151,7 +153,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, } current_level--; - ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; + ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)]; } if (pte_val(*ptep) != pte_val(map->pte)) { @@ -175,7 +177,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, out_map->addr = gpa; out_map->level = 0; - ret = gstage_page_size_to_level(page_size, &out_map->level); + ret = gstage_page_size_to_level(gstage, page_size, &out_map->level); if (ret) return ret; @@ -217,7 +219,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, u32 next_ptep_level; unsigned long next_page_size, page_size; - ret = gstage_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size); if (ret) return; @@ -229,7 +231,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, if (ptep_level && !gstage_pte_leaf(ptep)) { next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); next_ptep_level = ptep_level - 1; - ret = gstage_level_to_page_size(next_ptep_level, &next_page_size); + ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size); if (ret) return; @@ -263,7 +265,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage, while (addr < end) { found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level); - ret = gstage_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size); if (ret) break; @@ -297,7 +299,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end while (addr < end) { found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level); - ret = gstage_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size); if (ret) break; @@ -319,39 +321,34 @@ void __init kvm_riscv_gstage_mode_detect(void) /* Try Sv57x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { - kvm_riscv_gstage_mode = HGATP_MODE_SV57X4; - kvm_riscv_gstage_pgd_levels = 5; + kvm_riscv_gstage_max_pgd_levels = 5; goto done; } /* Try Sv48x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { - kvm_riscv_gstage_mode = HGATP_MODE_SV48X4; - kvm_riscv_gstage_pgd_levels = 4; + kvm_riscv_gstage_max_pgd_levels = 4; goto done; } /* Try Sv39x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) { - kvm_riscv_gstage_mode = HGATP_MODE_SV39X4; - kvm_riscv_gstage_pgd_levels = 3; + kvm_riscv_gstage_max_pgd_levels = 3; goto done; } #else /* CONFIG_32BIT */ /* Try Sv32x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) { - kvm_riscv_gstage_mode = HGATP_MODE_SV32X4; - kvm_riscv_gstage_pgd_levels = 2; + kvm_riscv_gstage_max_pgd_levels = 2; goto done; } #endif /* KVM depends on !HGATP_MODE_OFF */ - kvm_riscv_gstage_mode = HGATP_MODE_OFF; - kvm_riscv_gstage_pgd_levels = 0; + kvm_riscv_gstage_max_pgd_levels = 0; done: csr_write(CSR_HGATP, 0); diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 0f3fe3986fc02e..90ee0a032b9a3c 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -105,17 +105,17 @@ static int __init riscv_kvm_init(void) return rc; kvm_riscv_gstage_mode_detect(); - switch (kvm_riscv_gstage_mode) { - case HGATP_MODE_SV32X4: + switch (kvm_riscv_gstage_max_pgd_levels) { + case 2: str = "Sv32x4"; break; - case HGATP_MODE_SV39X4: + case 3: str = "Sv39x4"; break; - case HGATP_MODE_SV48X4: + case 4: str = "Sv48x4"; break; - case HGATP_MODE_SV57X4: + case 5: str = "Sv57x4"; break; default: @@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void) (rc) ? slist : "no features"); } - kvm_info("using %s G-stage page table format\n", str); + kvm_info("highest G-stage page table mode is %s\n", str); kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 088d33ba90edaf..fbcdd75cb9af04 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, if (!writable) map.pte = pte_wrprotect(map.pte); - ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels); + ret = kvm_mmu_topup_memory_cache(&pcache, kvm->arch.pgd_levels); if (ret) goto out; @@ -186,7 +186,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest GPA space. */ if ((new->base_gfn + new->npages) >= - (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT)) + kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels) >> PAGE_SHIFT) return -EFAULT; hva = new->userspace_addr; @@ -472,7 +472,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, memset(out_map, 0, sizeof(*out_map)); /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels); + ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.pgd_levels); if (ret) { kvm_err("Failed to topup G-stage cache\n"); return ret; @@ -575,6 +575,7 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm) return -ENOMEM; kvm->arch.pgd = page_to_virt(pgd_page); kvm->arch.pgd_phys = page_to_phys(pgd_page); + kvm->arch.pgd_levels = kvm_riscv_gstage_max_pgd_levels; return 0; } @@ -590,10 +591,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm) gstage.flags = 0; gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); gstage.pgd = kvm->arch.pgd; - kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false); + kvm_riscv_gstage_unmap_range(&gstage, 0UL, + kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels), false); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; kvm->arch.pgd_phys = 0; + kvm->arch.pgd_levels = 0; } spin_unlock(&kvm->mmu_lock); @@ -603,11 +606,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm) void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu) { - unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT; - struct kvm_arch *k = &vcpu->kvm->arch; + struct kvm_arch *ka = &vcpu->kvm->arch; + unsigned long hgatp = kvm_riscv_gstage_mode(ka->pgd_levels) + << HGATP_MODE_SHIFT; - hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID; - hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN; + hgatp |= (READ_ONCE(ka->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID; + hgatp |= (ka->pgd_phys >> PAGE_SHIFT) & HGATP_PPN; ncsr_write(CSR_HGATP, hgatp); diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 13c63ae1a78b25..4d82a886102c64 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -199,7 +199,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_VM_GPA_BITS: - r = kvm_riscv_gstage_gpa_bits; + r = kvm_riscv_gstage_gpa_bits(kvm->arch.pgd_levels); break; default: r = 0; diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index cf34d448289d79..c15bdb1dd8bef0 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -26,7 +26,8 @@ static DEFINE_SPINLOCK(vmid_lock); void __init kvm_riscv_gstage_vmid_detect(void) { /* Figure-out number of VMID bits in HW */ - csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID); + csr_write(CSR_HGATP, (kvm_riscv_gstage_mode(kvm_riscv_gstage_max_pgd_levels) << + HGATP_MODE_SHIFT) | HGATP_VMID); vmid_bits = csr_read(CSR_HGATP); vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT; vmid_bits = fls_long(vmid_bits); From b1d91342b07c5835d314351441d85e083b264e79 Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Thu, 2 Apr 2026 21:23:01 +0800 Subject: [PATCH 2/4] RISC-V: KVM: Cache gstage pgd_levels in struct kvm_gstage Gstage page-table helpers frequently chase gstage->kvm->arch to fetch pgd_levels. This adds noise and repeats the same dereference chain in hot paths. Add pgd_levels to struct kvm_gstage and initialize it from kvm->arch when setting up a gstage instance. Introduce kvm_riscv_gstage_init() to centralize initialization and switch gstage code to use gstage->pgd_levels. Suggested-by: Anup Patel Signed-off-by: Fangyu Yu Reviewed-by: Anup Patel Signed-off-by: Linux RISC-V bot --- arch/riscv/include/asm/kvm_gstage.h | 10 ++++++ arch/riscv/kvm/gstage.c | 10 +++--- arch/riscv/kvm/mmu.c | 50 ++++++----------------------- 3 files changed, 25 insertions(+), 45 deletions(-) diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index 5aa58d1f692aca..70d9d483365ee4 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -15,6 +15,7 @@ struct kvm_gstage { #define KVM_GSTAGE_FLAGS_LOCAL BIT(0) unsigned long vmid; pgd_t *pgd; + unsigned long pgd_levels; }; struct kvm_gstage_mapping { @@ -92,4 +93,13 @@ static inline unsigned long kvm_riscv_gstage_mode(unsigned long pgd_levels) } } +static inline void kvm_riscv_gstage_init(struct kvm_gstage *gstage, struct kvm *kvm) +{ + gstage->kvm = kvm; + gstage->flags = 0; + gstage->vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage->pgd = kvm->arch.pgd; + gstage->pgd_levels = kvm->arch.pgd_levels; +} + #endif diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index 4beb9322fe7682..7c4c34bc191b8a 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -26,7 +26,7 @@ static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage, unsigned long mask; unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level); - if (level == gstage->kvm->arch.pgd_levels - 1) + if (level == gstage->pgd_levels - 1) mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1; else mask = PTRS_PER_PTE - 1; @@ -45,7 +45,7 @@ static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long pa u32 i; unsigned long psz = 1UL << 12; - for (i = 0; i < gstage->kvm->arch.pgd_levels; i++) { + for (i = 0; i < gstage->pgd_levels; i++) { if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) { *out_level = i; return 0; @@ -58,7 +58,7 @@ static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long pa static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level, unsigned long *out_pgorder) { - if (gstage->kvm->arch.pgd_levels < level) + if (gstage->pgd_levels < level) return -EINVAL; *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits); @@ -83,7 +83,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, pte_t **ptepp, u32 *ptep_level) { pte_t *ptep; - u32 current_level = gstage->kvm->arch.pgd_levels - 1; + u32 current_level = gstage->pgd_levels - 1; *ptep_level = current_level; ptep = (pte_t *)gstage->pgd; @@ -127,7 +127,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, const struct kvm_gstage_mapping *map) { - u32 current_level = gstage->kvm->arch.pgd_levels - 1; + u32 current_level = gstage->pgd_levels - 1; pte_t *next_ptep = (pte_t *)gstage->pgd; pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)]; diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index fbcdd75cb9af04..2d3def024270c0 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -24,10 +24,7 @@ static void mmu_wp_memory_region(struct kvm *kvm, int slot) phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); spin_lock(&kvm->mmu_lock); kvm_riscv_gstage_wp_range(&gstage, start, end); @@ -49,10 +46,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, struct kvm_gstage_mapping map; struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(hpa); @@ -89,10 +83,7 @@ void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size) { struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); spin_lock(&kvm->mmu_lock); kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false); @@ -109,10 +100,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); kvm_riscv_gstage_wp_range(&gstage, start, end); } @@ -141,10 +129,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); spin_lock(&kvm->mmu_lock); kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false); @@ -250,10 +235,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.pgd) return false; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); mmu_locked = spin_trylock(&kvm->mmu_lock); kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, @@ -275,10 +257,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT, &ptep, &ptep_level)) return false; @@ -298,10 +277,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT, &ptep, &ptep_level)) return false; @@ -463,10 +439,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, struct kvm_gstage gstage; struct page *page; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); /* Setup initial state of output mapping */ memset(out_map, 0, sizeof(*out_map)); @@ -587,10 +560,7 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm) spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels), false); pgd = READ_ONCE(kvm->arch.pgd); From 67a21baca98cac994d00c097ae701d7208634fa4 Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Thu, 2 Apr 2026 21:23:02 +0800 Subject: [PATCH 3/4] RISC-V: KVM: Detect and expose supported HGATP G-stage modes Extend kvm_riscv_gstage_mode_detect() to record HGATP.MODE values in a bitmask. Keep tracking the maximum supported G-stage page table level for existing internal users. Also provide lightweight helpers to retrieve the supported-mode bitmask and validate a requested HGATP.MODE against it. Signed-off-by: Fangyu Yu Reviewed-by: Andrew Jones Reviewed-by: Guo Ren Signed-off-by: Linux RISC-V bot --- arch/riscv/include/asm/kvm_gstage.h | 11 +++++++++++ arch/riscv/kvm/gstage.c | 15 ++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index 70d9d483365ee4..bbf8f45c6563be 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -31,6 +31,7 @@ struct kvm_gstage_mapping { #endif extern unsigned long kvm_riscv_gstage_max_pgd_levels; +extern u32 kvm_riscv_gstage_supported_mode_mask; #define kvm_riscv_gstage_pgd_xbits 2 #define kvm_riscv_gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits)) @@ -102,4 +103,14 @@ static inline void kvm_riscv_gstage_init(struct kvm_gstage *gstage, struct kvm * gstage->pgd_levels = kvm->arch.pgd_levels; } +static inline u32 kvm_riscv_get_hgatp_mode_mask(void) +{ + return kvm_riscv_gstage_supported_mode_mask; +} + +static inline bool kvm_riscv_hgatp_mode_is_valid(unsigned long mode) +{ + return kvm_riscv_gstage_supported_mode_mask & BIT(mode); +} + #endif diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index 7c4c34bc191b8a..9204e6427d2d3e 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -16,6 +16,8 @@ unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3; #else unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2; #endif +/* Bitmask of supported HGATP.MODE encodings (BIT(HGATP_MODE_*)). */ +u32 kvm_riscv_gstage_supported_mode_mask __ro_after_init; #define gstage_pte_leaf(__ptep) \ (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) @@ -317,11 +319,17 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end void __init kvm_riscv_gstage_mode_detect(void) { + kvm_riscv_gstage_supported_mode_mask = 0; + kvm_riscv_gstage_max_pgd_levels = 0; + #ifdef CONFIG_64BIT /* Try Sv57x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { kvm_riscv_gstage_max_pgd_levels = 5; + kvm_riscv_gstage_supported_mode_mask |= BIT(HGATP_MODE_SV57X4) | + BIT(HGATP_MODE_SV48X4) | + BIT(HGATP_MODE_SV39X4); goto done; } @@ -329,6 +337,8 @@ void __init kvm_riscv_gstage_mode_detect(void) csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { kvm_riscv_gstage_max_pgd_levels = 4; + kvm_riscv_gstage_supported_mode_mask |= BIT(HGATP_MODE_SV48X4) | + BIT(HGATP_MODE_SV39X4); goto done; } @@ -336,6 +346,7 @@ void __init kvm_riscv_gstage_mode_detect(void) csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) { kvm_riscv_gstage_max_pgd_levels = 3; + kvm_riscv_gstage_supported_mode_mask |= BIT(HGATP_MODE_SV39X4); goto done; } #else /* CONFIG_32BIT */ @@ -343,13 +354,11 @@ void __init kvm_riscv_gstage_mode_detect(void) csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) { kvm_riscv_gstage_max_pgd_levels = 2; + kvm_riscv_gstage_supported_mode_mask |= BIT(HGATP_MODE_SV32X4); goto done; } #endif - /* KVM depends on !HGATP_MODE_OFF */ - kvm_riscv_gstage_max_pgd_levels = 0; - done: csr_write(CSR_HGATP, 0); kvm_riscv_local_hfence_gvma_all(); From 375476bb1c6e940d7c982819c44def53b16aa46b Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Thu, 2 Apr 2026 21:23:03 +0800 Subject: [PATCH 4/4] RISC-V: KVM: add KVM_CAP_RISCV_SET_HGATP_MODE Add a VM capability that allows userspace to select the G-stage page table format by setting HGATP.MODE on a per-VM basis. Userspace enables the capability via KVM_ENABLE_CAP, passing the requested HGATP.MODE in args[0]. The request is rejected with -EINVAL if the mode is not supported by the host, and with -EBUSY if the VM has already been committed (e.g. vCPUs have been created or any memslot is populated). KVM_CHECK_EXTENSION(KVM_CAP_RISCV_SET_HGATP_MODE) returns a bitmask of the HGATP.MODE formats supported by the host. Signed-off-by: Fangyu Yu Reviewed-by: Andrew Jones Reviewed-by: Guo Ren Signed-off-by: Linux RISC-V bot --- Documentation/virt/kvm/api.rst | 27 +++++++++++++++++++++++++++ arch/riscv/kvm/vm.c | 18 ++++++++++++++++-- include/uapi/linux/kvm.h | 1 + 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 032516783e9622..9d7f6958fa816e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8902,6 +8902,33 @@ helpful if user space wants to emulate instructions which are not This capability can be enabled dynamically even if VCPUs were already created and are running. +7.47 KVM_CAP_RISCV_SET_HGATP_MODE +--------------------------------- + +:Architectures: riscv +:Type: VM +:Parameters: args[0] contains the requested HGATP mode +:Returns: + - 0 on success. + - -EINVAL if args[0] is outside the range of HGATP modes supported by the + hardware. + - -EBUSY if vCPUs have already been created for the VM, if the VM has any + non-empty memslots. + +This capability allows userspace to explicitly select the HGATP mode for +the VM. The selected mode must be supported by both KVM and hardware. This +capability must be enabled before creating any vCPUs or memslots. + +If this capability is not enabled, KVM will select the default HGATP mode +automatically. The default is the highest HGATP.MODE value supported by +hardware. + +``KVM_CHECK_EXTENSION(KVM_CAP_RISCV_SET_HGATP_MODE)`` returns a bitmask of +HGATP.MODE values supported by the host. A return value of 0 indicates that +the capability is not supported. Supported-mode bitmask use HGATP.MODE +encodings as defined by the RISC-V privileged specification, such as Sv39x4 +corresponds to HGATP.MODE=8, so userspace should test bitmask & BIT(8). + 8. Other capabilities. ====================== diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 4d82a886102c64..5e82a3ad3ad015 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -201,6 +201,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_GPA_BITS: r = kvm_riscv_gstage_gpa_bits(kvm->arch.pgd_levels); break; + case KVM_CAP_RISCV_SET_HGATP_MODE: + r = kvm_riscv_get_hgatp_mode_mask(); + break; default: r = 0; break; @@ -211,12 +214,23 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { + if (cap->flags) + return -EINVAL; + switch (cap->cap) { case KVM_CAP_RISCV_MP_STATE_RESET: - if (cap->flags) - return -EINVAL; kvm->arch.mp_state_reset = true; return 0; + case KVM_CAP_RISCV_SET_HGATP_MODE: + if (!kvm_riscv_hgatp_mode_is_valid(cap->args[0])) + return -EINVAL; + + if (kvm->created_vcpus || !kvm_are_all_memslots_empty(kvm)) + return -EBUSY; +#ifdef CONFIG_64BIT + kvm->arch.pgd_levels = 3 + cap->args[0] - HGATP_MODE_SV39X4; +#endif + return 0; default: return -EINVAL; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 80364d4dbebb0c..a74a80fd40469c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -989,6 +989,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_SEA_TO_USER 245 #define KVM_CAP_S390_USER_OPEREXEC 246 #define KVM_CAP_S390_KEYOP 247 +#define KVM_CAP_RISCV_SET_HGATP_MODE 248 struct kvm_irq_routing_irqchip { __u32 irqchip;