diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h index b9ee8346cc8c9a..b75cab959e538c 100644 --- a/arch/riscv/include/asm/kexec.h +++ b/arch/riscv/include/asm/kexec.h @@ -53,6 +53,7 @@ typedef void (*riscv_kexec_method)(unsigned long first_ind_entry, unsigned long va_pa_off); extern riscv_kexec_method riscv_kexec_norelocate; +extern riscv_kexec_method riscv_kexec_relocate_entry; #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops elf_kexec_ops; @@ -75,4 +76,8 @@ int load_extra_segments(struct kimage *image, unsigned long kernel_start, unsigned long cmdline_len); #endif +#ifndef __ASSEMBLY__ +extern char __kexec_tramp_text_start[]; +#endif + #endif diff --git a/arch/riscv/kernel/image-vars.h b/arch/riscv/kernel/image-vars.h index 3bd9d06a8b8ff9..35ee3f059afa07 100644 --- a/arch/riscv/kernel/image-vars.h +++ b/arch/riscv/kernel/image-vars.h @@ -34,4 +34,18 @@ __efistub_sysfb_primary_display = sysfb_primary_display; #endif +#ifdef CONFIG_KEXEC_CORE +#define KEXEC_TRAMP_TEXT \ + . = ALIGN(PAGE_SIZE); \ + __kexec_tramp_text_start = .; \ + KEEP(*(.kexec.tramp.text)) \ + KEEP(*(.kexec.tramp.text.*)) \ + __kexec_tramp_text_end = .; \ + ASSERT((__kexec_tramp_text_end - __kexec_tramp_text_start) <= PAGE_SIZE, \ + ".kexec.tramp.text exceeds 4K"); \ + . = ALIGN(PAGE_SIZE); +#else +#define KEXEC_TRAMP_TEXT /* nothing */ +#endif + #endif /* __RISCV_KERNEL_IMAGE_VARS_H */ diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S index de0a4b35d01efc..29392f457f421d 100644 --- a/arch/riscv/kernel/kexec_relocate.S +++ b/arch/riscv/kernel/kexec_relocate.S @@ -34,27 +34,13 @@ SYM_CODE_START(riscv_kexec_relocate) csrw CSR_SIP, zero /* - * When we switch SATP.MODE to "Bare" we'll only - * play with physical addresses. However the first time - * we try to jump somewhere, the offset on the jump - * will be relative to pc which will still be on VA. To - * deal with this we set stvec to the physical address at - * the start of the loop below so that we jump there in - * any case. + * The trampoline wrapper (riscv_kexec_relocate_entry) has already + * dropped the MMU and handed control to us at this PA copy of the + * relocate code. From here on the entire loop runs with SATP=0 and + * every address (s0, s5, source/dest pointers) is a physical one. */ - la s6, 1f - sub s6, s6, s4 - csrw CSR_STVEC, s6 - - /* - * With C-extension, here we get 42 Bytes and the next - * .align directive would pad zeros here up to 44 Bytes. - * So manually put a nop here to avoid zeros padding. - */ - nop /* Process entries in a loop */ -.align 2 1: REG_L t0, 0(s0) /* t0 = *image->entry */ addi s0, s0, RISCV_SZPTR /* image->entry++ */ @@ -70,8 +56,8 @@ SYM_CODE_START(riscv_kexec_relocate) andi t1, t0, 0x2 beqz t1, 2f andi s0, t0, ~0x2 - csrw CSR_SATP, zero - jr s6 + /* MMU is already off; the entry wrapper handled the transition. */ + j 1b 2: /* IND_DONE entry ? -> jump to done label */ @@ -147,13 +133,35 @@ riscv_kexec_relocate_end: /* Used for jumping to crashkernel */ -.section ".text" +.extern kexec_tramp_satp +.extern riscv_kexec_norelocate_pa +.section ".kexec.tramp.text", "ax" SYM_CODE_START(riscv_kexec_norelocate) + /* + * Two-pass entry: + * - 1st entry: t3 == 0 (initialized by machine_kexec()). + * + * - 2nd entry: t3 holds the physical address of + * riscv_kexec_norelocate, so auipc matches t3 and we fall through + * to label 1 to continue execution under trampoline VA(=PA). + */ + auipc t0, 0 + beq t0, t3, 1f + + la t0, riscv_kexec_norelocate_pa + REG_L t3, 0(t0) + la t0, kexec_tramp_satp + REG_L t1, 0(t0) + csrw CSR_SATP, t1 + sfence.vma x0, x0 + + jr t3 /* * s0: (const) Phys address to jump to * s1: (const) Phys address of the FDT image * s2: (const) The hartid of the current hart */ +1: mv s0, a1 mv s1, a2 mv s2, a3 @@ -199,16 +207,53 @@ SYM_CODE_START(riscv_kexec_norelocate) csrw CSR_SSCRATCH, zero /* - * Switch to physical addressing - * This will also trigger a jump to CSR_STVEC - * which in this case is the address of the new - * kernel. + * We are already executing from the trampoline VA with the trampoline + * page table installed, so there is no need to rely on the old flow + * of programming stvec and taking the implicit trap on SATP switch. + * Jump directly to the target entry instead. */ - csrw CSR_STVEC, a2 csrw CSR_SATP, zero + jr a2 SYM_CODE_END(riscv_kexec_norelocate) +.extern riscv_kexec_relocate_entry_pa +.extern riscv_kexec_cc_buffer_pa +.section ".kexec.tramp.text", "ax" +SYM_CODE_START(riscv_kexec_relocate_entry) + /* + * Two-pass entry, identical in shape to riscv_kexec_norelocate: + * - 1st entry: t3 == 0 (initialized by machine_kexec()). + * - 2nd entry: t3 == PA of riscv_kexec_relocate_entry, so auipc + * matches t3 and we fall through to label 1. + * Args a0..a4 are passed through unchanged to riscv_kexec_relocate. + */ + auipc t0, 0 + beq t0, t3, 1f + + la t0, riscv_kexec_relocate_entry_pa + REG_L t3, 0(t0) + la t0, kexec_tramp_satp + REG_L t1, 0(t0) + csrw CSR_SATP, t1 + sfence.vma x0, x0 + + jr t3 +1: + /* + * Now executing at the PA of this wrapper with the trampoline pgd + * installed (identity-mapped). Drop the MMU; PC stays valid because + * it is already a PA. + */ + csrw CSR_SATP, zero + sfence.vma x0, x0 + + /* Jump to the PA of control_code_buffer to run the relocate body. */ + la t0, riscv_kexec_cc_buffer_pa + REG_L t0, 0(t0) + jr t0 +SYM_CODE_END(riscv_kexec_relocate_entry) + .section ".rodata" SYM_DATA(riscv_kexec_relocate_size, .long riscv_kexec_relocate_end - riscv_kexec_relocate) diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c index 2306ce3e5f229f..99cc251f971c6c 100644 --- a/arch/riscv/kernel/machine_kexec.c +++ b/arch/riscv/kernel/machine_kexec.c @@ -18,6 +18,69 @@ #include #include +unsigned long kexec_tramp_satp; +unsigned long riscv_kexec_norelocate_pa; +unsigned long riscv_kexec_relocate_entry_pa; +unsigned long riscv_kexec_cc_buffer_pa; +static pgd_t kexec_tramp_pgd[PTRS_PER_PGD] __aligned(PAGE_SIZE); +static p4d_t kexec_tramp_p4d[PTRS_PER_P4D] __aligned(PAGE_SIZE); +static pud_t kexec_tramp_pud[PTRS_PER_PUD] __aligned(PAGE_SIZE); +static pmd_t kexec_tramp_pmd[PTRS_PER_PMD] __aligned(PAGE_SIZE); +static pte_t kexec_tramp_pte[PTRS_PER_PTE] __aligned(PAGE_SIZE); +static p4d_t kexec_tramp_p4d2[PTRS_PER_P4D] __aligned(PAGE_SIZE); +static pud_t kexec_tramp_pud2[PTRS_PER_PUD] __aligned(PAGE_SIZE); +static pmd_t kexec_tramp_pmd2[PTRS_PER_PMD] __aligned(PAGE_SIZE); +static pte_t kexec_tramp_pte2[PTRS_PER_PTE] __aligned(PAGE_SIZE); + +static void map_tramp_page(p4d_t *p4ds, pud_t *puds, pmd_t *pmds, pte_t *ptes, + unsigned long va, unsigned long pa) +{ + pgd_t *pgd = (pgd_t *)kexec_tramp_pgd + pgd_index(va); + pmd_t *pmd; + + if (pgtable_l5_enabled) { + p4d_t *p4d; + + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa_symbol(p4ds)), PAGE_TABLE)); + p4d = (p4d_t *)p4ds + p4d_index(va); + if (pgtable_l4_enabled) + set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa_symbol(puds)), + PAGE_TABLE)); + else + set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa_symbol(pmds)), + PAGE_TABLE)); + } else { + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa_symbol(puds)), PAGE_TABLE)); + } + + if (pgtable_l4_enabled) { + pud_t *pud = (pud_t *)puds + pud_index(va); + + set_pud(pud, pfn_pud(PFN_DOWN(__pa_symbol(pmds)), PAGE_TABLE)); + pmd = (pmd_t *)pmds + pmd_index(va); + } else { + pmd = (pmd_t *)puds + pmd_index(va); + } + set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa_symbol(ptes)), PAGE_TABLE)); + + set_pte((pte_t *)ptes + pte_index(va), + pfn_pte(PFN_DOWN(pa), PAGE_KERNEL_EXEC)); +} + +static void riscv_kexec_build_tramp(unsigned long va, unsigned long pa) +{ + /* VA -> PA: map the trampoline page via its kernel VA. */ + map_tramp_page(kexec_tramp_p4d, kexec_tramp_pud, + kexec_tramp_pmd, kexec_tramp_pte, va, pa); + + /* + * PA -> PA: identity-map the same page so the second-pass code + * can keep executing after the kernel VA mapping is dropped. + */ + map_tramp_page(kexec_tramp_p4d2, kexec_tramp_pud2, + kexec_tramp_pmd2, kexec_tramp_pte2, pa, pa); +} + /* * machine_kexec_prepare - Initialize kexec * @@ -58,6 +121,16 @@ machine_kexec_prepare(struct kimage *image) return -EINVAL; } + /* + * Build the trampoline page table and capture its SATP value. + * The crash path consumes it today; the non-crash kexec path + * will use the same setup as well. + */ + riscv_kexec_build_tramp((unsigned long)__kexec_tramp_text_start, + __pa_symbol(__kexec_tramp_text_start)); + WRITE_ONCE(kexec_tramp_satp, + PFN_DOWN(__pa_symbol(kexec_tramp_pgd)) | satp_mode); + /* Copy the assembler code for relocation to the control page */ if (image->type != KEXEC_TYPE_CRASH) { control_code_buffer = page_address(image->control_code_page); @@ -73,6 +146,14 @@ machine_kexec_prepare(struct kimage *image) /* Mark the control page executable */ set_memory_x((unsigned long) control_code_buffer, 1); + + WRITE_ONCE(riscv_kexec_relocate_entry_pa, + __pa_symbol(&riscv_kexec_relocate_entry)); + WRITE_ONCE(riscv_kexec_cc_buffer_pa, + __pa(control_code_buffer)); + } else { + WRITE_ONCE(riscv_kexec_norelocate_pa, + __pa_symbol(&riscv_kexec_norelocate)); } return 0; @@ -150,11 +231,15 @@ machine_kexec(struct kimage *image) { struct kimage_arch *internal = &image->arch; unsigned long jump_addr = (unsigned long) image->start; - unsigned long first_ind_entry = (unsigned long) &image->head; + /* + * The relocate body runs entirely with the MMU off (the wrapper + * drops SATP before jumping into control_code_buffer), so the very + * first entry must be a physical address. + */ + unsigned long first_ind_entry = __pa(&image->head); unsigned long this_cpu_id = __smp_processor_id(); unsigned long this_hart_id = cpuid_to_hartid_map(this_cpu_id); unsigned long fdt_addr = internal->fdt_addr; - void *control_code_buffer = page_address(image->control_code_page); riscv_kexec_method kexec_method = NULL; #ifdef CONFIG_SMP @@ -163,7 +248,7 @@ machine_kexec(struct kimage *image) #endif if (image->type != KEXEC_TYPE_CRASH) - kexec_method = control_code_buffer; + kexec_method = (riscv_kexec_method) &riscv_kexec_relocate_entry; else kexec_method = (riscv_kexec_method) &riscv_kexec_norelocate; @@ -176,7 +261,35 @@ machine_kexec(struct kimage *image) /* Jump to the relocation code */ pr_notice("Bye...\n"); - kexec_method(first_ind_entry, jump_addr, fdt_addr, - this_hart_id, kernel_map.va_pa_offset); + /* + * Hand off to the trampoline. For KEXEC_TYPE_CRASH we go into + * riscv_kexec_norelocate, which uses t3 as the 1st/2nd-pass + * discriminator (must be 0 on first entry). A bare + * asm volatile ("li t3, 0" ::: "t3") + * before the C call only declares t3 *modified*; the compiler is + * free to use t3 as scratch when materialising args. Pin t3 = 0 + * (and the args) via local register variables and perform the + * indirect jump inside the same inline asm so t3 == 0 is + * guaranteed at the moment control leaves machine_kexec(). + */ + { + register unsigned long a0_val asm("a0") = first_ind_entry; + register unsigned long a1_val asm("a1") = jump_addr; + register unsigned long a2_val asm("a2") = fdt_addr; + register unsigned long a3_val asm("a3") = this_hart_id; + register unsigned long a4_val asm("a4") = kernel_map.va_pa_offset; + register unsigned long t3_zero asm("t3") = 0; + register riscv_kexec_method m asm("t6") = kexec_method; + + asm volatile ( + "jr %[m]" + : + : "r" (a0_val), "r" (a1_val), "r" (a2_val), + "r" (a3_val), "r" (a4_val), + "r" (t3_zero), + [m] "r" (m) + : "memory" + ); + } unreachable(); } diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 1f4f8496941aef..fc7758e5b8190a 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -41,6 +41,7 @@ SECTIONS ENTRY_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT + KEXEC_TRAMP_TEXT _etext = .; }