diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 674044754378af..2921680d213229 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -185,6 +185,7 @@ config RISCV select HAVE_KRETPROBES # https://github.com/ClangBuiltLinux/linux/issues/1881 select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD + select HAVE_LIVEPATCH if FRAME_POINTER && 64BIT select HAVE_MOVE_PMD select HAVE_MOVE_PUD select HAVE_PAGE_SIZE_4KB @@ -195,6 +196,7 @@ config RISCV select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE if FRAME_POINTER && 64BIT select HAVE_RETHOOK select HAVE_RSEQ select HAVE_RUST if RUSTC_SUPPORTS_RISCV && CC_IS_CLANG @@ -1394,3 +1396,5 @@ endmenu # "CPU Power Management" source "arch/riscv/kvm/Kconfig" source "drivers/acpi/Kconfig" + +source "kernel/livepatch/Kconfig" diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index addc8188152f7f..4b9b0f2792143b 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -8,6 +8,7 @@ #include #include +#include #include #ifndef __ASSEMBLER__ @@ -53,6 +54,14 @@ struct pt_regs { unsigned long cause; /* a0 value before the syscall */ unsigned long orig_a0; + + /* + * This frame record is entirely zeroed on exception entry, allowing the + * unwinder to identify exception boundaries. The type field encodes + * whether the exception was taken from user (FINAL) or kernel (PT_REGS) + * mode. + */ + struct frame_record_meta stackframe; }; #define PTRACE_SYSEMU 0x1f diff --git a/arch/riscv/include/asm/stacktrace.h b/arch/riscv/include/asm/stacktrace.h index b1495a7e06ce69..bc87c49403798b 100644 --- a/arch/riscv/include/asm/stacktrace.h +++ b/arch/riscv/include/asm/stacktrace.h @@ -3,8 +3,13 @@ #ifndef _ASM_RISCV_STACKTRACE_H #define _ASM_RISCV_STACKTRACE_H +#include #include +#include + +#include #include +#include struct stackframe { unsigned long fp; @@ -16,14 +21,70 @@ extern void notrace walk_stackframe(struct task_struct *task, struct pt_regs *re extern void dump_backtrace(struct pt_regs *regs, struct task_struct *task, const char *loglvl); -static inline bool on_thread_stack(void) +/* + * IRQ stack accessors + */ +static inline struct stack_info stackinfo_get_irq(void) +{ + unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); + unsigned long high = low + IRQ_STACK_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} + +static inline bool on_irq_stack(unsigned long sp, unsigned long size) +{ + struct stack_info info = stackinfo_get_irq(); + + return stackinfo_on_stack(&info, sp, size); +} + +/* + * Task stack accessors + */ +static inline struct stack_info stackinfo_get_task(const struct task_struct *tsk) { - return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); + unsigned long low = (unsigned long)task_stack_page(tsk); + unsigned long high = low + THREAD_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} + +static inline bool on_task_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size) +{ + struct stack_info info = stackinfo_get_task(tsk); + + return stackinfo_on_stack(&info, sp, size); } +/* + * Cast is necessary since current->stack is an opaque ptr. + */ +#define on_thread_stack() (on_task_stack(current, current_stack_pointer, 1)) +/* + * Overflow stack accessors + */ #ifdef CONFIG_VMAP_STACK DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); + +static inline struct stack_info stackinfo_get_overflow(void) +{ + unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (struct stack_info) { + .low = low, + .high = high, + }; +} #endif /* CONFIG_VMAP_STACK */ #endif /* _ASM_RISCV_STACKTRACE_H */ diff --git a/arch/riscv/include/asm/stacktrace/common.h b/arch/riscv/include/asm/stacktrace/common.h new file mode 100644 index 00000000000000..87d6d40672f357 --- /dev/null +++ b/arch/riscv/include/asm/stacktrace/common.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * RISC-V common stack unwinder types and helpers. + * + * See: arch/arm64/include/asm/stacktrace/common.h for the reference + * implementation. + * + * Copyright (C) 2024 + */ +#ifndef __ASM_RISCV_STACKTRACE_COMMON_H +#define __ASM_RISCV_STACKTRACE_COMMON_H + +#include +#include +#include + +#include + +/** + * struct stack_info - describes the bounds of a stack. + * + * @low: The lowest valid address on the stack. + * @high: The highest valid address on the stack. + */ +struct stack_info { + unsigned long low; + unsigned long high; +}; + +/** + * struct unwind_state - state used for robust unwinding. + * + * @fp: The fp value in the frame record (or the real fp). + * @pc: The ra value in the frame record (or the real ra). + * + * @stack: The stack currently being unwound. + * @stacks: An array of stacks which can be unwound. + * @nr_stacks: The number of stacks in @stacks. + */ +struct unwind_state { + unsigned long fp; + unsigned long pc; + + struct stack_info stack; + struct stack_info *stacks; + int nr_stacks; +}; + +/** + * stackinfo_get_unknown() - Get an unknown stack_info. + * + * Return: a stack_info with low and high set to 0. + */ +static inline struct stack_info stackinfo_get_unknown(void) +{ + return (struct stack_info) { + .low = 0, + .high = 0, + }; +} + +/** + * stackinfo_on_stack() - Check whether an object is fully within a stack. + * + * @info: The stack to check against. + * @sp: The base address of the object. + * @size: The size of the object. + * + * Return: true if the object is fully contained within the stack. + */ +static inline bool stackinfo_on_stack(const struct stack_info *info, + unsigned long sp, unsigned long size) +{ + if (!info->low) + return false; + + if (sp < info->low || sp + size < sp || sp + size > info->high) + return false; + + return true; +} + +/** + * unwind_init_common() - Initialize the common parts of the unwind state. + * + * @state: the unwind state to initialize. + */ +static inline void unwind_init_common(struct unwind_state *state) +{ + state->stack = stackinfo_get_unknown(); +} + +/** + * unwind_find_stack() - Find the accessible stack which entirely contains an + * object. + * + * @state: the current unwind state. + * @sp: the base address of the object. + * @size: the size of the object. + * + * Return: a pointer to the relevant stack_info if found; NULL otherwise. + */ +static inline struct stack_info *unwind_find_stack(struct unwind_state *state, + unsigned long sp, + unsigned long size) +{ + struct stack_info *info = &state->stack; + + if (stackinfo_on_stack(info, sp, size)) + return info; + + for (int i = 0; i < state->nr_stacks; i++) { + info = &state->stacks[i]; + if (stackinfo_on_stack(info, sp, size)) + return info; + } + + return NULL; +} + +/** + * unwind_consume_stack() - Update stack boundaries so that future unwind steps + * cannot consume this object again. + * + * @state: the current unwind state. + * @info: the stack_info of the stack containing the object. + * @sp: the base address of the object. + * @size: the size of the object. + * + * Stack transitions are strictly one-way, and once we've + * transitioned from one stack to another, it's never valid to + * unwind back to the old stack. + * + * Note that stacks can nest in several valid orders, e.g. + * + * TASK -> IRQ -> OVERFLOW + * + * ... so we do not check the specific order of stack + * transitions. + */ +static inline void unwind_consume_stack(struct unwind_state *state, + struct stack_info *info, + unsigned long sp, + unsigned long size) +{ + struct stack_info tmp; + + tmp = *info; + *info = stackinfo_get_unknown(); + state->stack = tmp; + + /* + * Future unwind steps can only consume stack above this frame record. + * Update the current stack to start immediately above it. + */ + state->stack.low = sp + size; +} + +#endif /* __ASM_RISCV_STACKTRACE_COMMON_H */ diff --git a/arch/riscv/include/asm/stacktrace/frame.h b/arch/riscv/include/asm/stacktrace/frame.h new file mode 100644 index 00000000000000..5720a6c65fe882 --- /dev/null +++ b/arch/riscv/include/asm/stacktrace/frame.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_RISCV_STACKTRACE_FRAME_H +#define __ASM_RISCV_STACKTRACE_FRAME_H + +/* + * See: arch/arm64/include/asm/stacktrace/frame.h for the reference + * implementation. + */ + +/* + * - FRAME_META_TYPE_NONE + * + * This value is reserved. + * + * - FRAME_META_TYPE_FINAL + * + * The record is the last entry on the stack. + * Unwinding should terminate successfully. + * + * - FRAME_META_TYPE_PT_REGS + * + * The record is embedded within a struct pt_regs, recording the registers at + * an arbitrary point in time. + * Unwinding should consume pt_regs::epc, followed by pt_regs::ra. + * + * Note: all other values are reserved and should result in unwinding + * terminating with an error. + */ +#define FRAME_META_TYPE_NONE 0 +#define FRAME_META_TYPE_FINAL 1 +#define FRAME_META_TYPE_PT_REGS 2 + +#ifndef __ASSEMBLER__ +/* + * A standard RISC-V frame record. + */ +struct frame_record { + unsigned long fp; + unsigned long ra; +}; + +/* + * A metadata frame record indicating a special unwind. + * The record::{fp,ra} fields must be zero to indicate the presence of + * metadata. + */ +struct frame_record_meta { + struct frame_record record; + unsigned long type; +}; +#endif /* __ASSEMBLER__ */ + +#endif /* __ASM_RISCV_STACKTRACE_FRAME_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index cabb99cadfb6d1..1cb6c9ab298173 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -44,6 +44,11 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi_ecall.o = $(CC_FLAGS_FTRACE) endif +# When KASAN is enabled, a stack trace is recorded for every alloc/free, which +# can significantly impact performance. Avoid instrumenting the stack trace +# collection code to minimize this impact. +KASAN_SANITIZE_stacktrace.o := n + always-$(KBUILD_BUILTIN) += vmlinux.lds obj-y += head.o diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index af827448a609e6..8dfcb5a44bb86f 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -131,6 +131,9 @@ void asm_offsets(void) OFFSET(PT_BADADDR, pt_regs, badaddr); OFFSET(PT_CAUSE, pt_regs, cause); + DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); + DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type)); + OFFSET(SUSPEND_CONTEXT_REGS, suspend_context, regs); OFFSET(HIBERN_PBE_ADDR, pbe, address); @@ -501,6 +504,7 @@ void asm_offsets(void) OFFSET(SBI_HART_BOOT_STACK_PTR_OFFSET, sbi_hart_boot_data, stack_ptr); DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN)); + DEFINE(STACKFRAME_RECORD_SIZE, sizeof(struct stackframe)); OFFSET(STACKFRAME_FP, stackframe, fp); OFFSET(STACKFRAME_RA, stackframe, ra); #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index d011fb51c59a04..9cae0e1eba1c69 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -193,6 +194,27 @@ SYM_CODE_START(handle_exception) REG_S s4, PT_CAUSE(sp) REG_S s5, PT_TP(sp) + /* + * Create a metadata frame record. The unwinder will use this to + * identify and unwind exception boundaries. + */ + REG_S zero, (S_STACKFRAME + STACKFRAME_FP)(sp) /* stackframe.record.fp = 0 */ + REG_S zero, (S_STACKFRAME + STACKFRAME_RA)(sp) /* stackframe.record.ra = 0 */ +#ifdef CONFIG_RISCV_M_MODE + li t0, SR_MPP + and t0, s1, t0 +#else + andi t0, s1, SR_SPP +#endif + bnez t0, 1f + li t0, FRAME_META_TYPE_FINAL + j 2f +1: + li t0, FRAME_META_TYPE_PT_REGS +2: + REG_S t0, S_STACKFRAME_TYPE(sp) + addi s0, sp, S_STACKFRAME + STACKFRAME_RECORD_SIZE + /* * Set the scratch register to 0, so that if a recursive exception * occurs, the exception vector knows it came from the kernel @@ -357,8 +379,8 @@ ASM_NOKPROBE(handle_kernel_stack_overflow) SYM_CODE_START(ret_from_fork_kernel_asm) call schedule_tail - move a0, s1 /* fn_arg */ - move a1, s0 /* fn */ + move a0, s3 /* fn_arg */ + move a1, s2 /* fn */ move a2, sp /* pt_regs */ call ret_from_fork_kernel j ret_from_exception @@ -383,7 +405,7 @@ SYM_FUNC_START(call_on_irq_stack) addi sp, sp, -STACKFRAME_SIZE_ON_STACK REG_S ra, STACKFRAME_RA(sp) REG_S s0, STACKFRAME_FP(sp) - addi s0, sp, STACKFRAME_SIZE_ON_STACK + addi s0, sp, STACKFRAME_RECORD_SIZE /* Switch to the per-CPU shadow call stack */ scs_save_current @@ -399,7 +421,7 @@ SYM_FUNC_START(call_on_irq_stack) scs_load_current /* Switch back to the thread stack and restore ra and s0 */ - addi sp, s0, -STACKFRAME_SIZE_ON_STACK + addi sp, s0, -STACKFRAME_RECORD_SIZE REG_L ra, STACKFRAME_RA(sp) REG_L s0, STACKFRAME_FP(sp) addi sp, sp, STACKFRAME_SIZE_ON_STACK diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index b430edfb83f4c8..5d55199a9230e1 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -242,7 +242,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, */ old = *parent; - if (!function_graph_enter(old, self_addr, frame_pointer, parent)) + if (!function_graph_enter(old, self_addr, frame_pointer, + (void *)frame_pointer)) *parent = return_hooker; } @@ -264,7 +265,8 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, */ old = *parent; - if (!function_graph_enter_regs(old, ip, frame_pointer, parent, fregs)) + if (!function_graph_enter_regs(old, ip, frame_pointer, + (void *)frame_pointer, fregs)) *parent = return_hooker; } #endif /* CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index f6a8ca49e6277c..00e16a24f1495b 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "efi-header.S" @@ -177,6 +178,14 @@ secondary_start_sbi: REG_S a0, (a1) 1: #endif + + /* + * Set up the frame pointer for the secondary idle task so reliable + * stack unwinding terminates at the metadata frame in task_pt_regs(). + * Without this, the first frame records can inherit an undefined caller + * fp and unwind past smp_callin() into .Lsecondary_park. + */ + addi s0, sp, S_STACKFRAME + STACKFRAME_RECORD_SIZE scs_load_current call smp_callin #endif /* CONFIG_SMP */ @@ -305,6 +314,20 @@ SYM_CODE_START(_start_kernel) la tp, init_task la sp, init_thread_union + THREAD_SIZE addi sp, sp, -PT_SIZE_ON_STACK + + /* + * Set up a metadata frame record for the init task so that + * the unwinder can identify the outermost frame by its + * {fp, ra} = {0, 0} sentinel at the bottom of pt_regs. + * fp/s0 points above the metadata record (RISC-V + * convention). + */ + REG_S zero, (S_STACKFRAME + STACKFRAME_FP)(sp) + REG_S zero, (S_STACKFRAME + STACKFRAME_RA)(sp) + li t0, FRAME_META_TYPE_FINAL + REG_S t0, S_STACKFRAME_TYPE(sp) + addi s0, sp, S_STACKFRAME + STACKFRAME_RECORD_SIZE + #if defined(CONFIG_RISCV_SBI) && defined(CONFIG_RISCV_USER_CFI) li a7, SBI_EXT_FWFT li a6, SBI_EXT_FWFT_SET diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 082fe0b0e3c083..26c55fba8fec7a 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -85,9 +85,7 @@ addi sp, sp, -FREGS_SIZE_ON_STACK REG_S t0, FREGS_EPC(sp) REG_S x1, FREGS_RA(sp) -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST REG_S x8, FREGS_S0(sp) -#endif REG_S x6, FREGS_T1(sp) #ifdef CONFIG_CC_IS_CLANG REG_S x7, FREGS_T2(sp) @@ -113,9 +111,7 @@ .macro RESTORE_ABI_REGS REG_L t0, FREGS_EPC(sp) REG_L x1, FREGS_RA(sp) -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST REG_L x8, FREGS_S0(sp) -#endif REG_L x6, FREGS_T1(sp) #ifdef CONFIG_CC_IS_CLANG REG_L x7, FREGS_T2(sp) diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c index b465bc9eb870ec..436af96ea59cab 100644 --- a/arch/riscv/kernel/perf_callchain.c +++ b/arch/riscv/kernel/perf_callchain.c @@ -44,5 +44,5 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, return; } - walk_stackframe(NULL, regs, fill_callchain, entry); + arch_stack_walk(fill_callchain, entry, NULL, regs); } diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index b2df7f72241a5f..5212926b926ba1 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -258,8 +258,23 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* Supervisor/Machine, irqs on: */ childregs->status = SR_PP | SR_PIE; - p->thread.s[0] = (unsigned long)args->fn; - p->thread.s[1] = (unsigned long)args->fn_arg; + /* + * Set up a metadata frame record at the bottom of the + * stack for the unwinder. Use FRAME_META_TYPE_FINAL + * since this is the outermost kernel entry for the new + * task. The frame_record::{fp,ra} are already zero from + * memset(). + * + * fp/s0 points above the metadata record (RISC-V + * convention). fn and fn_arg are passed via s2/s3, + * keeping s0 available for the frame pointer chain. + */ + childregs->stackframe.type = FRAME_META_TYPE_FINAL; + + p->thread.s[0] = (unsigned long)(&childregs->stackframe) + + sizeof(struct frame_record); + p->thread.s[2] = (unsigned long)args->fn; + p->thread.s[3] = (unsigned long)args->fn_arg; p->thread.ra = (unsigned long)ret_from_fork_kernel_asm; } else { /* allocate new shadow stack if needed. In case of CLONE_VM we have to */ @@ -278,6 +293,18 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (clone_flags & CLONE_SETTLS) childregs->tp = tls; childregs->a0 = 0; /* Return value of fork() */ + + /* + * Set up the unwind boundary: ensure the metadata + * frame record has its {fp,ra} sentinel zeroed and + * point fp/s0 above the metadata record. The type + * field is inherited from the parent's pt_regs. + */ + childregs->stackframe.record.fp = 0; + childregs->stackframe.record.ra = 0; + p->thread.s[0] = (unsigned long)(&childregs->stackframe) + + sizeof(struct frame_record); + p->thread.ra = (unsigned long)ret_from_fork_user_asm; } p->thread.riscv_v_flags = 0; diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 2692d3a06afa2b..0d76320b3a2973 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -11,98 +11,16 @@ #include #include #include +#include +#include #include -#ifdef CONFIG_FRAME_POINTER - /* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. + * Non-frame-pointer fallback unwinder. + * Only compiled when CONFIG_FRAME_POINTER is not enabled. */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - unsigned long addr = x; \ - if ((task) == current) \ - val = READ_ONCE(addr); \ - else \ - val = READ_ONCE_NOCHECK(addr); \ - val; \ -}) - -extern asmlinkage void handle_exception(void); -extern unsigned long ret_from_exception_end; - -static inline int fp_is_valid(unsigned long fp, unsigned long sp) -{ - unsigned long low, high; - - low = sp + sizeof(struct stackframe); - high = ALIGN(sp, THREAD_SIZE); - - return !(fp < low || fp > high || fp & 0x07); -} - -void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, - bool (*fn)(void *, unsigned long), void *arg) -{ - unsigned long fp, sp, pc; - int graph_idx = 0; - int level = 0; - - if (regs) { - fp = frame_pointer(regs); - sp = user_stack_pointer(regs); - pc = instruction_pointer(regs); - } else if (task == NULL || task == current) { - fp = (unsigned long)__builtin_frame_address(0); - sp = current_stack_pointer; - pc = (unsigned long)walk_stackframe; - level = -1; - } else { - /* task blocked in __switch_to */ - fp = task->thread.s[0]; - sp = task->thread.sp; - pc = task->thread.ra; - } - - for (;;) { - struct stackframe *frame; - - if (unlikely(!__kernel_text_address(pc) || (level++ >= 0 && !fn(arg, pc)))) - break; - - if (unlikely(!fp_is_valid(fp, sp))) - break; - - /* Unwind stack frame */ - frame = (struct stackframe *)fp - 1; - sp = fp; - if (regs && (regs->epc == pc) && fp_is_valid(frame->ra, sp)) { - /* We hit function where ra is not saved on the stack */ - fp = frame->ra; - pc = regs->ra; - } else { - fp = READ_ONCE_TASK_STACK(task, frame->fp); - pc = READ_ONCE_TASK_STACK(task, frame->ra); - pc = ftrace_graph_ret_addr(task, &graph_idx, pc, - &frame->ra); - if (pc >= (unsigned long)handle_exception && - pc < (unsigned long)&ret_from_exception_end) { - if (unlikely(!fn(arg, pc))) - break; - - pc = ((struct pt_regs *)sp)->epc; - fp = ((struct pt_regs *)sp)->s0; - } - } - - } -} - -#else /* !CONFIG_FRAME_POINTER */ +#ifndef CONFIG_FRAME_POINTER void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, bool (*fn)(void *, unsigned long), void *arg) @@ -133,7 +51,12 @@ void notrace walk_stackframe(struct task_struct *task, } } -#endif /* CONFIG_FRAME_POINTER */ +#endif /* !CONFIG_FRAME_POINTER */ + +/* + * Common trace helpers. + * These are used by both the FP (kunwind) and non-FP (walk_stackframe) paths. + */ static bool print_trace_address(void *arg, unsigned long pc) { @@ -146,12 +69,12 @@ static bool print_trace_address(void *arg, unsigned long pc) noinline void dump_backtrace(struct pt_regs *regs, struct task_struct *task, const char *loglvl) { - walk_stackframe(task, regs, print_trace_address, (void *)loglvl); + printk("%sCall Trace:\n", loglvl); + arch_stack_walk(print_trace_address, (void *)loglvl, task, regs); } void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { - pr_cont("%sCall Trace:\n", loglvl); dump_backtrace(NULL, task, loglvl); } @@ -171,17 +94,468 @@ unsigned long __get_wchan(struct task_struct *task) if (!try_get_task_stack(task)) return 0; - walk_stackframe(task, NULL, save_wchan, &pc); + arch_stack_walk(save_wchan, &pc, task, NULL); put_task_stack(task); return pc; } -noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, - struct task_struct *task, struct pt_regs *regs) +/* + * Frame-pointer-based kernel unwind infrastructure. + * Only compiled when CONFIG_FRAME_POINTER is enabled. + * + * See: arch/arm64/kernel/stacktrace.c for the reference implementation. + */ +#ifdef CONFIG_FRAME_POINTER + +/* + * Per-cpu stacks are only accessible when unwinding the current task in a + * non-preemptible context. + */ +#define STACKINFO_CPU(task, name) \ + ({ \ + (((task) == current) && !preemptible()) \ + ? stackinfo_get_##name() \ + : stackinfo_get_unknown(); \ + }) + +enum kunwind_source { + KUNWIND_SOURCE_UNKNOWN, + KUNWIND_SOURCE_FRAME, + KUNWIND_SOURCE_CALLER, + KUNWIND_SOURCE_TASK, + KUNWIND_SOURCE_REGS_PC, +}; + +union unwind_flags { + unsigned long all; + struct { + unsigned long fgraph : 1, + kretprobe : 1; + }; +}; + +/* + * Kernel unwind state + * + * @common: Common unwind state. + * @task: The task being unwound. + * @graph_idx: Used by ftrace_graph_ret_addr() for optimized stack unwinding. + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement ra + * value. + */ +struct kunwind_state { + struct unwind_state common; + struct task_struct *task; + int graph_idx; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif + enum kunwind_source source; + union unwind_flags flags; + struct pt_regs *regs; +}; + +static __always_inline void +kunwind_init(struct kunwind_state *state, + struct task_struct *task) +{ + unwind_init_common(&state->common); + state->task = task; + state->source = KUNWIND_SOURCE_UNKNOWN; + state->flags.all = 0; + state->regs = NULL; +} + +/* + * Start an unwind from a pt_regs. + * + * The unwind will begin at the PC within the regs. + * + * The regs must be on a stack currently owned by the calling task. + */ +static __always_inline void +kunwind_init_from_regs(struct kunwind_state *state, + struct pt_regs *regs) +{ + kunwind_init(state, current); + + state->regs = regs; + state->common.fp = frame_pointer(regs); + state->common.pc = instruction_pointer(regs); + state->source = KUNWIND_SOURCE_REGS_PC; +} + +/* + * Start an unwind from a caller. + * + * The unwind will begin at the caller of whichever function this is inlined + * into. + * + * The function which invokes this must be noinline. + */ +static __always_inline void +kunwind_init_from_caller(struct kunwind_state *state) +{ + unsigned long fp = (unsigned long)__builtin_frame_address(0); + struct frame_record *record = (struct frame_record *)fp - 1; + + kunwind_init(state, current); + + state->common.fp = READ_ONCE(record->fp); + state->common.pc = READ_ONCE(record->ra); + state->source = KUNWIND_SOURCE_CALLER; +} + +/* + * Start an unwind from a blocked task. + * + * The unwind will begin at the blocked task's saved PC (i.e. the caller of + * __switch_to). + * + * The caller should ensure the task is blocked in __switch_to for the + * duration of the unwind, or the unwind will be bogus. It is never valid to + * call this for the current task. + */ +static __always_inline void +kunwind_init_from_task(struct kunwind_state *state, + struct task_struct *task) +{ + kunwind_init(state, task); + + state->common.fp = task->thread.s[0]; + state->common.pc = task->thread.ra; + state->source = KUNWIND_SOURCE_TASK; +} + +static __always_inline int +kunwind_recover_return_address(struct kunwind_state *state) +{ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (state->task->ret_stack && + state->common.pc == (unsigned long)return_to_handler) { + unsigned long orig_pc; + + orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->common.pc, + (void *)state->common.fp); + if (state->common.pc == orig_pc) { + WARN_ON_ONCE(state->task == current); + return -EINVAL; + } + state->common.pc = orig_pc; + state->flags.fgraph = 1; + } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#ifdef CONFIG_KRETPROBES + if (is_kretprobe_trampoline(state->common.pc)) { + unsigned long orig_pc; + + orig_pc = kretprobe_find_ret_addr(state->task, + (void *)state->common.fp, + &state->kr_cur); + if (!orig_pc) + return -EINVAL; + state->common.pc = orig_pc; + state->flags.kretprobe = 1; + } +#endif /* CONFIG_KRETPROBES */ + + return 0; +} + +/* + * When we reach an exception boundary marked by a metadata frame record, + * extract pt_regs from the stack and continue unwinding from the saved + * context (epc and s0/fp). + * + * On RISC-V, fp points above the metadata record, so the record's + * frame_record portion is at fp - sizeof(struct frame_record). + */ +static __always_inline int +kunwind_next_regs_pc(struct kunwind_state *state) +{ + struct stack_info *info; + unsigned long fp = state->common.fp; + struct pt_regs *regs; + + regs = container_of((unsigned long *)(fp - sizeof(struct frame_record)), + struct pt_regs, stackframe.record.fp); + + info = unwind_find_stack(&state->common, (unsigned long)regs, + sizeof(*regs)); + if (!info) + return -EINVAL; + + unwind_consume_stack(&state->common, info, (unsigned long)regs, + sizeof(*regs)); + + state->regs = regs; + state->common.pc = regs->epc; + state->common.fp = frame_pointer(regs); + state->regs = NULL; + state->source = KUNWIND_SOURCE_REGS_PC; + return 0; +} + +/* + * Handle a metadata frame record embedded in pt_regs. + * + * On RISC-V, fp points above the record (fp = metadata + 16), so the + * frame_record_meta starts at fp - sizeof(struct frame_record). + * + * FRAME_META_TYPE_FINAL: This is the outermost exception entry + * (user -> kernel). Unwinding terminates successfully. + * FRAME_META_TYPE_PT_REGS: This is a nested exception entry + * (kernel -> kernel). Continue unwinding from the saved context. + */ +static __always_inline int +kunwind_next_frame_record_meta(struct kunwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->common.fp; + unsigned long meta_base = fp - sizeof(struct frame_record); + struct frame_record_meta *meta; + struct stack_info *info; + + info = unwind_find_stack(&state->common, meta_base, sizeof(*meta)); + if (!info) + return -EINVAL; + + meta = (struct frame_record_meta *)meta_base; + switch (READ_ONCE(meta->type)) { + case FRAME_META_TYPE_FINAL: + if (meta == &task_pt_regs(tsk)->stackframe) + return -ENOENT; + WARN_ON_ONCE(tsk == current); + return -EINVAL; + case FRAME_META_TYPE_PT_REGS: + return kunwind_next_regs_pc(state); + default: + WARN_ON_ONCE(tsk == current); + return -EINVAL; + } +} + +/* + * Unwind from one frame record to the next. + * + * On RISC-V, the frame record sits at fp - sizeof(struct frame_record), + * immediately below the address pointed to by fp/s0. This applies to both + * normal frame records and metadata frame records (embedded in pt_regs). + * + * A metadata record is identified by both fp and ra being zero in the + * frame_record portion, with a type value following at fp + 16. + */ +static __always_inline int +kunwind_next_frame_record(struct kunwind_state *state) +{ + unsigned long fp = state->common.fp; + struct frame_record *record; + struct stack_info *info; + unsigned long new_fp, new_pc; + unsigned long record_base; + + if (fp & 0x7) + return -EINVAL; + + record_base = fp - sizeof(*record); + + info = unwind_find_stack(&state->common, record_base, sizeof(*record)); + if (!info) + return -EINVAL; + + record = (struct frame_record *)record_base; + new_fp = READ_ONCE(record->fp); + new_pc = READ_ONCE(record->ra); + + if (!new_fp && !new_pc) + return kunwind_next_frame_record_meta(state); + + unwind_consume_stack(&state->common, info, record_base, + sizeof(*record)); + + state->common.fp = new_fp; + state->common.pc = new_pc; + state->source = KUNWIND_SOURCE_FRAME; + + return 0; +} + +/* + * Unwind from one frame record (A) to the next frame record (B). + * + * We terminate early if the location of B indicates a malformed chain of frame + * records (e.g. a cycle), determined based on the location and fp value of A + * and the location (but not the fp value) of B. + */ +static __always_inline int +kunwind_next(struct kunwind_state *state) +{ + int err; + + state->flags.all = 0; + + switch (state->source) { + case KUNWIND_SOURCE_FRAME: + case KUNWIND_SOURCE_CALLER: + case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_PC: + err = kunwind_next_frame_record(state); + break; + default: + err = -EINVAL; + } + + if (err) + return err; + + return kunwind_recover_return_address(state); +} + +typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie); + +static __always_inline int +do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state, + void *cookie) +{ + int ret; + + ret = kunwind_recover_return_address(state); + if (ret) + return ret; + + while (1) { + if (!consume_state(state, cookie)) + return -EINVAL; + ret = kunwind_next(state); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + } +} + +static __always_inline int +kunwind_stack_walk(kunwind_consume_fn consume_state, + void *cookie, struct task_struct *task, + struct pt_regs *regs) +{ + struct task_struct *tsk = task ?: current; + struct stack_info stacks[] = { + stackinfo_get_task(tsk), + STACKINFO_CPU(tsk, irq), +#ifdef CONFIG_VMAP_STACK + STACKINFO_CPU(tsk, overflow), +#endif + }; + struct kunwind_state state = { + .common = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }, + }; + + if (regs) { + if (tsk != current) + return -EINVAL; + kunwind_init_from_regs(&state, regs); + } else if (tsk == current) { + kunwind_init_from_caller(&state); + } else { + kunwind_init_from_task(&state, tsk); + } + + return do_kunwind(&state, consume_state, cookie); +} + +struct kunwind_consume_entry_data { + stack_trace_consume_fn consume_entry; + void *cookie; +}; + +static __always_inline bool +arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + struct kunwind_consume_entry_data *data = cookie; + + return data->consume_entry(data->cookie, state->common.pc); +} + +static __always_inline bool +arch_reliable_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + /* + * At an exception boundary we can reliably consume the saved PC. We do + * not know whether the LR was live when the exception was taken, and + * so we cannot perform the next unwind step reliably. + * + * All that matters is whether the *entire* unwind is reliable, so give + * up as soon as we hit an exception boundary. + */ + if (state->source == KUNWIND_SOURCE_REGS_PC) + return false; + + return arch_kunwind_consume_entry(state, cookie); +} + +#endif /* CONFIG_FRAME_POINTER */ + +/* + * arch_stack_walk - dual implementation. + * + * When CONFIG_FRAME_POINTER is enabled, uses the kunwind infrastructure for + * robust frame-pointer-based unwinding, consistent with arch_stack_walk_reliable. + * + * When CONFIG_FRAME_POINTER is disabled, falls back to the simple stack scan + * in walk_stackframe(). + */ +#ifdef CONFIG_FRAME_POINTER + +noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) +{ + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs); +} + +#else + +noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { walk_stackframe(task, regs, consume_entry, cookie); } +#endif /* CONFIG_FRAME_POINTER */ + +/* + * Reliable stack walk for livepatch (CONFIG_FRAME_POINTER only). + */ +#ifdef CONFIG_FRAME_POINTER + +noinline noinstr int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, + struct task_struct *task) +{ + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + return kunwind_stack_walk(arch_reliable_kunwind_consume_entry, &data, + task, NULL); +} + +#endif /* CONFIG_FRAME_POINTER */ + /* * Get the return address for a single stackframe and return a pointer to the * next frame tail. diff --git a/scripts/sorttable.c b/scripts/sorttable.c index e8ed11c680c6d5..b4061c2c03e1fa 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -901,11 +901,17 @@ static int do_file(char const *const fname, void *addr) /* fallthrough */ case EM_386: case EM_LOONGARCH: - case EM_RISCV: case EM_S390: case EM_X86_64: custom_sort = sort_relative_table_with_data; break; + case EM_RISCV: +#ifdef MCOUNT_SORT_ENABLED + /* RISC-V uses patchable function entries before function entry. */ + before_func = 8; +#endif + custom_sort = sort_relative_table_with_data; + break; case EM_PARISC: case EM_PPC: case EM_PPC64: diff --git a/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c b/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c index dd802783ea849f..275e4b10cf5950 100644 --- a/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c +++ b/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c @@ -18,6 +18,8 @@ #define FN_PREFIX __s390x_ #elif defined(__aarch64__) #define FN_PREFIX __arm64_ +#elif defined(__riscv) +#define FN_PREFIX __riscv_ #else /* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER */ #define FN_PREFIX