From 29166d02d0473d0ef06366088c7fb465283f6b47 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 28 May 2026 23:12:30 +0200 Subject: [PATCH 1/2] riscv: misaligned: Fix fast_unaligned_access_speed_key init When booting with unaligned_scalar_speed=fast, fast_unaligned_access_speed_key is initialized incorrectly. The key is currently derived from the fast_misaligned_access cpumask, but that mask is only populated when the unaligned access speed probe runs. Specifying unaligned_scalar_speed=fast skips the probe entirely, leaving the mask uninitialized. The information tracked by fast_misaligned_access is already available in the misaligned_access_speed per-CPU variable. Use that to initialize fast_unaligned_access_speed_key instead and remove the redundant cpumask. Signed-off-by: Nam Cao Signed-off-by: Linux RISC-V bot --- arch/riscv/kernel/unaligned_access_speed.c | 69 +++++++--------------- 1 file changed, 22 insertions(+), 47 deletions(-) diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c index 11c781a4de733a..bb57eb5d19dfe3 100644 --- a/arch/riscv/kernel/unaligned_access_speed.c +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -27,8 +27,6 @@ DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN; -static cpumask_t fast_misaligned_access; - static u64 __maybe_unused measure_cycles(void (*func)(void *dst, const void *src, size_t len), void *dst, void *src, size_t len) @@ -131,13 +129,10 @@ static int check_unaligned_access(struct page *page) * Set the value of fast_misaligned_access of a CPU. These operations * are atomic to avoid race conditions. */ - if (ret) { + if (ret) per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST; - cpumask_set_cpu(cpu, &fast_misaligned_access); - } else { + else per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW; - cpumask_clear_cpu(cpu, &fast_misaligned_access); - } return 0; } @@ -192,49 +187,24 @@ static void __init check_unaligned_access_speed_all_cpus(void) DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); -static void modify_unaligned_access_branches(cpumask_t *mask, int weight) +static void modify_unaligned_access_branches(const cpumask_t *mask) { - if (cpumask_weight(mask) == weight) + bool fast = true; + int cpu; + + for_each_cpu(cpu, mask) { + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_FAST) { + fast = false; + break; + } + } + + if (fast) static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); else static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); } -static void set_unaligned_access_static_branches_except_cpu(int cpu) -{ - /* - * Same as set_unaligned_access_static_branches, except excludes the - * given CPU from the result. When a CPU is hotplugged into an offline - * state, this function is called before the CPU is set to offline in - * the cpumask, and thus the CPU needs to be explicitly excluded. - */ - - cpumask_t fast_except_me; - - cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); - cpumask_clear_cpu(cpu, &fast_except_me); - - modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); -} - -static void set_unaligned_access_static_branches(void) -{ - /* - * This will be called after check_unaligned_access_all_cpus so the - * result of unaligned access speed for all CPUs will be available. - * - * To avoid the number of online cpus changing between reading - * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be - * held before calling this function. - */ - - cpumask_t fast_and_online; - - cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); - - modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); -} - static int riscv_online_cpu(unsigned int cpu) { int ret = cpu_online_unaligned_access_init(cpu); @@ -266,14 +236,19 @@ static int riscv_online_cpu(unsigned int cpu) #endif exit: - set_unaligned_access_static_branches(); + modify_unaligned_access_branches(cpu_online_mask); return 0; } static int riscv_offline_cpu(unsigned int cpu) { - set_unaligned_access_static_branches_except_cpu(cpu); + cpumask_t mask; + + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(cpu, &mask); + + modify_unaligned_access_branches(&mask); return 0; } @@ -430,7 +405,7 @@ static int __init check_unaligned_access_all_cpus(void) riscv_online_cpu_vec, NULL); cpus_read_lock(); - set_unaligned_access_static_branches(); + modify_unaligned_access_branches(cpu_online_mask); cpus_read_unlock(); return 0; From 3f331455f5bb17206ff0a64723a4fffb519a5efe Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 28 May 2026 23:12:31 +0200 Subject: [PATCH 2/2] riscv: traps_misaligned: Avoid redundant unaligned access speed probe When a CPU is taken offline and then is brought back online, unaligned access speed probe always runs even though the unaligned access speed is already known, wasting CPU cycles. This is because when a CPU becomes online, the following happen: 1. check_unaligned_access_emulated() is called, which clears misaligned_access_speed if there is no emulation. 2. check_unaligned_access() is called because misaligned_access_speed is cleared, wasting CPU cycles determining something already previous known. Avoid the redundant access speed probe by stop clearing misaligned_access_speed in (1). If access speed is already known, just reuse it. On my Visionfive 2, this reduces CPU bring-up time from 26ms to 0.8ms. Signed-off-by: Nam Cao Signed-off-by: Linux RISC-V bot --- arch/riscv/kernel/traps_misaligned.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 81b7682e6c6dbc..6e8ae6c66322c3 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -522,10 +522,10 @@ static bool unaligned_ctl __read_mostly; static void check_unaligned_access_emulated(void *arg __always_unused) { int cpu = smp_processor_id(); - long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); unsigned long tmp_var, tmp_val; - *mas_ptr = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN; + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) + return; __asm__ __volatile__ ( " "REG_L" %[tmp], 1(%[ptr])\n"