diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml index 2b0a8a93bb2144..1c6f091518d49a 100644 --- a/Documentation/devicetree/bindings/riscv/extensions.yaml +++ b/Documentation/devicetree/bindings/riscv/extensions.yaml @@ -232,6 +232,12 @@ properties: ratified at commit d70011dde6c2 ("Update to ratified state") of riscv-j-extension. + - const: ssqosid + description: | + The standard Ssqosid extension for Quality of Service ID is + ratified as v1.0 in commit d9c616497fde ("Merge pull + request #7 from ved-rivos/Ratified") of riscv-ssqosid. + - const: ssstateen description: | The standard Ssstateen extension for supervisor-mode view of the diff --git a/MAINTAINERS b/MAINTAINERS index c2c6d79275c6eb..eab31c7b5e9174 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23017,6 +23017,21 @@ F: drivers/perf/riscv_pmu.c F: drivers/perf/riscv_pmu_legacy.c F: drivers/perf/riscv_pmu_sbi.c +RISC-V QOS RESCTRL SUPPORT +M: Drew Fustini +R: yunhui cui +L: linux-riscv@lists.infradead.org +S: Supported +F: arch/riscv/include/asm/qos.h +F: arch/riscv/include/asm/resctrl.h +F: arch/riscv/kernel/qos.c +F: drivers/acpi/riscv/rqsc.c +F: drivers/acpi/riscv/rqsc.h +F: drivers/resctrl/cbqri_devices.c +F: drivers/resctrl/cbqri_internal.h +F: drivers/resctrl/cbqri_resctrl.c +F: include/linux/riscv_cbqri.h + RISC-V RPMI AND MPXY DRIVERS M: Rahul Pathak M: Anup Patel diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e2a0522571760b..5d2fdf74ee716e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -591,6 +591,26 @@ config RISCV_ISA_SVNAPOT If you don't know what to do here, say Y. +config RISCV_ISA_SSQOSID + bool "Ssqosid extension support for supervisor mode Quality of Service ID" + depends on 64BIT + default n + select ARCH_HAS_CPU_RESCTRL + select RISCV_CBQRI_DRIVER + help + Adds support for the Ssqosid ISA extension (Supervisor-mode + Quality of Service ID). + + Ssqosid defines the srmcfg CSR which allows the system to tag the + running process with an RCID (Resource Control ID) and MCID + (Monitoring Counter ID). The RCID is used to determine resource + allocation. The MCID is used to track resource usage in event + counters. + + For example, a cache controller may use the RCID to apply a + cache partitioning scheme and use the MCID to track how much + cache a process, or a group of processes, is using. + config RISCV_ISA_SVPBMT bool "Svpbmt extension support for supervisor mode page-based memory types" depends on 64BIT && MMU diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h index 26ab37c171bcf6..3cfd0102085ee9 100644 --- a/arch/riscv/include/asm/acpi.h +++ b/arch/riscv/include/asm/acpi.h @@ -67,6 +67,16 @@ int acpi_get_riscv_isa(struct acpi_table_header *table, void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size, u32 *cboz_size, u32 *cbop_size); + +#ifdef CONFIG_RISCV_CBQRI_DRIVER +int __init acpi_parse_rqsc(struct acpi_table_header *table); +#else +static inline int acpi_parse_rqsc(struct acpi_table_header *table) +{ + return -EINVAL; +} +#endif /* CONFIG_RISCV_CBQRI_DRIVER */ + #else static inline void acpi_init_rintc_map(void) { } static inline struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 31b8988f4488da..7bce928e5daa09 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -84,6 +84,10 @@ #define SATP_ASID_MASK _AC(0xFFFF, UL) #endif +/* SRMCFG fields */ +#define SRMCFG_RCID_MASK GENMASK(11, 0) +#define SRMCFG_MCID_MASK GENMASK(27, 16) + /* Exception cause high bit - is an interrupt if set */ #define CAUSE_IRQ_FLAG (_AC(1, UL) << (__riscv_xlen - 1)) @@ -328,6 +332,7 @@ #define CSR_STVAL 0x143 #define CSR_SIP 0x144 #define CSR_SATP 0x180 +#define CSR_SRMCFG 0x181 #define CSR_STIMECMP 0x14D #define CSR_STIMECMPH 0x15D diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 7ef8e5f55c8dcf..b83dae5cebb992 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -112,6 +112,7 @@ #define RISCV_ISA_EXT_ZCLSD 103 #define RISCV_ISA_EXT_ZICFILP 104 #define RISCV_ISA_EXT_ZICFISS 105 +#define RISCV_ISA_EXT_SSQOSID 106 #define RISCV_ISA_EXT_XLINUXENVCFG 127 diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 812517b2cec135..49a386d74cd3f0 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -123,6 +123,9 @@ struct thread_struct { /* A forced icache flush is not needed if migrating to the previous cpu. */ unsigned int prev_cpu; #endif +#ifdef CONFIG_RISCV_ISA_SSQOSID + u32 srmcfg; +#endif }; /* Whitelist the fstate from the task_struct for hardened usercopy */ diff --git a/arch/riscv/include/asm/qos.h b/arch/riscv/include/asm/qos.h new file mode 100644 index 00000000000000..727d438454f35c --- /dev/null +++ b/arch/riscv/include/asm/qos.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_QOS_H +#define _ASM_RISCV_QOS_H + +#include + +#ifdef CONFIG_RISCV_ISA_SSQOSID + +#include +#include +#include + +#include +#include +#include + +/* cached value of srmcfg csr for each cpu */ +DECLARE_PER_CPU(u32, cpu_srmcfg); + +/* default srmcfg value for each cpu, set via resctrl cpu assignment */ +DECLARE_PER_CPU(u32, cpu_srmcfg_default); + +static inline void __switch_to_srmcfg(struct task_struct *next) +{ + u32 thread_srmcfg, default_srmcfg; + + thread_srmcfg = READ_ONCE(next->thread.srmcfg); + default_srmcfg = __this_cpu_read(cpu_srmcfg_default); + + /* + * RCID and MCID inherit from cpu_srmcfg_default independently. + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are both 0, + * so a per-field zero means "no task assignment for this + * dimension" and the CPU default supplies that field. Matches + * x86 RDT's __resctrl_sched_in() per-field logic. The fully + * unassigned (thread.srmcfg == 0) and fully assigned (both + * fields non-zero) cases short-circuit the field math. + */ + if (thread_srmcfg == 0) { + thread_srmcfg = default_srmcfg; + } else { + u32 rcid = FIELD_GET(SRMCFG_RCID_MASK, thread_srmcfg); + u32 mcid = FIELD_GET(SRMCFG_MCID_MASK, thread_srmcfg); + + if (rcid == 0 || mcid == 0) { + if (rcid == 0) + rcid = FIELD_GET(SRMCFG_RCID_MASK, default_srmcfg); + if (mcid == 0) + mcid = FIELD_GET(SRMCFG_MCID_MASK, default_srmcfg); + thread_srmcfg = FIELD_PREP(SRMCFG_RCID_MASK, rcid) | + FIELD_PREP(SRMCFG_MCID_MASK, mcid); + } + } + + if (thread_srmcfg != __this_cpu_read(cpu_srmcfg)) { + /* + * Drain stores from the outgoing task before the CSR write + * so they retain the previous RCID/MCID tag at the cache + * interconnect. + */ + RISCV_FENCE(rw, o); + + __this_cpu_write(cpu_srmcfg, thread_srmcfg); + csr_write(CSR_SRMCFG, thread_srmcfg); + /* + * Order the csrw before the new task's loads/stores so they + * pick up the new tag. Zicsr 6.1.1 makes CSR writes weakly + * ordered (device-output) vs memory ops. Ssqosid v1.0 is + * silent so honor the general CSR rule. + */ + RISCV_FENCE(o, rw); + } +} + +static __always_inline bool has_srmcfg(void) +{ + return riscv_has_extension_unlikely(RISCV_ISA_EXT_SSQOSID); +} + +#else /* ! CONFIG_RISCV_ISA_SSQOSID */ + +struct task_struct; +static __always_inline bool has_srmcfg(void) { return false; } +static inline void __switch_to_srmcfg(struct task_struct *next) { } + +#endif /* CONFIG_RISCV_ISA_SSQOSID */ +#endif /* _ASM_RISCV_QOS_H */ diff --git a/arch/riscv/include/asm/resctrl.h b/arch/riscv/include/asm/resctrl.h new file mode 100644 index 00000000000000..282b5b59e3ee87 --- /dev/null +++ b/arch/riscv/include/asm/resctrl.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_RISCV_RESCTRL_H +#define _ASM_RISCV_RESCTRL_H + +#include +#include +#include + +#include + +struct rdt_resource; + +/* + * Sentinel "no CLOSID assigned" used by resctrl_arch_rmid_idx_decode(). + * fs/resctrl treats this opaquely. CBQRI uses MCID directly as the linear + * rmid index, so closid is unused on decode. + */ +#define RISCV_RESCTRL_EMPTY_CLOSID ((u32)~0) + +/* + * Terminology mapping between x86 (Intel RDT/AMD QoS) and RISC-V: + * + * CLOSID on x86 is RCID on RISC-V + * RMID on x86 is MCID on RISC-V + * CDP on x86 is AT (access type) on RISC-V + * + * Each fast-path arch entry point below is the RISC-V realization of the + * generic contract documented in . Comments here describe + * only the RISC-V-specific behavior (srmcfg encoding, CBQRI controller + * lookup, MCID-as-index policy). + */ + +/** + * resctrl_arch_alloc_capable() - any CBQRI controller exposes resctrl alloc + * + * Returns true once at least one CBQRI controller has successfully probed for + * a resctrl-exposed allocation feature (cache capacity or memory bandwidth). + * Only meaningful after cbqri_resctrl_setup() runs at late_initcall. + */ +bool resctrl_arch_alloc_capable(void); + +/** + * resctrl_arch_mon_capable() - any CBQRI controller exposes resctrl monitoring + * + * Returns true once at least one CBQRI controller has successfully probed a + * monitoring event wired through resctrl (L3 occupancy or L3 mbm_total_bytes). + */ +bool resctrl_arch_mon_capable(void); + +/** + * resctrl_arch_rmid_idx_encode() - encode (RCID, MCID) into a linear index + * @closid: RCID (resource control id) + * @rmid: MCID (monitoring counter id) + * + * RISC-V uses MCID directly as the linear index into per-RMID arrays + * managed by fs/resctrl, since CBQRI controllers admit any MCID for any + * RCID. closid is unused here. CDP is encoded via the AT field on each + * CBQRI op rather than via the index. + */ +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); + +/** + * resctrl_arch_rmid_idx_decode() - inverse of resctrl_arch_rmid_idx_encode() + * @idx: linear index + * @closid: out: always RISCV_RESCTRL_EMPTY_CLOSID + * @rmid: out: the MCID that @idx encodes + */ +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); + +/** + * resctrl_arch_set_cpu_default_closid_rmid() - install per-CPU srmcfg default + * @cpu: CPU number + * @closid: RCID to use when no task is matched + * @rmid: MCID to use when no task is matched + * + * Sets the per-CPU cpu_srmcfg_default so __switch_to_srmcfg() can fall back + * to the CPU's default RCID/MCID for default-group tasks (those whose + * thread.srmcfg encodes to 0, i.e. closid == RESCTRL_RESERVED_CLOSID and + * rmid == RESCTRL_RESERVED_RMID). Implements resctrl allocation rule 2 + * ("CPU default") on RISC-V. + */ +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); + +/** + * resctrl_arch_sched_in() - context-switch hook to install task RCID/MCID + * @tsk: the task being scheduled in + * + * Called from finish_task_switch() to write tsk->thread.srmcfg into the + * srmcfg CSR. Tasks tagged with RISCV_RESCTRL_EMPTY_CLOSID inherit the + * per-CPU default set via resctrl_arch_set_cpu_default_closid_rmid(). + */ +void resctrl_arch_sched_in(struct task_struct *tsk); + +/** + * resctrl_arch_set_closid_rmid() - tag a task with an RCID/MCID + * @tsk: task to tag + * @closid: RCID to install + * @rmid: MCID to install + * + * Updates tsk->thread.srmcfg with the encoded (RCID, MCID) pair. The new + * value takes effect on the next resctrl_arch_sched_in() for this task. + */ +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); + +/** + * resctrl_arch_match_closid() - test whether a task carries a given RCID + * @tsk: task + * @closid: RCID + */ +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); + +/** + * resctrl_arch_match_rmid() - test whether a task carries a given (RCID, MCID) + * @tsk: task + * @closid: RCID + * @rmid: MCID + */ +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); + +/** + * resctrl_arch_mon_ctx_alloc() - allocate per-monitor-event arch context + * @r: resctrl resource being monitored + * @evtid: which monitor event needs context + * + * Returns an opaque pointer that resctrl_arch_rmid_read() can use to find the + * CBQRI controller backing this event. CBQRI's BC bandwidth context is + * keyed off the resource's L3 monitoring domain rather than per-event state, + * so this implementation returns NULL. + */ +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); + +/** + * resctrl_arch_mon_ctx_free() - release context returned by mon_ctx_alloc() + * @r: resctrl resource + * @evtid: monitor event id + * @arch_mon_ctx: pointer returned by resctrl_arch_mon_ctx_alloc() + */ +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, + void *arch_mon_ctx); + +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + return val; +} + +/* Not needed for RISC-V */ +static inline void resctrl_arch_enable_mon(void) { } +static inline void resctrl_arch_disable_mon(void) { } +static inline void resctrl_arch_enable_alloc(void) { } +static inline void resctrl_arch_disable_alloc(void) { } + +#endif /* _ASM_RISCV_RESCTRL_H */ diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h index 0e71eb82f920ca..1c7ea53ec012ad 100644 --- a/arch/riscv/include/asm/switch_to.h +++ b/arch/riscv/include/asm/switch_to.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_FPU extern void __fstate_save(struct task_struct *save_to); @@ -119,6 +120,8 @@ do { \ __switch_to_fpu(__prev, __next); \ if (has_vector() || has_xtheadvector()) \ __switch_to_vector(__prev, __next); \ + if (has_srmcfg()) \ + __switch_to_srmcfg(__next); \ if (switch_to_should_flush_icache(__next)) \ local_flush_icache_all(); \ __switch_to_envcfg(__next); \ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index cabb99cadfb6d1..ebe1c3588177b4 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -128,3 +128,5 @@ obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += bugs.o obj-$(CONFIG_RISCV_USER_CFI) += usercfi.o + +obj-$(CONFIG_RISCV_ISA_SSQOSID) += qos.o diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index f46aa5602d74d3..668a7e71ff1c64 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -582,6 +582,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(ssaia, RISCV_ISA_EXT_SSAIA), __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), __RISCV_ISA_EXT_SUPERSET(ssnpm, RISCV_ISA_EXT_SSNPM, riscv_xlinuxenvcfg_exts), + __RISCV_ISA_EXT_DATA(ssqosid, RISCV_ISA_EXT_SSQOSID), __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), __RISCV_ISA_EXT_DATA(svade, RISCV_ISA_EXT_SVADE), __RISCV_ISA_EXT_DATA_VALIDATE(svadu, RISCV_ISA_EXT_SVADU, riscv_ext_svadu_validate), diff --git a/arch/riscv/kernel/qos.c b/arch/riscv/kernel/qos.c new file mode 100644 index 00000000000000..d18b99b195e79e --- /dev/null +++ b/arch/riscv/kernel/qos.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Cached value of srmcfg csr for each cpu. Seeded to U32_MAX so the next + * __switch_to_srmcfg() unconditionally writes the CSR; the encoding + * MCID << 16 | RCID with both fields well under 16 bits can never + * produce this sentinel. This covers early-boot context switches that + * happen before riscv_srmcfg_init() runs as an arch_initcall. + */ +DEFINE_PER_CPU(u32, cpu_srmcfg) = U32_MAX; + +/* default srmcfg value for each cpu, set via resctrl cpu assignment */ +DEFINE_PER_CPU(u32, cpu_srmcfg_default); + +/* + * Seed the per-CPU srmcfg cache to a sentinel that no real srmcfg encoding + * can produce (MCID << 16 | RCID, both fields well under 16 bits) so the + * next __switch_to_srmcfg() unconditionally writes the CSR. Ssqosid v1.0 + * leaves CSR state across hart stop/start implementation-defined, so the + * cached value cannot be trusted after online. + */ +static int riscv_srmcfg_online(unsigned int cpu) +{ + per_cpu(cpu_srmcfg, cpu) = U32_MAX; + return 0; +} + +/* + * Invalidate the cache on offline too. The sentinel persists across the + * offline period, so a CPU brought back online finds the cache already + * invalidated before it is schedulable. This closes the window where a + * task scheduled before riscv_srmcfg_online() runs could match a stale + * cache and skip the CSR write while the hardware CSR was reset across + * hart stop/start. + */ +static int riscv_srmcfg_offline(unsigned int cpu) +{ + per_cpu(cpu_srmcfg, cpu) = U32_MAX; + return 0; +} + +/* + * CPU PM notifier: invalidate the cached srmcfg on resume from a deep + * idle / suspend. Ssqosid v1.0 leaves CSR_SRMCFG state across low-power + * transitions implementation-defined, and the boot CPU never goes + * through the cpuhp online callback during system suspend, so without + * this hook __switch_to_srmcfg() would skip the CSR write when the + * outgoing task happens to share its srmcfg with the pre-suspend cache. + */ +static int riscv_srmcfg_pm_notify(struct notifier_block *nb, + unsigned long action, void *unused) +{ + switch (action) { + case CPU_PM_EXIT: + case CPU_PM_ENTER_FAILED: + __this_cpu_write(cpu_srmcfg, U32_MAX); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block riscv_srmcfg_pm_nb = { + .notifier_call = riscv_srmcfg_pm_notify, +}; + +static int __init riscv_srmcfg_init(void) +{ + int err; + + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SSQOSID)) + return 0; + + /* + * cpuhp_setup_state() invokes the startup callback locally on every + * already-online CPU, so no separate seed loop is needed here. + */ + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "riscv/srmcfg:online", + riscv_srmcfg_online, riscv_srmcfg_offline); + if (err < 0) { + pr_warn("srmcfg cpuhp registration failed (%d), cpus brought online after boot will not invalidate the CSR_SRMCFG cache\n", + err); + return err; + } + + cpu_pm_register_notifier(&riscv_srmcfg_pm_nb); + return 0; +} +arch_initcall(riscv_srmcfg_init); diff --git a/drivers/acpi/riscv/Makefile b/drivers/acpi/riscv/Makefile index 1284a076fa8887..77f8f0101b7e8d 100644 --- a/drivers/acpi/riscv/Makefile +++ b/drivers/acpi/riscv/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += rhct.o init.o irq.o +obj-$(CONFIG_RISCV_CBQRI_DRIVER) += rqsc.o obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o obj-$(CONFIG_ACPI_RIMT) += rimt.o diff --git a/drivers/acpi/riscv/init.c b/drivers/acpi/riscv/init.c index 7c00f7995e866d..129ebfae28be3d 100644 --- a/drivers/acpi/riscv/init.c +++ b/drivers/acpi/riscv/init.c @@ -5,11 +5,32 @@ */ #include +#include #include "init.h" +#include "rqsc.h" void __init acpi_arch_init(void) { riscv_acpi_init_gsi_mapping(); + if (IS_ENABLED(CONFIG_ACPI_RIMT)) riscv_acpi_rimt_init(); + + if (IS_ENABLED(CONFIG_RISCV_CBQRI_DRIVER)) { + struct acpi_table_header *rqsc __free(acpi_put_table) = NULL; + acpi_status status = acpi_get_table(ACPI_SIG_RQSC, 0, &rqsc); + + if (status == AE_NOT_FOUND) { + /* RQSC is optional. Silence on systems without it. */ + } else if (ACPI_FAILURE(status)) { + pr_err("RQSC: failed to get table: %s\n", + acpi_format_exception(status)); + } else { + int rc = acpi_parse_rqsc(rqsc); + + if (rc < 0) + pr_err("RQSC: failed to parse table: %d\n", + rc); + } + } } diff --git a/drivers/acpi/riscv/rqsc.c b/drivers/acpi/riscv/rqsc.c new file mode 100644 index 00000000000000..1b1ae2e353a510 --- /dev/null +++ b/drivers/acpi/riscv/rqsc.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "ACPI: RQSC: " fmt + +#include +#include +#include + +#include "rqsc.h" + +#define CBQRI_CTRL_SIZE 0x1000 + +int __init acpi_parse_rqsc(struct acpi_table_header *table) +{ + struct acpi_table_rqsc *rqsc = (struct acpi_table_rqsc *)table; + struct acpi_rqsc_node *end, *node; + int num_controllers = 0; + + /* + * Reject revisions newer than this parser was written against. A + * future revision could extend the fixed RQSC header before the + * first node, which would shift the resource subtables and cause the + * sizeof(*node)-based offset below to point into the wrong place. + */ + if (rqsc->header.revision != ACPI_RQSC_REVISION) { + pr_err("RQSC table revision %u, expected %u, aborting\n", + rqsc->header.revision, ACPI_RQSC_REVISION); + return -EINVAL; + } + + /* Reject tables shorter than the fixed RQSC header. */ + if (rqsc->header.length < sizeof(struct acpi_table_rqsc)) { + pr_err("RQSC table truncated: length %u < %zu, aborting\n", + rqsc->header.length, sizeof(struct acpi_table_rqsc)); + return -EINVAL; + } + + end = ACPI_ADD_PTR(struct acpi_rqsc_node, rqsc, rqsc->header.length); + + for (node = ACPI_ADD_PTR(struct acpi_rqsc_node, rqsc, + sizeof(struct acpi_table_rqsc)); + node < end; + node = ACPI_ADD_PTR(struct acpi_rqsc_node, node, node->length) + ) { + const struct acpi_rqsc_resource *res0; + struct cbqri_controller_info info = {}; + int ret; + + if ((void *)node + sizeof(*node) > (void *)end) { + pr_err("truncated entry at end of table, aborting\n"); + riscv_cbqri_unregister_last(num_controllers); + return -EINVAL; + } + + if (node->length < sizeof(*node)) { + pr_err("malformed RQSC entry: length %u < %zu, aborting\n", + node->length, sizeof(*node)); + riscv_cbqri_unregister_last(num_controllers); + return -EINVAL; + } + + /* + * Without this check, a node whose length claims to extend + * past the end of the table would advance the loop cursor + * past 'end' and silently terminate. Flag the corruption + * explicitly so a malformed firmware table cannot truncate + * the controller list without noise. + */ + if ((void *)node + node->length > (void *)end) { + pr_err("RQSC entry length %u overruns table end, aborting\n", + node->length); + riscv_cbqri_unregister_last(num_controllers); + return -EINVAL; + } + + /* GAS must describe system memory. ioremap() consumes it later. */ + if (node->reg.space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + pr_warn("controller has unsupported address space_id=%u, skipping\n", + node->reg.space_id); + continue; + } + + if (!node->reg.address) { + pr_warn("controller has zero address, skipping\n"); + continue; + } + + info.type = node->type; + /* RQSC section 2 Table 2: 12-byte GAS-format register interface address */ + info.addr = node->reg.address; + info.size = CBQRI_CTRL_SIZE; + info.rcid_count = node->rcid; + info.mcid_count = node->mcid; + + /* See CBQRI_MAX_RCID/MCID in for the rationale. */ + if (info.rcid_count > CBQRI_MAX_RCID) { + pr_warn("controller at %pa: rcid_count %u exceeds CBQRI_MAX_RCID %u, skipping\n", + &info.addr, info.rcid_count, CBQRI_MAX_RCID); + continue; + } + + if (info.mcid_count > CBQRI_MAX_MCID) { + pr_warn("controller at %pa: mcid_count %u exceeds CBQRI_MAX_MCID %u, skipping\n", + &info.addr, info.mcid_count, CBQRI_MAX_MCID); + continue; + } + + /* + * RQSC Table 2: at least one of RCID Count or MCID Count must be non-zero. + */ + if (!info.rcid_count && !info.mcid_count) { + pr_warn("controller at %pa: both rcid_count and mcid_count are zero, skipping\n", + &info.addr); + continue; + } + + if (node->nres == 0) { + pr_warn("controller at %pa has no resource descriptors, skipping\n", + &info.addr); + continue; + } + + /* + * Resources follow the node header in-line. Only res[0] is + * consumed. Bound it against end before reading its prefix so + * a table that ends partway through a resource subtable is + * rejected rather than read past the mapping. + */ + res0 = (const struct acpi_rqsc_resource *) + ((const u8 *)node + sizeof(*node)); + if ((void *)res0 + sizeof(*res0) > (void *)end || + node->length < sizeof(*node) + sizeof(*res0) || + res0->length < sizeof(*res0)) { + pr_warn("controller at %pa: node too short for resource descriptor, skipping\n", + &info.addr); + continue; + } + + if (node->nres > 1) + pr_warn("controller at %pa has %u resource descriptors, using first\n", + &info.addr, node->nres); + + /* + * id1 is u64 but it is used for cache_id and prox_dom + * which are only u32. Reject rather than truncate, so a + * too large id is not silently mapped to the wrong PPTT + * entry or NUMA node. + */ + if (res0->id1 > U32_MAX) { + pr_warn("controller at %pa: id1 0x%llx exceeds u32, skipping\n", + &info.addr, res0->id1); + continue; + } + + /* + * Pair the QoS controller type with the resource descriptor + * fields that index id1. RQSC Table 4 defines the mapping: + * Capacity controller indexes a Processor Cache via PPTT + * cache_id, a Bandwidth controller indexes a Memory Range + * via SRAT proximity domain. Mismatched pairings (e.g. a + * CC whose first resource is Memory) would otherwise route + * id1 into the wrong downstream lookup. + */ + switch (info.type) { + case CBQRI_CONTROLLER_TYPE_CAPACITY: + if (res0->type != ACPI_RQSC_RESOURCE_TYPE_CACHE || + res0->id_type != ACPI_RQSC_RESOURCE_ID_TYPE_PROCESSOR_CACHE) { + pr_warn("CC at %pa: resource type=%u id_type=%u not (cache, processor cache), skipping\n", + &info.addr, res0->type, res0->id_type); + continue; + } + info.cache_id = (u32)res0->id1; + break; + case CBQRI_CONTROLLER_TYPE_BANDWIDTH: + if (res0->type != ACPI_RQSC_RESOURCE_TYPE_MEMORY || + res0->id_type != ACPI_RQSC_RESOURCE_ID_TYPE_MEMORY_RANGE) { + pr_warn("BC at %pa: resource type=%u id_type=%u not (memory, memory range), skipping\n", + &info.addr, res0->type, res0->id_type); + continue; + } + info.prox_dom = (u32)res0->id1; + break; + default: + pr_warn("controller at %pa: unknown type %u, skipping\n", + &info.addr, info.type); + continue; + } + + pr_debug("registering controller type=%u addr=%pa rcid=%u mcid=%u\n", + info.type, &info.addr, info.rcid_count, info.mcid_count); + + ret = riscv_cbqri_register_controller(&info); + if (ret == 0) + num_controllers++; + else + pr_warn("controller at %pa: registration failed (%d), skipping\n", + &info.addr, ret); + } + + pr_info("found %d CBQRI controllers\n", num_controllers); + return 0; +} diff --git a/drivers/acpi/riscv/rqsc.h b/drivers/acpi/riscv/rqsc.h new file mode 100644 index 00000000000000..fa0d96e267e158 --- /dev/null +++ b/drivers/acpi/riscv/rqsc.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Local definitions for the RISC-V Quality of Service Controller (RQSC) + * ACPI table. Will move to ACPICA's include/acpi/actbl2.h once the spec + * is ratified. + */ +#ifndef _DRIVERS_ACPI_RISCV_RQSC_H +#define _DRIVERS_ACPI_RISCV_RQSC_H + +#include +#include + +#define ACPI_SIG_RQSC "RQSC" /* RISC-V Quality of Service Controller */ + +/* RQSC Table 1: current revision number. */ +#define ACPI_RQSC_REVISION 1 + +/* RQSC Table 4: Resource Type values for acpi_rqsc_resource.type. */ +#define ACPI_RQSC_RESOURCE_TYPE_CACHE 0 +#define ACPI_RQSC_RESOURCE_TYPE_MEMORY 1 + +/* RQSC Table 4: Resource ID Type values for .id_type. */ +#define ACPI_RQSC_RESOURCE_ID_TYPE_PROCESSOR_CACHE 0 +#define ACPI_RQSC_RESOURCE_ID_TYPE_MEMORY_RANGE 1 +#define ACPI_RQSC_RESOURCE_ID_TYPE_MEMORY_SIDE_CACHE 2 +#define ACPI_RQSC_RESOURCE_ID_TYPE_ACPI_DEVICE 3 +#define ACPI_RQSC_RESOURCE_ID_TYPE_PCI_DEVICE 4 + +/* + * Byte-packed: u64 id1 would otherwise pad to 8-byte alignment and inflate + * sizeof(*res) from the spec's 20 bytes to 24, mis-sizing resource subtables. + */ +struct acpi_rqsc_resource { + u8 type; + u8 resv; + u16 length; + u16 flags; + u8 resv2; + u8 id_type; + u64 id1; + u32 id2; +} __packed; + +struct acpi_rqsc_node { + u8 type; + u8 resv; + u16 length; + /* RQSC section 2 Table 2: 12-byte GAS-format register interface address */ + struct acpi_generic_address reg; + u16 rcid; + u16 mcid; + u16 flags; + u16 nres; + /* + * Followed by nres acpi_rqsc_resource subtables. Walk them via + * each resource's own length field so a future RQSC revision that + * extends the resource layout cannot misalign older parsers. + */ +} __packed; + +struct acpi_table_rqsc { + struct acpi_table_header header; /* Common ACPI table header */ + u32 num; +} __packed; + +#endif /* _DRIVERS_ACPI_RISCV_RQSC_H */ diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig index 672abea3b03ccb..732aae26f8f8e2 100644 --- a/drivers/resctrl/Kconfig +++ b/drivers/resctrl/Kconfig @@ -29,3 +29,35 @@ config ARM64_MPAM_RESCTRL_FS default y if ARM64_MPAM_DRIVER && RESCTRL_FS select RESCTRL_RMID_DEPENDS_ON_CLOSID select RESCTRL_ASSIGN_FIXED + +menuconfig RISCV_CBQRI_DRIVER + bool "RISC-V CBQRI driver" + depends on RISCV && RISCV_ISA_SSQOSID + help + Capacity and Bandwidth QoS Register Interface (CBQRI) driver + for RISC-V cache and memory-controller QoS resources. CBQRI + exposes capacity allocation, bandwidth reservation, weighted + bandwidth share, and per-MCID monitoring counters through the + resctrl filesystem at /sys/fs/resctrl when RESCTRL_FS is also + enabled. + + RISCV_ISA_SSQOSID provides the srmcfg CSR that tags each hart's + memory traffic with the RCID and MCID consumed by CBQRI + controllers. + +if RISCV_CBQRI_DRIVER + +config RISCV_CBQRI_DRIVER_DEBUG + bool "Enable debug messages from the CBQRI driver" + help + Say yes here to enable debug messages from the CBQRI driver. + + This adds pr_debug() output covering controller probe and + per-controller registration steps. Useful when bringing up a + new platform; otherwise leave disabled to avoid log noise. + +endif + +config RISCV_CBQRI_RESCTRL_FS + bool + default y if RISCV_CBQRI_DRIVER && RESCTRL_FS diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile index 4f6d0e81f9b8f3..ed737b4461b975 100644 --- a/drivers/resctrl/Makefile +++ b/drivers/resctrl/Makefile @@ -3,3 +3,9 @@ mpam-y += mpam_devices.o mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG + +obj-$(CONFIG_RISCV_CBQRI_DRIVER) += cbqri.o +cbqri-y += cbqri_devices.o +cbqri-$(CONFIG_RISCV_CBQRI_RESCTRL_FS) += cbqri_resctrl.o + +ccflags-$(CONFIG_RISCV_CBQRI_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/cbqri_devices.c b/drivers/resctrl/cbqri_devices.c new file mode 100644 index 00000000000000..b1a6ce35042ace --- /dev/null +++ b/drivers/resctrl/cbqri_devices.c @@ -0,0 +1,1154 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cbqri_internal.h" + +LIST_HEAD(cbqri_controllers); + +/* Set capacity block mask (cc_block_mask) */ +static void cbqri_set_cbm(struct cbqri_controller *ctrl, u64 cbm) +{ + iowrite64(cbm, ctrl->base + CBQRI_CC_BLOCK_MASK_OFF); +} + +/* Set the Rbwb (reserved bandwidth blocks) field in bc_bw_alloc */ +static void cbqri_set_rbwb(struct cbqri_controller *ctrl, u64 rbwb) +{ + u64 reg; + + reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF); + FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_RBWB_MASK, ®, rbwb); + iowrite64(reg, ctrl->base + CBQRI_BC_BW_ALLOC_OFF); +} + +/* Get the Rbwb (reserved bandwidth blocks) field in bc_bw_alloc */ +static u64 cbqri_get_rbwb(struct cbqri_controller *ctrl) +{ + u64 reg; + + reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF); + return FIELD_GET(CBQRI_CONTROL_REGISTERS_RBWB_MASK, reg); +} + +/* Set the Mweight (opportunistic weight) field in bc_bw_alloc */ +static void cbqri_set_mweight(struct cbqri_controller *ctrl, u64 mweight) +{ + u64 reg; + + reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF); + FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK, ®, mweight); + iowrite64(reg, ctrl->base + CBQRI_BC_BW_ALLOC_OFF); +} + +/* Get the Mweight (opportunistic weight) field in bc_bw_alloc */ +static u64 cbqri_get_mweight(struct cbqri_controller *ctrl) +{ + u64 reg; + + reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF); + return FIELD_GET(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK, reg); +} + +/* + * Stage both fields of bc_bw_alloc in one read-modify-write so the staging + * register is consistent after a single MMIO write. + */ +static void cbqri_set_bc_bw_alloc(struct cbqri_controller *ctrl, + u64 rbwb, u64 mweight) +{ + u64 reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF); + + FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_RBWB_MASK, ®, rbwb); + FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK, ®, mweight); + iowrite64(reg, ctrl->base + CBQRI_BC_BW_ALLOC_OFF); +} + +enum cbqri_bc_field { + CBQRI_BC_FIELD_RBWB, + CBQRI_BC_FIELD_MWEIGHT, +}; + +static int cbqri_wait_busy_flag(struct cbqri_controller *ctrl, int reg_offset, + u64 *regp) +{ + u64 reg; + int ret; + + /* + * Sleeping poll: caller holds ctrl->lock as a sleeping mutex, so + * 10us/1ms is safe under PREEMPT_RT. + */ + ret = readq_poll_timeout(ctrl->base + reg_offset, reg, + !FIELD_GET(CBQRI_CONTROL_REGISTERS_BUSY_MASK, reg), + 10, 1000); + if (ret) { + ctrl->faulted = true; + return ret; + } + ctrl->faulted = false; + if (regp) + *regp = reg; + return 0; +} + +/* + * Perform capacity allocation control operation on capacity controller. + * Caller must hold ctrl->lock. + */ +static int cbqri_cc_alloc_op(struct cbqri_controller *ctrl, int operation, + int rcid, enum cbqri_at at) +{ + int reg_offset = CBQRI_CC_ALLOC_CTL_OFF; + int status; + u64 reg; + + lockdep_assert_held(&ctrl->lock); + + if (cbqri_wait_busy_flag(ctrl, reg_offset, ®) < 0) { + pr_err_ratelimited("BUSY timeout before starting operation\n"); + return -EIO; + } + FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_OP_MASK, ®, operation); + FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_RCID_MASK, ®, rcid); + + /* + * CBQRI Table 1: AT 0=Data, 1=Code. Program AT on controllers + * that report supports_alloc_at_code. On controllers that don't, + * AT is reserved-zero and the op acts on both halves. + */ + reg &= ~CBQRI_CONTROL_REGISTERS_AT_MASK; + if (ctrl->cc.supports_alloc_at_code) + reg |= FIELD_PREP(CBQRI_CONTROL_REGISTERS_AT_MASK, at); + + iowrite64(reg, ctrl->base + reg_offset); + + if (cbqri_wait_busy_flag(ctrl, reg_offset, ®) < 0) { + pr_err_ratelimited("BUSY timeout during operation\n"); + return -EIO; + } + + status = FIELD_GET(CBQRI_CONTROL_REGISTERS_STATUS_MASK, reg); + if (status != CBQRI_CC_ALLOC_CTL_STATUS_SUCCESS) { + pr_err_ratelimited("operation %d failed: status=%d\n", operation, status); + return -EIO; + } + + return 0; +} + +/* + * Issue a monitoring op on a CC or BC controller's mon_ctl register at + * reg_offset (CBQRI_CC_MON_CTL_OFF or CBQRI_BC_MON_CTL_OFF). The CC and + * BC mon_ctl registers share an identical OP/MCID/EVT_ID/STATUS layout, so + * one helper covers both. Caller must hold ctrl->lock. + */ +int cbqri_mon_op(struct cbqri_controller *ctrl, int reg_offset, + int operation, int mcid, int evt_id, u64 *out_reg) +{ + u64 reg; + + lockdep_assert_held(&ctrl->lock); + + if (cbqri_wait_busy_flag(ctrl, reg_offset, ®) < 0) { + pr_err_ratelimited("BUSY timeout before starting operation\n"); + return -EIO; + } + FIELD_MODIFY(CBQRI_MON_CTL_OP_MASK, ®, operation); + FIELD_MODIFY(CBQRI_MON_CTL_MCID_MASK, ®, mcid); + FIELD_MODIFY(CBQRI_MON_CTL_EVT_ID_MASK, ®, evt_id); + iowrite64(reg, ctrl->base + reg_offset); + + if (cbqri_wait_busy_flag(ctrl, reg_offset, ®) < 0) { + pr_err_ratelimited("BUSY timeout\n"); + return -EIO; + } + + if (FIELD_GET(CBQRI_MON_CTL_STATUS_MASK, reg) != + CBQRI_MON_CTL_STATUS_SUCCESS) + return -EIO; + + if (out_reg) + *out_reg = reg; + + return 0; +} + +/* + * Perform bandwidth allocation control operation on bandwidth controller. + * Caller must hold ctrl->lock. + */ +static int cbqri_bc_alloc_op(struct cbqri_controller *ctrl, int operation, int rcid) +{ + int reg_offset = CBQRI_BC_ALLOC_CTL_OFF; + int status; + u64 reg; + + lockdep_assert_held(&ctrl->lock); + + if (cbqri_wait_busy_flag(ctrl, reg_offset, ®) < 0) { + pr_err_ratelimited("BUSY timeout before starting operation\n"); + return -EIO; + } + FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_OP_MASK, ®, operation); + FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_RCID_MASK, ®, rcid); + reg &= ~CBQRI_CONTROL_REGISTERS_AT_MASK; + iowrite64(reg, ctrl->base + reg_offset); + + if (cbqri_wait_busy_flag(ctrl, reg_offset, ®) < 0) { + pr_err_ratelimited("BUSY timeout during operation\n"); + return -EIO; + } + + status = FIELD_GET(CBQRI_CONTROL_REGISTERS_STATUS_MASK, reg); + if (status != CBQRI_BC_ALLOC_CTL_STATUS_SUCCESS) { + pr_err_ratelimited("BC alloc op %d failed: status=%d\n", + operation, status); + return -EIO; + } + + return 0; +} + +/* + * Apply a capacity block mask and verify via CONFIG_LIMIT + READ_LIMIT. + * + * AT-capable controllers with CDP off need a second CONFIG_LIMIT on the + * other AT half (the spec encodes AT only as 0=Data / 1=Code, there is + * no "both halves" value). CDP-on issues separate per-type writes from + * resctrl, so a single CONFIG_LIMIT per call is correct. + */ +int cbqri_apply_cache_config(struct cbqri_controller *ctrl, u32 closid, + const struct cbqri_cc_config *cfg) +{ + bool need_at_mirror; + u64 saved_cbm = 0; + int err = 0; + u64 reg; + + mutex_lock(&ctrl->lock); + + need_at_mirror = ctrl->cc.supports_alloc_at_code && !cfg->cdp_enabled; + + /* + * Capture the cfg->at half CBM before any write so a partial + * AT-mirror failure can revert and keep the two halves consistent. + * Pre-clear cc_block_mask so a silent firmware no-op (status + * SUCCESS but staging not updated) shows as a zero readback + * rather than carrying stale data from a prior op. Mirrors the + * defensive pattern in cbqri_read_cache_config(). + */ + if (need_at_mirror) { + cbqri_set_cbm(ctrl, 0); + err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, + closid, cfg->at); + if (err < 0) + goto out; + saved_cbm = ioread64(ctrl->base + CBQRI_CC_BLOCK_MASK_OFF); + } + + /* Set capacity block mask (cc_block_mask) */ + cbqri_set_cbm(ctrl, cfg->cbm); + + /* Capacity config limit operation for the AT half implied by cfg->at */ + err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT, + closid, cfg->at); + if (err < 0) + goto out; + + /* + * CDP-off mirror: on AT-capable controllers, also program the + * other AT half with the same mask so the two halves stay in sync. + */ + if (need_at_mirror) { + enum cbqri_at other = (cfg->at == CBQRI_AT_CODE) ? + CBQRI_AT_DATA : CBQRI_AT_CODE; + + cbqri_set_cbm(ctrl, cfg->cbm); + err = cbqri_cc_alloc_op(ctrl, + CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT, + closid, other); + if (err < 0) { + int rerr; + + /* + * Best-effort revert of the cfg->at half so the two + * halves stay in sync. A schemata read sees only one + * half, so silent divergence would otherwise report + * the new value as if the write had succeeded. + */ + cbqri_set_cbm(ctrl, saved_cbm); + rerr = cbqri_cc_alloc_op(ctrl, + CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT, + closid, cfg->at); + if (rerr < 0) + pr_err_ratelimited("AT-mirror revert failed (err=%d), AT halves diverged\n", + rerr); + goto out; + } + } + + /* Clear cc_block_mask before read limit to verify op works */ + cbqri_set_cbm(ctrl, 0); + + /* Perform a capacity read limit operation to verify blockmask */ + err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, + closid, cfg->at); + if (err < 0) + goto out; + + /* + * Read capacity blockmask and narrow to u32 to match resctrl's CBM + * width. cbqri_probe_cc() rejects ncblks > 32 so the upper bits are + * reserved zero. + */ + reg = ioread64(ctrl->base + CBQRI_CC_BLOCK_MASK_OFF); + if (lower_32_bits(reg) != cfg->cbm) { + pr_err_ratelimited("CBM verify mismatch (reg=%llx != cbm=%llx)\n", + reg, cfg->cbm); + err = -EIO; + } + +out: + mutex_unlock(&ctrl->lock); + return err; +} + +/* + * Read the configured CBM for closid on the at half via READ_LIMIT. + * Pre-clears cc_block_mask before the op so a silent firmware no-op + * (status SUCCESS but staging not updated) is detectable in cbm_out. + */ +int cbqri_read_cache_config(struct cbqri_controller *ctrl, u32 closid, + enum cbqri_at at, u32 *cbm_out) +{ + int err; + + mutex_lock(&ctrl->lock); + cbqri_set_cbm(ctrl, 0); + err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, closid, at); + if (err == 0) { + /* + * cc_block_mask is a 64-bit MMIO register. resctrl exposes the + * CBM as a u32. cbqri_probe_cc() rejects ncblks > 32 so the + * upper 32 bits are reserved zero by the spec. Narrow + * explicitly via lower_32_bits() so the assumption is visible + * at the read site. + */ + *cbm_out = lower_32_bits(ioread64(ctrl->base + CBQRI_CC_BLOCK_MASK_OFF)); + } + mutex_unlock(&ctrl->lock); + return err; +} + +/* + * Apply a per-RCID update to one field (Rbwb or Mweight) of bc_bw_alloc. + * bc_bw_alloc packs both fields, so both halves are seeded from the + * authoritative software caches before CONFIG_LIMIT. This avoids the + * silent READ_LIMIT no-op window where stale data from a prior op's + * RCID could leak into the unmodified field. The verify step uses an + * inverted-value sentinel to confirm hardware accepted the target field. + * + * Caller must hold ctrl->lock. + */ +static int cbqri_apply_bc_field(struct cbqri_controller *ctrl, u32 closid, + enum cbqri_bc_field field, u64 val) +{ + u64 rbwb = ctrl->rbwb_cache[closid]; + u64 mweight = ctrl->mweight_cache[closid]; + u64 readback; + int ret; + + lockdep_assert_held(&ctrl->lock); + + if (field == CBQRI_BC_FIELD_RBWB) + rbwb = val; + else + mweight = val; + + /* + * Wait for BUSY=0 before staging. A read-modify-write to the + * bc_bw_alloc staging register while an op is in flight can corrupt + * the unmodified field. + */ + if (cbqri_wait_busy_flag(ctrl, CBQRI_BC_ALLOC_CTL_OFF, NULL) < 0) { + pr_err_ratelimited("BUSY timeout before staging bc_bw_alloc\n"); + return -EIO; + } + + cbqri_set_bc_bw_alloc(ctrl, rbwb, mweight); + + ret = cbqri_bc_alloc_op(ctrl, CBQRI_BC_ALLOC_CTL_OP_CONFIG_LIMIT, closid); + if (ret < 0) + return ret; + + /* + * Pre-write a sentinel that cannot equal val to the target field + * so a silent READ_LIMIT (status SUCCESS but no staging update) + * is detectable in the readback. ~val truncated to the field + * width cannot equal val. + */ + if (field == CBQRI_BC_FIELD_RBWB) + cbqri_set_rbwb(ctrl, ~val); + else + cbqri_set_mweight(ctrl, ~val); + + ret = cbqri_bc_alloc_op(ctrl, CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT, closid); + if (ret < 0) + return ret; + + readback = (field == CBQRI_BC_FIELD_RBWB) ? + cbqri_get_rbwb(ctrl) : cbqri_get_mweight(ctrl); + if (readback != val) { + pr_err_ratelimited("BC field verify mismatch (reg=0x%llx != val=%llu)\n", + readback, val); + return -EIO; + } + + /* Hardware confirmed to hold val. Update the authoritative cache. */ + if (field == CBQRI_BC_FIELD_RBWB) + ctrl->rbwb_cache[closid] = rbwb; + else + ctrl->mweight_cache[closid] = mweight; + + return 0; +} + +/* + * Apply an Rbwb update for closid, optionally enforcing CBQRI section 4.5 + * sum(Rbwb) <= MRBWB. check_sum=false is used by coordinated init/reset + * walks where intermediate sums may transiently exceed MRBWB. + */ +int cbqri_apply_rbwb(struct cbqri_controller *ctrl, u32 closid, + u64 rbwb, bool check_sum) +{ + u32 i; + int ret; + + if (rbwb > U16_MAX) + return -EINVAL; + + mutex_lock(&ctrl->lock); + + if (check_sum && rbwb > 0) { + u64 sum = rbwb; + + for (i = 0; i < ctrl->rcid_count; i++) { + if (i == closid) + continue; + sum += ctrl->rbwb_cache[i]; + } + if (sum > ctrl->bc.mrbwb) { + pr_err_ratelimited("RBWB sum %llu exceeds MRBWB %u\n", + sum, ctrl->bc.mrbwb); + ret = -EINVAL; + goto out; + } + } + + ret = cbqri_apply_bc_field(ctrl, closid, CBQRI_BC_FIELD_RBWB, rbwb); +out: + mutex_unlock(&ctrl->lock); + return ret; +} + +int cbqri_apply_mweight_config(struct cbqri_controller *ctrl, u32 closid, + u64 mweight) +{ + int ret; + + if (mweight > FIELD_MAX(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK)) + return -EINVAL; + + mutex_lock(&ctrl->lock); + ret = cbqri_apply_bc_field(ctrl, closid, CBQRI_BC_FIELD_MWEIGHT, mweight); + mutex_unlock(&ctrl->lock); + return ret; +} + +/* + * Read the Rbwb (reserved bandwidth blocks) for closid via READ_LIMIT. + */ +int cbqri_read_rbwb(struct cbqri_controller *ctrl, u32 closid, u64 *rbwb_out) +{ + u8 mweight_sentinel = ~ctrl->mweight_cache[closid]; + int err; + + mutex_lock(&ctrl->lock); + + /* + * Stage a sentinel into the unread Mweight field. A silent + * READ_LIMIT no-op (status SUCCESS but staging not refreshed) leaves + * the sentinel in place, while a real read overwrites Mweight with + * the hardware value, which differs from the inverted cache sentinel. + */ + cbqri_set_bc_bw_alloc(ctrl, ctrl->rbwb_cache[closid], mweight_sentinel); + err = cbqri_bc_alloc_op(ctrl, CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT, closid); + if (err == 0) { + if (cbqri_get_mweight(ctrl) == mweight_sentinel) { + pr_err_ratelimited("Rbwb READ_LIMIT did not update staging\n"); + err = -EIO; + } else { + *rbwb_out = cbqri_get_rbwb(ctrl); + } + } + mutex_unlock(&ctrl->lock); + return err; +} + +/* + * Read the Mweight (opportunistic weight) for closid via READ_LIMIT. + */ +int cbqri_read_mweight(struct cbqri_controller *ctrl, u32 closid, u64 *mweight_out) +{ + u16 rbwb_sentinel = ~ctrl->rbwb_cache[closid]; + int err; + + mutex_lock(&ctrl->lock); + + /* + * Stage a sentinel into the unread Rbwb field so a silent READ_LIMIT + * no-op is detectable, mirroring cbqri_read_rbwb(). + */ + cbqri_set_bc_bw_alloc(ctrl, rbwb_sentinel, ctrl->mweight_cache[closid]); + err = cbqri_bc_alloc_op(ctrl, CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT, closid); + if (err == 0) { + if (cbqri_get_rbwb(ctrl) == rbwb_sentinel) { + pr_err_ratelimited("Mweight READ_LIMIT did not update staging\n"); + err = -EIO; + } else { + *mweight_out = cbqri_get_mweight(ctrl); + } + } + mutex_unlock(&ctrl->lock); + return err; +} + +static int cbqri_probe_feature(struct cbqri_controller *ctrl, int reg_offset, + int operation, int evt_id, int *status, + bool *access_type_supported) +{ + const u64 active_mask = CBQRI_CONTROL_REGISTERS_OP_MASK | + CBQRI_CONTROL_REGISTERS_AT_MASK | + CBQRI_CONTROL_REGISTERS_RCID_MASK | + CBQRI_MON_CTL_EVT_ID_MASK; + u64 reg, saved_reg; + int at; + + /* + * Default the output to false so the status==0 (feature not + * implemented) path returns a deterministic value to the caller + * rather than leaving an uninitialized bool. mon_ctl probes pass + * NULL: the register has no AT field, so the AT probe is skipped. + */ + if (access_type_supported) + *access_type_supported = false; + + /* Keep the initial register value to preserve the WPRI fields */ + reg = ioread64(ctrl->base + reg_offset); + saved_reg = reg; + + /* Drain any in-flight firmware op before issuing our own write. */ + if (cbqri_wait_busy_flag(ctrl, reg_offset, &saved_reg) < 0) { + pr_err("BUSY timeout before probe operation\n"); + return -EIO; + } + + /* + * Execute the requested operation with the active fields + * (OP/AT/RCID/EVT_ID) cleared, then set OP and, for mon_ctl, the + * probe-safe evt_id. WPRI bits outside active_mask carry over from + * saved_reg. alloc_ctl callers pass evt_id 0. + */ + reg = (saved_reg & ~active_mask) | + FIELD_PREP(CBQRI_CONTROL_REGISTERS_OP_MASK, operation) | + FIELD_PREP(CBQRI_MON_CTL_EVT_ID_MASK, evt_id); + iowrite64(reg, ctrl->base + reg_offset); + if (cbqri_wait_busy_flag(ctrl, reg_offset, ®) < 0) { + pr_err_ratelimited("BUSY timeout during operation\n"); + return -EIO; + } + + /* Get the operation status */ + *status = FIELD_GET(CBQRI_CONTROL_REGISTERS_STATUS_MASK, reg); + + /* + * Probe AT support only on alloc_ctl registers (mon_ctl has no AT + * field, so access_type_supported is NULL there). Skipped when the + * register is unimplemented (status stays 0). + */ + if (access_type_supported && *status != 0) { + /* + * Re-issue operation with AT=CODE so the controller + * latches AT=CODE on supported hardware (or resets it to 0 + * on hardware that doesn't). OP must be a defined CBQRI op + * here. OP=0 is a no-op and would silently disable CDP. + */ + reg = (saved_reg & ~active_mask) | + FIELD_PREP(CBQRI_CONTROL_REGISTERS_OP_MASK, operation) | + FIELD_PREP(CBQRI_CONTROL_REGISTERS_AT_MASK, + CBQRI_CONTROL_REGISTERS_AT_CODE); + iowrite64(reg, ctrl->base + reg_offset); + if (cbqri_wait_busy_flag(ctrl, reg_offset, ®) < 0) { + pr_err("BUSY timeout setting AT field\n"); + return -EIO; + } + + /* + * If the AT field value has been reset to zero, + * then the AT support is not present + */ + at = FIELD_GET(CBQRI_CONTROL_REGISTERS_AT_MASK, reg); + if (at == CBQRI_CONTROL_REGISTERS_AT_CODE) + *access_type_supported = true; + } + + /* + * Restore the original register value. + * Clear OP to avoid re-triggering the probe op. + */ + saved_reg &= ~CBQRI_CONTROL_REGISTERS_OP_MASK; + iowrite64(saved_reg, ctrl->base + reg_offset); + if (cbqri_wait_busy_flag(ctrl, reg_offset, NULL) < 0) { + pr_err("BUSY timeout restoring register value\n"); + return -EIO; + } + + return 0; +} + +static int cbqri_probe_cc(struct cbqri_controller *ctrl) +{ + int err, status; + u64 reg; + + reg = ioread64(ctrl->base + CBQRI_CC_CAPABILITIES_OFF); + if (reg == 0) + return -ENODEV; + + ctrl->ver_minor = FIELD_GET(CBQRI_CC_CAPABILITIES_VER_MINOR_MASK, reg); + ctrl->ver_major = FIELD_GET(CBQRI_CC_CAPABILITIES_VER_MAJOR_MASK, reg); + ctrl->cc.ncblks = FIELD_GET(CBQRI_CC_CAPABILITIES_NCBLKS_MASK, reg); + + pr_debug("version=%d.%d ncblks=%d cache_level=%d\n", + ctrl->ver_major, ctrl->ver_minor, + ctrl->cc.ncblks, ctrl->cache.cache_level); + + /* + * NCBLKS == 0 would divide-by-zero in the schemata math while + * ctrl->lock is held. + */ + if (!ctrl->cc.ncblks) { + pr_warn("CC at %pa has 0 capacity blocks, skipping\n", + &ctrl->addr); + return -ENODEV; + } + + if (ctrl->cc.ncblks > 32) { + pr_warn("CC at %pa has ncblks=%u > 32 (resctrl CBM is u32), skipping\n", + &ctrl->addr, ctrl->cc.ncblks); + return -ENODEV; + } + + /* + * Resolve cache_size via cacheinfo. cpus_read_lock satisfies + * lockdep_assert_cpus_held() inside get_cpu_cacheinfo_level(). If + * every cpu_mask member is offline, cache_size stays 0 and the + * controller cannot back occupancy monitoring. + */ + cpus_read_lock(); + if (!ctrl->cache.cache_size) { + int cpu = cpumask_first_and(&ctrl->cache.cpu_mask, cpu_online_mask); + + if (cpu < nr_cpu_ids) { + struct cacheinfo *ci; + + ci = get_cpu_cacheinfo_level(cpu, ctrl->cache.cache_level); + if (ci) + ctrl->cache.cache_size = ci->size; + } + } + cpus_read_unlock(); + + /* Probe monitoring features */ + err = cbqri_probe_feature(ctrl, CBQRI_CC_MON_CTL_OFF, + CBQRI_CC_MON_CTL_OP_CONFIG_EVENT, + CBQRI_CC_EVT_ID_NONE, &status, NULL); + if (err) + return err; + + if (status == CBQRI_MON_CTL_STATUS_SUCCESS) { + /* + * Occupancy is reported to userspace in bytes, computed as + * cache_size * counter / ncblks by the resctrl glue. If + * cacheinfo has no cache_size, leave mon_capable false so + * the file is not exposed at all rather than silently + * returning 0. + */ + if (!ctrl->cache.cache_size) + pr_debug("CC @%pa: cache_size unknown, occupancy monitoring disabled\n", + &ctrl->addr); + else + ctrl->mon_capable = true; + } + + /* Probe allocation features */ + err = cbqri_probe_feature(ctrl, CBQRI_CC_ALLOC_CTL_OFF, + CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, 0, + &status, &ctrl->cc.supports_alloc_at_code); + if (err) + return err; + + if (status == CBQRI_CC_ALLOC_CTL_STATUS_SUCCESS) + ctrl->alloc_capable = true; + + return 0; +} + +static int cbqri_probe_bc(struct cbqri_controller *ctrl) +{ + int err, status; + u32 i; + u64 reg; + + reg = ioread64(ctrl->base + CBQRI_BC_CAPABILITIES_OFF); + if (reg == 0) + return -ENODEV; + + ctrl->ver_minor = FIELD_GET(CBQRI_BC_CAPABILITIES_VER_MINOR_MASK, reg); + ctrl->ver_major = FIELD_GET(CBQRI_BC_CAPABILITIES_VER_MAJOR_MASK, reg); + ctrl->bc.nbwblks = FIELD_GET(CBQRI_BC_CAPABILITIES_NBWBLKS_MASK, reg); + ctrl->bc.mrbwb = FIELD_GET(CBQRI_BC_CAPABILITIES_MRBWB_MASK, reg); + + if (!ctrl->bc.nbwblks) { + pr_err("bandwidth controller has nbwblks=0\n"); + return -EINVAL; + } + + if (!ctrl->rcid_count) { + pr_err("bandwidth controller has rcid_count=0\n"); + return -EINVAL; + } + + /* + * Reset seeds RCID 0 with mrbwb - (rcid_count - 1). Reject a + * controller that would underflow that arithmetic. + */ + if (ctrl->bc.mrbwb < ctrl->rcid_count) { + pr_err("bandwidth controller has mrbwb=%u < rcid_count=%u, rejecting\n", + ctrl->bc.mrbwb, ctrl->rcid_count); + return -EINVAL; + } + + pr_debug("version=%d.%d nbwblks=%d mrbwb=%d\n", + ctrl->ver_major, ctrl->ver_minor, + ctrl->bc.nbwblks, ctrl->bc.mrbwb); + + /* Probe monitoring features */ + err = cbqri_probe_feature(ctrl, CBQRI_BC_MON_CTL_OFF, + CBQRI_BC_MON_CTL_OP_READ_COUNTER, 0, + &status, NULL); + if (err) + return err; + + if (status == CBQRI_MON_CTL_STATUS_SUCCESS) + ctrl->mon_capable = true; + + /* Probe allocation features */ + err = cbqri_probe_feature(ctrl, CBQRI_BC_ALLOC_CTL_OFF, + CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT, 0, + &status, &ctrl->bc.supports_alloc_at_code); + if (err) + return err; + + if (status == CBQRI_BC_ALLOC_CTL_STATUS_SUCCESS) { + ctrl->alloc_capable = true; + + /* + * Per-RCID Rbwb and Mweight caches. The caches feed both + * fields of bc_bw_alloc on every apply so the staging + * register reflects authoritative software state, sidestepping + * silent READ_LIMIT no-op corruption of the unmodified field. + * rbwb_cache also lets cbqri_apply_rbwb() validate + * sum(Rbwb) <= MRBWB without re-reading every RCID. + */ + ctrl->rbwb_cache = kcalloc(ctrl->rcid_count, + sizeof(*ctrl->rbwb_cache), + GFP_KERNEL); + if (!ctrl->rbwb_cache) + return -ENOMEM; + + ctrl->mweight_cache = kcalloc(ctrl->rcid_count, + sizeof(*ctrl->mweight_cache), + GFP_KERNEL); + if (!ctrl->mweight_cache) { + kfree(ctrl->rbwb_cache); + ctrl->rbwb_cache = NULL; + return -ENOMEM; + } + + /* + * Seed mweight to the maximum, matching the resctrl-side + * MB_WGHT default. cbqri_apply_bc_field() reads both halves + * of bc_bw_alloc from the caches on every CONFIG_LIMIT, so + * the first MB_MIN domain init (which writes Rbwb) would + * otherwise commit Mweight=0 to every RCID. Per CBQRI 4.5 + * a weight of 0 implies the configured limit is a hard + * limit and the use of unused or non-reserved bandwidth + * is not allowed, which starves every RCID of opportunistic + * bandwidth until the subsequent MB_WGHT domain init + * catches up. + */ + for (i = 0; i < ctrl->rcid_count; i++) + ctrl->mweight_cache[i] = + FIELD_MAX(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK); + } + + return 0; +} + +static int cbqri_probe_controller(struct cbqri_controller *ctrl) +{ + int err; + + pr_debug("controller info: type=%d addr=%pa size=%pa max-rcid=%u max-mcid=%u\n", + ctrl->type, &ctrl->addr, &ctrl->size, + ctrl->rcid_count, ctrl->mcid_count); + + if (!ctrl->addr) { + pr_warn("controller has invalid addr=0x0, skipping\n"); + return -EINVAL; + } + + if (ctrl->size < CBQRI_CTRL_MIN_REG_SPAN) { + pr_warn("controller at %pa: size %pa < minimum 0x%x, skipping\n", + &ctrl->addr, &ctrl->size, CBQRI_CTRL_MIN_REG_SPAN); + return -EINVAL; + } + + if (!request_mem_region(ctrl->addr, ctrl->size, "cbqri_controller")) { + pr_err("request_mem_region failed for %pa\n", &ctrl->addr); + return -EBUSY; + } + + ctrl->base = ioremap(ctrl->addr, ctrl->size); + if (!ctrl->base) { + pr_err("ioremap failed for %pa\n", &ctrl->addr); + err = -ENOMEM; + goto err_release; + } + + switch (ctrl->type) { + case CBQRI_CONTROLLER_TYPE_CAPACITY: + err = cbqri_probe_cc(ctrl); + break; + case CBQRI_CONTROLLER_TYPE_BANDWIDTH: + err = cbqri_probe_bc(ctrl); + break; + default: + pr_err("unknown controller type %d\n", ctrl->type); + err = -ENODEV; + break; + } + + if (err) + goto err_iounmap; + + return 0; + +err_iounmap: + iounmap(ctrl->base); + ctrl->base = NULL; +err_release: + release_mem_region(ctrl->addr, ctrl->size); + return err; +} + +/* + * Pre-arm every MCID with the Occupancy event so a subsequent READ_COUNTER + * just snapshots the live counter rather than re-configuring the slot. + * Called once per CC during resctrl-side cpuhp online for the L3 monitoring + * domain. + */ +int cbqri_init_mon_counters(struct cbqri_controller *ctrl) +{ + int i, err; + + for (i = 0; i < ctrl->mcid_count; i++) { + mutex_lock(&ctrl->lock); + err = cbqri_mon_op(ctrl, CBQRI_CC_MON_CTL_OFF, + CBQRI_CC_MON_CTL_OP_CONFIG_EVENT, + i, CBQRI_CC_EVT_ID_OCCUPANCY, NULL); + mutex_unlock(&ctrl->lock); + if (err) + return err; + } + return 0; +} + +/* + * 62-bit BC counter delta. Inputs must be pre-masked to + * CBQRI_BC_MON_CTR_VAL_CTR_MASK. The shift promotes the modular + * subtraction into 64-bit so a single wrap (cur < prev) yields the + * correct delta. Multi-wrap is handled by the caller via the + * hardware OVF bit (CBQRI 4.3). This function only needs to recover + * from at most one wrap. + */ +u64 cbqri_bc_mon_overflow(u64 prev_ctr, u64 cur_ctr) +{ + const unsigned int shift = 64 - 62; + u64 chunks = (cur_ctr << shift) - (prev_ctr << shift); + + return chunks >> shift; +} + +/* + * Allocate the per-MCID software accumulator and pre-arm every MCID + * with TOTAL_READ_WRITE so subsequent reads just snapshot the live + * counter. + * + * Caller responsibility: serialize concurrent invocations on the same + * single mon-capable BC (cbqri_resctrl uses cbqri_domain_list_lock for + * this). + */ +int cbqri_init_bc_mon_counters(struct cbqri_controller *bc) +{ + int i, err; + + if (bc->mbm_total_states) + return 0; + + bc->mbm_total_states = kcalloc(bc->mcid_count, + sizeof(*bc->mbm_total_states), + GFP_KERNEL); + if (!bc->mbm_total_states) + return -ENOMEM; + + for (i = 0; i < bc->mcid_count; i++) { + mutex_lock(&bc->lock); + err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF, + CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, + i, CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL); + mutex_unlock(&bc->lock); + if (err) { + kfree(bc->mbm_total_states); + bc->mbm_total_states = NULL; + return err; + } + } + return 0; +} + +/* + * Return the single mon-capable BC, NULL if zero or more than one. BC + * counters can only accurately surface as L3 mbm_total_bytes if every memory + * request flows through the same BC. + */ +struct cbqri_controller *cbqri_find_only_mon_bc(void) +{ + struct cbqri_controller *ctrl, *only_bc = NULL; + + list_for_each_entry(ctrl, &cbqri_controllers, list) { + if (ctrl->type != CBQRI_CONTROLLER_TYPE_BANDWIDTH) + continue; + if (!ctrl->mon_capable) + continue; + if (only_bc) + return NULL; + only_bc = ctrl; + } + return only_bc; +} + +void cbqri_controller_destroy(struct cbqri_controller *ctrl) +{ + /* + * cbqri_probe_controller() clears ctrl->base on its error paths and + * releases the mem region itself, so reach into both only when + * destroy is rolling back a successful probe. + */ + if (ctrl->base) { + iounmap(ctrl->base); + release_mem_region(ctrl->addr, ctrl->size); + } + kfree(ctrl->mbm_total_states); + kfree(ctrl->mweight_cache); + kfree(ctrl->rbwb_cache); + kfree(ctrl); +} + +/* + * Roll back the most recent n successful riscv_cbqri_register_controller() + * calls. Discovery layers use this to undo partial registrations when a + * subsequent table entry turns out to be malformed and the whole parse must + * abort. + * + * Caller serialization: this is intended for boot-time discovery (ACPI + * acpi_arch_init, future DT) which run single-threaded before late_initcall. + * No lock is taken. + */ +void riscv_cbqri_unregister_last(unsigned int n) +{ + while (n--) { + struct cbqri_controller *ctrl; + + if (list_empty(&cbqri_controllers)) + return; + ctrl = list_last_entry(&cbqri_controllers, + struct cbqri_controller, list); + list_del(&ctrl->list); + cbqri_controller_destroy(ctrl); + } +} + +/* + * Allocate, populate, and add to cbqri_controllers a fresh controller + * descriptor based on info supplied by a discovery layer (ACPI RQSC, + * future DT). Resolves the cpumask via PPTT (capacity) so callers do + * not need to know about cacheinfo topology. + */ +int riscv_cbqri_register_controller(const struct cbqri_controller_info *info) +{ + struct cbqri_controller *ctrl; + int err; + + if (!info->addr) { + pr_warn("skipping controller with invalid addr=0x0\n"); + return -EINVAL; + } + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return -ENOMEM; + + mutex_init(&ctrl->lock); + + ctrl->addr = info->addr; + ctrl->size = info->size; + ctrl->type = info->type; + ctrl->rcid_count = info->rcid_count; + ctrl->mcid_count = info->mcid_count; + + /* + * SRMCFG encodes RCID in 12 bits. ACPI's acpi_parse_rqsc() already + * caps info->rcid_count at CBQRI_MAX_RCID (1024) so this is unreachable + * today, but a future DT discovery path or a malformed firmware table + * routed through a different validator could bypass that ceiling. + * Catch the violation here rather than silently truncating in every + * FIELD_PREP(SRMCFG_RCID_MASK, closid) on the schedule-in fast path. + */ + if (WARN_ON_ONCE(ctrl->rcid_count > FIELD_MAX(SRMCFG_RCID_MASK) + 1)) { + cbqri_controller_destroy(ctrl); + return -EINVAL; + } + + /* + * mon_ctl encodes MCID in 12 bits. acpi_parse_rqsc() caps + * info->mcid_count at CBQRI_MAX_MCID (1024), but a future discovery + * path could bypass that. Reject an out-of-range count so + * cbqri_init_mon_counters() iterates a trusted bound and no MCID + * aliases another slot through FIELD_MODIFY(MON_CTL_MCID_MASK). + */ + if (WARN_ON_ONCE(ctrl->mcid_count > FIELD_MAX(CBQRI_MON_CTL_MCID_MASK) + 1)) { + cbqri_controller_destroy(ctrl); + return -EINVAL; + } + + switch (info->type) { + case CBQRI_CONTROLLER_TYPE_CAPACITY: { + int level; + + ctrl->cache.cache_id = info->cache_id; + + level = find_acpi_cache_level_from_id(info->cache_id); + if (level < 0) { + pr_warn("Failed to resolve cache level for cache id 0x%x (%d), skipping\n", + info->cache_id, level); + cbqri_controller_destroy(ctrl); + return level; + } + ctrl->cache.cache_level = level; + + /* + * cache_size stays at 0 here. cacheinfo is not populated + * yet at acpi_arch_init time. Filled lazily during probe + * via get_cpu_cacheinfo_level(). + */ + + err = acpi_pptt_get_cpumask_from_cache_id(info->cache_id, + &ctrl->cache.cpu_mask); + if (err) { + pr_warn("Failed to get cpumask for cache id 0x%x (%d), skipping\n", + info->cache_id, err); + cbqri_controller_destroy(ctrl); + return err; + } + break; + } + case CBQRI_CONTROLLER_TYPE_BANDWIDTH: { + struct cbqri_controller *other; + int node_id; + + ctrl->mem.prox_dom = info->prox_dom; + node_id = pxm_to_node(info->prox_dom); + if (node_id == NUMA_NO_NODE) { + pr_warn("controller at %pa: proximity domain %u has no NUMA node, skipping\n", + &ctrl->addr, info->prox_dom); + cbqri_controller_destroy(ctrl); + return -ENODEV; + } + /* + * cbqri_resctrl_dom tracks a single hw_ctrl per domain, so a + * second BC sharing the same proximity domain would be + * silently dropped when the resctrl glue resolves the cpu to + * an existing domain. Reject the duplicate at register time + * to keep the failure mode visible. + */ + list_for_each_entry(other, &cbqri_controllers, list) { + if (other->type != CBQRI_CONTROLLER_TYPE_BANDWIDTH) + continue; + if (other->mem.prox_dom != info->prox_dom) + continue; + pr_warn("controller at %pa: proximity domain %u already claimed by %pa, skipping\n", + &ctrl->addr, info->prox_dom, &other->addr); + cbqri_controller_destroy(ctrl); + return -EEXIST; + } + cpumask_copy(&ctrl->mem.cpu_mask, cpumask_of_node(node_id)); + break; + } + default: + pr_warn("controller at %pa: unknown type %u, skipping\n", + &ctrl->addr, info->type); + cbqri_controller_destroy(ctrl); + return -EINVAL; + } + + err = cbqri_probe_controller(ctrl); + if (err) { + cbqri_controller_destroy(ctrl); + return err; + } + + list_add_tail(&ctrl->list, &cbqri_controllers); + return 0; +} diff --git a/drivers/resctrl/cbqri_internal.h b/drivers/resctrl/cbqri_internal.h new file mode 100644 index 00000000000000..68a40f846f4039 --- /dev/null +++ b/drivers/resctrl/cbqri_internal.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _DRIVERS_RESCTRL_CBQRI_INTERNAL_H +#define _DRIVERS_RESCTRL_CBQRI_INTERNAL_H + +#include +#include +#include +#include +#include +#include + +/* + * Capacity Controller (CC) and Bandwidth Controller (BC) MMIO register offsets. + */ +#define CBQRI_CC_CAPABILITIES_OFF 0 +#define CBQRI_CC_MON_CTL_OFF 8 +#define CBQRI_CC_MON_CTL_VAL_OFF 16 +#define CBQRI_CC_ALLOC_CTL_OFF 24 +#define CBQRI_CC_BLOCK_MASK_OFF 32 + +#define CBQRI_BC_CAPABILITIES_OFF 0 +#define CBQRI_BC_MON_CTL_OFF 8 +#define CBQRI_BC_MON_CTR_VAL_OFF 16 +#define CBQRI_BC_ALLOC_CTL_OFF 24 +#define CBQRI_BC_BW_ALLOC_OFF 32 + +/* + * Smallest MMIO span the driver actually accesses: highest defined + * register offset (0x20) plus the 8-byte register width. Used by + * cbqri_probe_controller() to reject undersized firmware-supplied + * mappings before request_mem_region/ioremap, so a u64 access at + * BLOCK_MASK does not walk past the end of the mapping. + */ +#define CBQRI_CTRL_MIN_REG_SPAN 0x28u + +#define CBQRI_CC_CAPABILITIES_VER_MINOR_MASK GENMASK_ULL(3, 0) +#define CBQRI_CC_CAPABILITIES_VER_MAJOR_MASK GENMASK_ULL(7, 4) +#define CBQRI_CC_CAPABILITIES_NCBLKS_MASK GENMASK_ULL(23, 8) + +#define CBQRI_BC_CAPABILITIES_VER_MINOR_MASK GENMASK_ULL(3, 0) +#define CBQRI_BC_CAPABILITIES_VER_MAJOR_MASK GENMASK_ULL(7, 4) +#define CBQRI_BC_CAPABILITIES_NBWBLKS_MASK GENMASK_ULL(23, 8) +#define CBQRI_BC_CAPABILITIES_MRBWB_MASK GENMASK_ULL(47, 32) + +/* + * CC and BC control and mon registers are 64-bit. Keep every field mask + * GENMASK_ULL so FIELD_MODIFY() or ~mask on a u64 register never + * zero-extends a 32-bit mask and clobbers STATUS/BUSY/WPRI in bits 63:32 + * if RV32 support is added in the future. + */ +#define CBQRI_CONTROL_REGISTERS_OP_MASK GENMASK_ULL(4, 0) +#define CBQRI_CONTROL_REGISTERS_AT_MASK GENMASK_ULL(7, 5) +#define CBQRI_CONTROL_REGISTERS_AT_DATA 0 +#define CBQRI_CONTROL_REGISTERS_AT_CODE 1 +#define CBQRI_CONTROL_REGISTERS_RCID_MASK GENMASK_ULL(19, 8) +#define CBQRI_CONTROL_REGISTERS_STATUS_MASK GENMASK_ULL(38, 32) +#define CBQRI_CONTROL_REGISTERS_BUSY_MASK GENMASK_ULL(39, 39) +#define CBQRI_CONTROL_REGISTERS_RBWB_MASK GENMASK_ULL(15, 0) +#define CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK GENMASK_ULL(27, 20) + +#define CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT 1 +#define CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT 2 +#define CBQRI_CC_ALLOC_CTL_STATUS_SUCCESS 1 + +#define CBQRI_BC_ALLOC_CTL_OP_CONFIG_LIMIT 1 +#define CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT 2 +#define CBQRI_BC_ALLOC_CTL_STATUS_SUCCESS 1 + +#define CBQRI_CC_MON_CTL_OP_CONFIG_EVENT 1 +#define CBQRI_CC_MON_CTL_OP_READ_COUNTER 2 + +#define CBQRI_BC_MON_CTL_OP_CONFIG_EVENT 1 +#define CBQRI_BC_MON_CTL_OP_READ_COUNTER 2 + +/* Bandwidth usage monitoring event IDs (CBQRI spec Table 10) */ +#define CBQRI_BC_EVT_ID_TOTAL_READ_WRITE 1 + +/* bc_mon_ctr_val layout (CBQRI spec section 4.3, Figure 7) */ +#define CBQRI_BC_MON_CTR_VAL_CTR_MASK GENMASK_ULL(61, 0) +#define CBQRI_BC_MON_CTR_VAL_INVALID BIT_ULL(62) +#define CBQRI_BC_MON_CTR_VAL_OVF BIT_ULL(63) + +/* mon_ctl field masks (CC and BC share an identical OP/MCID/EVT_ID/STATUS layout) */ +#define CBQRI_MON_CTL_OP_MASK GENMASK_ULL(4, 0) +#define CBQRI_MON_CTL_MCID_MASK GENMASK_ULL(19, 8) +#define CBQRI_MON_CTL_EVT_ID_MASK GENMASK_ULL(27, 20) +#define CBQRI_MON_CTL_STATUS_MASK GENMASK_ULL(38, 32) +#define CBQRI_MON_CTL_STATUS_SUCCESS 1 + +/* Capacity usage monitoring event IDs (CBQRI spec Table 4) */ +#define CBQRI_CC_EVT_ID_NONE 0 +#define CBQRI_CC_EVT_ID_OCCUPANCY 1 + +/* Capacity Controller hardware capabilities */ +struct riscv_cbqri_capacity_caps { + u16 ncblks; + bool supports_alloc_at_code; +}; + +/* Bandwidth Controller hardware capabilities */ +struct riscv_cbqri_bandwidth_caps { + u16 nbwblks; /* number of bandwidth blocks */ + u16 mrbwb; /* max reserved bw blocks */ + + bool supports_alloc_at_code; +}; + +/** + * struct cbqri_bc_mon_state - per-MCID software accumulator for BC bandwidth + * @prev_ctr: previous 62-bit hardware snapshot (already masked to CTR field) + * @chunks: accumulated 64-bit byte total across hardware wraparounds + * + * Updated in resctrl_arch_rmid_read() under cbqri_controller::lock and + * zeroed by resctrl_arch_reset_rmid(). + */ +struct cbqri_bc_mon_state { + u64 prev_ctr; + u64 chunks; +}; + +/** + * enum cbqri_at - capacity controller access type for CDP + * @CBQRI_AT_DATA: data access (CBQRI Table 1, AT=0) + * @CBQRI_AT_CODE: code access (CBQRI Table 1, AT=1) + * + * Selects between data and code halves on controllers that advertise + * supports_alloc_at_code. The resctrl glue maps from CDP_DATA / CDP_CODE + * to this enum at the boundary so cbqri_devices.c stays free of fs/resctrl + * types. + */ +enum cbqri_at { + CBQRI_AT_DATA = CBQRI_CONTROL_REGISTERS_AT_DATA, + CBQRI_AT_CODE = CBQRI_CONTROL_REGISTERS_AT_CODE, +}; + +/** + * struct cbqri_cc_config - desired capacity allocation state for one rcid + * @cbm: capacity block mask + * @at: AT half (data or code) the @cbm applies to + * @cdp_enabled: when false and the controller supports AT, mirror @cbm + * into the other AT half so both stay in sync + */ +struct cbqri_cc_config { + u64 cbm; + enum cbqri_at at; + bool cdp_enabled; +}; + +struct cbqri_controller { + void __iomem *base; + /* + * Serializes the write-then-poll-busy MMIO sequences on this + * controller. Each CBQRI op may busy-wait up to 1 ms on slow + * firmware, so use a sleeping mutex (paired with the sleeping + * readq_poll_timeout() in cbqri_wait_busy_flag()) to keep + * preemption enabled, which is required for PREEMPT_RT. + * All resctrl-arch entry points run in process context. + */ + struct mutex lock; + /* + * Set by cbqri_wait_busy_flag() on BUSY timeout, cleared on the + * next successful wait. Informational only, used for diagnostics. + */ + bool faulted; + + int ver_major; + int ver_minor; + + struct riscv_cbqri_bandwidth_caps bc; + struct riscv_cbqri_capacity_caps cc; + + bool alloc_capable; + bool mon_capable; + + phys_addr_t addr; + phys_addr_t size; + enum cbqri_controller_type type; + u32 rcid_count; + u32 mcid_count; + + /* + * Per-RCID cache of the most recent Rbwb / Mweight values applied + * via CONFIG_LIMIT. bc_bw_alloc packs both fields into one register, + * so cbqri_apply_bc_field() seeds both halves from the authoritative + * cache before CONFIG_LIMIT. + */ + u16 *rbwb_cache; + u8 *mweight_cache; + + /* + * Per-MCID 64-bit software accumulator for the BC's mbm_total_bytes + * event. Allocated by cbqri_init_bc_mon_counters() when this BC is + * paired with an L3 monitoring domain, sized by ->mcid_count. NULL + * on capacity controllers and on BCs that are not mon-paired. + * Protected by ->lock along with the surrounding MMIO sequence. + */ + struct cbqri_bc_mon_state *mbm_total_states; + + struct list_head list; + + struct cache_controller { + u32 cache_level; + u32 cache_size; /* in bytes */ + struct cpumask cpu_mask; + /* Unique Cache ID from the PPTT table's Cache Type Structure */ + u32 cache_id; + } cache; + + struct mem_controller { + /* Proximity Domain from SRAT table Memory Affinity Controller */ + u32 prox_dom; + struct cpumask cpu_mask; + } mem; +}; + +extern struct list_head cbqri_controllers; + +void cbqri_controller_destroy(struct cbqri_controller *ctrl); + +int cbqri_apply_cache_config(struct cbqri_controller *ctrl, u32 closid, + const struct cbqri_cc_config *cfg); + +int cbqri_read_cache_config(struct cbqri_controller *ctrl, u32 closid, + enum cbqri_at at, u32 *cbm_out); + +int cbqri_mon_op(struct cbqri_controller *ctrl, int reg_offset, + int operation, int mcid, int evt_id, u64 *out_reg); + +int cbqri_init_mon_counters(struct cbqri_controller *ctrl); + +int cbqri_apply_rbwb(struct cbqri_controller *ctrl, u32 closid, + u64 rbwb, bool check_sum); + +int cbqri_apply_mweight_config(struct cbqri_controller *ctrl, u32 closid, + u64 mweight); + +int cbqri_read_rbwb(struct cbqri_controller *ctrl, u32 closid, u64 *rbwb_out); + +int cbqri_read_mweight(struct cbqri_controller *ctrl, u32 closid, u64 *mweight_out); + +u64 cbqri_bc_mon_overflow(u64 prev_ctr, u64 cur_ctr); + +int cbqri_init_bc_mon_counters(struct cbqri_controller *bc); + +struct cbqri_controller *cbqri_find_only_mon_bc(void); + +#endif /* _DRIVERS_RESCTRL_CBQRI_INTERNAL_H */ diff --git a/drivers/resctrl/cbqri_resctrl.c b/drivers/resctrl/cbqri_resctrl.c new file mode 100644 index 00000000000000..efd75d241122f9 --- /dev/null +++ b/drivers/resctrl/cbqri_resctrl.c @@ -0,0 +1,1520 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cbqri_internal.h" + +struct cbqri_resctrl_res { + struct cbqri_controller *ctrl; + struct rdt_resource resctrl_res; + bool cdp_enabled; +}; + +struct cbqri_resctrl_dom { + struct rdt_ctrl_domain resctrl_ctrl_dom; + struct cbqri_controller *hw_ctrl; + /* + * For an L3 capacity controller paired with a bandwidth controller + * of matching topology, paired_bc caches that BC so mbm_total_bytes + * reads / resets don't have to walk cbqri_controllers on every hit. + * NULL for non-L3 domains and L3s without a paired BC. + */ + struct cbqri_controller *paired_bc; +}; + +static struct cbqri_resctrl_res cbqri_resctrl_resources[RDT_NUM_RESOURCES]; + +static bool exposed_alloc_capable; +static bool exposed_mon_capable; + +/* Used by resctrl_arch_system_num_rmid_idx(). Narrowed by accumulate_caps. */ +static u32 max_rmid = U32_MAX; + +/* Protects ctrl_domain list mutations across CPU hotplug. */ +static DEFINE_MUTEX(cbqri_domain_list_lock); + +static struct rdt_ctrl_domain * +cbqri_find_ctrl_domain(struct list_head *h, int id) +{ + struct rdt_domain_hdr *hdr = resctrl_find_domain(h, id, NULL); + + return hdr ? container_of(hdr, struct rdt_ctrl_domain, hdr) : NULL; +} + +static struct rdt_l3_mon_domain * +cbqri_find_l3_mon_domain(struct list_head *h, int id) +{ + struct rdt_domain_hdr *hdr = resctrl_find_domain(h, id, NULL); + + return hdr ? container_of(hdr, struct rdt_l3_mon_domain, hdr) : NULL; +} + +static int cbqri_apply_cache_config_dom(struct cbqri_resctrl_dom *hw_dom, + struct rdt_resource *r, + u32 closid, enum resctrl_conf_type t, + u64 cbm) +{ + struct cbqri_resctrl_res *hw_res = + container_of(r, struct cbqri_resctrl_res, resctrl_res); + struct cbqri_cc_config cfg = { + .cbm = cbm, + .at = (t == CDP_CODE) ? CBQRI_AT_CODE : CBQRI_AT_DATA, + .cdp_enabled = hw_res->cdp_enabled, + }; + + return cbqri_apply_cache_config(hw_dom->hw_ctrl, closid, &cfg); +} + +bool resctrl_arch_alloc_capable(void) +{ + return exposed_alloc_capable; +} + +bool resctrl_arch_mon_capable(void) +{ + return exposed_mon_capable; +} + +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) +{ + if (rid != RDT_RESOURCE_L2 && rid != RDT_RESOURCE_L3) + return false; + return cbqri_resctrl_resources[rid].cdp_enabled; +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) +{ + struct cbqri_resctrl_res *cbqri_res; + + if (rid != RDT_RESOURCE_L2 && rid != RDT_RESOURCE_L3) + return -ENODEV; + + cbqri_res = &cbqri_resctrl_resources[rid]; + if (!cbqri_res->resctrl_res.cdp_capable) + return -ENODEV; + + cbqri_res->cdp_enabled = enable; + return 0; +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &cbqri_resctrl_resources[l].resctrl_res; +} + +/* + * fs/resctrl unconditionally references the symbols below before checking + * mon_capable. They are stubs for features CBQRI does not yet support. + */ +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + return false; +} + +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) +{ + return NULL; +} + +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, void *arch_mon_ctx) +{ +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return false; +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + return -EOPNOTSUPP; +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ +} + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return false; +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + return -EOPNOTSUPP; +} + +void resctrl_arch_mon_event_config_read(void *info) +{ +} + +void resctrl_arch_mon_event_config_write(void *info) +{ +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 unused, u32 rmid, enum resctrl_event_id eventid) +{ + struct cbqri_resctrl_dom *hw_dom; + struct rdt_ctrl_domain *cd; + + if (irqs_disabled()) + return; + + mutex_lock(&cbqri_domain_list_lock); + + /* + * Occupancy MCIDs are armed once by cbqri_init_mon_counters() and + * free run thereafter, so only mbm_total_bytes needs a per-rmid reset. + */ + switch (eventid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: { + struct cbqri_controller *bc; + + cd = cbqri_find_ctrl_domain(&r->ctrl_domains, d->hdr.id); + if (!cd) + break; + hw_dom = container_of(cd, struct cbqri_resctrl_dom, resctrl_ctrl_dom); + bc = hw_dom->paired_bc; + if (!bc) + break; + if (WARN_ON_ONCE(!bc->mbm_total_states)) + break; + if (rmid >= bc->mcid_count) + break; + + mutex_lock(&bc->lock); + /* + * CONFIG_EVENT both resets and re-arms. Skip the accumulator + * memset on failure. A stale hardware counter X with + * prev_ctr=0 would inject overflow(0, X) on the next read. + */ + if (!cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF, + CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, rmid, + CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL)) + memset(&bc->mbm_total_states[rmid], 0, + sizeof(*bc->mbm_total_states)); + mutex_unlock(&bc->lock); + break; + } + + default: + break; + } + + mutex_unlock(&cbqri_domain_list_lock); +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +{ + int i; + + /* + * Occupancy counters free run and need no reset; only the + * mbm_total_bytes accumulators are cleared. Bound by max_rmid + * (system-wide minimum mcid_count). + */ + for (i = 0; i < max_rmid; i++) + resctrl_arch_reset_rmid(r, d, 0, i, QOS_L3_MBM_TOTAL_EVENT_ID); +} + +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, + u32 closid, u32 rmid, enum resctrl_event_id eventid, + void *arch_priv, u64 *val, void *arch_mon_ctx) +{ + struct cbqri_resctrl_dom *hw_dom; + struct cbqri_controller *ctrl; + struct rdt_ctrl_domain *d; + u64 ctr_val; + int err = 0; + + resctrl_arch_rmid_read_context_check(); + + /* + * cbqri_mon_op() takes ctrl->lock sleeping mutex and polls + * BUSY for up to 1 ms, neither of which is safe under + * irqs_disabled(). + */ + if (irqs_disabled()) + return -EIO; + + /* + * cbqri_domain_list_lock serialises the list walk against + * cbqri_detach_cpu_from_ctrl_domains(). + */ + mutex_lock(&cbqri_domain_list_lock); + + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + d = cbqri_find_ctrl_domain(&r->ctrl_domains, hdr->id); + if (!d) { + err = -ENOENT; + break; + } + + hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom); + ctrl = hw_dom->hw_ctrl; + + mutex_lock(&ctrl->lock); + + /* + * MCIDs are armed with Occupancy once at init and free run. + * Pass EVT_ID explicitly as the CBQRI spec does not guarantee + * sticky-last-configured-event for READ_COUNTER. + */ + err = cbqri_mon_op(ctrl, CBQRI_CC_MON_CTL_OFF, + CBQRI_CC_MON_CTL_OP_READ_COUNTER, + rmid, CBQRI_CC_EVT_ID_OCCUPANCY, NULL); + if (!err) { + ctr_val = ioread64(ctrl->base + CBQRI_CC_MON_CTL_VAL_OFF); + + /* + * Capacity blocks to bytes. Multiply before divide + * so a non-power-of-2 ncblks doesn't truncate. + */ + *val = (u64)ctrl->cache.cache_size * ctr_val / + ctrl->cc.ncblks; + } + mutex_unlock(&ctrl->lock); + break; + + case QOS_L3_MBM_TOTAL_EVENT_ID: { + struct cbqri_controller *bc; + + /* + * The L3 monitoring domain's id is the L3 cache id. The + * matching ctrl domain's hw_dom->paired_bc was cached at + * add time to avoid walking cbqri_controllers on every read. + */ + d = cbqri_find_ctrl_domain(&r->ctrl_domains, hdr->id); + if (!d) { + err = -ENOENT; + break; + } + hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom); + bc = hw_dom->paired_bc; + if (!bc) { + err = -ENOENT; + break; + } + if (WARN_ON_ONCE(!bc->mbm_total_states)) { + err = -EIO; + break; + } + if (rmid >= bc->mcid_count) { + err = -ERANGE; + break; + } + + mutex_lock(&bc->lock); + /* Pass EVT_ID explicitly. Same reason as the CC path above. */ + err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF, + CBQRI_BC_MON_CTL_OP_READ_COUNTER, rmid, + CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL); + if (err) + goto out_bc; + + ctr_val = ioread64(bc->base + CBQRI_BC_MON_CTR_VAL_OFF); + + if (ctr_val & CBQRI_BC_MON_CTR_VAL_INVALID) { + /* + * Return the last good total and leave prev_ctr so + * the next valid sample resumes from there. + */ + *val = bc->mbm_total_states[rmid].chunks; + } else if (ctr_val & CBQRI_BC_MON_CTR_VAL_OVF) { + /* + * OVF is sticky until next CONFIG_EVENT. + * cbqri_bc_mon_overflow() can recover at most + * one wrap. With OVF set, the count is unknown, + * so re-arm and re-anchor prev_ctr=0. + */ + struct cbqri_bc_mon_state *s = &bc->mbm_total_states[rmid]; + + pr_warn_ratelimited("BC@%pa MCID %u: bandwidth counter overflow\n", + &bc->addr, rmid); + err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF, + CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, rmid, + CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL); + if (err) + goto out_bc; + + s->prev_ctr = 0; + *val = s->chunks; + } else { + struct cbqri_bc_mon_state *s = &bc->mbm_total_states[rmid]; + u64 cur = ctr_val & CBQRI_BC_MON_CTR_VAL_CTR_MASK; + + s->chunks += cbqri_bc_mon_overflow(s->prev_ctr, cur); + s->prev_ctr = cur; + *val = s->chunks; + } +out_bc: + mutex_unlock(&bc->lock); + break; + } + + default: + err = -EINVAL; + break; + } + + mutex_unlock(&cbqri_domain_list_lock); + return err; +} + +/* + * Note about terminology between x86 (Intel RDT/AMD QoS) and RISC-V: + * CLOSID on x86 is RCID on RISC-V + * RMID on x86 is MCID on RISC-V + */ +u32 resctrl_arch_get_num_closid(struct rdt_resource *res) +{ + struct cbqri_resctrl_res *hw_res; + + hw_res = container_of(res, struct cbqri_resctrl_res, resctrl_res); + + if (!hw_res->ctrl) + return 0; + + return hw_res->ctrl->rcid_count; +} + +u32 resctrl_arch_system_num_rmid_idx(void) +{ + return max_rmid; +} + +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid) +{ + return rmid; +} + +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *closid = RISCV_RESCTRL_EMPTY_CLOSID; + *rmid = idx; +} + +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) +{ + u32 srmcfg = FIELD_PREP(SRMCFG_RCID_MASK, closid) | + FIELD_PREP(SRMCFG_MCID_MASK, rmid); + + WRITE_ONCE(per_cpu(cpu_srmcfg_default, cpu), srmcfg); +} + +void resctrl_arch_sched_in(struct task_struct *tsk) +{ + __switch_to_srmcfg(tsk); +} + +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + u32 srmcfg = FIELD_PREP(SRMCFG_RCID_MASK, closid) | + FIELD_PREP(SRMCFG_MCID_MASK, rmid); + + WRITE_ONCE(tsk->thread.srmcfg, srmcfg); +} + +void resctrl_arch_sync_cpu_closid_rmid(void *info) +{ + struct resctrl_cpu_defaults *r = info; + + lockdep_assert_preemption_disabled(); + + if (r) { + resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(), + r->closid, r->rmid); + } + + resctrl_arch_sched_in(current); +} + +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + return FIELD_GET(SRMCFG_RCID_MASK, READ_ONCE(tsk->thread.srmcfg)) == closid; +} + +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + return FIELD_GET(SRMCFG_MCID_MASK, READ_ONCE(tsk->thread.srmcfg)) == rmid; +} + +void resctrl_arch_pre_mount(void) +{ + /* All controllers discovered at boot via late_initcall. Nothing to do. */ +} + +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + struct cbqri_resctrl_dom *dom; + + dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom); + + if (!r->alloc_capable) + return -EINVAL; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + return cbqri_apply_cache_config_dom(dom, r, closid, t, cfg_val); + case RDT_RESOURCE_MB_MIN: + /* sum(Rbwb) <= MRBWB validation runs inside cbqri_apply_rbwb(). */ + return cbqri_apply_rbwb(dom->hw_ctrl, closid, cfg_val, true); + case RDT_RESOURCE_MB_WGHT: + return cbqri_apply_mweight_config(dom->hw_ctrl, closid, cfg_val); + default: + return -EINVAL; + } +} + +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + struct resctrl_staged_config *cfg; + enum resctrl_conf_type t; + struct rdt_ctrl_domain *d; + int err = 0; + + /* Walking r->ctrl_domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + for (t = 0; t < CDP_NUM_TYPES; t++) { + cfg = &d->staged_config[t]; + if (!cfg->have_new_ctrl) + continue; + err = resctrl_arch_update_one(r, d, closid, t, cfg->new_ctrl); + if (err) + return err; + } + } + return err; +} + +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + struct cbqri_resctrl_dom *hw_dom; + struct cbqri_controller *ctrl; + enum cbqri_at at; + u32 val; + int err; + + hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom); + ctrl = hw_dom->hw_ctrl; + val = resctrl_get_default_ctrl(r); + + if (!r->alloc_capable) + return val; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + at = (type == CDP_CODE) ? CBQRI_AT_CODE : CBQRI_AT_DATA; + err = cbqri_read_cache_config(ctrl, closid, at, &val); + if (err < 0) + val = resctrl_get_default_ctrl(r); + break; + case RDT_RESOURCE_MB_MIN: { + u64 rbwb; + + err = cbqri_read_rbwb(ctrl, closid, &rbwb); + if (err == 0) + val = (u32)rbwb; + break; + } + case RDT_RESOURCE_MB_WGHT: { + u64 mweight; + + err = cbqri_read_mweight(ctrl, closid, &mweight); + if (err == 0) + val = (u32)mweight; + break; + } + default: + break; + } + + return val; +} + +/* + * RCID 0 carries the remaining MRBWB after every other RCID is seeded with + * the minimum Rbwb of 1. cbqri_probe_bc() rejects a bandwidth controller + * with mrbwb < rcid_count, so this subtraction cannot underflow. + */ +static u64 cbqri_rcid0_rbwb(struct cbqri_controller *ctrl) +{ + if (WARN_ON_ONCE(ctrl->bc.mrbwb < ctrl->rcid_count)) + return 1; + return ctrl->bc.mrbwb - (ctrl->rcid_count - 1); +} + +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) +{ + struct cbqri_resctrl_res *hw_res; + struct cbqri_resctrl_dom *dom; + struct rdt_ctrl_domain *d; + enum resctrl_conf_type t; + u32 default_ctrl; + int i; + + lockdep_assert_cpus_held(); + + hw_res = container_of(r, struct cbqri_resctrl_res, resctrl_res); + default_ctrl = resctrl_get_default_ctrl(r); + + if (!hw_res->ctrl) + return; + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + dom = container_of(d, struct cbqri_resctrl_dom, + resctrl_ctrl_dom); + + switch (r->rid) { + case RDT_RESOURCE_MB_MIN: + /* + * CBQRI section 4.5: Rbwb >= 1, sum(Rbwb) <= MRBWB. + * Walk N-1..1 first so RCID 0 lands last with the + * remaining budget. + */ + for (i = 0; i < hw_res->ctrl->rcid_count; i++) { + u32 rcid = (i + 1) % hw_res->ctrl->rcid_count; + u64 rbwb = (rcid == 0) ? + cbqri_rcid0_rbwb(dom->hw_ctrl) : 1; + int rerr; + + rerr = cbqri_apply_rbwb(dom->hw_ctrl, rcid, rbwb, false); + if (rerr) + pr_err_ratelimited("RBWB reset RCID %u failed (%d)\n", + rcid, rerr); + } + break; + case RDT_RESOURCE_MB_WGHT: + /* All RCIDs start at max weight (the new-group default). */ + for (i = 0; i < hw_res->ctrl->rcid_count; i++) { + int rerr; + + rerr = cbqri_apply_mweight_config(dom->hw_ctrl, i, + default_ctrl); + if (rerr) + pr_err_ratelimited("Mweight reset RCID %u failed (%d)\n", + i, rerr); + } + break; + default: + for (i = 0; i < hw_res->ctrl->rcid_count; i++) { + for (t = 0; t < CDP_NUM_TYPES; t++) { + int rerr; + + rerr = resctrl_arch_update_one(r, d, i, t, + default_ctrl); + if (rerr) + pr_err_ratelimited("rid=%d reset RCID %u type %u failed (%d)\n", + r->rid, i, t, rerr); + } + } + break; + } + } +} + +static struct rdt_ctrl_domain *cbqri_new_domain(struct cbqri_controller *ctrl) +{ + struct cbqri_resctrl_dom *hw_dom; + struct rdt_ctrl_domain *domain; + + hw_dom = kzalloc_obj(*hw_dom, GFP_KERNEL); + if (!hw_dom) + return NULL; + + hw_dom->hw_ctrl = ctrl; + domain = &hw_dom->resctrl_ctrl_dom; + + INIT_LIST_HEAD(&domain->hdr.list); + + return domain; +} + +static int cbqri_init_domain_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + struct cbqri_resctrl_res *hw_res; + struct cbqri_resctrl_dom *dom; + enum resctrl_conf_type t; + int err = 0; + u64 rbwb; + int i; + + hw_res = container_of(r, struct cbqri_resctrl_res, resctrl_res); + dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom); + + for (i = 0; i < hw_res->ctrl->rcid_count; i++) { + /* + * For MB_MIN walk, RCIDs 1..N-1 then RCID 0 last so the sum + * doesn't exceed MRBWB during the walk. + */ + u32 rcid = (r->rid == RDT_RESOURCE_MB_MIN) ? + ((i + 1) % hw_res->ctrl->rcid_count) : i; + + switch (r->rid) { + case RDT_RESOURCE_MB_MIN: + /* + * CBQRI section 4.5: Rbwb >= 1, sum(Rbwb) <= MRBWB. + * RCID 0 takes the remaining budget. + */ + rbwb = (rcid == 0) ? cbqri_rcid0_rbwb(dom->hw_ctrl) : 1; + + err = cbqri_apply_rbwb(dom->hw_ctrl, rcid, rbwb, false); + break; + case RDT_RESOURCE_MB_WGHT: + /* Match the new-group default: equal weights across RCIDs. */ + err = cbqri_apply_mweight_config(dom->hw_ctrl, i, + resctrl_get_default_ctrl(r)); + break; + default: + /* + * Seed both DATA and CODE staged slots so a later + * mount with -o cdp does not see stale CODE values. + * On non-AT controllers cbqri_cc_alloc_op() masks + * AT to 0, so all three iterations land on the same + * hardware state. The redundant writes are harmless. + */ + for (t = 0; t < CDP_NUM_TYPES; t++) { + err = resctrl_arch_update_one(r, d, i, t, + resctrl_get_default_ctrl(r)); + if (err) + break; + } + break; + } + if (err) + return err; + } + return 0; +} + +/* + * Walk cbqri_controllers and pick one capacity controller (CC) per cache + * level (L2/L3) to back the corresponding RDT_RESOURCE_L*. When more than + * one CC sits at the same level (e.g. one per socket), they must agree on + * rcid_count / ncblks / alloc_capable. A mismatch is fatal because resctrl + * exposes a single set of caps per rid. The first matching controller wins. + */ +static int cbqri_resctrl_pick_caches(void) +{ + struct cbqri_controller *ctrl; + + list_for_each_entry(ctrl, &cbqri_controllers, list) { + struct cbqri_resctrl_res *cbqri_res; + enum resctrl_res_level rid; + + if (ctrl->type != CBQRI_CONTROLLER_TYPE_CAPACITY) + continue; + if (!ctrl->alloc_capable) { + if (ctrl->mon_capable) + pr_warn_once("CC @%pa: monitor-only controllers aren't supported\n", + &ctrl->addr); + continue; + } + + if (ctrl->cache.cache_level == 2) { + rid = RDT_RESOURCE_L2; + } else if (ctrl->cache.cache_level == 3) { + rid = RDT_RESOURCE_L3; + } else { + pr_err("unknown cache level %d\n", + ctrl->cache.cache_level); + return -ENODEV; + } + + cbqri_res = &cbqri_resctrl_resources[rid]; + if (cbqri_res->ctrl) { + /* + * CCs at the same cache level must agree on every cap + * resctrl exposes globally. Reject mismatches at pick + * time so the inconsistency is visible at boot. + */ + if (cbqri_res->ctrl->rcid_count != ctrl->rcid_count || + cbqri_res->ctrl->cc.ncblks != ctrl->cc.ncblks || + cbqri_res->ctrl->cc.supports_alloc_at_code != + ctrl->cc.supports_alloc_at_code || + cbqri_res->ctrl->alloc_capable != ctrl->alloc_capable) { + pr_err("L%d controllers have mismatched capabilities\n", + ctrl->cache.cache_level); + return -EINVAL; + } + continue; + } + + cbqri_res->ctrl = ctrl; + } + + return 0; +} + +/* + * Fill the rdt_resource fields for one picked rid. An rid with no picked + * controller is left untouched so it stays out of resctrl_arch_get_resource(). + */ +static int cbqri_resctrl_control_init(struct cbqri_resctrl_res *cbqri_res) +{ + struct cbqri_controller *ctrl = cbqri_res->ctrl; + struct rdt_resource *res = &cbqri_res->resctrl_res; + + if (!ctrl) + return 0; + + switch (res->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + res->name = (res->rid == RDT_RESOURCE_L2) ? "L2" : "L3"; + res->schema_fmt = RESCTRL_SCHEMA_BITMAP; + res->ctrl_scope = (res->rid == RDT_RESOURCE_L2) ? + RESCTRL_L2_CACHE : RESCTRL_L3_CACHE; + res->cache.cbm_len = ctrl->cc.ncblks; + res->cache.shareable_bits = 0; + res->cache.min_cbm_bits = 1; + res->cache.arch_has_sparse_bitmasks = false; + res->cdp_capable = ctrl->cc.supports_alloc_at_code; + res->alloc_capable = ctrl->alloc_capable; + INIT_LIST_HEAD(&res->ctrl_domains); + INIT_LIST_HEAD(&res->mon_domains); + + if (ctrl->mon_capable && res->rid == RDT_RESOURCE_L3) { + res->mon_scope = RESCTRL_L3_CACHE; + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, + false, 0, NULL); + res->mon_capable = true; + } + break; + + case RDT_RESOURCE_MB_MIN: + res->name = "MB_MIN"; + res->schema_fmt = RESCTRL_SCHEMA_RANGE; + /* + * resctrl requires a cache scope for MBA-style domains. + * Use L3 as a proxy until the resctrl supports non-cache + * scopes for bandwidth resources. + */ + res->ctrl_scope = RESCTRL_L3_CACHE; + /* Rbwb is an integer block count, not a percentage. No MBA delay_linear. */ + res->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + res->membw.min_bw = 1; + res->membw.max_bw = ctrl->bc.mrbwb; + res->membw.bw_gran = 1; + /* + * CBQRI section 4.5 caps sum(Rbwb) <= MRBWB. Default new + * groups to min_bw so mkdir cannot overflow that sum. + */ + res->membw.default_to_min = true; + res->alloc_capable = ctrl->alloc_capable; + INIT_LIST_HEAD(&res->ctrl_domains); + INIT_LIST_HEAD(&res->mon_domains); + break; + + case RDT_RESOURCE_MB_WGHT: + res->name = "MB_WGHT"; + res->schema_fmt = RESCTRL_SCHEMA_RANGE; + res->ctrl_scope = RESCTRL_L3_CACHE; + /* Mweight is a dimensionless ratio. No delay/linear concept. */ + res->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + /* + * CBQRI section 4.5: Mweight is 0-255 (0 disables + * work-conserving). No sum constraint, so leave + * default_to_min false. Groups default to max_bw. + */ + res->membw.min_bw = 0; + res->membw.max_bw = 255; + res->membw.bw_gran = 1; + res->alloc_capable = ctrl->alloc_capable; + INIT_LIST_HEAD(&res->ctrl_domains); + INIT_LIST_HEAD(&res->mon_domains); + break; + + default: + break; + } + + return 0; +} + +/* + * Pick one BC to back both MB_MIN and MB_WGHT. + */ +static int cbqri_resctrl_pick_bw_alloc(void) +{ + struct cbqri_resctrl_res *mb_min = &cbqri_resctrl_resources[RDT_RESOURCE_MB_MIN]; + struct cbqri_resctrl_res *mb_wght = &cbqri_resctrl_resources[RDT_RESOURCE_MB_WGHT]; + struct cbqri_controller *ctrl; + + list_for_each_entry(ctrl, &cbqri_controllers, list) { + if (ctrl->type != CBQRI_CONTROLLER_TYPE_BANDWIDTH) + continue; + if (!ctrl->alloc_capable) + continue; + + if (mb_min->ctrl) { + if (mb_min->ctrl->rcid_count != ctrl->rcid_count || + mb_min->ctrl->bc.mrbwb != ctrl->bc.mrbwb) { + pr_err("BW controllers have mismatched capabilities\n"); + return -EINVAL; + } + continue; + } + + mb_min->ctrl = ctrl; + mb_wght->ctrl = ctrl; + } + + return 0; +} + +/* + * Enable mbm_total_bytes when the system exposes exactly one mon-capable + * bandwidth controller and exactly one L3 cache. Pairing a single BC with + * multiple L3 domains would let userspace overcount system bandwidth by a + * factor equal to the L3 domain count. resctrl_is_mon_event_enabled() then + * gates the BC pairing and rmid-space accounting. L3 occupancy is enabled + * by cbqri_resctrl_control_init(). + */ +static void cbqri_resctrl_pick_counters(void) +{ + struct cbqri_resctrl_res *l3 = &cbqri_resctrl_resources[RDT_RESOURCE_L3]; + struct cbqri_controller *ctrl, *prev; + unsigned int l3_count = 0; + + /* Count distinct L3 cache_ids */ + list_for_each_entry(ctrl, &cbqri_controllers, list) { + bool seen = false; + + if (ctrl->type != CBQRI_CONTROLLER_TYPE_CAPACITY) + continue; + if (ctrl->cache.cache_level != 3) + continue; + + list_for_each_entry(prev, &cbqri_controllers, list) { + if (prev == ctrl) + break; + if (prev->type != CBQRI_CONTROLLER_TYPE_CAPACITY) + continue; + if (prev->cache.cache_level != 3) + continue; + if (prev->cache.cache_id == ctrl->cache.cache_id) { + seen = true; + break; + } + } + if (!seen) + l3_count++; + } + + if (l3_count > 1) { + pr_warn_once("multiple L3 domains (%u) detected. mbm_total_bytes disabled\n", + l3_count); + return; + } + + /* + * mbm_total_bytes is surfaced on the L3 monitoring domain, so it + * needs a mon-capable L3 cache controller as well as a single + * mon-capable bandwidth controller. + */ + if (l3->ctrl && l3->ctrl->mon_capable && cbqri_find_only_mon_bc()) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL); +} + +static void cbqri_resctrl_accumulate_caps(void) +{ + struct cbqri_controller *l3_ctrl; + int rid; + + for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) { + struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid]; + + if (!hw_res->ctrl) + continue; + if (hw_res->ctrl->alloc_capable) + exposed_alloc_capable = true; + if (hw_res->ctrl->mon_capable) + exposed_mon_capable = true; + } + + /* + * Narrow max_rmid against the picked occupancy source (the L3 CC) + * only. A mon-capable controller that is not exposed as a counter + * source must not clamp the rmid space. + */ + l3_ctrl = cbqri_resctrl_resources[RDT_RESOURCE_L3].ctrl; + if (l3_ctrl && l3_ctrl->mon_capable) + max_rmid = min(max_rmid, l3_ctrl->mcid_count); + + /* + * When mbm_total_bytes is enabled, the paired BC is a second counter + * source, so clamp against its mcid_count too. A BC left unpicked + * because mbm_total_bytes is disabled must not clamp it. + */ + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { + struct cbqri_controller *bc = cbqri_find_only_mon_bc(); + + if (bc) + max_rmid = min(max_rmid, bc->mcid_count); + } + + if (!exposed_mon_capable) { + max_rmid = 1; + return; + } + + /* + * num_rmid is the user-visible bound for the L3 monitoring rmid + * space. Track max_rmid (the picked-source minimum) so userspace is + * not told more RMIDs than can be allocated. + */ + cbqri_resctrl_resources[RDT_RESOURCE_L3].resctrl_res.mon.num_rmid = max_rmid; +} + +/* + * Create, list-insert, and online a fresh ctrl_domain backing ctrl on + * resource res, seeded with cpu and identified by dom_id. Caller must + * hold cbqri_domain_list_lock and must have already verified that no + * existing ctrl_domain on res carries this id. + */ +static struct rdt_ctrl_domain *cbqri_create_ctrl_domain(struct cbqri_controller *ctrl, + struct rdt_resource *res, + unsigned int cpu, int dom_id) +{ + struct rdt_ctrl_domain *domain; + struct list_head *pos = NULL; + int err; + + domain = cbqri_new_domain(ctrl); + if (!domain) + return ERR_PTR(-ENOMEM); + + cpumask_set_cpu(cpu, &domain->hdr.cpu_mask); + domain->hdr.id = dom_id; + domain->hdr.type = RESCTRL_CTRL_DOMAIN; + + err = cbqri_init_domain_ctrlval(res, domain); + if (err) { + kfree(container_of(domain, struct cbqri_resctrl_dom, + resctrl_ctrl_dom)); + return ERR_PTR(err); + } + + /* Insert sorted by id so user-visible ordering is deterministic. */ + resctrl_find_domain(&res->ctrl_domains, dom_id, &pos); + list_add_tail(&domain->hdr.list, pos); + + resctrl_online_ctrl_domain(res, domain); + + return domain; +} + +static int cbqri_attach_cpu_to_l3_mon(struct cbqri_controller *ctrl, + struct rdt_resource *res, unsigned int cpu) +{ + struct rdt_l3_mon_domain *mon_dom; + struct rdt_ctrl_domain *ctrl_dom; + struct cbqri_resctrl_dom *hw_dom; + struct list_head *mon_pos = NULL; + int dom_id = ctrl->cache.cache_id; + int err; + + lockdep_assert_held(&cbqri_domain_list_lock); + + mon_dom = cbqri_find_l3_mon_domain(&res->mon_domains, dom_id); + if (mon_dom) { + cpumask_set_cpu(cpu, &mon_dom->hdr.cpu_mask); + return 0; + } + + ctrl_dom = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id); + if (!ctrl_dom) { + pr_err("L3 mon attach for cpu %u: no ctrl_domain id %d\n", + cpu, dom_id); + return -EINVAL; + } + + mon_dom = kzalloc_obj(*mon_dom, GFP_KERNEL); + if (!mon_dom) + return -ENOMEM; + + mon_dom->hdr.id = dom_id; + mon_dom->hdr.type = RESCTRL_MON_DOMAIN; + mon_dom->hdr.rid = RDT_RESOURCE_L3; + cpumask_set_cpu(cpu, &mon_dom->hdr.cpu_mask); + INIT_LIST_HEAD(&mon_dom->hdr.list); + + if (resctrl_find_domain(&res->mon_domains, dom_id, &mon_pos)) { + pr_err("duplicate L3 mon_domain id %d\n", dom_id); + err = -EEXIST; + goto err_free; + } + if (mon_pos) + list_add_tail(&mon_dom->hdr.list, mon_pos); + else + list_add_tail(&mon_dom->hdr.list, &res->mon_domains); + + /* + * Pair this L3 domain with the system's mon-capable BC and + * initialise the BC's per-MCID software accumulators before + * resctrl_online_mon_domain() exposes the domain to userspace. + * A concurrent sysfs read of mbm_total_bytes between online and + * BC init would otherwise pass the !bc->mbm_total_states check + * with a half-initialised pointer. + */ + hw_dom = container_of(ctrl_dom, struct cbqri_resctrl_dom, resctrl_ctrl_dom); + + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + hw_dom->paired_bc = cbqri_find_only_mon_bc(); + if (hw_dom->paired_bc) { + err = cbqri_init_bc_mon_counters(hw_dom->paired_bc); + if (err) { + pr_err("BC @%pa: mon init failed (%d)\n", &hw_dom->paired_bc->addr, err); + hw_dom->paired_bc = NULL; + goto err_listdel; + } + } + + err = resctrl_online_mon_domain(res, &mon_dom->hdr); + if (err) + goto err_listdel; + + err = cbqri_init_mon_counters(ctrl); + if (err) + goto err_offline; + + return 0; + +err_offline: + /* + * cancel_delayed_work avoids deadlocking against the cqm_limbo + * worker which takes cpus_read_lock while this hotplug callback + * already holds cpus_write_lock. mbm_over is only + * INIT_DELAYED_WORK'd when MBM_TOTAL was enabled, so gate the + * cancel on the same condition to avoid touching a zeroed work + * struct. + */ + cancel_delayed_work(&mon_dom->cqm_limbo); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + cancel_delayed_work(&mon_dom->mbm_over); + resctrl_offline_mon_domain(res, &mon_dom->hdr); +err_listdel: + list_del(&mon_dom->hdr.list); +err_free: + kfree(mon_dom); + return err; +} + +static int cbqri_attach_cpu_to_cap_ctrl(struct cbqri_controller *ctrl, + unsigned int cpu) +{ + struct cbqri_resctrl_res *hw_res; + struct rdt_ctrl_domain *domain; + struct rdt_resource *res; + bool new_domain = false; + int dom_id; + int err; + + if (ctrl->cache.cache_level == 2) + hw_res = &cbqri_resctrl_resources[RDT_RESOURCE_L2]; + else if (ctrl->cache.cache_level == 3) + hw_res = &cbqri_resctrl_resources[RDT_RESOURCE_L3]; + else + return 0; + + if (!hw_res->ctrl) + return 0; + + res = &hw_res->resctrl_res; + dom_id = ctrl->cache.cache_id; + + domain = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id); + if (domain) { + cpumask_set_cpu(cpu, &domain->hdr.cpu_mask); + } else { + domain = cbqri_create_ctrl_domain(ctrl, res, cpu, dom_id); + if (IS_ERR(domain)) + return PTR_ERR(domain); + new_domain = true; + } + + if (ctrl->mon_capable && ctrl->cache.cache_level == 3) { + err = cbqri_attach_cpu_to_l3_mon(ctrl, res, cpu); + if (err) + goto err_undo_ctrl_dom; + } + + return 0; + +err_undo_ctrl_dom: + /* + * The cpuhp core only rolls back states that successfully ran their + * startup. The L3 mon attach failure happens inside this state's + * startup, so its own offline callback is not invoked. Undo the + * cpumask_set and, if this attach created the ctrl_domain, tear it + * down so a retry sees a clean slate. + */ + cpumask_clear_cpu(cpu, &domain->hdr.cpu_mask); + if (new_domain) { + resctrl_offline_ctrl_domain(res, domain); + list_del(&domain->hdr.list); + kfree(container_of(domain, struct cbqri_resctrl_dom, + resctrl_ctrl_dom)); + } + return err; +} + +static int cbqri_attach_cpu_to_one_bw_res(struct cbqri_controller *ctrl, + enum resctrl_res_level rid, + unsigned int cpu) +{ + struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid]; + struct rdt_resource *res = &hw_res->resctrl_res; + struct rdt_ctrl_domain *domain; + int dom_id = ctrl->mem.prox_dom; + + if (!hw_res->ctrl) + return 0; + + domain = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id); + if (domain) { + cpumask_set_cpu(cpu, &domain->hdr.cpu_mask); + return 0; + } + + domain = cbqri_create_ctrl_domain(ctrl, res, cpu, dom_id); + if (IS_ERR(domain)) + return PTR_ERR(domain); + + return 0; +} + +static void cbqri_detach_cpu_from_one_bw_res(struct cbqri_controller *ctrl, + enum resctrl_res_level rid, + unsigned int cpu) +{ + struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid]; + struct rdt_resource *res = &hw_res->resctrl_res; + struct rdt_ctrl_domain *domain; + int dom_id = ctrl->mem.prox_dom; + + lockdep_assert_held(&cbqri_domain_list_lock); + + if (!hw_res->ctrl) + return; + + domain = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id); + if (!domain || !cpumask_test_cpu(cpu, &domain->hdr.cpu_mask)) + return; + + cpumask_clear_cpu(cpu, &domain->hdr.cpu_mask); + if (cpumask_empty(&domain->hdr.cpu_mask)) { + resctrl_offline_ctrl_domain(res, domain); + list_del(&domain->hdr.list); + kfree(container_of(domain, struct cbqri_resctrl_dom, + resctrl_ctrl_dom)); + } +} + +static int cbqri_attach_cpu_to_bw_ctrl(struct cbqri_controller *ctrl, + unsigned int cpu) +{ + int err; + + err = cbqri_attach_cpu_to_one_bw_res(ctrl, RDT_RESOURCE_MB_MIN, cpu); + if (err) + return err; + + err = cbqri_attach_cpu_to_one_bw_res(ctrl, RDT_RESOURCE_MB_WGHT, cpu); + if (err) + cbqri_detach_cpu_from_one_bw_res(ctrl, RDT_RESOURCE_MB_MIN, cpu); + return err; +} + +static void cbqri_detach_cpu_from_l3_mon(struct rdt_resource *res, + unsigned int cpu) +{ + struct rdt_l3_mon_domain *mon_dom, *tmp; + + lockdep_assert_held(&cbqri_domain_list_lock); + + list_for_each_entry_safe(mon_dom, tmp, &res->mon_domains, hdr.list) { + if (!cpumask_test_cpu(cpu, &mon_dom->hdr.cpu_mask)) + continue; + cpumask_clear_cpu(cpu, &mon_dom->hdr.cpu_mask); + if (cpumask_empty(&mon_dom->hdr.cpu_mask)) { + /* + * This runs as a cpuhp offline callback under + * cpus_write_lock. The cqm_limbo and mbm_over workers + * take cpus_read_lock before touching a domain, so + * neither can run or re-queue here. A non-sync cancel + * thus reliably dequeues any pending work before kfree, + * and cancel_delayed_work_sync() would instead deadlock + * against that cpus_read_lock. + */ + cancel_delayed_work(&mon_dom->cqm_limbo); + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + cancel_delayed_work(&mon_dom->mbm_over); + resctrl_offline_mon_domain(res, &mon_dom->hdr); + list_del(&mon_dom->hdr.list); + kfree(mon_dom); + } + } +} + +static void cbqri_detach_cpu_from_ctrl_domains(struct rdt_resource *res, + unsigned int cpu) +{ + struct rdt_ctrl_domain *domain, *tmp; + + list_for_each_entry_safe(domain, tmp, &res->ctrl_domains, hdr.list) { + if (!cpumask_test_cpu(cpu, &domain->hdr.cpu_mask)) + continue; + cpumask_clear_cpu(cpu, &domain->hdr.cpu_mask); + if (cpumask_empty(&domain->hdr.cpu_mask)) { + resctrl_offline_ctrl_domain(res, domain); + list_del(&domain->hdr.list); + kfree(container_of(domain, struct cbqri_resctrl_dom, + resctrl_ctrl_dom)); + } + } +} + +/* + * Remove a CPU from every domain it was attached to. The per-resource + * detach helpers act only when the CPU is set in a domain's mask, so this + * is idempotent and undoes a partial online attach as well as a full + * offline. Caller holds cbqri_domain_list_lock. + */ +static void cbqri_detach_cpu_from_all_ctrls(unsigned int cpu) +{ + int rid; + + lockdep_assert_held(&cbqri_domain_list_lock); + + for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) { + struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid]; + + if (!hw_res->ctrl) + continue; + cbqri_detach_cpu_from_ctrl_domains(&hw_res->resctrl_res, cpu); + if (rid == RDT_RESOURCE_L3 && hw_res->ctrl->mon_capable) + cbqri_detach_cpu_from_l3_mon(&hw_res->resctrl_res, cpu); + } +} + +/* + * Attach a CPU to every controller that claims it. On failure, detach the + * CPU from everything attached so far: the cpuhp core does not run this + * state's offline teardown when its startup fails, so a partial attach + * would otherwise leak into the domain cpu_masks. Caller holds + * cbqri_domain_list_lock. + */ +static int cbqri_attach_cpu_to_all_ctrls(unsigned int cpu) +{ + struct cbqri_controller *ctrl; + int err = 0; + + lockdep_assert_held(&cbqri_domain_list_lock); + + list_for_each_entry(ctrl, &cbqri_controllers, list) { + switch (ctrl->type) { + case CBQRI_CONTROLLER_TYPE_CAPACITY: + if (!cpumask_test_cpu(cpu, &ctrl->cache.cpu_mask)) + continue; + if (!ctrl->alloc_capable) + continue; + err = cbqri_attach_cpu_to_cap_ctrl(ctrl, cpu); + break; + case CBQRI_CONTROLLER_TYPE_BANDWIDTH: + if (!cpumask_test_cpu(cpu, &ctrl->mem.cpu_mask)) + continue; + if (!ctrl->alloc_capable) + continue; + err = cbqri_attach_cpu_to_bw_ctrl(ctrl, cpu); + break; + default: + continue; + } + if (err) { + cbqri_detach_cpu_from_all_ctrls(cpu); + break; + } + } + + return err; +} + +static bool cbqri_resctrl_inited; + +static void cbqri_resctrl_teardown(void) +{ + int rid; + + if (!cbqri_resctrl_inited) + return; + + resctrl_exit(); + + for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) { + struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid]; + + hw_res->ctrl = NULL; + hw_res->cdp_enabled = false; + } + exposed_alloc_capable = false; + exposed_mon_capable = false; + max_rmid = U32_MAX; + cbqri_resctrl_inited = false; +} + +static int cbqri_resctrl_setup(void) +{ + int rid; + int err; + + for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) + cbqri_resctrl_resources[rid].resctrl_res.rid = rid; + + err = cbqri_resctrl_pick_caches(); + if (err) + return err; + + err = cbqri_resctrl_pick_bw_alloc(); + if (err) + return err; + + cbqri_resctrl_pick_counters(); + + for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) { + err = cbqri_resctrl_control_init(&cbqri_resctrl_resources[rid]); + if (err) + return err; + } + + cbqri_resctrl_accumulate_caps(); + + if (!exposed_alloc_capable && !exposed_mon_capable) { + pr_debug("no resctrl-capable CBQRI controllers found\n"); + return -ENODEV; + } + + err = resctrl_init(); + if (err) { + /* + * resctrl_init() failed before we set cbqri_resctrl_inited, + * so cbqri_resctrl_teardown() would no-op. Roll back the + * exposed_*_capable flags and the resource picks directly + * so resctrl_arch_alloc_capable() / _mon_capable() do not + * lie to callers after this returns. + */ + for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) { + cbqri_resctrl_resources[rid].ctrl = NULL; + cbqri_resctrl_resources[rid].cdp_enabled = false; + } + exposed_alloc_capable = false; + exposed_mon_capable = false; + max_rmid = U32_MAX; + return err; + } + + cbqri_resctrl_inited = true; + return 0; +} + +static int cbqri_resctrl_online_cpu(unsigned int cpu) +{ + int err; + + mutex_lock(&cbqri_domain_list_lock); + err = cbqri_attach_cpu_to_all_ctrls(cpu); + mutex_unlock(&cbqri_domain_list_lock); + if (err) + return err; + + /* + * Seed the per-CPU default RCID/MCID to the reserved (0, 0) pair and + * notify the resctrl core so it tracks this CPU in the default group. + * Mirrors x86 resctrl_arch_online_cpu(). + */ + resctrl_arch_set_cpu_default_closid_rmid(cpu, 0, 0); + resctrl_online_cpu(cpu); + return 0; +} + +static int cbqri_resctrl_offline_cpu(unsigned int cpu) +{ + resctrl_offline_cpu(cpu); + + mutex_lock(&cbqri_domain_list_lock); + cbqri_detach_cpu_from_all_ctrls(cpu); + mutex_unlock(&cbqri_domain_list_lock); + return 0; +} + +/* Saved cpuhp slot from cpuhp_setup_state() for symmetric removal. */ +static enum cpuhp_state cbqri_cpuhp_state; + +static int __init cbqri_arch_late_init(void) +{ + int err; + + if (!riscv_isa_extension_available(NULL, SSQOSID)) + return -ENODEV; + + err = cbqri_resctrl_setup(); + if (err) + return err; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cbqri:online", + cbqri_resctrl_online_cpu, + cbqri_resctrl_offline_cpu); + if (err < 0) { + cbqri_resctrl_teardown(); + return err; + } + cbqri_cpuhp_state = err; + + return 0; +} +late_initcall(cbqri_arch_late_init); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 9a7dfc48cb2e29..d9f05270094155 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -245,8 +245,7 @@ static int parse_line(char *line, struct resctrl_schema *s, if (WARN_ON_ONCE(!parse_ctrlval)) return -EINVAL; - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && resctrl_is_membw(r)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); return -EINVAL; } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 1a9b29119f88f8..76187987b2ee46 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -397,6 +397,8 @@ void mbm_handle_overflow(struct work_struct *work); bool is_mba_sc(struct rdt_resource *r); +bool resctrl_is_membw(struct rdt_resource *r); + void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5dfdaa6f9d8ff6..02733b11e115ea 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1412,7 +1412,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) + if (resctrl_is_membw(r)) continue; has_cache = true; list_for_each_entry(d, &r->ctrl_domains, hdr.list) { @@ -1555,6 +1555,12 @@ bool is_mba_sc(struct rdt_resource *r) return r->membw.mba_sc; } +/* RANGE schema is bandwidth (MBA/SMBA/MB_MIN/MB_WGHT). BITMAP is cache. */ +bool resctrl_is_membw(struct rdt_resource *r) +{ + return r->schema_fmt == RESCTRL_SCHEMA_RANGE; +} + /* * rdtgroup_size_show - Display size in bytes of allocated regions * @@ -1616,8 +1622,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, ctrl = resctrl_arch_get_config(r, d, closid, type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) + if (resctrl_is_membw(r)) size = ctrl; else size = rdtgroup_cbm_to_size(r, d, ctrl); @@ -2397,6 +2402,8 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) return RFTYPE_RES_CACHE; case RDT_RESOURCE_MBA: case RDT_RESOURCE_SMBA: + case RDT_RESOURCE_MB_MIN: + case RDT_RESOURCE_MB_WGHT: return RFTYPE_RES_MB; case RDT_RESOURCE_PERF_PKG: return RFTYPE_RES_PERF_PKG; @@ -3648,8 +3655,7 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) { + if (resctrl_is_membw(r)) { rdtgroup_init_mba(r, rdtgrp->closid); if (is_mba_sc(r)) continue; diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 006e57fd7ca589..bcbc166412ef13 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -53,6 +53,8 @@ enum resctrl_res_level { RDT_RESOURCE_L2, RDT_RESOURCE_MBA, RDT_RESOURCE_SMBA, + RDT_RESOURCE_MB_MIN, + RDT_RESOURCE_MB_WGHT, RDT_RESOURCE_PERF_PKG, /* Must be the last */ @@ -245,7 +247,13 @@ enum membw_throttle_mode { /** * struct resctrl_membw - Memory bandwidth allocation related data * @min_bw: Minimum memory bandwidth percentage user can request - * @max_bw: Maximum memory bandwidth value, used as the reset value + * @max_bw: Maximum memory bandwidth value a group can be + * configured with + * @default_to_min: When true, the default control value for new + * groups and reset is @min_bw instead of @max_bw. + * Drivers whose hardware enforces a sum constraint + * across groups (e.g. CBQRI MB_MIN) set this so + * mkdir does not overflow the sum. * @bw_gran: Granularity at which the memory bandwidth is allocated * @delay_linear: True if memory B/W delay is in linear scale * @arch_needs_linear: True if we can't configure non-linear resources @@ -257,6 +265,7 @@ enum membw_throttle_mode { struct resctrl_membw { u32 min_bw; u32 max_bw; + bool default_to_min; u32 bw_gran; u32 delay_linear; bool arch_needs_linear; @@ -403,7 +412,7 @@ static inline u32 resctrl_get_default_ctrl(struct rdt_resource *r) case RESCTRL_SCHEMA_BITMAP: return BIT_MASK(r->cache.cbm_len) - 1; case RESCTRL_SCHEMA_RANGE: - return r->membw.max_bw; + return r->membw.default_to_min ? r->membw.min_bw : r->membw.max_bw; } return WARN_ON_ONCE(1); diff --git a/include/linux/riscv_cbqri.h b/include/linux/riscv_cbqri.h new file mode 100644 index 00000000000000..5863f0a65f6cc5 --- /dev/null +++ b/include/linux/riscv_cbqri.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Public registration API for the RISC-V Capacity and Bandwidth QoS + * Register Interface (CBQRI) driver. Discovery layers (ACPI RQSC, future + * device tree) call riscv_cbqri_register_controller() to hand a controller + * descriptor to the driver, which owns all subsequent state. + */ +#ifndef _LINUX_RISCV_CBQRI_H +#define _LINUX_RISCV_CBQRI_H + +#include + +enum cbqri_controller_type { + CBQRI_CONTROLLER_TYPE_CAPACITY, + CBQRI_CONTROLLER_TYPE_BANDWIDTH, +}; + +/* Sanity caps on per-controller RCID/MCID counts from firmware */ +#define CBQRI_MAX_RCID 1024 +#define CBQRI_MAX_MCID 1024 + +/** + * struct cbqri_controller_info - registration descriptor + * @addr: MMIO base address of the controller's register interface + * @size: size of the MMIO region + * @type: capacity or bandwidth controller + * @rcid_count: number of supported RCIDs (per RQSC table) + * @mcid_count: number of supported MCIDs (per RQSC table) + * @cache_id: PPTT cache id. Only meaningful for CAPACITY controllers + * @prox_dom: SRAT proximity domain. Only meaningful for BANDWIDTH + * controllers + * + * Discovery layers populate one of @cache_id / @prox_dom according to + * @type. The CBQRI driver resolves the matching cpumask internally so + * callers do not need to know about cacheinfo/NUMA topology. + */ +struct cbqri_controller_info { + phys_addr_t addr; + phys_addr_t size; + enum cbqri_controller_type type; + u32 rcid_count; + u32 mcid_count; + u32 cache_id; + u32 prox_dom; +}; + +#if IS_ENABLED(CONFIG_RISCV_CBQRI_DRIVER) +int riscv_cbqri_register_controller(const struct cbqri_controller_info *info); +void riscv_cbqri_unregister_last(unsigned int n); +#else +static inline int +riscv_cbqri_register_controller(const struct cbqri_controller_info *info) +{ + return -ENODEV; +} + +static inline void riscv_cbqri_unregister_last(unsigned int n) { } +#endif + +#endif /* _LINUX_RISCV_CBQRI_H */