diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml
index 2b0a8a93bb2144..1c6f091518d49a 100644
--- a/Documentation/devicetree/bindings/riscv/extensions.yaml
+++ b/Documentation/devicetree/bindings/riscv/extensions.yaml
@@ -232,6 +232,12 @@ properties:
             ratified at commit d70011dde6c2 ("Update to ratified state")
             of riscv-j-extension.
 
+        - const: ssqosid
+          description: |
+            The standard Ssqosid extension for Quality of Service ID is
+            ratified as v1.0 in commit d9c616497fde ("Merge pull
+            request #7 from ved-rivos/Ratified") of riscv-ssqosid.
+
         - const: ssstateen
           description: |
             The standard Ssstateen extension for supervisor-mode view of the
diff --git a/MAINTAINERS b/MAINTAINERS
index c2c6d79275c6eb..eab31c7b5e9174 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23017,6 +23017,21 @@ F:	drivers/perf/riscv_pmu.c
 F:	drivers/perf/riscv_pmu_legacy.c
 F:	drivers/perf/riscv_pmu_sbi.c
 
+RISC-V QOS RESCTRL SUPPORT
+M:	Drew Fustini <fustini@kernel.org>
+R:	yunhui cui <cuiyunhui@bytedance.com>
+L:	linux-riscv@lists.infradead.org
+S:	Supported
+F:	arch/riscv/include/asm/qos.h
+F:	arch/riscv/include/asm/resctrl.h
+F:	arch/riscv/kernel/qos.c
+F:	drivers/acpi/riscv/rqsc.c
+F:	drivers/acpi/riscv/rqsc.h
+F:	drivers/resctrl/cbqri_devices.c
+F:	drivers/resctrl/cbqri_internal.h
+F:	drivers/resctrl/cbqri_resctrl.c
+F:	include/linux/riscv_cbqri.h
+
 RISC-V RPMI AND MPXY DRIVERS
 M:	Rahul Pathak <rahul@summations.net>
 M:	Anup Patel <anup@brainfault.org>
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index e2a0522571760b..5d2fdf74ee716e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -591,6 +591,26 @@ config RISCV_ISA_SVNAPOT
 
 	  If you don't know what to do here, say Y.
 
+config RISCV_ISA_SSQOSID
+	bool "Ssqosid extension support for supervisor mode Quality of Service ID"
+	depends on 64BIT
+	default n
+	select ARCH_HAS_CPU_RESCTRL
+	select RISCV_CBQRI_DRIVER
+	help
+	  Adds support for the Ssqosid ISA extension (Supervisor-mode
+	  Quality of Service ID).
+
+	  Ssqosid defines the srmcfg CSR which allows the system to tag the
+	  running process with an RCID (Resource Control ID) and MCID
+	  (Monitoring Counter ID). The RCID is used to determine resource
+	  allocation. The MCID is used to track resource usage in event
+	  counters.
+
+	  For example, a cache controller may use the RCID to apply a
+	  cache partitioning scheme and use the MCID to track how much
+	  cache a process, or a group of processes, is using.
+
 config RISCV_ISA_SVPBMT
 	bool "Svpbmt extension support for supervisor mode page-based memory types"
 	depends on 64BIT && MMU
diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h
index 26ab37c171bcf6..3cfd0102085ee9 100644
--- a/arch/riscv/include/asm/acpi.h
+++ b/arch/riscv/include/asm/acpi.h
@@ -67,6 +67,16 @@ int acpi_get_riscv_isa(struct acpi_table_header *table,
 
 void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size,
 			     u32 *cboz_size, u32 *cbop_size);
+
+#ifdef CONFIG_RISCV_CBQRI_DRIVER
+int __init acpi_parse_rqsc(struct acpi_table_header *table);
+#else
+static inline int acpi_parse_rqsc(struct acpi_table_header *table)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_RISCV_CBQRI_DRIVER */
+
 #else
 static inline void acpi_init_rintc_map(void) { }
 static inline struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu)
diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index 31b8988f4488da..7bce928e5daa09 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -84,6 +84,10 @@
 #define SATP_ASID_MASK	_AC(0xFFFF, UL)
 #endif
 
+/* SRMCFG fields */
+#define SRMCFG_RCID_MASK	GENMASK(11, 0)
+#define SRMCFG_MCID_MASK	GENMASK(27, 16)
+
 /* Exception cause high bit - is an interrupt if set */
 #define CAUSE_IRQ_FLAG		(_AC(1, UL) << (__riscv_xlen - 1))
 
@@ -328,6 +332,7 @@
 #define CSR_STVAL		0x143
 #define CSR_SIP			0x144
 #define CSR_SATP		0x180
+#define CSR_SRMCFG		0x181
 
 #define CSR_STIMECMP		0x14D
 #define CSR_STIMECMPH		0x15D
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index 7ef8e5f55c8dcf..b83dae5cebb992 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -112,6 +112,7 @@
 #define RISCV_ISA_EXT_ZCLSD		103
 #define RISCV_ISA_EXT_ZICFILP		104
 #define RISCV_ISA_EXT_ZICFISS		105
+#define RISCV_ISA_EXT_SSQOSID		106
 
 #define RISCV_ISA_EXT_XLINUXENVCFG	127
 
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index 812517b2cec135..49a386d74cd3f0 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -123,6 +123,9 @@ struct thread_struct {
 	/* A forced icache flush is not needed if migrating to the previous cpu. */
 	unsigned int prev_cpu;
 #endif
+#ifdef CONFIG_RISCV_ISA_SSQOSID
+	u32 srmcfg;
+#endif
 };
 
 /* Whitelist the fstate from the task_struct for hardened usercopy */
diff --git a/arch/riscv/include/asm/qos.h b/arch/riscv/include/asm/qos.h
new file mode 100644
index 00000000000000..727d438454f35c
--- /dev/null
+++ b/arch/riscv/include/asm/qos.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_QOS_H
+#define _ASM_RISCV_QOS_H
+
+#include <linux/percpu-defs.h>
+
+#ifdef CONFIG_RISCV_ISA_SSQOSID
+
+#include <linux/bitfield.h>
+#include <linux/cpufeature.h>
+#include <linux/sched.h>
+
+#include <asm/csr.h>
+#include <asm/fence.h>
+#include <asm/hwcap.h>
+
+/* cached value of srmcfg csr for each cpu */
+DECLARE_PER_CPU(u32, cpu_srmcfg);
+
+/* default srmcfg value for each cpu, set via resctrl cpu assignment */
+DECLARE_PER_CPU(u32, cpu_srmcfg_default);
+
+static inline void __switch_to_srmcfg(struct task_struct *next)
+{
+	u32 thread_srmcfg, default_srmcfg;
+
+	thread_srmcfg = READ_ONCE(next->thread.srmcfg);
+	default_srmcfg = __this_cpu_read(cpu_srmcfg_default);
+
+	/*
+	 * RCID and MCID inherit from cpu_srmcfg_default independently.
+	 * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are both 0,
+	 * so a per-field zero means "no task assignment for this
+	 * dimension" and the CPU default supplies that field. Matches
+	 * x86 RDT's __resctrl_sched_in() per-field logic. The fully
+	 * unassigned (thread.srmcfg == 0) and fully assigned (both
+	 * fields non-zero) cases short-circuit the field math.
+	 */
+	if (thread_srmcfg == 0) {
+		thread_srmcfg = default_srmcfg;
+	} else {
+		u32 rcid = FIELD_GET(SRMCFG_RCID_MASK, thread_srmcfg);
+		u32 mcid = FIELD_GET(SRMCFG_MCID_MASK, thread_srmcfg);
+
+		if (rcid == 0 || mcid == 0) {
+			if (rcid == 0)
+				rcid = FIELD_GET(SRMCFG_RCID_MASK, default_srmcfg);
+			if (mcid == 0)
+				mcid = FIELD_GET(SRMCFG_MCID_MASK, default_srmcfg);
+			thread_srmcfg = FIELD_PREP(SRMCFG_RCID_MASK, rcid) |
+					FIELD_PREP(SRMCFG_MCID_MASK, mcid);
+		}
+	}
+
+	if (thread_srmcfg != __this_cpu_read(cpu_srmcfg)) {
+		/*
+		 * Drain stores from the outgoing task before the CSR write
+		 * so they retain the previous RCID/MCID tag at the cache
+		 * interconnect.
+		 */
+		RISCV_FENCE(rw, o);
+
+		__this_cpu_write(cpu_srmcfg, thread_srmcfg);
+		csr_write(CSR_SRMCFG, thread_srmcfg);
+		/*
+		 * Order the csrw before the new task's loads/stores so they
+		 * pick up the new tag. Zicsr 6.1.1 makes CSR writes weakly
+		 * ordered (device-output) vs memory ops. Ssqosid v1.0 is
+		 * silent so honor the general CSR rule.
+		 */
+		RISCV_FENCE(o, rw);
+	}
+}
+
+static __always_inline bool has_srmcfg(void)
+{
+	return riscv_has_extension_unlikely(RISCV_ISA_EXT_SSQOSID);
+}
+
+#else /* ! CONFIG_RISCV_ISA_SSQOSID  */
+
+struct task_struct;
+static __always_inline bool has_srmcfg(void) { return false; }
+static inline void __switch_to_srmcfg(struct task_struct *next) { }
+
+#endif /* CONFIG_RISCV_ISA_SSQOSID */
+#endif /* _ASM_RISCV_QOS_H */
diff --git a/arch/riscv/include/asm/resctrl.h b/arch/riscv/include/asm/resctrl.h
new file mode 100644
index 00000000000000..282b5b59e3ee87
--- /dev/null
+++ b/arch/riscv/include/asm/resctrl.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_RISCV_RESCTRL_H
+#define _ASM_RISCV_RESCTRL_H
+
+#include <linux/resctrl_types.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include <asm/qos.h>
+
+struct rdt_resource;
+
+/*
+ * Sentinel "no CLOSID assigned" used by resctrl_arch_rmid_idx_decode().
+ * fs/resctrl treats this opaquely. CBQRI uses MCID directly as the linear
+ * rmid index, so closid is unused on decode.
+ */
+#define RISCV_RESCTRL_EMPTY_CLOSID	((u32)~0)
+
+/*
+ * Terminology mapping between x86 (Intel RDT/AMD QoS) and RISC-V:
+ *
+ *  CLOSID on x86 is RCID on RISC-V
+ *    RMID on x86 is MCID on RISC-V
+ *     CDP on x86 is AT (access type) on RISC-V
+ *
+ * Each fast-path arch entry point below is the RISC-V realization of the
+ * generic contract documented in <linux/resctrl.h>. Comments here describe
+ * only the RISC-V-specific behavior (srmcfg encoding, CBQRI controller
+ * lookup, MCID-as-index policy).
+ */
+
+/**
+ * resctrl_arch_alloc_capable() - any CBQRI controller exposes resctrl alloc
+ *
+ * Returns true once at least one CBQRI controller has successfully probed for
+ * a resctrl-exposed allocation feature (cache capacity or memory bandwidth).
+ * Only meaningful after cbqri_resctrl_setup() runs at late_initcall.
+ */
+bool resctrl_arch_alloc_capable(void);
+
+/**
+ * resctrl_arch_mon_capable() - any CBQRI controller exposes resctrl monitoring
+ *
+ * Returns true once at least one CBQRI controller has successfully probed a
+ * monitoring event wired through resctrl (L3 occupancy or L3 mbm_total_bytes).
+ */
+bool resctrl_arch_mon_capable(void);
+
+/**
+ * resctrl_arch_rmid_idx_encode() - encode (RCID, MCID) into a linear index
+ * @closid: RCID (resource control id)
+ * @rmid:   MCID (monitoring counter id)
+ *
+ * RISC-V uses MCID directly as the linear index into per-RMID arrays
+ * managed by fs/resctrl, since CBQRI controllers admit any MCID for any
+ * RCID. closid is unused here. CDP is encoded via the AT field on each
+ * CBQRI op rather than via the index.
+ */
+u32  resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid);
+
+/**
+ * resctrl_arch_rmid_idx_decode() - inverse of resctrl_arch_rmid_idx_encode()
+ * @idx:    linear index
+ * @closid: out: always RISCV_RESCTRL_EMPTY_CLOSID
+ * @rmid:   out: the MCID that @idx encodes
+ */
+void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid);
+
+/**
+ * resctrl_arch_set_cpu_default_closid_rmid() - install per-CPU srmcfg default
+ * @cpu:    CPU number
+ * @closid: RCID to use when no task is matched
+ * @rmid:   MCID to use when no task is matched
+ *
+ * Sets the per-CPU cpu_srmcfg_default so __switch_to_srmcfg() can fall back
+ * to the CPU's default RCID/MCID for default-group tasks (those whose
+ * thread.srmcfg encodes to 0, i.e. closid == RESCTRL_RESERVED_CLOSID and
+ * rmid == RESCTRL_RESERVED_RMID). Implements resctrl allocation rule 2
+ * ("CPU default") on RISC-V.
+ */
+void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid);
+
+/**
+ * resctrl_arch_sched_in() - context-switch hook to install task RCID/MCID
+ * @tsk: the task being scheduled in
+ *
+ * Called from finish_task_switch() to write tsk->thread.srmcfg into the
+ * srmcfg CSR. Tasks tagged with RISCV_RESCTRL_EMPTY_CLOSID inherit the
+ * per-CPU default set via resctrl_arch_set_cpu_default_closid_rmid().
+ */
+void resctrl_arch_sched_in(struct task_struct *tsk);
+
+/**
+ * resctrl_arch_set_closid_rmid() - tag a task with an RCID/MCID
+ * @tsk:    task to tag
+ * @closid: RCID to install
+ * @rmid:   MCID to install
+ *
+ * Updates tsk->thread.srmcfg with the encoded (RCID, MCID) pair. The new
+ * value takes effect on the next resctrl_arch_sched_in() for this task.
+ */
+void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid);
+
+/**
+ * resctrl_arch_match_closid() - test whether a task carries a given RCID
+ * @tsk:    task
+ * @closid: RCID
+ */
+bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid);
+
+/**
+ * resctrl_arch_match_rmid() - test whether a task carries a given (RCID, MCID)
+ * @tsk:    task
+ * @closid: RCID
+ * @rmid:   MCID
+ */
+bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid);
+
+/**
+ * resctrl_arch_mon_ctx_alloc() - allocate per-monitor-event arch context
+ * @r:     resctrl resource being monitored
+ * @evtid: which monitor event needs context
+ *
+ * Returns an opaque pointer that resctrl_arch_rmid_read() can use to find the
+ * CBQRI controller backing this event. CBQRI's BC bandwidth context is
+ * keyed off the resource's L3 monitoring domain rather than per-event state,
+ * so this implementation returns NULL.
+ */
+void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid);
+
+/**
+ * resctrl_arch_mon_ctx_free() - release context returned by mon_ctx_alloc()
+ * @r:            resctrl resource
+ * @evtid:        monitor event id
+ * @arch_mon_ctx: pointer returned by resctrl_arch_mon_ctx_alloc()
+ */
+void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid,
+			       void *arch_mon_ctx);
+
+static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
+{
+	return val;
+}
+
+/* Not needed for RISC-V */
+static inline void resctrl_arch_enable_mon(void) { }
+static inline void resctrl_arch_disable_mon(void) { }
+static inline void resctrl_arch_enable_alloc(void) { }
+static inline void resctrl_arch_disable_alloc(void) { }
+
+#endif /* _ASM_RISCV_RESCTRL_H */
diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h
index 0e71eb82f920ca..1c7ea53ec012ad 100644
--- a/arch/riscv/include/asm/switch_to.h
+++ b/arch/riscv/include/asm/switch_to.h
@@ -14,6 +14,7 @@
 #include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/csr.h>
+#include <asm/qos.h>
 
 #ifdef CONFIG_FPU
 extern void __fstate_save(struct task_struct *save_to);
@@ -119,6 +120,8 @@ do {							\
 		__switch_to_fpu(__prev, __next);	\
 	if (has_vector() || has_xtheadvector())		\
 		__switch_to_vector(__prev, __next);	\
+	if (has_srmcfg())				\
+		__switch_to_srmcfg(__next);		\
 	if (switch_to_should_flush_icache(__next))	\
 		local_flush_icache_all();		\
 	__switch_to_envcfg(__next);			\
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index cabb99cadfb6d1..ebe1c3588177b4 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -128,3 +128,5 @@ obj-$(CONFIG_ACPI_NUMA)	+= acpi_numa.o
 
 obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += bugs.o
 obj-$(CONFIG_RISCV_USER_CFI) += usercfi.o
+
+obj-$(CONFIG_RISCV_ISA_SSQOSID) += qos.o
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index f46aa5602d74d3..668a7e71ff1c64 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -582,6 +582,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
 	__RISCV_ISA_EXT_DATA(ssaia, RISCV_ISA_EXT_SSAIA),
 	__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
 	__RISCV_ISA_EXT_SUPERSET(ssnpm, RISCV_ISA_EXT_SSNPM, riscv_xlinuxenvcfg_exts),
+	__RISCV_ISA_EXT_DATA(ssqosid, RISCV_ISA_EXT_SSQOSID),
 	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
 	__RISCV_ISA_EXT_DATA(svade, RISCV_ISA_EXT_SVADE),
 	__RISCV_ISA_EXT_DATA_VALIDATE(svadu, RISCV_ISA_EXT_SVADU, riscv_ext_svadu_validate),
diff --git a/arch/riscv/kernel/qos.c b/arch/riscv/kernel/qos.c
new file mode 100644
index 00000000000000..d18b99b195e79e
--- /dev/null
+++ b/arch/riscv/kernel/qos.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/cpu.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpuhotplug.h>
+#include <linux/notifier.h>
+#include <linux/percpu-defs.h>
+#include <linux/types.h>
+
+#include <asm/cpufeature-macros.h>
+#include <asm/hwcap.h>
+#include <asm/qos.h>
+
+/*
+ * Cached value of srmcfg csr for each cpu. Seeded to U32_MAX so the next
+ * __switch_to_srmcfg() unconditionally writes the CSR; the encoding
+ * MCID << 16 | RCID with both fields well under 16 bits can never
+ * produce this sentinel. This covers early-boot context switches that
+ * happen before riscv_srmcfg_init() runs as an arch_initcall.
+ */
+DEFINE_PER_CPU(u32, cpu_srmcfg) = U32_MAX;
+
+/* default srmcfg value for each cpu, set via resctrl cpu assignment */
+DEFINE_PER_CPU(u32, cpu_srmcfg_default);
+
+/*
+ * Seed the per-CPU srmcfg cache to a sentinel that no real srmcfg encoding
+ * can produce (MCID << 16 | RCID, both fields well under 16 bits) so the
+ * next __switch_to_srmcfg() unconditionally writes the CSR. Ssqosid v1.0
+ * leaves CSR state across hart stop/start implementation-defined, so the
+ * cached value cannot be trusted after online.
+ */
+static int riscv_srmcfg_online(unsigned int cpu)
+{
+	per_cpu(cpu_srmcfg, cpu) = U32_MAX;
+	return 0;
+}
+
+/*
+ * Invalidate the cache on offline too. The sentinel persists across the
+ * offline period, so a CPU brought back online finds the cache already
+ * invalidated before it is schedulable. This closes the window where a
+ * task scheduled before riscv_srmcfg_online() runs could match a stale
+ * cache and skip the CSR write while the hardware CSR was reset across
+ * hart stop/start.
+ */
+static int riscv_srmcfg_offline(unsigned int cpu)
+{
+	per_cpu(cpu_srmcfg, cpu) = U32_MAX;
+	return 0;
+}
+
+/*
+ * CPU PM notifier: invalidate the cached srmcfg on resume from a deep
+ * idle / suspend. Ssqosid v1.0 leaves CSR_SRMCFG state across low-power
+ * transitions implementation-defined, and the boot CPU never goes
+ * through the cpuhp online callback during system suspend, so without
+ * this hook __switch_to_srmcfg() would skip the CSR write when the
+ * outgoing task happens to share its srmcfg with the pre-suspend cache.
+ */
+static int riscv_srmcfg_pm_notify(struct notifier_block *nb,
+				  unsigned long action, void *unused)
+{
+	switch (action) {
+	case CPU_PM_EXIT:
+	case CPU_PM_ENTER_FAILED:
+		__this_cpu_write(cpu_srmcfg, U32_MAX);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block riscv_srmcfg_pm_nb = {
+	.notifier_call = riscv_srmcfg_pm_notify,
+};
+
+static int __init riscv_srmcfg_init(void)
+{
+	int err;
+
+	if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SSQOSID))
+		return 0;
+
+	/*
+	 * cpuhp_setup_state() invokes the startup callback locally on every
+	 * already-online CPU, so no separate seed loop is needed here.
+	 */
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "riscv/srmcfg:online",
+				riscv_srmcfg_online, riscv_srmcfg_offline);
+	if (err < 0) {
+		pr_warn("srmcfg cpuhp registration failed (%d), cpus brought online after boot will not invalidate the CSR_SRMCFG cache\n",
+			err);
+		return err;
+	}
+
+	cpu_pm_register_notifier(&riscv_srmcfg_pm_nb);
+	return 0;
+}
+arch_initcall(riscv_srmcfg_init);
diff --git a/drivers/acpi/riscv/Makefile b/drivers/acpi/riscv/Makefile
index 1284a076fa8887..77f8f0101b7e8d 100644
--- a/drivers/acpi/riscv/Makefile
+++ b/drivers/acpi/riscv/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y					+= rhct.o init.o irq.o
+obj-$(CONFIG_RISCV_CBQRI_DRIVER)	+= rqsc.o
 obj-$(CONFIG_ACPI_PROCESSOR_IDLE)	+= cpuidle.o
 obj-$(CONFIG_ACPI_CPPC_LIB)		+= cppc.o
 obj-$(CONFIG_ACPI_RIMT)			+= rimt.o
diff --git a/drivers/acpi/riscv/init.c b/drivers/acpi/riscv/init.c
index 7c00f7995e866d..129ebfae28be3d 100644
--- a/drivers/acpi/riscv/init.c
+++ b/drivers/acpi/riscv/init.c
@@ -5,11 +5,32 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/cleanup.h>
 #include "init.h"
+#include "rqsc.h"
 
 void __init acpi_arch_init(void)
 {
 	riscv_acpi_init_gsi_mapping();
+
 	if (IS_ENABLED(CONFIG_ACPI_RIMT))
 		riscv_acpi_rimt_init();
+
+	if (IS_ENABLED(CONFIG_RISCV_CBQRI_DRIVER)) {
+		struct acpi_table_header *rqsc __free(acpi_put_table) = NULL;
+		acpi_status status = acpi_get_table(ACPI_SIG_RQSC, 0, &rqsc);
+
+		if (status == AE_NOT_FOUND) {
+			/* RQSC is optional. Silence on systems without it. */
+		} else if (ACPI_FAILURE(status)) {
+			pr_err("RQSC: failed to get table: %s\n",
+			       acpi_format_exception(status));
+		} else {
+			int rc = acpi_parse_rqsc(rqsc);
+
+			if (rc < 0)
+				pr_err("RQSC: failed to parse table: %d\n",
+				       rc);
+		}
+	}
 }
diff --git a/drivers/acpi/riscv/rqsc.c b/drivers/acpi/riscv/rqsc.c
new file mode 100644
index 00000000000000..1b1ae2e353a510
--- /dev/null
+++ b/drivers/acpi/riscv/rqsc.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "ACPI: RQSC: " fmt
+
+#include <linux/acpi.h>
+#include <linux/bits.h>
+#include <linux/riscv_cbqri.h>
+
+#include "rqsc.h"
+
+#define CBQRI_CTRL_SIZE 0x1000
+
+int __init acpi_parse_rqsc(struct acpi_table_header *table)
+{
+	struct acpi_table_rqsc *rqsc = (struct acpi_table_rqsc *)table;
+	struct acpi_rqsc_node *end, *node;
+	int num_controllers = 0;
+
+	/*
+	 * Reject revisions newer than this parser was written against. A
+	 * future revision could extend the fixed RQSC header before the
+	 * first node, which would shift the resource subtables and cause the
+	 * sizeof(*node)-based offset below to point into the wrong place.
+	 */
+	if (rqsc->header.revision != ACPI_RQSC_REVISION) {
+		pr_err("RQSC table revision %u, expected %u, aborting\n",
+		       rqsc->header.revision, ACPI_RQSC_REVISION);
+		return -EINVAL;
+	}
+
+	/* Reject tables shorter than the fixed RQSC header. */
+	if (rqsc->header.length < sizeof(struct acpi_table_rqsc)) {
+		pr_err("RQSC table truncated: length %u < %zu, aborting\n",
+		       rqsc->header.length, sizeof(struct acpi_table_rqsc));
+		return -EINVAL;
+	}
+
+	end = ACPI_ADD_PTR(struct acpi_rqsc_node, rqsc, rqsc->header.length);
+
+	for (node = ACPI_ADD_PTR(struct acpi_rqsc_node, rqsc,
+				 sizeof(struct acpi_table_rqsc));
+	     node < end;
+	     node = ACPI_ADD_PTR(struct acpi_rqsc_node, node, node->length)
+	) {
+		const struct acpi_rqsc_resource *res0;
+		struct cbqri_controller_info info = {};
+		int ret;
+
+		if ((void *)node + sizeof(*node) > (void *)end) {
+			pr_err("truncated entry at end of table, aborting\n");
+			riscv_cbqri_unregister_last(num_controllers);
+			return -EINVAL;
+		}
+
+		if (node->length < sizeof(*node)) {
+			pr_err("malformed RQSC entry: length %u < %zu, aborting\n",
+			       node->length, sizeof(*node));
+			riscv_cbqri_unregister_last(num_controllers);
+			return -EINVAL;
+		}
+
+		/*
+		 * Without this check, a node whose length claims to extend
+		 * past the end of the table would advance the loop cursor
+		 * past 'end' and silently terminate. Flag the corruption
+		 * explicitly so a malformed firmware table cannot truncate
+		 * the controller list without noise.
+		 */
+		if ((void *)node + node->length > (void *)end) {
+			pr_err("RQSC entry length %u overruns table end, aborting\n",
+			       node->length);
+			riscv_cbqri_unregister_last(num_controllers);
+			return -EINVAL;
+		}
+
+		/* GAS must describe system memory. ioremap() consumes it later. */
+		if (node->reg.space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
+			pr_warn("controller has unsupported address space_id=%u, skipping\n",
+				node->reg.space_id);
+			continue;
+		}
+
+		if (!node->reg.address) {
+			pr_warn("controller has zero address, skipping\n");
+			continue;
+		}
+
+		info.type = node->type;
+		/* RQSC section 2 Table 2: 12-byte GAS-format register interface address */
+		info.addr = node->reg.address;
+		info.size = CBQRI_CTRL_SIZE;
+		info.rcid_count = node->rcid;
+		info.mcid_count = node->mcid;
+
+		/* See CBQRI_MAX_RCID/MCID in <linux/riscv_cbqri.h> for the rationale. */
+		if (info.rcid_count > CBQRI_MAX_RCID) {
+			pr_warn("controller at %pa: rcid_count %u exceeds CBQRI_MAX_RCID %u, skipping\n",
+				&info.addr, info.rcid_count, CBQRI_MAX_RCID);
+			continue;
+		}
+
+		if (info.mcid_count > CBQRI_MAX_MCID) {
+			pr_warn("controller at %pa: mcid_count %u exceeds CBQRI_MAX_MCID %u, skipping\n",
+				&info.addr, info.mcid_count, CBQRI_MAX_MCID);
+			continue;
+		}
+
+		/*
+		 * RQSC Table 2: at least one of RCID Count or MCID Count must be non-zero.
+		 */
+		if (!info.rcid_count && !info.mcid_count) {
+			pr_warn("controller at %pa: both rcid_count and mcid_count are zero, skipping\n",
+				&info.addr);
+			continue;
+		}
+
+		if (node->nres == 0) {
+			pr_warn("controller at %pa has no resource descriptors, skipping\n",
+				&info.addr);
+			continue;
+		}
+
+		/*
+		 * Resources follow the node header in-line. Only res[0] is
+		 * consumed. Bound it against end before reading its prefix so
+		 * a table that ends partway through a resource subtable is
+		 * rejected rather than read past the mapping.
+		 */
+		res0 = (const struct acpi_rqsc_resource *)
+		       ((const u8 *)node + sizeof(*node));
+		if ((void *)res0 + sizeof(*res0) > (void *)end ||
+		    node->length < sizeof(*node) + sizeof(*res0) ||
+		    res0->length < sizeof(*res0)) {
+			pr_warn("controller at %pa: node too short for resource descriptor, skipping\n",
+				&info.addr);
+			continue;
+		}
+
+		if (node->nres > 1)
+			pr_warn("controller at %pa has %u resource descriptors, using first\n",
+				&info.addr, node->nres);
+
+		/*
+		 * id1 is u64 but it is used for cache_id and prox_dom
+		 * which are only u32. Reject rather than truncate, so a
+		 * too large id is not silently mapped to the wrong PPTT
+		 * entry or NUMA node.
+		 */
+		if (res0->id1 > U32_MAX) {
+			pr_warn("controller at %pa: id1 0x%llx exceeds u32, skipping\n",
+				&info.addr, res0->id1);
+			continue;
+		}
+
+		/*
+		 * Pair the QoS controller type with the resource descriptor
+		 * fields that index id1. RQSC Table 4 defines the mapping:
+		 * Capacity controller indexes a Processor Cache via PPTT
+		 * cache_id, a Bandwidth controller indexes a Memory Range
+		 * via SRAT proximity domain. Mismatched pairings  (e.g. a
+		 * CC whose first resource is Memory) would otherwise route
+		 * id1 into the wrong downstream lookup.
+		 */
+		switch (info.type) {
+		case CBQRI_CONTROLLER_TYPE_CAPACITY:
+			if (res0->type != ACPI_RQSC_RESOURCE_TYPE_CACHE ||
+			    res0->id_type != ACPI_RQSC_RESOURCE_ID_TYPE_PROCESSOR_CACHE) {
+				pr_warn("CC at %pa: resource type=%u id_type=%u not (cache, processor cache), skipping\n",
+					&info.addr, res0->type, res0->id_type);
+				continue;
+			}
+			info.cache_id = (u32)res0->id1;
+			break;
+		case CBQRI_CONTROLLER_TYPE_BANDWIDTH:
+			if (res0->type != ACPI_RQSC_RESOURCE_TYPE_MEMORY ||
+			    res0->id_type != ACPI_RQSC_RESOURCE_ID_TYPE_MEMORY_RANGE) {
+				pr_warn("BC at %pa: resource type=%u id_type=%u not (memory, memory range), skipping\n",
+					&info.addr, res0->type, res0->id_type);
+				continue;
+			}
+			info.prox_dom = (u32)res0->id1;
+			break;
+		default:
+			pr_warn("controller at %pa: unknown type %u, skipping\n",
+				&info.addr, info.type);
+			continue;
+		}
+
+		pr_debug("registering controller type=%u addr=%pa rcid=%u mcid=%u\n",
+			 info.type, &info.addr, info.rcid_count, info.mcid_count);
+
+		ret = riscv_cbqri_register_controller(&info);
+		if (ret == 0)
+			num_controllers++;
+		else
+			pr_warn("controller at %pa: registration failed (%d), skipping\n",
+				&info.addr, ret);
+	}
+
+	pr_info("found %d CBQRI controllers\n", num_controllers);
+	return 0;
+}
diff --git a/drivers/acpi/riscv/rqsc.h b/drivers/acpi/riscv/rqsc.h
new file mode 100644
index 00000000000000..fa0d96e267e158
--- /dev/null
+++ b/drivers/acpi/riscv/rqsc.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Local definitions for the RISC-V Quality of Service Controller (RQSC)
+ * ACPI table. Will move to ACPICA's include/acpi/actbl2.h once the spec
+ * is ratified.
+ */
+#ifndef _DRIVERS_ACPI_RISCV_RQSC_H
+#define _DRIVERS_ACPI_RISCV_RQSC_H
+
+#include <linux/types.h>
+#include <acpi/actbl.h>
+
+#define ACPI_SIG_RQSC	"RQSC"	/* RISC-V Quality of Service Controller */
+
+/* RQSC Table 1: current revision number. */
+#define ACPI_RQSC_REVISION	1
+
+/* RQSC Table 4: Resource Type values for acpi_rqsc_resource.type. */
+#define ACPI_RQSC_RESOURCE_TYPE_CACHE	0
+#define ACPI_RQSC_RESOURCE_TYPE_MEMORY	1
+
+/* RQSC Table 4: Resource ID Type values for .id_type. */
+#define ACPI_RQSC_RESOURCE_ID_TYPE_PROCESSOR_CACHE	0
+#define ACPI_RQSC_RESOURCE_ID_TYPE_MEMORY_RANGE		1
+#define ACPI_RQSC_RESOURCE_ID_TYPE_MEMORY_SIDE_CACHE	2
+#define ACPI_RQSC_RESOURCE_ID_TYPE_ACPI_DEVICE		3
+#define ACPI_RQSC_RESOURCE_ID_TYPE_PCI_DEVICE		4
+
+/*
+ * Byte-packed: u64 id1 would otherwise pad to 8-byte alignment and inflate
+ * sizeof(*res) from the spec's 20 bytes to 24, mis-sizing resource subtables.
+ */
+struct acpi_rqsc_resource {
+	u8 type;
+	u8 resv;
+	u16 length;
+	u16 flags;
+	u8 resv2;
+	u8 id_type;
+	u64 id1;
+	u32 id2;
+} __packed;
+
+struct acpi_rqsc_node {
+	u8 type;
+	u8 resv;
+	u16 length;
+	/* RQSC section 2 Table 2: 12-byte GAS-format register interface address */
+	struct acpi_generic_address reg;
+	u16 rcid;
+	u16 mcid;
+	u16 flags;
+	u16 nres;
+	/*
+	 * Followed by nres acpi_rqsc_resource subtables. Walk them via
+	 * each resource's own length field so a future RQSC revision that
+	 * extends the resource layout cannot misalign older parsers.
+	 */
+} __packed;
+
+struct acpi_table_rqsc {
+	struct acpi_table_header header;	/* Common ACPI table header */
+	u32 num;
+} __packed;
+
+#endif /* _DRIVERS_ACPI_RISCV_RQSC_H */
diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
index 672abea3b03ccb..732aae26f8f8e2 100644
--- a/drivers/resctrl/Kconfig
+++ b/drivers/resctrl/Kconfig
@@ -29,3 +29,35 @@ config ARM64_MPAM_RESCTRL_FS
 	default y if ARM64_MPAM_DRIVER && RESCTRL_FS
 	select RESCTRL_RMID_DEPENDS_ON_CLOSID
 	select RESCTRL_ASSIGN_FIXED
+
+menuconfig RISCV_CBQRI_DRIVER
+	bool "RISC-V CBQRI driver"
+	depends on RISCV && RISCV_ISA_SSQOSID
+	help
+	  Capacity and Bandwidth QoS Register Interface (CBQRI) driver
+	  for RISC-V cache and memory-controller QoS resources. CBQRI
+	  exposes capacity allocation, bandwidth reservation, weighted
+	  bandwidth share, and per-MCID monitoring counters through the
+	  resctrl filesystem at /sys/fs/resctrl when RESCTRL_FS is also
+	  enabled.
+
+	  RISCV_ISA_SSQOSID provides the srmcfg CSR that tags each hart's
+	  memory traffic with the RCID and MCID consumed by CBQRI
+	  controllers.
+
+if RISCV_CBQRI_DRIVER
+
+config RISCV_CBQRI_DRIVER_DEBUG
+	bool "Enable debug messages from the CBQRI driver"
+	help
+	  Say yes here to enable debug messages from the CBQRI driver.
+
+	  This adds pr_debug() output covering controller probe and
+	  per-controller registration steps.  Useful when bringing up a
+	  new platform; otherwise leave disabled to avoid log noise.
+
+endif
+
+config RISCV_CBQRI_RESCTRL_FS
+	bool
+	default y if RISCV_CBQRI_DRIVER && RESCTRL_FS
diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
index 4f6d0e81f9b8f3..ed737b4461b975 100644
--- a/drivers/resctrl/Makefile
+++ b/drivers/resctrl/Makefile
@@ -3,3 +3,9 @@ mpam-y						+= mpam_devices.o
 mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS)		+= mpam_resctrl.o
 
 ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG)	+= -DDEBUG
+
+obj-$(CONFIG_RISCV_CBQRI_DRIVER)		+= cbqri.o
+cbqri-y						+= cbqri_devices.o
+cbqri-$(CONFIG_RISCV_CBQRI_RESCTRL_FS)		+= cbqri_resctrl.o
+
+ccflags-$(CONFIG_RISCV_CBQRI_DRIVER_DEBUG)	+= -DDEBUG
diff --git a/drivers/resctrl/cbqri_devices.c b/drivers/resctrl/cbqri_devices.c
new file mode 100644
index 00000000000000..b1a6ce35042ace
--- /dev/null
+++ b/drivers/resctrl/cbqri_devices.c
@@ -0,0 +1,1154 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/acpi.h>
+#include <linux/bitfield.h>
+#include <linux/cacheinfo.h>
+#include <linux/riscv_cbqri.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/ioport.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/numa.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/csr.h>
+
+#include "cbqri_internal.h"
+
+LIST_HEAD(cbqri_controllers);
+
+/* Set capacity block mask (cc_block_mask) */
+static void cbqri_set_cbm(struct cbqri_controller *ctrl, u64 cbm)
+{
+	iowrite64(cbm, ctrl->base + CBQRI_CC_BLOCK_MASK_OFF);
+}
+
+/* Set the Rbwb (reserved bandwidth blocks) field in bc_bw_alloc */
+static void cbqri_set_rbwb(struct cbqri_controller *ctrl, u64 rbwb)
+{
+	u64 reg;
+
+	reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF);
+	FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_RBWB_MASK, &reg, rbwb);
+	iowrite64(reg, ctrl->base + CBQRI_BC_BW_ALLOC_OFF);
+}
+
+/* Get the Rbwb (reserved bandwidth blocks) field in bc_bw_alloc */
+static u64 cbqri_get_rbwb(struct cbqri_controller *ctrl)
+{
+	u64 reg;
+
+	reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF);
+	return FIELD_GET(CBQRI_CONTROL_REGISTERS_RBWB_MASK, reg);
+}
+
+/* Set the Mweight (opportunistic weight) field in bc_bw_alloc */
+static void cbqri_set_mweight(struct cbqri_controller *ctrl, u64 mweight)
+{
+	u64 reg;
+
+	reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF);
+	FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK, &reg, mweight);
+	iowrite64(reg, ctrl->base + CBQRI_BC_BW_ALLOC_OFF);
+}
+
+/* Get the Mweight (opportunistic weight) field in bc_bw_alloc */
+static u64 cbqri_get_mweight(struct cbqri_controller *ctrl)
+{
+	u64 reg;
+
+	reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF);
+	return FIELD_GET(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK, reg);
+}
+
+/*
+ * Stage both fields of bc_bw_alloc in one read-modify-write so the staging
+ * register is consistent after a single MMIO write.
+ */
+static void cbqri_set_bc_bw_alloc(struct cbqri_controller *ctrl,
+				  u64 rbwb, u64 mweight)
+{
+	u64 reg = ioread64(ctrl->base + CBQRI_BC_BW_ALLOC_OFF);
+
+	FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_RBWB_MASK, &reg, rbwb);
+	FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK, &reg, mweight);
+	iowrite64(reg, ctrl->base + CBQRI_BC_BW_ALLOC_OFF);
+}
+
+enum cbqri_bc_field {
+	CBQRI_BC_FIELD_RBWB,
+	CBQRI_BC_FIELD_MWEIGHT,
+};
+
+static int cbqri_wait_busy_flag(struct cbqri_controller *ctrl, int reg_offset,
+				u64 *regp)
+{
+	u64 reg;
+	int ret;
+
+	/*
+	 * Sleeping poll: caller holds ctrl->lock as a sleeping mutex, so
+	 * 10us/1ms is safe under PREEMPT_RT.
+	 */
+	ret = readq_poll_timeout(ctrl->base + reg_offset, reg,
+				 !FIELD_GET(CBQRI_CONTROL_REGISTERS_BUSY_MASK, reg),
+				 10, 1000);
+	if (ret) {
+		ctrl->faulted = true;
+		return ret;
+	}
+	ctrl->faulted = false;
+	if (regp)
+		*regp = reg;
+	return 0;
+}
+
+/*
+ * Perform capacity allocation control operation on capacity controller.
+ * Caller must hold ctrl->lock.
+ */
+static int cbqri_cc_alloc_op(struct cbqri_controller *ctrl, int operation,
+			     int rcid, enum cbqri_at at)
+{
+	int reg_offset = CBQRI_CC_ALLOC_CTL_OFF;
+	int status;
+	u64 reg;
+
+	lockdep_assert_held(&ctrl->lock);
+
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, &reg) < 0) {
+		pr_err_ratelimited("BUSY timeout before starting operation\n");
+		return -EIO;
+	}
+	FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_OP_MASK, &reg, operation);
+	FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_RCID_MASK, &reg, rcid);
+
+	/*
+	 * CBQRI Table 1: AT 0=Data, 1=Code. Program AT on controllers
+	 * that report supports_alloc_at_code. On controllers that don't,
+	 * AT is reserved-zero and the op acts on both halves.
+	 */
+	reg &= ~CBQRI_CONTROL_REGISTERS_AT_MASK;
+	if (ctrl->cc.supports_alloc_at_code)
+		reg |= FIELD_PREP(CBQRI_CONTROL_REGISTERS_AT_MASK, at);
+
+	iowrite64(reg, ctrl->base + reg_offset);
+
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, &reg) < 0) {
+		pr_err_ratelimited("BUSY timeout during operation\n");
+		return -EIO;
+	}
+
+	status = FIELD_GET(CBQRI_CONTROL_REGISTERS_STATUS_MASK, reg);
+	if (status != CBQRI_CC_ALLOC_CTL_STATUS_SUCCESS) {
+		pr_err_ratelimited("operation %d failed: status=%d\n", operation, status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * Issue a monitoring op on a CC or BC controller's mon_ctl register at
+ * reg_offset (CBQRI_CC_MON_CTL_OFF or CBQRI_BC_MON_CTL_OFF). The CC and
+ * BC mon_ctl registers share an identical OP/MCID/EVT_ID/STATUS layout, so
+ * one helper covers both. Caller must hold ctrl->lock.
+ */
+int cbqri_mon_op(struct cbqri_controller *ctrl, int reg_offset,
+		 int operation, int mcid, int evt_id, u64 *out_reg)
+{
+	u64 reg;
+
+	lockdep_assert_held(&ctrl->lock);
+
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, &reg) < 0) {
+		pr_err_ratelimited("BUSY timeout before starting operation\n");
+		return -EIO;
+	}
+	FIELD_MODIFY(CBQRI_MON_CTL_OP_MASK, &reg, operation);
+	FIELD_MODIFY(CBQRI_MON_CTL_MCID_MASK, &reg, mcid);
+	FIELD_MODIFY(CBQRI_MON_CTL_EVT_ID_MASK, &reg, evt_id);
+	iowrite64(reg, ctrl->base + reg_offset);
+
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, &reg) < 0) {
+		pr_err_ratelimited("BUSY timeout\n");
+		return -EIO;
+	}
+
+	if (FIELD_GET(CBQRI_MON_CTL_STATUS_MASK, reg) !=
+	    CBQRI_MON_CTL_STATUS_SUCCESS)
+		return -EIO;
+
+	if (out_reg)
+		*out_reg = reg;
+
+	return 0;
+}
+
+/*
+ * Perform bandwidth allocation control operation on bandwidth controller.
+ * Caller must hold ctrl->lock.
+ */
+static int cbqri_bc_alloc_op(struct cbqri_controller *ctrl, int operation, int rcid)
+{
+	int reg_offset = CBQRI_BC_ALLOC_CTL_OFF;
+	int status;
+	u64 reg;
+
+	lockdep_assert_held(&ctrl->lock);
+
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, &reg) < 0) {
+		pr_err_ratelimited("BUSY timeout before starting operation\n");
+		return -EIO;
+	}
+	FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_OP_MASK, &reg, operation);
+	FIELD_MODIFY(CBQRI_CONTROL_REGISTERS_RCID_MASK, &reg, rcid);
+	reg &= ~CBQRI_CONTROL_REGISTERS_AT_MASK;
+	iowrite64(reg, ctrl->base + reg_offset);
+
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, &reg) < 0) {
+		pr_err_ratelimited("BUSY timeout during operation\n");
+		return -EIO;
+	}
+
+	status = FIELD_GET(CBQRI_CONTROL_REGISTERS_STATUS_MASK, reg);
+	if (status != CBQRI_BC_ALLOC_CTL_STATUS_SUCCESS) {
+		pr_err_ratelimited("BC alloc op %d failed: status=%d\n",
+				   operation, status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * Apply a capacity block mask and verify via CONFIG_LIMIT + READ_LIMIT.
+ *
+ * AT-capable controllers with CDP off need a second CONFIG_LIMIT on the
+ * other AT half (the spec encodes AT only as 0=Data / 1=Code, there is
+ * no "both halves" value). CDP-on issues separate per-type writes from
+ * resctrl, so a single CONFIG_LIMIT per call is correct.
+ */
+int cbqri_apply_cache_config(struct cbqri_controller *ctrl, u32 closid,
+			     const struct cbqri_cc_config *cfg)
+{
+	bool need_at_mirror;
+	u64 saved_cbm = 0;
+	int err = 0;
+	u64 reg;
+
+	mutex_lock(&ctrl->lock);
+
+	need_at_mirror = ctrl->cc.supports_alloc_at_code && !cfg->cdp_enabled;
+
+	/*
+	 * Capture the cfg->at half CBM before any write so a partial
+	 * AT-mirror failure can revert and keep the two halves consistent.
+	 * Pre-clear cc_block_mask so a silent firmware no-op (status
+	 * SUCCESS but staging not updated) shows as a zero readback
+	 * rather than carrying stale data from a prior op. Mirrors the
+	 * defensive pattern in cbqri_read_cache_config().
+	 */
+	if (need_at_mirror) {
+		cbqri_set_cbm(ctrl, 0);
+		err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT,
+					closid, cfg->at);
+		if (err < 0)
+			goto out;
+		saved_cbm = ioread64(ctrl->base + CBQRI_CC_BLOCK_MASK_OFF);
+	}
+
+	/* Set capacity block mask (cc_block_mask) */
+	cbqri_set_cbm(ctrl, cfg->cbm);
+
+	/* Capacity config limit operation for the AT half implied by cfg->at */
+	err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT,
+				closid, cfg->at);
+	if (err < 0)
+		goto out;
+
+	/*
+	 * CDP-off mirror: on AT-capable controllers, also program the
+	 * other AT half with the same mask so the two halves stay in sync.
+	 */
+	if (need_at_mirror) {
+		enum cbqri_at other = (cfg->at == CBQRI_AT_CODE) ?
+				      CBQRI_AT_DATA : CBQRI_AT_CODE;
+
+		cbqri_set_cbm(ctrl, cfg->cbm);
+		err = cbqri_cc_alloc_op(ctrl,
+					CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT,
+					closid, other);
+		if (err < 0) {
+			int rerr;
+
+			/*
+			 * Best-effort revert of the cfg->at half so the two
+			 * halves stay in sync. A schemata read sees only one
+			 * half, so silent divergence would otherwise report
+			 * the new value as if the write had succeeded.
+			 */
+			cbqri_set_cbm(ctrl, saved_cbm);
+			rerr = cbqri_cc_alloc_op(ctrl,
+						 CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT,
+						 closid, cfg->at);
+			if (rerr < 0)
+				pr_err_ratelimited("AT-mirror revert failed (err=%d), AT halves diverged\n",
+						   rerr);
+			goto out;
+		}
+	}
+
+	/* Clear cc_block_mask before read limit to verify op works */
+	cbqri_set_cbm(ctrl, 0);
+
+	/* Perform a capacity read limit operation to verify blockmask */
+	err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT,
+				closid, cfg->at);
+	if (err < 0)
+		goto out;
+
+	/*
+	 * Read capacity blockmask and narrow to u32 to match resctrl's CBM
+	 * width. cbqri_probe_cc() rejects ncblks > 32 so the upper bits are
+	 * reserved zero.
+	 */
+	reg = ioread64(ctrl->base + CBQRI_CC_BLOCK_MASK_OFF);
+	if (lower_32_bits(reg) != cfg->cbm) {
+		pr_err_ratelimited("CBM verify mismatch (reg=%llx != cbm=%llx)\n",
+				   reg, cfg->cbm);
+		err = -EIO;
+	}
+
+out:
+	mutex_unlock(&ctrl->lock);
+	return err;
+}
+
+/*
+ * Read the configured CBM for closid on the at half via READ_LIMIT.
+ * Pre-clears cc_block_mask before the op so a silent firmware no-op
+ * (status SUCCESS but staging not updated) is detectable in cbm_out.
+ */
+int cbqri_read_cache_config(struct cbqri_controller *ctrl, u32 closid,
+			    enum cbqri_at at, u32 *cbm_out)
+{
+	int err;
+
+	mutex_lock(&ctrl->lock);
+	cbqri_set_cbm(ctrl, 0);
+	err = cbqri_cc_alloc_op(ctrl, CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, closid, at);
+	if (err == 0) {
+		/*
+		 * cc_block_mask is a 64-bit MMIO register. resctrl exposes the
+		 * CBM as a u32. cbqri_probe_cc() rejects ncblks > 32 so the
+		 * upper 32 bits are reserved zero by the spec. Narrow
+		 * explicitly via lower_32_bits() so the assumption is visible
+		 * at the read site.
+		 */
+		*cbm_out = lower_32_bits(ioread64(ctrl->base + CBQRI_CC_BLOCK_MASK_OFF));
+	}
+	mutex_unlock(&ctrl->lock);
+	return err;
+}
+
+/*
+ * Apply a per-RCID update to one field (Rbwb or Mweight) of bc_bw_alloc.
+ * bc_bw_alloc packs both fields, so both halves are seeded from the
+ * authoritative software caches before CONFIG_LIMIT. This avoids the
+ * silent READ_LIMIT no-op window where stale data from a prior op's
+ * RCID could leak into the unmodified field. The verify step uses an
+ * inverted-value sentinel to confirm hardware accepted the target field.
+ *
+ * Caller must hold ctrl->lock.
+ */
+static int cbqri_apply_bc_field(struct cbqri_controller *ctrl, u32 closid,
+				enum cbqri_bc_field field, u64 val)
+{
+	u64 rbwb = ctrl->rbwb_cache[closid];
+	u64 mweight = ctrl->mweight_cache[closid];
+	u64 readback;
+	int ret;
+
+	lockdep_assert_held(&ctrl->lock);
+
+	if (field == CBQRI_BC_FIELD_RBWB)
+		rbwb = val;
+	else
+		mweight = val;
+
+	/*
+	 * Wait for BUSY=0 before staging. A read-modify-write to the
+	 * bc_bw_alloc staging register while an op is in flight can corrupt
+	 * the unmodified field.
+	 */
+	if (cbqri_wait_busy_flag(ctrl, CBQRI_BC_ALLOC_CTL_OFF, NULL) < 0) {
+		pr_err_ratelimited("BUSY timeout before staging bc_bw_alloc\n");
+		return -EIO;
+	}
+
+	cbqri_set_bc_bw_alloc(ctrl, rbwb, mweight);
+
+	ret = cbqri_bc_alloc_op(ctrl, CBQRI_BC_ALLOC_CTL_OP_CONFIG_LIMIT, closid);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Pre-write a sentinel that cannot equal val to the target field
+	 * so a silent READ_LIMIT (status SUCCESS but no staging update)
+	 * is detectable in the readback. ~val truncated to the field
+	 * width cannot equal val.
+	 */
+	if (field == CBQRI_BC_FIELD_RBWB)
+		cbqri_set_rbwb(ctrl, ~val);
+	else
+		cbqri_set_mweight(ctrl, ~val);
+
+	ret = cbqri_bc_alloc_op(ctrl, CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT, closid);
+	if (ret < 0)
+		return ret;
+
+	readback = (field == CBQRI_BC_FIELD_RBWB) ?
+		   cbqri_get_rbwb(ctrl) : cbqri_get_mweight(ctrl);
+	if (readback != val) {
+		pr_err_ratelimited("BC field verify mismatch (reg=0x%llx != val=%llu)\n",
+				   readback, val);
+		return -EIO;
+	}
+
+	/* Hardware confirmed to hold val. Update the authoritative cache. */
+	if (field == CBQRI_BC_FIELD_RBWB)
+		ctrl->rbwb_cache[closid] = rbwb;
+	else
+		ctrl->mweight_cache[closid] = mweight;
+
+	return 0;
+}
+
+/*
+ * Apply an Rbwb update for closid, optionally enforcing CBQRI section 4.5
+ * sum(Rbwb) <= MRBWB. check_sum=false is used by coordinated init/reset
+ * walks where intermediate sums may transiently exceed MRBWB.
+ */
+int cbqri_apply_rbwb(struct cbqri_controller *ctrl, u32 closid,
+		     u64 rbwb, bool check_sum)
+{
+	u32 i;
+	int ret;
+
+	if (rbwb > U16_MAX)
+		return -EINVAL;
+
+	mutex_lock(&ctrl->lock);
+
+	if (check_sum && rbwb > 0) {
+		u64 sum = rbwb;
+
+		for (i = 0; i < ctrl->rcid_count; i++) {
+			if (i == closid)
+				continue;
+			sum += ctrl->rbwb_cache[i];
+		}
+		if (sum > ctrl->bc.mrbwb) {
+			pr_err_ratelimited("RBWB sum %llu exceeds MRBWB %u\n",
+					   sum, ctrl->bc.mrbwb);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	ret = cbqri_apply_bc_field(ctrl, closid, CBQRI_BC_FIELD_RBWB, rbwb);
+out:
+	mutex_unlock(&ctrl->lock);
+	return ret;
+}
+
+int cbqri_apply_mweight_config(struct cbqri_controller *ctrl, u32 closid,
+			       u64 mweight)
+{
+	int ret;
+
+	if (mweight > FIELD_MAX(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK))
+		return -EINVAL;
+
+	mutex_lock(&ctrl->lock);
+	ret = cbqri_apply_bc_field(ctrl, closid, CBQRI_BC_FIELD_MWEIGHT, mweight);
+	mutex_unlock(&ctrl->lock);
+	return ret;
+}
+
+/*
+ * Read the Rbwb (reserved bandwidth blocks) for closid via READ_LIMIT.
+ */
+int cbqri_read_rbwb(struct cbqri_controller *ctrl, u32 closid, u64 *rbwb_out)
+{
+	u8 mweight_sentinel = ~ctrl->mweight_cache[closid];
+	int err;
+
+	mutex_lock(&ctrl->lock);
+
+	/*
+	 * Stage a sentinel into the unread Mweight field. A silent
+	 * READ_LIMIT no-op (status SUCCESS but staging not refreshed) leaves
+	 * the sentinel in place, while a real read overwrites Mweight with
+	 * the hardware value, which differs from the inverted cache sentinel.
+	 */
+	cbqri_set_bc_bw_alloc(ctrl, ctrl->rbwb_cache[closid], mweight_sentinel);
+	err = cbqri_bc_alloc_op(ctrl, CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT, closid);
+	if (err == 0) {
+		if (cbqri_get_mweight(ctrl) == mweight_sentinel) {
+			pr_err_ratelimited("Rbwb READ_LIMIT did not update staging\n");
+			err = -EIO;
+		} else {
+			*rbwb_out = cbqri_get_rbwb(ctrl);
+		}
+	}
+	mutex_unlock(&ctrl->lock);
+	return err;
+}
+
+/*
+ * Read the Mweight (opportunistic weight) for closid via READ_LIMIT.
+ */
+int cbqri_read_mweight(struct cbqri_controller *ctrl, u32 closid, u64 *mweight_out)
+{
+	u16 rbwb_sentinel = ~ctrl->rbwb_cache[closid];
+	int err;
+
+	mutex_lock(&ctrl->lock);
+
+	/*
+	 * Stage a sentinel into the unread Rbwb field so a silent READ_LIMIT
+	 * no-op is detectable, mirroring cbqri_read_rbwb().
+	 */
+	cbqri_set_bc_bw_alloc(ctrl, rbwb_sentinel, ctrl->mweight_cache[closid]);
+	err = cbqri_bc_alloc_op(ctrl, CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT, closid);
+	if (err == 0) {
+		if (cbqri_get_rbwb(ctrl) == rbwb_sentinel) {
+			pr_err_ratelimited("Mweight READ_LIMIT did not update staging\n");
+			err = -EIO;
+		} else {
+			*mweight_out = cbqri_get_mweight(ctrl);
+		}
+	}
+	mutex_unlock(&ctrl->lock);
+	return err;
+}
+
+static int cbqri_probe_feature(struct cbqri_controller *ctrl, int reg_offset,
+			       int operation, int evt_id, int *status,
+			       bool *access_type_supported)
+{
+	const u64 active_mask = CBQRI_CONTROL_REGISTERS_OP_MASK |
+				CBQRI_CONTROL_REGISTERS_AT_MASK |
+				CBQRI_CONTROL_REGISTERS_RCID_MASK |
+				CBQRI_MON_CTL_EVT_ID_MASK;
+	u64 reg, saved_reg;
+	int at;
+
+	/*
+	 * Default the output to false so the status==0 (feature not
+	 * implemented) path returns a deterministic value to the caller
+	 * rather than leaving an uninitialized bool. mon_ctl probes pass
+	 * NULL: the register has no AT field, so the AT probe is skipped.
+	 */
+	if (access_type_supported)
+		*access_type_supported = false;
+
+	/* Keep the initial register value to preserve the WPRI fields */
+	reg = ioread64(ctrl->base + reg_offset);
+	saved_reg = reg;
+
+	/* Drain any in-flight firmware op before issuing our own write. */
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, &saved_reg) < 0) {
+		pr_err("BUSY timeout before probe operation\n");
+		return -EIO;
+	}
+
+	/*
+	 * Execute the requested operation with the active fields
+	 * (OP/AT/RCID/EVT_ID) cleared, then set OP and, for mon_ctl, the
+	 * probe-safe evt_id. WPRI bits outside active_mask carry over from
+	 * saved_reg. alloc_ctl callers pass evt_id 0.
+	 */
+	reg = (saved_reg & ~active_mask) |
+	      FIELD_PREP(CBQRI_CONTROL_REGISTERS_OP_MASK, operation) |
+	      FIELD_PREP(CBQRI_MON_CTL_EVT_ID_MASK, evt_id);
+	iowrite64(reg, ctrl->base + reg_offset);
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, &reg) < 0) {
+		pr_err_ratelimited("BUSY timeout during operation\n");
+		return -EIO;
+	}
+
+	/* Get the operation status */
+	*status = FIELD_GET(CBQRI_CONTROL_REGISTERS_STATUS_MASK, reg);
+
+	/*
+	 * Probe AT support only on alloc_ctl registers (mon_ctl has no AT
+	 * field, so access_type_supported is NULL there). Skipped when the
+	 * register is unimplemented (status stays 0).
+	 */
+	if (access_type_supported && *status != 0) {
+		/*
+		 * Re-issue operation with AT=CODE so the controller
+		 * latches AT=CODE on supported hardware (or resets it to 0
+		 * on hardware that doesn't). OP must be a defined CBQRI op
+		 * here. OP=0 is a no-op and would silently disable CDP.
+		 */
+		reg = (saved_reg & ~active_mask) |
+		      FIELD_PREP(CBQRI_CONTROL_REGISTERS_OP_MASK, operation) |
+		      FIELD_PREP(CBQRI_CONTROL_REGISTERS_AT_MASK,
+				 CBQRI_CONTROL_REGISTERS_AT_CODE);
+		iowrite64(reg, ctrl->base + reg_offset);
+		if (cbqri_wait_busy_flag(ctrl, reg_offset, &reg) < 0) {
+			pr_err("BUSY timeout setting AT field\n");
+			return -EIO;
+		}
+
+		/*
+		 * If the AT field value has been reset to zero,
+		 * then the AT support is not present
+		 */
+		at = FIELD_GET(CBQRI_CONTROL_REGISTERS_AT_MASK, reg);
+		if (at == CBQRI_CONTROL_REGISTERS_AT_CODE)
+			*access_type_supported = true;
+	}
+
+	/*
+	 * Restore the original register value.
+	 * Clear OP to avoid re-triggering the probe op.
+	 */
+	saved_reg &= ~CBQRI_CONTROL_REGISTERS_OP_MASK;
+	iowrite64(saved_reg, ctrl->base + reg_offset);
+	if (cbqri_wait_busy_flag(ctrl, reg_offset, NULL) < 0) {
+		pr_err("BUSY timeout restoring register value\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int cbqri_probe_cc(struct cbqri_controller *ctrl)
+{
+	int err, status;
+	u64 reg;
+
+	reg = ioread64(ctrl->base + CBQRI_CC_CAPABILITIES_OFF);
+	if (reg == 0)
+		return -ENODEV;
+
+	ctrl->ver_minor = FIELD_GET(CBQRI_CC_CAPABILITIES_VER_MINOR_MASK, reg);
+	ctrl->ver_major = FIELD_GET(CBQRI_CC_CAPABILITIES_VER_MAJOR_MASK, reg);
+	ctrl->cc.ncblks = FIELD_GET(CBQRI_CC_CAPABILITIES_NCBLKS_MASK, reg);
+
+	pr_debug("version=%d.%d ncblks=%d cache_level=%d\n",
+		 ctrl->ver_major, ctrl->ver_minor,
+		 ctrl->cc.ncblks, ctrl->cache.cache_level);
+
+	/*
+	 * NCBLKS == 0 would divide-by-zero in the schemata math while
+	 * ctrl->lock is held.
+	 */
+	if (!ctrl->cc.ncblks) {
+		pr_warn("CC at %pa has 0 capacity blocks, skipping\n",
+			&ctrl->addr);
+		return -ENODEV;
+	}
+
+	if (ctrl->cc.ncblks > 32) {
+		pr_warn("CC at %pa has ncblks=%u > 32 (resctrl CBM is u32), skipping\n",
+			&ctrl->addr, ctrl->cc.ncblks);
+		return -ENODEV;
+	}
+
+	/*
+	 * Resolve cache_size via cacheinfo. cpus_read_lock satisfies
+	 * lockdep_assert_cpus_held() inside get_cpu_cacheinfo_level(). If
+	 * every cpu_mask member is offline, cache_size stays 0 and the
+	 * controller cannot back occupancy monitoring.
+	 */
+	cpus_read_lock();
+	if (!ctrl->cache.cache_size) {
+		int cpu = cpumask_first_and(&ctrl->cache.cpu_mask, cpu_online_mask);
+
+		if (cpu < nr_cpu_ids) {
+			struct cacheinfo *ci;
+
+			ci = get_cpu_cacheinfo_level(cpu, ctrl->cache.cache_level);
+			if (ci)
+				ctrl->cache.cache_size = ci->size;
+		}
+	}
+	cpus_read_unlock();
+
+	/* Probe monitoring features */
+	err = cbqri_probe_feature(ctrl, CBQRI_CC_MON_CTL_OFF,
+				  CBQRI_CC_MON_CTL_OP_CONFIG_EVENT,
+				  CBQRI_CC_EVT_ID_NONE, &status, NULL);
+	if (err)
+		return err;
+
+	if (status == CBQRI_MON_CTL_STATUS_SUCCESS) {
+		/*
+		 * Occupancy is reported to userspace in bytes, computed as
+		 * cache_size * counter / ncblks by the resctrl glue. If
+		 * cacheinfo has no cache_size, leave mon_capable false so
+		 * the file is not exposed at all rather than silently
+		 * returning 0.
+		 */
+		if (!ctrl->cache.cache_size)
+			pr_debug("CC @%pa: cache_size unknown, occupancy monitoring disabled\n",
+				 &ctrl->addr);
+		else
+			ctrl->mon_capable = true;
+	}
+
+	/* Probe allocation features */
+	err = cbqri_probe_feature(ctrl, CBQRI_CC_ALLOC_CTL_OFF,
+				  CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT, 0,
+				  &status, &ctrl->cc.supports_alloc_at_code);
+	if (err)
+		return err;
+
+	if (status == CBQRI_CC_ALLOC_CTL_STATUS_SUCCESS)
+		ctrl->alloc_capable = true;
+
+	return 0;
+}
+
+static int cbqri_probe_bc(struct cbqri_controller *ctrl)
+{
+	int err, status;
+	u32 i;
+	u64 reg;
+
+	reg = ioread64(ctrl->base + CBQRI_BC_CAPABILITIES_OFF);
+	if (reg == 0)
+		return -ENODEV;
+
+	ctrl->ver_minor = FIELD_GET(CBQRI_BC_CAPABILITIES_VER_MINOR_MASK, reg);
+	ctrl->ver_major = FIELD_GET(CBQRI_BC_CAPABILITIES_VER_MAJOR_MASK, reg);
+	ctrl->bc.nbwblks = FIELD_GET(CBQRI_BC_CAPABILITIES_NBWBLKS_MASK, reg);
+	ctrl->bc.mrbwb = FIELD_GET(CBQRI_BC_CAPABILITIES_MRBWB_MASK, reg);
+
+	if (!ctrl->bc.nbwblks) {
+		pr_err("bandwidth controller has nbwblks=0\n");
+		return -EINVAL;
+	}
+
+	if (!ctrl->rcid_count) {
+		pr_err("bandwidth controller has rcid_count=0\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Reset seeds RCID 0 with mrbwb - (rcid_count - 1). Reject a
+	 * controller that would underflow that arithmetic.
+	 */
+	if (ctrl->bc.mrbwb < ctrl->rcid_count) {
+		pr_err("bandwidth controller has mrbwb=%u < rcid_count=%u, rejecting\n",
+		       ctrl->bc.mrbwb, ctrl->rcid_count);
+		return -EINVAL;
+	}
+
+	pr_debug("version=%d.%d nbwblks=%d mrbwb=%d\n",
+		 ctrl->ver_major, ctrl->ver_minor,
+		 ctrl->bc.nbwblks, ctrl->bc.mrbwb);
+
+	/* Probe monitoring features */
+	err = cbqri_probe_feature(ctrl, CBQRI_BC_MON_CTL_OFF,
+				  CBQRI_BC_MON_CTL_OP_READ_COUNTER, 0,
+				  &status, NULL);
+	if (err)
+		return err;
+
+	if (status == CBQRI_MON_CTL_STATUS_SUCCESS)
+		ctrl->mon_capable = true;
+
+	/* Probe allocation features */
+	err = cbqri_probe_feature(ctrl, CBQRI_BC_ALLOC_CTL_OFF,
+				  CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT, 0,
+				  &status, &ctrl->bc.supports_alloc_at_code);
+	if (err)
+		return err;
+
+	if (status == CBQRI_BC_ALLOC_CTL_STATUS_SUCCESS) {
+		ctrl->alloc_capable = true;
+
+		/*
+		 * Per-RCID Rbwb and Mweight caches. The caches feed both
+		 * fields of bc_bw_alloc on every apply so the staging
+		 * register reflects authoritative software state, sidestepping
+		 * silent READ_LIMIT no-op corruption of the unmodified field.
+		 * rbwb_cache also lets cbqri_apply_rbwb() validate
+		 * sum(Rbwb) <= MRBWB without re-reading every RCID.
+		 */
+		ctrl->rbwb_cache = kcalloc(ctrl->rcid_count,
+					   sizeof(*ctrl->rbwb_cache),
+					   GFP_KERNEL);
+		if (!ctrl->rbwb_cache)
+			return -ENOMEM;
+
+		ctrl->mweight_cache = kcalloc(ctrl->rcid_count,
+					      sizeof(*ctrl->mweight_cache),
+					      GFP_KERNEL);
+		if (!ctrl->mweight_cache) {
+			kfree(ctrl->rbwb_cache);
+			ctrl->rbwb_cache = NULL;
+			return -ENOMEM;
+		}
+
+		/*
+		 * Seed mweight to the maximum, matching the resctrl-side
+		 * MB_WGHT default. cbqri_apply_bc_field() reads both halves
+		 * of bc_bw_alloc from the caches on every CONFIG_LIMIT, so
+		 * the first MB_MIN domain init (which writes Rbwb) would
+		 * otherwise commit Mweight=0 to every RCID. Per CBQRI 4.5
+		 * a weight of 0 implies the configured limit is a hard
+		 * limit and the use of unused or non-reserved bandwidth
+		 * is not allowed, which starves every RCID of opportunistic
+		 * bandwidth until the subsequent MB_WGHT domain init
+		 * catches up.
+		 */
+		for (i = 0; i < ctrl->rcid_count; i++)
+			ctrl->mweight_cache[i] =
+				FIELD_MAX(CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK);
+	}
+
+	return 0;
+}
+
+static int cbqri_probe_controller(struct cbqri_controller *ctrl)
+{
+	int err;
+
+	pr_debug("controller info: type=%d addr=%pa size=%pa max-rcid=%u max-mcid=%u\n",
+		 ctrl->type, &ctrl->addr, &ctrl->size,
+		 ctrl->rcid_count, ctrl->mcid_count);
+
+	if (!ctrl->addr) {
+		pr_warn("controller has invalid addr=0x0, skipping\n");
+		return -EINVAL;
+	}
+
+	if (ctrl->size < CBQRI_CTRL_MIN_REG_SPAN) {
+		pr_warn("controller at %pa: size %pa < minimum 0x%x, skipping\n",
+			&ctrl->addr, &ctrl->size, CBQRI_CTRL_MIN_REG_SPAN);
+		return -EINVAL;
+	}
+
+	if (!request_mem_region(ctrl->addr, ctrl->size, "cbqri_controller")) {
+		pr_err("request_mem_region failed for %pa\n", &ctrl->addr);
+		return -EBUSY;
+	}
+
+	ctrl->base = ioremap(ctrl->addr, ctrl->size);
+	if (!ctrl->base) {
+		pr_err("ioremap failed for %pa\n", &ctrl->addr);
+		err = -ENOMEM;
+		goto err_release;
+	}
+
+	switch (ctrl->type) {
+	case CBQRI_CONTROLLER_TYPE_CAPACITY:
+		err = cbqri_probe_cc(ctrl);
+		break;
+	case CBQRI_CONTROLLER_TYPE_BANDWIDTH:
+		err = cbqri_probe_bc(ctrl);
+		break;
+	default:
+		pr_err("unknown controller type %d\n", ctrl->type);
+		err = -ENODEV;
+		break;
+	}
+
+	if (err)
+		goto err_iounmap;
+
+	return 0;
+
+err_iounmap:
+	iounmap(ctrl->base);
+	ctrl->base = NULL;
+err_release:
+	release_mem_region(ctrl->addr, ctrl->size);
+	return err;
+}
+
+/*
+ * Pre-arm every MCID with the Occupancy event so a subsequent READ_COUNTER
+ * just snapshots the live counter rather than re-configuring the slot.
+ * Called once per CC during resctrl-side cpuhp online for the L3 monitoring
+ * domain.
+ */
+int cbqri_init_mon_counters(struct cbqri_controller *ctrl)
+{
+	int i, err;
+
+	for (i = 0; i < ctrl->mcid_count; i++) {
+		mutex_lock(&ctrl->lock);
+		err = cbqri_mon_op(ctrl, CBQRI_CC_MON_CTL_OFF,
+				   CBQRI_CC_MON_CTL_OP_CONFIG_EVENT,
+				   i, CBQRI_CC_EVT_ID_OCCUPANCY, NULL);
+		mutex_unlock(&ctrl->lock);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * 62-bit BC counter delta. Inputs must be pre-masked to
+ * CBQRI_BC_MON_CTR_VAL_CTR_MASK. The shift promotes the modular
+ * subtraction into 64-bit so a single wrap (cur < prev) yields the
+ * correct delta. Multi-wrap is handled by the caller via the
+ * hardware OVF bit (CBQRI 4.3). This function only needs to recover
+ * from at most one wrap.
+ */
+u64 cbqri_bc_mon_overflow(u64 prev_ctr, u64 cur_ctr)
+{
+	const unsigned int shift = 64 - 62;
+	u64 chunks = (cur_ctr << shift) - (prev_ctr << shift);
+
+	return chunks >> shift;
+}
+
+/*
+ * Allocate the per-MCID software accumulator and pre-arm every MCID
+ * with TOTAL_READ_WRITE so subsequent reads just snapshot the live
+ * counter.
+ *
+ * Caller responsibility: serialize concurrent invocations on the same
+ * single mon-capable BC (cbqri_resctrl uses cbqri_domain_list_lock for
+ * this).
+ */
+int cbqri_init_bc_mon_counters(struct cbqri_controller *bc)
+{
+	int i, err;
+
+	if (bc->mbm_total_states)
+		return 0;
+
+	bc->mbm_total_states = kcalloc(bc->mcid_count,
+				       sizeof(*bc->mbm_total_states),
+				       GFP_KERNEL);
+	if (!bc->mbm_total_states)
+		return -ENOMEM;
+
+	for (i = 0; i < bc->mcid_count; i++) {
+		mutex_lock(&bc->lock);
+		err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+				   CBQRI_BC_MON_CTL_OP_CONFIG_EVENT,
+				   i, CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL);
+		mutex_unlock(&bc->lock);
+		if (err) {
+			kfree(bc->mbm_total_states);
+			bc->mbm_total_states = NULL;
+			return err;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Return the single mon-capable BC, NULL if zero or more than one. BC
+ * counters can only accurately surface as L3 mbm_total_bytes if every memory
+ * request flows through the same BC.
+ */
+struct cbqri_controller *cbqri_find_only_mon_bc(void)
+{
+	struct cbqri_controller *ctrl, *only_bc = NULL;
+
+	list_for_each_entry(ctrl, &cbqri_controllers, list) {
+		if (ctrl->type != CBQRI_CONTROLLER_TYPE_BANDWIDTH)
+			continue;
+		if (!ctrl->mon_capable)
+			continue;
+		if (only_bc)
+			return NULL;
+		only_bc = ctrl;
+	}
+	return only_bc;
+}
+
+void cbqri_controller_destroy(struct cbqri_controller *ctrl)
+{
+	/*
+	 * cbqri_probe_controller() clears ctrl->base on its error paths and
+	 * releases the mem region itself, so reach into both only when
+	 * destroy is rolling back a successful probe.
+	 */
+	if (ctrl->base) {
+		iounmap(ctrl->base);
+		release_mem_region(ctrl->addr, ctrl->size);
+	}
+	kfree(ctrl->mbm_total_states);
+	kfree(ctrl->mweight_cache);
+	kfree(ctrl->rbwb_cache);
+	kfree(ctrl);
+}
+
+/*
+ * Roll back the most recent n successful riscv_cbqri_register_controller()
+ * calls. Discovery layers use this to undo partial registrations when a
+ * subsequent table entry turns out to be malformed and the whole parse must
+ * abort.
+ *
+ * Caller serialization: this is intended for boot-time discovery (ACPI
+ * acpi_arch_init, future DT) which run single-threaded before late_initcall.
+ * No lock is taken.
+ */
+void riscv_cbqri_unregister_last(unsigned int n)
+{
+	while (n--) {
+		struct cbqri_controller *ctrl;
+
+		if (list_empty(&cbqri_controllers))
+			return;
+		ctrl = list_last_entry(&cbqri_controllers,
+				       struct cbqri_controller, list);
+		list_del(&ctrl->list);
+		cbqri_controller_destroy(ctrl);
+	}
+}
+
+/*
+ * Allocate, populate, and add to cbqri_controllers a fresh controller
+ * descriptor based on info supplied by a discovery layer (ACPI RQSC,
+ * future DT). Resolves the cpumask via PPTT (capacity) so callers do
+ * not need to know about cacheinfo topology.
+ */
+int riscv_cbqri_register_controller(const struct cbqri_controller_info *info)
+{
+	struct cbqri_controller *ctrl;
+	int err;
+
+	if (!info->addr) {
+		pr_warn("skipping controller with invalid addr=0x0\n");
+		return -EINVAL;
+	}
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return -ENOMEM;
+
+	mutex_init(&ctrl->lock);
+
+	ctrl->addr = info->addr;
+	ctrl->size = info->size;
+	ctrl->type = info->type;
+	ctrl->rcid_count = info->rcid_count;
+	ctrl->mcid_count = info->mcid_count;
+
+	/*
+	 * SRMCFG encodes RCID in 12 bits.  ACPI's acpi_parse_rqsc() already
+	 * caps info->rcid_count at CBQRI_MAX_RCID (1024) so this is unreachable
+	 * today, but a future DT discovery path or a malformed firmware table
+	 * routed through a different validator could bypass that ceiling.
+	 * Catch the violation here rather than silently truncating in every
+	 * FIELD_PREP(SRMCFG_RCID_MASK, closid) on the schedule-in fast path.
+	 */
+	if (WARN_ON_ONCE(ctrl->rcid_count > FIELD_MAX(SRMCFG_RCID_MASK) + 1)) {
+		cbqri_controller_destroy(ctrl);
+		return -EINVAL;
+	}
+
+	/*
+	 * mon_ctl encodes MCID in 12 bits. acpi_parse_rqsc() caps
+	 * info->mcid_count at CBQRI_MAX_MCID (1024), but a future discovery
+	 * path could bypass that. Reject an out-of-range count so
+	 * cbqri_init_mon_counters() iterates a trusted bound and no MCID
+	 * aliases another slot through FIELD_MODIFY(MON_CTL_MCID_MASK).
+	 */
+	if (WARN_ON_ONCE(ctrl->mcid_count > FIELD_MAX(CBQRI_MON_CTL_MCID_MASK) + 1)) {
+		cbqri_controller_destroy(ctrl);
+		return -EINVAL;
+	}
+
+	switch (info->type) {
+	case CBQRI_CONTROLLER_TYPE_CAPACITY: {
+		int level;
+
+		ctrl->cache.cache_id = info->cache_id;
+
+		level = find_acpi_cache_level_from_id(info->cache_id);
+		if (level < 0) {
+			pr_warn("Failed to resolve cache level for cache id 0x%x (%d), skipping\n",
+				info->cache_id, level);
+			cbqri_controller_destroy(ctrl);
+			return level;
+		}
+		ctrl->cache.cache_level = level;
+
+		/*
+		 * cache_size stays at 0 here. cacheinfo is not populated
+		 * yet at acpi_arch_init time. Filled lazily during probe
+		 * via get_cpu_cacheinfo_level().
+		 */
+
+		err = acpi_pptt_get_cpumask_from_cache_id(info->cache_id,
+							  &ctrl->cache.cpu_mask);
+		if (err) {
+			pr_warn("Failed to get cpumask for cache id 0x%x (%d), skipping\n",
+				info->cache_id, err);
+			cbqri_controller_destroy(ctrl);
+			return err;
+		}
+		break;
+	}
+	case CBQRI_CONTROLLER_TYPE_BANDWIDTH: {
+		struct cbqri_controller *other;
+		int node_id;
+
+		ctrl->mem.prox_dom = info->prox_dom;
+		node_id = pxm_to_node(info->prox_dom);
+		if (node_id == NUMA_NO_NODE) {
+			pr_warn("controller at %pa: proximity domain %u has no NUMA node, skipping\n",
+				&ctrl->addr, info->prox_dom);
+			cbqri_controller_destroy(ctrl);
+			return -ENODEV;
+		}
+		/*
+		 * cbqri_resctrl_dom tracks a single hw_ctrl per domain, so a
+		 * second BC sharing the same proximity domain would be
+		 * silently dropped when the resctrl glue resolves the cpu to
+		 * an existing domain. Reject the duplicate at register time
+		 * to keep the failure mode visible.
+		 */
+		list_for_each_entry(other, &cbqri_controllers, list) {
+			if (other->type != CBQRI_CONTROLLER_TYPE_BANDWIDTH)
+				continue;
+			if (other->mem.prox_dom != info->prox_dom)
+				continue;
+			pr_warn("controller at %pa: proximity domain %u already claimed by %pa, skipping\n",
+				&ctrl->addr, info->prox_dom, &other->addr);
+			cbqri_controller_destroy(ctrl);
+			return -EEXIST;
+		}
+		cpumask_copy(&ctrl->mem.cpu_mask, cpumask_of_node(node_id));
+		break;
+	}
+	default:
+		pr_warn("controller at %pa: unknown type %u, skipping\n",
+			&ctrl->addr, info->type);
+		cbqri_controller_destroy(ctrl);
+		return -EINVAL;
+	}
+
+	err = cbqri_probe_controller(ctrl);
+	if (err) {
+		cbqri_controller_destroy(ctrl);
+		return err;
+	}
+
+	list_add_tail(&ctrl->list, &cbqri_controllers);
+	return 0;
+}
diff --git a/drivers/resctrl/cbqri_internal.h b/drivers/resctrl/cbqri_internal.h
new file mode 100644
index 00000000000000..68a40f846f4039
--- /dev/null
+++ b/drivers/resctrl/cbqri_internal.h
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _DRIVERS_RESCTRL_CBQRI_INTERNAL_H
+#define _DRIVERS_RESCTRL_CBQRI_INTERNAL_H
+
+#include <linux/bitfield.h>
+#include <linux/riscv_cbqri.h>
+#include <linux/cpumask.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+
+/*
+ * Capacity Controller (CC) and Bandwidth Controller (BC) MMIO register offsets.
+ */
+#define CBQRI_CC_CAPABILITIES_OFF 0
+#define CBQRI_CC_MON_CTL_OFF      8
+#define CBQRI_CC_MON_CTL_VAL_OFF 16
+#define CBQRI_CC_ALLOC_CTL_OFF   24
+#define CBQRI_CC_BLOCK_MASK_OFF  32
+
+#define CBQRI_BC_CAPABILITIES_OFF 0
+#define CBQRI_BC_MON_CTL_OFF      8
+#define CBQRI_BC_MON_CTR_VAL_OFF 16
+#define CBQRI_BC_ALLOC_CTL_OFF   24
+#define CBQRI_BC_BW_ALLOC_OFF    32
+
+/*
+ * Smallest MMIO span the driver actually accesses: highest defined
+ * register offset (0x20) plus the 8-byte register width. Used by
+ * cbqri_probe_controller() to reject undersized firmware-supplied
+ * mappings before request_mem_region/ioremap, so a u64 access at
+ * BLOCK_MASK does not walk past the end of the mapping.
+ */
+#define CBQRI_CTRL_MIN_REG_SPAN  0x28u
+
+#define CBQRI_CC_CAPABILITIES_VER_MINOR_MASK  GENMASK_ULL(3, 0)
+#define CBQRI_CC_CAPABILITIES_VER_MAJOR_MASK  GENMASK_ULL(7, 4)
+#define CBQRI_CC_CAPABILITIES_NCBLKS_MASK     GENMASK_ULL(23, 8)
+
+#define CBQRI_BC_CAPABILITIES_VER_MINOR_MASK  GENMASK_ULL(3, 0)
+#define CBQRI_BC_CAPABILITIES_VER_MAJOR_MASK  GENMASK_ULL(7, 4)
+#define CBQRI_BC_CAPABILITIES_NBWBLKS_MASK    GENMASK_ULL(23, 8)
+#define CBQRI_BC_CAPABILITIES_MRBWB_MASK      GENMASK_ULL(47, 32)
+
+/*
+ * CC and BC control and mon registers are 64-bit. Keep every field mask
+ * GENMASK_ULL so FIELD_MODIFY() or ~mask on a u64 register never
+ * zero-extends a 32-bit mask and clobbers STATUS/BUSY/WPRI in bits 63:32
+ * if RV32 support is added in the future.
+ */
+#define CBQRI_CONTROL_REGISTERS_OP_MASK      GENMASK_ULL(4, 0)
+#define CBQRI_CONTROL_REGISTERS_AT_MASK      GENMASK_ULL(7, 5)
+#define CBQRI_CONTROL_REGISTERS_AT_DATA      0
+#define CBQRI_CONTROL_REGISTERS_AT_CODE      1
+#define CBQRI_CONTROL_REGISTERS_RCID_MASK    GENMASK_ULL(19, 8)
+#define CBQRI_CONTROL_REGISTERS_STATUS_MASK  GENMASK_ULL(38, 32)
+#define CBQRI_CONTROL_REGISTERS_BUSY_MASK    GENMASK_ULL(39, 39)
+#define CBQRI_CONTROL_REGISTERS_RBWB_MASK    GENMASK_ULL(15, 0)
+#define CBQRI_CONTROL_REGISTERS_MWEIGHT_MASK GENMASK_ULL(27, 20)
+
+#define CBQRI_CC_ALLOC_CTL_OP_CONFIG_LIMIT 1
+#define CBQRI_CC_ALLOC_CTL_OP_READ_LIMIT   2
+#define CBQRI_CC_ALLOC_CTL_STATUS_SUCCESS  1
+
+#define CBQRI_BC_ALLOC_CTL_OP_CONFIG_LIMIT 1
+#define CBQRI_BC_ALLOC_CTL_OP_READ_LIMIT   2
+#define CBQRI_BC_ALLOC_CTL_STATUS_SUCCESS  1
+
+#define CBQRI_CC_MON_CTL_OP_CONFIG_EVENT 1
+#define CBQRI_CC_MON_CTL_OP_READ_COUNTER 2
+
+#define CBQRI_BC_MON_CTL_OP_CONFIG_EVENT 1
+#define CBQRI_BC_MON_CTL_OP_READ_COUNTER 2
+
+/* Bandwidth usage monitoring event IDs (CBQRI spec Table 10) */
+#define CBQRI_BC_EVT_ID_TOTAL_READ_WRITE  1
+
+/* bc_mon_ctr_val layout (CBQRI spec section 4.3, Figure 7) */
+#define CBQRI_BC_MON_CTR_VAL_CTR_MASK    GENMASK_ULL(61, 0)
+#define CBQRI_BC_MON_CTR_VAL_INVALID     BIT_ULL(62)
+#define CBQRI_BC_MON_CTR_VAL_OVF         BIT_ULL(63)
+
+/* mon_ctl field masks (CC and BC share an identical OP/MCID/EVT_ID/STATUS layout) */
+#define CBQRI_MON_CTL_OP_MASK        GENMASK_ULL(4, 0)
+#define CBQRI_MON_CTL_MCID_MASK      GENMASK_ULL(19, 8)
+#define CBQRI_MON_CTL_EVT_ID_MASK    GENMASK_ULL(27, 20)
+#define CBQRI_MON_CTL_STATUS_MASK    GENMASK_ULL(38, 32)
+#define CBQRI_MON_CTL_STATUS_SUCCESS 1
+
+/* Capacity usage monitoring event IDs (CBQRI spec Table 4) */
+#define CBQRI_CC_EVT_ID_NONE         0
+#define CBQRI_CC_EVT_ID_OCCUPANCY    1
+
+/* Capacity Controller hardware capabilities */
+struct riscv_cbqri_capacity_caps {
+	u16 ncblks;
+	bool supports_alloc_at_code;
+};
+
+/* Bandwidth Controller hardware capabilities */
+struct riscv_cbqri_bandwidth_caps {
+	u16 nbwblks; /* number of bandwidth blocks */
+	u16 mrbwb;   /* max reserved bw blocks */
+
+	bool supports_alloc_at_code;
+};
+
+/**
+ * struct cbqri_bc_mon_state - per-MCID software accumulator for BC bandwidth
+ * @prev_ctr: previous 62-bit hardware snapshot (already masked to CTR field)
+ * @chunks:   accumulated 64-bit byte total across hardware wraparounds
+ *
+ * Updated in resctrl_arch_rmid_read() under cbqri_controller::lock and
+ * zeroed by resctrl_arch_reset_rmid().
+ */
+struct cbqri_bc_mon_state {
+	u64 prev_ctr;
+	u64 chunks;
+};
+
+/**
+ * enum cbqri_at - capacity controller access type for CDP
+ * @CBQRI_AT_DATA: data access (CBQRI Table 1, AT=0)
+ * @CBQRI_AT_CODE: code access (CBQRI Table 1, AT=1)
+ *
+ * Selects between data and code halves on controllers that advertise
+ * supports_alloc_at_code. The resctrl glue maps from CDP_DATA / CDP_CODE
+ * to this enum at the boundary so cbqri_devices.c stays free of fs/resctrl
+ * types.
+ */
+enum cbqri_at {
+	CBQRI_AT_DATA = CBQRI_CONTROL_REGISTERS_AT_DATA,
+	CBQRI_AT_CODE = CBQRI_CONTROL_REGISTERS_AT_CODE,
+};
+
+/**
+ * struct cbqri_cc_config - desired capacity allocation state for one rcid
+ * @cbm:         capacity block mask
+ * @at:          AT half (data or code) the @cbm applies to
+ * @cdp_enabled: when false and the controller supports AT, mirror @cbm
+ *               into the other AT half so both stay in sync
+ */
+struct cbqri_cc_config {
+	u64           cbm;
+	enum cbqri_at at;
+	bool          cdp_enabled;
+};
+
+struct cbqri_controller {
+	void __iomem *base;
+	/*
+	 * Serializes the write-then-poll-busy MMIO sequences on this
+	 * controller. Each CBQRI op may busy-wait up to 1 ms on slow
+	 * firmware, so use a sleeping mutex (paired with the sleeping
+	 * readq_poll_timeout() in cbqri_wait_busy_flag()) to keep
+	 * preemption enabled, which is required for PREEMPT_RT.
+	 * All resctrl-arch entry points run in process context.
+	 */
+	struct mutex lock;
+	/*
+	 * Set by cbqri_wait_busy_flag() on BUSY timeout, cleared on the
+	 * next successful wait. Informational only, used for diagnostics.
+	 */
+	bool faulted;
+
+	int ver_major;
+	int ver_minor;
+
+	struct riscv_cbqri_bandwidth_caps bc;
+	struct riscv_cbqri_capacity_caps cc;
+
+	bool alloc_capable;
+	bool mon_capable;
+
+	phys_addr_t addr;
+	phys_addr_t size;
+	enum cbqri_controller_type type;
+	u32 rcid_count;
+	u32 mcid_count;
+
+	/*
+	 * Per-RCID cache of the most recent Rbwb / Mweight values applied
+	 * via CONFIG_LIMIT. bc_bw_alloc packs both fields into one register,
+	 * so cbqri_apply_bc_field() seeds both halves from the authoritative
+	 * cache before CONFIG_LIMIT.
+	 */
+	u16 *rbwb_cache;
+	u8  *mweight_cache;
+
+	/*
+	 * Per-MCID 64-bit software accumulator for the BC's mbm_total_bytes
+	 * event. Allocated by cbqri_init_bc_mon_counters() when this BC is
+	 * paired with an L3 monitoring domain, sized by ->mcid_count. NULL
+	 * on capacity controllers and on BCs that are not mon-paired.
+	 * Protected by ->lock along with the surrounding MMIO sequence.
+	 */
+	struct cbqri_bc_mon_state *mbm_total_states;
+
+	struct list_head list;
+
+	struct cache_controller {
+		u32 cache_level;
+		u32 cache_size; /* in bytes */
+		struct cpumask cpu_mask;
+		/* Unique Cache ID from the PPTT table's Cache Type Structure */
+		u32 cache_id;
+	} cache;
+
+	struct mem_controller {
+		/* Proximity Domain from SRAT table Memory Affinity Controller */
+		u32 prox_dom;
+		struct cpumask cpu_mask;
+	} mem;
+};
+
+extern struct list_head cbqri_controllers;
+
+void cbqri_controller_destroy(struct cbqri_controller *ctrl);
+
+int cbqri_apply_cache_config(struct cbqri_controller *ctrl, u32 closid,
+			     const struct cbqri_cc_config *cfg);
+
+int cbqri_read_cache_config(struct cbqri_controller *ctrl, u32 closid,
+			    enum cbqri_at at, u32 *cbm_out);
+
+int cbqri_mon_op(struct cbqri_controller *ctrl, int reg_offset,
+		 int operation, int mcid, int evt_id, u64 *out_reg);
+
+int cbqri_init_mon_counters(struct cbqri_controller *ctrl);
+
+int cbqri_apply_rbwb(struct cbqri_controller *ctrl, u32 closid,
+		     u64 rbwb, bool check_sum);
+
+int cbqri_apply_mweight_config(struct cbqri_controller *ctrl, u32 closid,
+			       u64 mweight);
+
+int cbqri_read_rbwb(struct cbqri_controller *ctrl, u32 closid, u64 *rbwb_out);
+
+int cbqri_read_mweight(struct cbqri_controller *ctrl, u32 closid, u64 *mweight_out);
+
+u64 cbqri_bc_mon_overflow(u64 prev_ctr, u64 cur_ctr);
+
+int cbqri_init_bc_mon_counters(struct cbqri_controller *bc);
+
+struct cbqri_controller *cbqri_find_only_mon_bc(void);
+
+#endif /* _DRIVERS_RESCTRL_CBQRI_INTERNAL_H */
diff --git a/drivers/resctrl/cbqri_resctrl.c b/drivers/resctrl/cbqri_resctrl.c
new file mode 100644
index 00000000000000..efd75d241122f9
--- /dev/null
+++ b/drivers/resctrl/cbqri_resctrl.c
@@ -0,0 +1,1520 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/bitfield.h>
+#include <linux/cacheinfo.h>
+#include <linux/riscv_cbqri.h>
+#include <linux/cpu.h>
+#include <linux/cpufeature.h>
+#include <linux/cpuhotplug.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/resctrl.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/csr.h>
+#include <asm/qos.h>
+
+#include "cbqri_internal.h"
+
+struct cbqri_resctrl_res {
+	struct cbqri_controller *ctrl;
+	struct rdt_resource     resctrl_res;
+	bool                    cdp_enabled;
+};
+
+struct cbqri_resctrl_dom {
+	struct rdt_ctrl_domain  resctrl_ctrl_dom;
+	struct cbqri_controller *hw_ctrl;
+	/*
+	 * For an L3 capacity controller paired with a bandwidth controller
+	 * of matching topology, paired_bc caches that BC so mbm_total_bytes
+	 * reads / resets don't have to walk cbqri_controllers on every hit.
+	 * NULL for non-L3 domains and L3s without a paired BC.
+	 */
+	struct cbqri_controller *paired_bc;
+};
+
+static struct cbqri_resctrl_res cbqri_resctrl_resources[RDT_NUM_RESOURCES];
+
+static bool exposed_alloc_capable;
+static bool exposed_mon_capable;
+
+/* Used by resctrl_arch_system_num_rmid_idx(). Narrowed by accumulate_caps. */
+static u32 max_rmid = U32_MAX;
+
+/* Protects ctrl_domain list mutations across CPU hotplug. */
+static DEFINE_MUTEX(cbqri_domain_list_lock);
+
+static struct rdt_ctrl_domain *
+cbqri_find_ctrl_domain(struct list_head *h, int id)
+{
+	struct rdt_domain_hdr *hdr = resctrl_find_domain(h, id, NULL);
+
+	return hdr ? container_of(hdr, struct rdt_ctrl_domain, hdr) : NULL;
+}
+
+static struct rdt_l3_mon_domain *
+cbqri_find_l3_mon_domain(struct list_head *h, int id)
+{
+	struct rdt_domain_hdr *hdr = resctrl_find_domain(h, id, NULL);
+
+	return hdr ? container_of(hdr, struct rdt_l3_mon_domain, hdr) : NULL;
+}
+
+static int cbqri_apply_cache_config_dom(struct cbqri_resctrl_dom *hw_dom,
+					struct rdt_resource *r,
+					u32 closid, enum resctrl_conf_type t,
+					u64 cbm)
+{
+	struct cbqri_resctrl_res *hw_res =
+		container_of(r, struct cbqri_resctrl_res, resctrl_res);
+	struct cbqri_cc_config cfg = {
+		.cbm = cbm,
+		.at = (t == CDP_CODE) ? CBQRI_AT_CODE : CBQRI_AT_DATA,
+		.cdp_enabled = hw_res->cdp_enabled,
+	};
+
+	return cbqri_apply_cache_config(hw_dom->hw_ctrl, closid, &cfg);
+}
+
+bool resctrl_arch_alloc_capable(void)
+{
+	return exposed_alloc_capable;
+}
+
+bool resctrl_arch_mon_capable(void)
+{
+	return exposed_mon_capable;
+}
+
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid)
+{
+	if (rid != RDT_RESOURCE_L2 && rid != RDT_RESOURCE_L3)
+		return false;
+	return cbqri_resctrl_resources[rid].cdp_enabled;
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable)
+{
+	struct cbqri_resctrl_res *cbqri_res;
+
+	if (rid != RDT_RESOURCE_L2 && rid != RDT_RESOURCE_L3)
+		return -ENODEV;
+
+	cbqri_res = &cbqri_resctrl_resources[rid];
+	if (!cbqri_res->resctrl_res.cdp_capable)
+		return -ENODEV;
+
+	cbqri_res->cdp_enabled = enable;
+	return 0;
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+	if (l >= RDT_NUM_RESOURCES)
+		return NULL;
+
+	return &cbqri_resctrl_resources[l].resctrl_res;
+}
+
+/*
+ * fs/resctrl unconditionally references the symbols below before checking
+ * mon_capable. They are stubs for features CBQRI does not yet support.
+ */
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+	return false;
+}
+
+void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r,
+				 enum resctrl_event_id evtid)
+{
+	return NULL;
+}
+
+void resctrl_arch_mon_ctx_free(struct rdt_resource *r,
+			       enum resctrl_event_id evtid, void *arch_mon_ctx)
+{
+}
+
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			      enum resctrl_event_id evtid, u32 rmid, u32 closid,
+			      u32 cntr_id, bool assign)
+{
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			   u32 unused, u32 rmid, int cntr_id,
+			   enum resctrl_event_id eventid, u64 *val)
+{
+	return -EOPNOTSUPP;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+	return false;
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+	return -EOPNOTSUPP;
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			     u32 unused, u32 rmid, int cntr_id,
+			     enum resctrl_event_id eventid)
+{
+}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+	return false;
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+	return -EOPNOTSUPP;
+}
+
+void resctrl_arch_mon_event_config_read(void *info)
+{
+}
+
+void resctrl_arch_mon_event_config_write(void *info)
+{
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+			     u32 unused, u32 rmid, enum resctrl_event_id eventid)
+{
+	struct cbqri_resctrl_dom *hw_dom;
+	struct rdt_ctrl_domain *cd;
+
+	if (irqs_disabled())
+		return;
+
+	mutex_lock(&cbqri_domain_list_lock);
+
+	/*
+	 * Occupancy MCIDs are armed once by cbqri_init_mon_counters() and
+	 * free run thereafter, so only mbm_total_bytes needs a per-rmid reset.
+	 */
+	switch (eventid) {
+	case QOS_L3_MBM_TOTAL_EVENT_ID: {
+		struct cbqri_controller *bc;
+
+		cd = cbqri_find_ctrl_domain(&r->ctrl_domains, d->hdr.id);
+		if (!cd)
+			break;
+		hw_dom = container_of(cd, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+		bc = hw_dom->paired_bc;
+		if (!bc)
+			break;
+		if (WARN_ON_ONCE(!bc->mbm_total_states))
+			break;
+		if (rmid >= bc->mcid_count)
+			break;
+
+		mutex_lock(&bc->lock);
+		/*
+		 * CONFIG_EVENT both resets and re-arms. Skip the accumulator
+		 * memset on failure. A stale hardware counter X with
+		 * prev_ctr=0 would inject overflow(0, X) on the next read.
+		 */
+		if (!cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+				  CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, rmid,
+				  CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL))
+			memset(&bc->mbm_total_states[rmid], 0,
+			       sizeof(*bc->mbm_total_states));
+		mutex_unlock(&bc->lock);
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	mutex_unlock(&cbqri_domain_list_lock);
+}
+
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
+{
+	int i;
+
+	/*
+	 * Occupancy counters free run and need no reset; only the
+	 * mbm_total_bytes accumulators are cleared. Bound by max_rmid
+	 * (system-wide minimum mcid_count).
+	 */
+	for (i = 0; i < max_rmid; i++)
+		resctrl_arch_reset_rmid(r, d, 0, i, QOS_L3_MBM_TOTAL_EVENT_ID);
+}
+
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
+			   u32 closid, u32 rmid, enum resctrl_event_id eventid,
+			   void *arch_priv, u64 *val, void *arch_mon_ctx)
+{
+	struct cbqri_resctrl_dom *hw_dom;
+	struct cbqri_controller *ctrl;
+	struct rdt_ctrl_domain *d;
+	u64 ctr_val;
+	int err = 0;
+
+	resctrl_arch_rmid_read_context_check();
+
+	/*
+	 * cbqri_mon_op() takes ctrl->lock sleeping mutex and polls
+	 * BUSY for up to 1 ms, neither of which is safe under
+	 * irqs_disabled().
+	 */
+	if (irqs_disabled())
+		return -EIO;
+
+	/*
+	 * cbqri_domain_list_lock serialises the list walk against
+	 * cbqri_detach_cpu_from_ctrl_domains().
+	 */
+	mutex_lock(&cbqri_domain_list_lock);
+
+	switch (eventid) {
+	case QOS_L3_OCCUP_EVENT_ID:
+		d = cbqri_find_ctrl_domain(&r->ctrl_domains, hdr->id);
+		if (!d) {
+			err = -ENOENT;
+			break;
+		}
+
+		hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+		ctrl = hw_dom->hw_ctrl;
+
+		mutex_lock(&ctrl->lock);
+
+		/*
+		 * MCIDs are armed with Occupancy once at init and free run.
+		 * Pass EVT_ID explicitly as the CBQRI spec does not guarantee
+		 * sticky-last-configured-event for READ_COUNTER.
+		 */
+		err = cbqri_mon_op(ctrl, CBQRI_CC_MON_CTL_OFF,
+				   CBQRI_CC_MON_CTL_OP_READ_COUNTER,
+				   rmid, CBQRI_CC_EVT_ID_OCCUPANCY, NULL);
+		if (!err) {
+			ctr_val = ioread64(ctrl->base + CBQRI_CC_MON_CTL_VAL_OFF);
+
+			/*
+			 * Capacity blocks to bytes. Multiply before divide
+			 * so a non-power-of-2 ncblks doesn't truncate.
+			 */
+			*val = (u64)ctrl->cache.cache_size * ctr_val /
+			       ctrl->cc.ncblks;
+		}
+		mutex_unlock(&ctrl->lock);
+		break;
+
+	case QOS_L3_MBM_TOTAL_EVENT_ID: {
+		struct cbqri_controller *bc;
+
+		/*
+		 * The L3 monitoring domain's id is the L3 cache id. The
+		 * matching ctrl domain's hw_dom->paired_bc was cached at
+		 * add time to avoid walking cbqri_controllers on every read.
+		 */
+		d = cbqri_find_ctrl_domain(&r->ctrl_domains, hdr->id);
+		if (!d) {
+			err = -ENOENT;
+			break;
+		}
+		hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+		bc = hw_dom->paired_bc;
+		if (!bc) {
+			err = -ENOENT;
+			break;
+		}
+		if (WARN_ON_ONCE(!bc->mbm_total_states)) {
+			err = -EIO;
+			break;
+		}
+		if (rmid >= bc->mcid_count) {
+			err = -ERANGE;
+			break;
+		}
+
+		mutex_lock(&bc->lock);
+		/* Pass EVT_ID explicitly. Same reason as the CC path above. */
+		err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+				   CBQRI_BC_MON_CTL_OP_READ_COUNTER, rmid,
+				   CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL);
+		if (err)
+			goto out_bc;
+
+		ctr_val = ioread64(bc->base + CBQRI_BC_MON_CTR_VAL_OFF);
+
+		if (ctr_val & CBQRI_BC_MON_CTR_VAL_INVALID) {
+			/*
+			 * Return the last good total and leave prev_ctr so
+			 * the next valid sample resumes from there.
+			 */
+			*val = bc->mbm_total_states[rmid].chunks;
+		} else if (ctr_val & CBQRI_BC_MON_CTR_VAL_OVF) {
+			/*
+			 * OVF is sticky until next CONFIG_EVENT.
+			 * cbqri_bc_mon_overflow() can recover at most
+			 * one wrap. With OVF set, the count is unknown,
+			 * so re-arm and re-anchor prev_ctr=0.
+			 */
+			struct cbqri_bc_mon_state *s = &bc->mbm_total_states[rmid];
+
+			pr_warn_ratelimited("BC@%pa MCID %u: bandwidth counter overflow\n",
+					    &bc->addr, rmid);
+			err = cbqri_mon_op(bc, CBQRI_BC_MON_CTL_OFF,
+					   CBQRI_BC_MON_CTL_OP_CONFIG_EVENT, rmid,
+					   CBQRI_BC_EVT_ID_TOTAL_READ_WRITE, NULL);
+			if (err)
+				goto out_bc;
+
+			s->prev_ctr = 0;
+			*val = s->chunks;
+		} else {
+			struct cbqri_bc_mon_state *s = &bc->mbm_total_states[rmid];
+			u64 cur = ctr_val & CBQRI_BC_MON_CTR_VAL_CTR_MASK;
+
+			s->chunks  += cbqri_bc_mon_overflow(s->prev_ctr, cur);
+			s->prev_ctr = cur;
+			*val        = s->chunks;
+		}
+out_bc:
+		mutex_unlock(&bc->lock);
+		break;
+	}
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&cbqri_domain_list_lock);
+	return err;
+}
+
+/*
+ * Note about terminology between x86 (Intel RDT/AMD QoS) and RISC-V:
+ *   CLOSID on x86 is RCID on RISC-V
+ *     RMID on x86 is MCID on RISC-V
+ */
+u32 resctrl_arch_get_num_closid(struct rdt_resource *res)
+{
+	struct cbqri_resctrl_res *hw_res;
+
+	hw_res = container_of(res, struct cbqri_resctrl_res, resctrl_res);
+
+	if (!hw_res->ctrl)
+		return 0;
+
+	return hw_res->ctrl->rcid_count;
+}
+
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+	return max_rmid;
+}
+
+u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid)
+{
+	return rmid;
+}
+
+void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
+{
+	*closid = RISCV_RESCTRL_EMPTY_CLOSID;
+	*rmid = idx;
+}
+
+void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid)
+{
+	u32 srmcfg = FIELD_PREP(SRMCFG_RCID_MASK, closid) |
+		     FIELD_PREP(SRMCFG_MCID_MASK, rmid);
+
+	WRITE_ONCE(per_cpu(cpu_srmcfg_default, cpu), srmcfg);
+}
+
+void resctrl_arch_sched_in(struct task_struct *tsk)
+{
+	__switch_to_srmcfg(tsk);
+}
+
+void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
+{
+	u32 srmcfg = FIELD_PREP(SRMCFG_RCID_MASK, closid) |
+		     FIELD_PREP(SRMCFG_MCID_MASK, rmid);
+
+	WRITE_ONCE(tsk->thread.srmcfg, srmcfg);
+}
+
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
+{
+	struct resctrl_cpu_defaults *r = info;
+
+	lockdep_assert_preemption_disabled();
+
+	if (r) {
+		resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(),
+							 r->closid, r->rmid);
+	}
+
+	resctrl_arch_sched_in(current);
+}
+
+bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
+{
+	return FIELD_GET(SRMCFG_RCID_MASK, READ_ONCE(tsk->thread.srmcfg)) == closid;
+}
+
+bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
+{
+	return FIELD_GET(SRMCFG_MCID_MASK, READ_ONCE(tsk->thread.srmcfg)) == rmid;
+}
+
+void resctrl_arch_pre_mount(void)
+{
+	/* All controllers discovered at boot via late_initcall. Nothing to do. */
+}
+
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+			    u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+	struct cbqri_resctrl_dom *dom;
+
+	dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+
+	if (!r->alloc_capable)
+		return -EINVAL;
+
+	switch (r->rid) {
+	case RDT_RESOURCE_L2:
+	case RDT_RESOURCE_L3:
+		return cbqri_apply_cache_config_dom(dom, r, closid, t, cfg_val);
+	case RDT_RESOURCE_MB_MIN:
+		/* sum(Rbwb) <= MRBWB validation runs inside cbqri_apply_rbwb(). */
+		return cbqri_apply_rbwb(dom->hw_ctrl, closid, cfg_val, true);
+	case RDT_RESOURCE_MB_WGHT:
+		return cbqri_apply_mweight_config(dom->hw_ctrl, closid, cfg_val);
+	default:
+		return -EINVAL;
+	}
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+	struct resctrl_staged_config *cfg;
+	enum resctrl_conf_type t;
+	struct rdt_ctrl_domain *d;
+	int err = 0;
+
+	/* Walking r->ctrl_domains, ensure it can't race with cpuhp */
+	lockdep_assert_cpus_held();
+
+	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+		for (t = 0; t < CDP_NUM_TYPES; t++) {
+			cfg = &d->staged_config[t];
+			if (!cfg->have_new_ctrl)
+				continue;
+			err = resctrl_arch_update_one(r, d, closid, t, cfg->new_ctrl);
+			if (err)
+				return err;
+		}
+	}
+	return err;
+}
+
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+			    u32 closid, enum resctrl_conf_type type)
+{
+	struct cbqri_resctrl_dom *hw_dom;
+	struct cbqri_controller *ctrl;
+	enum cbqri_at at;
+	u32 val;
+	int err;
+
+	hw_dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+	ctrl = hw_dom->hw_ctrl;
+	val = resctrl_get_default_ctrl(r);
+
+	if (!r->alloc_capable)
+		return val;
+
+	switch (r->rid) {
+	case RDT_RESOURCE_L2:
+	case RDT_RESOURCE_L3:
+		at = (type == CDP_CODE) ? CBQRI_AT_CODE : CBQRI_AT_DATA;
+		err = cbqri_read_cache_config(ctrl, closid, at, &val);
+		if (err < 0)
+			val = resctrl_get_default_ctrl(r);
+		break;
+	case RDT_RESOURCE_MB_MIN: {
+		u64 rbwb;
+
+		err = cbqri_read_rbwb(ctrl, closid, &rbwb);
+		if (err == 0)
+			val = (u32)rbwb;
+		break;
+	}
+	case RDT_RESOURCE_MB_WGHT: {
+		u64 mweight;
+
+		err = cbqri_read_mweight(ctrl, closid, &mweight);
+		if (err == 0)
+			val = (u32)mweight;
+		break;
+	}
+	default:
+		break;
+	}
+
+	return val;
+}
+
+/*
+ * RCID 0 carries the remaining MRBWB after every other RCID is seeded with
+ * the minimum Rbwb of 1. cbqri_probe_bc() rejects a bandwidth controller
+ * with mrbwb < rcid_count, so this subtraction cannot underflow.
+ */
+static u64 cbqri_rcid0_rbwb(struct cbqri_controller *ctrl)
+{
+	if (WARN_ON_ONCE(ctrl->bc.mrbwb < ctrl->rcid_count))
+		return 1;
+	return ctrl->bc.mrbwb - (ctrl->rcid_count - 1);
+}
+
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
+{
+	struct cbqri_resctrl_res *hw_res;
+	struct cbqri_resctrl_dom *dom;
+	struct rdt_ctrl_domain *d;
+	enum resctrl_conf_type t;
+	u32 default_ctrl;
+	int i;
+
+	lockdep_assert_cpus_held();
+
+	hw_res = container_of(r, struct cbqri_resctrl_res, resctrl_res);
+	default_ctrl = resctrl_get_default_ctrl(r);
+
+	if (!hw_res->ctrl)
+		return;
+
+	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+		dom = container_of(d, struct cbqri_resctrl_dom,
+				   resctrl_ctrl_dom);
+
+		switch (r->rid) {
+		case RDT_RESOURCE_MB_MIN:
+			/*
+			 * CBQRI section 4.5: Rbwb >= 1, sum(Rbwb) <= MRBWB.
+			 * Walk N-1..1 first so RCID 0 lands last with the
+			 * remaining budget.
+			 */
+			for (i = 0; i < hw_res->ctrl->rcid_count; i++) {
+				u32 rcid = (i + 1) % hw_res->ctrl->rcid_count;
+				u64 rbwb = (rcid == 0) ?
+					cbqri_rcid0_rbwb(dom->hw_ctrl) : 1;
+				int rerr;
+
+				rerr = cbqri_apply_rbwb(dom->hw_ctrl, rcid, rbwb, false);
+				if (rerr)
+					pr_err_ratelimited("RBWB reset RCID %u failed (%d)\n",
+							   rcid, rerr);
+			}
+			break;
+		case RDT_RESOURCE_MB_WGHT:
+			/* All RCIDs start at max weight (the new-group default). */
+			for (i = 0; i < hw_res->ctrl->rcid_count; i++) {
+				int rerr;
+
+				rerr = cbqri_apply_mweight_config(dom->hw_ctrl, i,
+								  default_ctrl);
+				if (rerr)
+					pr_err_ratelimited("Mweight reset RCID %u failed (%d)\n",
+							   i, rerr);
+			}
+			break;
+		default:
+			for (i = 0; i < hw_res->ctrl->rcid_count; i++) {
+				for (t = 0; t < CDP_NUM_TYPES; t++) {
+					int rerr;
+
+					rerr = resctrl_arch_update_one(r, d, i, t,
+								       default_ctrl);
+					if (rerr)
+						pr_err_ratelimited("rid=%d reset RCID %u type %u failed (%d)\n",
+								   r->rid, i, t, rerr);
+				}
+			}
+			break;
+		}
+	}
+}
+
+static struct rdt_ctrl_domain *cbqri_new_domain(struct cbqri_controller *ctrl)
+{
+	struct cbqri_resctrl_dom *hw_dom;
+	struct rdt_ctrl_domain *domain;
+
+	hw_dom = kzalloc_obj(*hw_dom, GFP_KERNEL);
+	if (!hw_dom)
+		return NULL;
+
+	hw_dom->hw_ctrl = ctrl;
+	domain = &hw_dom->resctrl_ctrl_dom;
+
+	INIT_LIST_HEAD(&domain->hdr.list);
+
+	return domain;
+}
+
+static int cbqri_init_domain_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+	struct cbqri_resctrl_res *hw_res;
+	struct cbqri_resctrl_dom *dom;
+	enum resctrl_conf_type t;
+	int err = 0;
+	u64 rbwb;
+	int i;
+
+	hw_res = container_of(r, struct cbqri_resctrl_res, resctrl_res);
+	dom = container_of(d, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+
+	for (i = 0; i < hw_res->ctrl->rcid_count; i++) {
+		/*
+		 * For MB_MIN walk, RCIDs 1..N-1 then RCID 0 last so the sum
+		 * doesn't exceed MRBWB during the walk.
+		 */
+		u32 rcid = (r->rid == RDT_RESOURCE_MB_MIN) ?
+				((i + 1) % hw_res->ctrl->rcid_count) : i;
+
+		switch (r->rid) {
+		case RDT_RESOURCE_MB_MIN:
+			/*
+			 * CBQRI section 4.5: Rbwb >= 1, sum(Rbwb) <= MRBWB.
+			 * RCID 0 takes the remaining budget.
+			 */
+			rbwb = (rcid == 0) ? cbqri_rcid0_rbwb(dom->hw_ctrl) : 1;
+
+			err = cbqri_apply_rbwb(dom->hw_ctrl, rcid, rbwb, false);
+			break;
+		case RDT_RESOURCE_MB_WGHT:
+			/* Match the new-group default: equal weights across RCIDs. */
+			err = cbqri_apply_mweight_config(dom->hw_ctrl, i,
+							 resctrl_get_default_ctrl(r));
+			break;
+		default:
+			/*
+			 * Seed both DATA and CODE staged slots so a later
+			 * mount with -o cdp does not see stale CODE values.
+			 * On non-AT controllers cbqri_cc_alloc_op() masks
+			 * AT to 0, so all three iterations land on the same
+			 * hardware state. The redundant writes are harmless.
+			 */
+			for (t = 0; t < CDP_NUM_TYPES; t++) {
+				err = resctrl_arch_update_one(r, d, i, t,
+							      resctrl_get_default_ctrl(r));
+				if (err)
+					break;
+			}
+			break;
+		}
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * Walk cbqri_controllers and pick one capacity controller (CC) per cache
+ * level (L2/L3) to back the corresponding RDT_RESOURCE_L*. When more than
+ * one CC sits at the same level (e.g. one per socket), they must agree on
+ * rcid_count / ncblks / alloc_capable. A mismatch is fatal because resctrl
+ * exposes a single set of caps per rid. The first matching controller wins.
+ */
+static int cbqri_resctrl_pick_caches(void)
+{
+	struct cbqri_controller *ctrl;
+
+	list_for_each_entry(ctrl, &cbqri_controllers, list) {
+		struct cbqri_resctrl_res *cbqri_res;
+		enum resctrl_res_level rid;
+
+		if (ctrl->type != CBQRI_CONTROLLER_TYPE_CAPACITY)
+			continue;
+		if (!ctrl->alloc_capable) {
+			if (ctrl->mon_capable)
+				pr_warn_once("CC @%pa: monitor-only controllers aren't supported\n",
+					     &ctrl->addr);
+			continue;
+		}
+
+		if (ctrl->cache.cache_level == 2) {
+			rid = RDT_RESOURCE_L2;
+		} else if (ctrl->cache.cache_level == 3) {
+			rid = RDT_RESOURCE_L3;
+		} else {
+			pr_err("unknown cache level %d\n",
+			       ctrl->cache.cache_level);
+			return -ENODEV;
+		}
+
+		cbqri_res = &cbqri_resctrl_resources[rid];
+		if (cbqri_res->ctrl) {
+			/*
+			 * CCs at the same cache level must agree on every cap
+			 * resctrl exposes globally. Reject mismatches at pick
+			 * time so the inconsistency is visible at boot.
+			 */
+			if (cbqri_res->ctrl->rcid_count != ctrl->rcid_count ||
+			    cbqri_res->ctrl->cc.ncblks != ctrl->cc.ncblks ||
+			    cbqri_res->ctrl->cc.supports_alloc_at_code !=
+				    ctrl->cc.supports_alloc_at_code ||
+			    cbqri_res->ctrl->alloc_capable != ctrl->alloc_capable) {
+				pr_err("L%d controllers have mismatched capabilities\n",
+				       ctrl->cache.cache_level);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		cbqri_res->ctrl = ctrl;
+	}
+
+	return 0;
+}
+
+/*
+ * Fill the rdt_resource fields for one picked rid. An rid with no picked
+ * controller is left untouched so it stays out of resctrl_arch_get_resource().
+ */
+static int cbqri_resctrl_control_init(struct cbqri_resctrl_res *cbqri_res)
+{
+	struct cbqri_controller *ctrl = cbqri_res->ctrl;
+	struct rdt_resource *res = &cbqri_res->resctrl_res;
+
+	if (!ctrl)
+		return 0;
+
+	switch (res->rid) {
+	case RDT_RESOURCE_L2:
+	case RDT_RESOURCE_L3:
+		res->name = (res->rid == RDT_RESOURCE_L2) ? "L2" : "L3";
+		res->schema_fmt = RESCTRL_SCHEMA_BITMAP;
+		res->ctrl_scope = (res->rid == RDT_RESOURCE_L2) ?
+				    RESCTRL_L2_CACHE : RESCTRL_L3_CACHE;
+		res->cache.cbm_len = ctrl->cc.ncblks;
+		res->cache.shareable_bits = 0;
+		res->cache.min_cbm_bits = 1;
+		res->cache.arch_has_sparse_bitmasks = false;
+		res->cdp_capable = ctrl->cc.supports_alloc_at_code;
+		res->alloc_capable = ctrl->alloc_capable;
+		INIT_LIST_HEAD(&res->ctrl_domains);
+		INIT_LIST_HEAD(&res->mon_domains);
+
+		if (ctrl->mon_capable && res->rid == RDT_RESOURCE_L3) {
+			res->mon_scope = RESCTRL_L3_CACHE;
+			resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID,
+						 false, 0, NULL);
+			res->mon_capable = true;
+		}
+		break;
+
+	case RDT_RESOURCE_MB_MIN:
+		res->name = "MB_MIN";
+		res->schema_fmt = RESCTRL_SCHEMA_RANGE;
+		/*
+		 * resctrl requires a cache scope for MBA-style domains.
+		 * Use L3 as a proxy until the resctrl supports non-cache
+		 * scopes for bandwidth resources.
+		 */
+		res->ctrl_scope = RESCTRL_L3_CACHE;
+		/* Rbwb is an integer block count, not a percentage. No MBA delay_linear. */
+		res->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+		res->membw.min_bw = 1;
+		res->membw.max_bw = ctrl->bc.mrbwb;
+		res->membw.bw_gran = 1;
+		/*
+		 * CBQRI section 4.5 caps sum(Rbwb) <= MRBWB. Default new
+		 * groups to min_bw so mkdir cannot overflow that sum.
+		 */
+		res->membw.default_to_min = true;
+		res->alloc_capable = ctrl->alloc_capable;
+		INIT_LIST_HEAD(&res->ctrl_domains);
+		INIT_LIST_HEAD(&res->mon_domains);
+		break;
+
+	case RDT_RESOURCE_MB_WGHT:
+		res->name = "MB_WGHT";
+		res->schema_fmt = RESCTRL_SCHEMA_RANGE;
+		res->ctrl_scope = RESCTRL_L3_CACHE;
+		/* Mweight is a dimensionless ratio. No delay/linear concept. */
+		res->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+		/*
+		 * CBQRI section 4.5: Mweight is 0-255 (0 disables
+		 * work-conserving). No sum constraint, so leave
+		 * default_to_min false. Groups default to max_bw.
+		 */
+		res->membw.min_bw = 0;
+		res->membw.max_bw = 255;
+		res->membw.bw_gran = 1;
+		res->alloc_capable = ctrl->alloc_capable;
+		INIT_LIST_HEAD(&res->ctrl_domains);
+		INIT_LIST_HEAD(&res->mon_domains);
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * Pick one BC to back both MB_MIN and MB_WGHT.
+ */
+static int cbqri_resctrl_pick_bw_alloc(void)
+{
+	struct cbqri_resctrl_res *mb_min = &cbqri_resctrl_resources[RDT_RESOURCE_MB_MIN];
+	struct cbqri_resctrl_res *mb_wght = &cbqri_resctrl_resources[RDT_RESOURCE_MB_WGHT];
+	struct cbqri_controller *ctrl;
+
+	list_for_each_entry(ctrl, &cbqri_controllers, list) {
+		if (ctrl->type != CBQRI_CONTROLLER_TYPE_BANDWIDTH)
+			continue;
+		if (!ctrl->alloc_capable)
+			continue;
+
+		if (mb_min->ctrl) {
+			if (mb_min->ctrl->rcid_count != ctrl->rcid_count ||
+			    mb_min->ctrl->bc.mrbwb != ctrl->bc.mrbwb) {
+				pr_err("BW controllers have mismatched capabilities\n");
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		mb_min->ctrl = ctrl;
+		mb_wght->ctrl = ctrl;
+	}
+
+	return 0;
+}
+
+/*
+ * Enable mbm_total_bytes when the system exposes exactly one mon-capable
+ * bandwidth controller and exactly one L3 cache. Pairing a single BC with
+ * multiple L3 domains would let userspace overcount system bandwidth by a
+ * factor equal to the L3 domain count. resctrl_is_mon_event_enabled() then
+ * gates the BC pairing and rmid-space accounting. L3 occupancy is enabled
+ * by cbqri_resctrl_control_init().
+ */
+static void cbqri_resctrl_pick_counters(void)
+{
+	struct cbqri_resctrl_res *l3 = &cbqri_resctrl_resources[RDT_RESOURCE_L3];
+	struct cbqri_controller *ctrl, *prev;
+	unsigned int l3_count = 0;
+
+	/* Count distinct L3 cache_ids */
+	list_for_each_entry(ctrl, &cbqri_controllers, list) {
+		bool seen = false;
+
+		if (ctrl->type != CBQRI_CONTROLLER_TYPE_CAPACITY)
+			continue;
+		if (ctrl->cache.cache_level != 3)
+			continue;
+
+		list_for_each_entry(prev, &cbqri_controllers, list) {
+			if (prev == ctrl)
+				break;
+			if (prev->type != CBQRI_CONTROLLER_TYPE_CAPACITY)
+				continue;
+			if (prev->cache.cache_level != 3)
+				continue;
+			if (prev->cache.cache_id == ctrl->cache.cache_id) {
+				seen = true;
+				break;
+			}
+		}
+		if (!seen)
+			l3_count++;
+	}
+
+	if (l3_count > 1) {
+		pr_warn_once("multiple L3 domains (%u) detected. mbm_total_bytes disabled\n",
+			     l3_count);
+		return;
+	}
+
+	/*
+	 * mbm_total_bytes is surfaced on the L3 monitoring domain, so it
+	 * needs a mon-capable L3 cache controller as well as a single
+	 * mon-capable bandwidth controller.
+	 */
+	if (l3->ctrl && l3->ctrl->mon_capable && cbqri_find_only_mon_bc())
+		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL);
+}
+
+static void cbqri_resctrl_accumulate_caps(void)
+{
+	struct cbqri_controller *l3_ctrl;
+	int rid;
+
+	for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) {
+		struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid];
+
+		if (!hw_res->ctrl)
+			continue;
+		if (hw_res->ctrl->alloc_capable)
+			exposed_alloc_capable = true;
+		if (hw_res->ctrl->mon_capable)
+			exposed_mon_capable = true;
+	}
+
+	/*
+	 * Narrow max_rmid against the picked occupancy source (the L3 CC)
+	 * only. A mon-capable controller that is not exposed as a counter
+	 * source must not clamp the rmid space.
+	 */
+	l3_ctrl = cbqri_resctrl_resources[RDT_RESOURCE_L3].ctrl;
+	if (l3_ctrl && l3_ctrl->mon_capable)
+		max_rmid = min(max_rmid, l3_ctrl->mcid_count);
+
+	/*
+	 * When mbm_total_bytes is enabled, the paired BC is a second counter
+	 * source, so clamp against its mcid_count too. A BC left unpicked
+	 * because mbm_total_bytes is disabled must not clamp it.
+	 */
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) {
+		struct cbqri_controller *bc = cbqri_find_only_mon_bc();
+
+		if (bc)
+			max_rmid = min(max_rmid, bc->mcid_count);
+	}
+
+	if (!exposed_mon_capable) {
+		max_rmid = 1;
+		return;
+	}
+
+	/*
+	 * num_rmid is the user-visible bound for the L3 monitoring rmid
+	 * space. Track max_rmid (the picked-source minimum) so userspace is
+	 * not told more RMIDs than can be allocated.
+	 */
+	cbqri_resctrl_resources[RDT_RESOURCE_L3].resctrl_res.mon.num_rmid = max_rmid;
+}
+
+/*
+ * Create, list-insert, and online a fresh ctrl_domain backing ctrl on
+ * resource res, seeded with cpu and identified by dom_id. Caller must
+ * hold cbqri_domain_list_lock and must have already verified that no
+ * existing ctrl_domain on res carries this id.
+ */
+static struct rdt_ctrl_domain *cbqri_create_ctrl_domain(struct cbqri_controller *ctrl,
+							struct rdt_resource *res,
+							unsigned int cpu, int dom_id)
+{
+	struct rdt_ctrl_domain *domain;
+	struct list_head *pos = NULL;
+	int err;
+
+	domain = cbqri_new_domain(ctrl);
+	if (!domain)
+		return ERR_PTR(-ENOMEM);
+
+	cpumask_set_cpu(cpu, &domain->hdr.cpu_mask);
+	domain->hdr.id = dom_id;
+	domain->hdr.type = RESCTRL_CTRL_DOMAIN;
+
+	err = cbqri_init_domain_ctrlval(res, domain);
+	if (err) {
+		kfree(container_of(domain, struct cbqri_resctrl_dom,
+				   resctrl_ctrl_dom));
+		return ERR_PTR(err);
+	}
+
+	/* Insert sorted by id so user-visible ordering is deterministic. */
+	resctrl_find_domain(&res->ctrl_domains, dom_id, &pos);
+	list_add_tail(&domain->hdr.list, pos);
+
+	resctrl_online_ctrl_domain(res, domain);
+
+	return domain;
+}
+
+static int cbqri_attach_cpu_to_l3_mon(struct cbqri_controller *ctrl,
+				      struct rdt_resource *res, unsigned int cpu)
+{
+	struct rdt_l3_mon_domain *mon_dom;
+	struct rdt_ctrl_domain *ctrl_dom;
+	struct cbqri_resctrl_dom *hw_dom;
+	struct list_head *mon_pos = NULL;
+	int dom_id = ctrl->cache.cache_id;
+	int err;
+
+	lockdep_assert_held(&cbqri_domain_list_lock);
+
+	mon_dom = cbqri_find_l3_mon_domain(&res->mon_domains, dom_id);
+	if (mon_dom) {
+		cpumask_set_cpu(cpu, &mon_dom->hdr.cpu_mask);
+		return 0;
+	}
+
+	ctrl_dom = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id);
+	if (!ctrl_dom) {
+		pr_err("L3 mon attach for cpu %u: no ctrl_domain id %d\n",
+		       cpu, dom_id);
+		return -EINVAL;
+	}
+
+	mon_dom = kzalloc_obj(*mon_dom, GFP_KERNEL);
+	if (!mon_dom)
+		return -ENOMEM;
+
+	mon_dom->hdr.id = dom_id;
+	mon_dom->hdr.type = RESCTRL_MON_DOMAIN;
+	mon_dom->hdr.rid = RDT_RESOURCE_L3;
+	cpumask_set_cpu(cpu, &mon_dom->hdr.cpu_mask);
+	INIT_LIST_HEAD(&mon_dom->hdr.list);
+
+	if (resctrl_find_domain(&res->mon_domains, dom_id, &mon_pos)) {
+		pr_err("duplicate L3 mon_domain id %d\n", dom_id);
+		err = -EEXIST;
+		goto err_free;
+	}
+	if (mon_pos)
+		list_add_tail(&mon_dom->hdr.list, mon_pos);
+	else
+		list_add_tail(&mon_dom->hdr.list, &res->mon_domains);
+
+	/*
+	 * Pair this L3 domain with the system's mon-capable BC and
+	 * initialise the BC's per-MCID software accumulators before
+	 * resctrl_online_mon_domain() exposes the domain to userspace.
+	 * A concurrent sysfs read of mbm_total_bytes between online and
+	 * BC init would otherwise pass the !bc->mbm_total_states check
+	 * with a half-initialised pointer.
+	 */
+	hw_dom = container_of(ctrl_dom, struct cbqri_resctrl_dom, resctrl_ctrl_dom);
+
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+		hw_dom->paired_bc = cbqri_find_only_mon_bc();
+	if (hw_dom->paired_bc) {
+		err = cbqri_init_bc_mon_counters(hw_dom->paired_bc);
+		if (err) {
+			pr_err("BC @%pa: mon init failed (%d)\n", &hw_dom->paired_bc->addr, err);
+			hw_dom->paired_bc = NULL;
+			goto err_listdel;
+		}
+	}
+
+	err = resctrl_online_mon_domain(res, &mon_dom->hdr);
+	if (err)
+		goto err_listdel;
+
+	err = cbqri_init_mon_counters(ctrl);
+	if (err)
+		goto err_offline;
+
+	return 0;
+
+err_offline:
+	/*
+	 * cancel_delayed_work avoids deadlocking against the cqm_limbo
+	 * worker which takes cpus_read_lock while this hotplug callback
+	 * already holds cpus_write_lock. mbm_over is only
+	 * INIT_DELAYED_WORK'd when MBM_TOTAL was enabled, so gate the
+	 * cancel on the same condition to avoid touching a zeroed work
+	 * struct.
+	 */
+	cancel_delayed_work(&mon_dom->cqm_limbo);
+	if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+		cancel_delayed_work(&mon_dom->mbm_over);
+	resctrl_offline_mon_domain(res, &mon_dom->hdr);
+err_listdel:
+	list_del(&mon_dom->hdr.list);
+err_free:
+	kfree(mon_dom);
+	return err;
+}
+
+static int cbqri_attach_cpu_to_cap_ctrl(struct cbqri_controller *ctrl,
+					unsigned int cpu)
+{
+	struct cbqri_resctrl_res *hw_res;
+	struct rdt_ctrl_domain *domain;
+	struct rdt_resource *res;
+	bool new_domain = false;
+	int dom_id;
+	int err;
+
+	if (ctrl->cache.cache_level == 2)
+		hw_res = &cbqri_resctrl_resources[RDT_RESOURCE_L2];
+	else if (ctrl->cache.cache_level == 3)
+		hw_res = &cbqri_resctrl_resources[RDT_RESOURCE_L3];
+	else
+		return 0;
+
+	if (!hw_res->ctrl)
+		return 0;
+
+	res = &hw_res->resctrl_res;
+	dom_id = ctrl->cache.cache_id;
+
+	domain = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id);
+	if (domain) {
+		cpumask_set_cpu(cpu, &domain->hdr.cpu_mask);
+	} else {
+		domain = cbqri_create_ctrl_domain(ctrl, res, cpu, dom_id);
+		if (IS_ERR(domain))
+			return PTR_ERR(domain);
+		new_domain = true;
+	}
+
+	if (ctrl->mon_capable && ctrl->cache.cache_level == 3) {
+		err = cbqri_attach_cpu_to_l3_mon(ctrl, res, cpu);
+		if (err)
+			goto err_undo_ctrl_dom;
+	}
+
+	return 0;
+
+err_undo_ctrl_dom:
+	/*
+	 * The cpuhp core only rolls back states that successfully ran their
+	 * startup. The L3 mon attach failure happens inside this state's
+	 * startup, so its own offline callback is not invoked. Undo the
+	 * cpumask_set and, if this attach created the ctrl_domain, tear it
+	 * down so a retry sees a clean slate.
+	 */
+	cpumask_clear_cpu(cpu, &domain->hdr.cpu_mask);
+	if (new_domain) {
+		resctrl_offline_ctrl_domain(res, domain);
+		list_del(&domain->hdr.list);
+		kfree(container_of(domain, struct cbqri_resctrl_dom,
+				   resctrl_ctrl_dom));
+	}
+	return err;
+}
+
+static int cbqri_attach_cpu_to_one_bw_res(struct cbqri_controller *ctrl,
+					  enum resctrl_res_level rid,
+					  unsigned int cpu)
+{
+	struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid];
+	struct rdt_resource *res = &hw_res->resctrl_res;
+	struct rdt_ctrl_domain *domain;
+	int dom_id = ctrl->mem.prox_dom;
+
+	if (!hw_res->ctrl)
+		return 0;
+
+	domain = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id);
+	if (domain) {
+		cpumask_set_cpu(cpu, &domain->hdr.cpu_mask);
+		return 0;
+	}
+
+	domain = cbqri_create_ctrl_domain(ctrl, res, cpu, dom_id);
+	if (IS_ERR(domain))
+		return PTR_ERR(domain);
+
+	return 0;
+}
+
+static void cbqri_detach_cpu_from_one_bw_res(struct cbqri_controller *ctrl,
+					     enum resctrl_res_level rid,
+					     unsigned int cpu)
+{
+	struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid];
+	struct rdt_resource *res = &hw_res->resctrl_res;
+	struct rdt_ctrl_domain *domain;
+	int dom_id = ctrl->mem.prox_dom;
+
+	lockdep_assert_held(&cbqri_domain_list_lock);
+
+	if (!hw_res->ctrl)
+		return;
+
+	domain = cbqri_find_ctrl_domain(&res->ctrl_domains, dom_id);
+	if (!domain || !cpumask_test_cpu(cpu, &domain->hdr.cpu_mask))
+		return;
+
+	cpumask_clear_cpu(cpu, &domain->hdr.cpu_mask);
+	if (cpumask_empty(&domain->hdr.cpu_mask)) {
+		resctrl_offline_ctrl_domain(res, domain);
+		list_del(&domain->hdr.list);
+		kfree(container_of(domain, struct cbqri_resctrl_dom,
+				   resctrl_ctrl_dom));
+	}
+}
+
+static int cbqri_attach_cpu_to_bw_ctrl(struct cbqri_controller *ctrl,
+				       unsigned int cpu)
+{
+	int err;
+
+	err = cbqri_attach_cpu_to_one_bw_res(ctrl, RDT_RESOURCE_MB_MIN, cpu);
+	if (err)
+		return err;
+
+	err = cbqri_attach_cpu_to_one_bw_res(ctrl, RDT_RESOURCE_MB_WGHT, cpu);
+	if (err)
+		cbqri_detach_cpu_from_one_bw_res(ctrl, RDT_RESOURCE_MB_MIN, cpu);
+	return err;
+}
+
+static void cbqri_detach_cpu_from_l3_mon(struct rdt_resource *res,
+					 unsigned int cpu)
+{
+	struct rdt_l3_mon_domain *mon_dom, *tmp;
+
+	lockdep_assert_held(&cbqri_domain_list_lock);
+
+	list_for_each_entry_safe(mon_dom, tmp, &res->mon_domains, hdr.list) {
+		if (!cpumask_test_cpu(cpu, &mon_dom->hdr.cpu_mask))
+			continue;
+		cpumask_clear_cpu(cpu, &mon_dom->hdr.cpu_mask);
+		if (cpumask_empty(&mon_dom->hdr.cpu_mask)) {
+			/*
+			 * This runs as a cpuhp offline callback under
+			 * cpus_write_lock. The cqm_limbo and mbm_over workers
+			 * take cpus_read_lock before touching a domain, so
+			 * neither can run or re-queue here. A non-sync cancel
+			 * thus reliably dequeues any pending work before kfree,
+			 * and cancel_delayed_work_sync() would instead deadlock
+			 * against that cpus_read_lock.
+			 */
+			cancel_delayed_work(&mon_dom->cqm_limbo);
+			if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID))
+				cancel_delayed_work(&mon_dom->mbm_over);
+			resctrl_offline_mon_domain(res, &mon_dom->hdr);
+			list_del(&mon_dom->hdr.list);
+			kfree(mon_dom);
+		}
+	}
+}
+
+static void cbqri_detach_cpu_from_ctrl_domains(struct rdt_resource *res,
+					       unsigned int cpu)
+{
+	struct rdt_ctrl_domain *domain, *tmp;
+
+	list_for_each_entry_safe(domain, tmp, &res->ctrl_domains, hdr.list) {
+		if (!cpumask_test_cpu(cpu, &domain->hdr.cpu_mask))
+			continue;
+		cpumask_clear_cpu(cpu, &domain->hdr.cpu_mask);
+		if (cpumask_empty(&domain->hdr.cpu_mask)) {
+			resctrl_offline_ctrl_domain(res, domain);
+			list_del(&domain->hdr.list);
+			kfree(container_of(domain, struct cbqri_resctrl_dom,
+					   resctrl_ctrl_dom));
+		}
+	}
+}
+
+/*
+ * Remove a CPU from every domain it was attached to. The per-resource
+ * detach helpers act only when the CPU is set in a domain's mask, so this
+ * is idempotent and undoes a partial online attach as well as a full
+ * offline. Caller holds cbqri_domain_list_lock.
+ */
+static void cbqri_detach_cpu_from_all_ctrls(unsigned int cpu)
+{
+	int rid;
+
+	lockdep_assert_held(&cbqri_domain_list_lock);
+
+	for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) {
+		struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid];
+
+		if (!hw_res->ctrl)
+			continue;
+		cbqri_detach_cpu_from_ctrl_domains(&hw_res->resctrl_res, cpu);
+		if (rid == RDT_RESOURCE_L3 && hw_res->ctrl->mon_capable)
+			cbqri_detach_cpu_from_l3_mon(&hw_res->resctrl_res, cpu);
+	}
+}
+
+/*
+ * Attach a CPU to every controller that claims it. On failure, detach the
+ * CPU from everything attached so far: the cpuhp core does not run this
+ * state's offline teardown when its startup fails, so a partial attach
+ * would otherwise leak into the domain cpu_masks. Caller holds
+ * cbqri_domain_list_lock.
+ */
+static int cbqri_attach_cpu_to_all_ctrls(unsigned int cpu)
+{
+	struct cbqri_controller *ctrl;
+	int err = 0;
+
+	lockdep_assert_held(&cbqri_domain_list_lock);
+
+	list_for_each_entry(ctrl, &cbqri_controllers, list) {
+		switch (ctrl->type) {
+		case CBQRI_CONTROLLER_TYPE_CAPACITY:
+			if (!cpumask_test_cpu(cpu, &ctrl->cache.cpu_mask))
+				continue;
+			if (!ctrl->alloc_capable)
+				continue;
+			err = cbqri_attach_cpu_to_cap_ctrl(ctrl, cpu);
+			break;
+		case CBQRI_CONTROLLER_TYPE_BANDWIDTH:
+			if (!cpumask_test_cpu(cpu, &ctrl->mem.cpu_mask))
+				continue;
+			if (!ctrl->alloc_capable)
+				continue;
+			err = cbqri_attach_cpu_to_bw_ctrl(ctrl, cpu);
+			break;
+		default:
+			continue;
+		}
+		if (err) {
+			cbqri_detach_cpu_from_all_ctrls(cpu);
+			break;
+		}
+	}
+
+	return err;
+}
+
+static bool cbqri_resctrl_inited;
+
+static void cbqri_resctrl_teardown(void)
+{
+	int rid;
+
+	if (!cbqri_resctrl_inited)
+		return;
+
+	resctrl_exit();
+
+	for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) {
+		struct cbqri_resctrl_res *hw_res = &cbqri_resctrl_resources[rid];
+
+		hw_res->ctrl = NULL;
+		hw_res->cdp_enabled = false;
+	}
+	exposed_alloc_capable = false;
+	exposed_mon_capable = false;
+	max_rmid = U32_MAX;
+	cbqri_resctrl_inited = false;
+}
+
+static int cbqri_resctrl_setup(void)
+{
+	int rid;
+	int err;
+
+	for (rid = 0; rid < RDT_NUM_RESOURCES; rid++)
+		cbqri_resctrl_resources[rid].resctrl_res.rid = rid;
+
+	err = cbqri_resctrl_pick_caches();
+	if (err)
+		return err;
+
+	err = cbqri_resctrl_pick_bw_alloc();
+	if (err)
+		return err;
+
+	cbqri_resctrl_pick_counters();
+
+	for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) {
+		err = cbqri_resctrl_control_init(&cbqri_resctrl_resources[rid]);
+		if (err)
+			return err;
+	}
+
+	cbqri_resctrl_accumulate_caps();
+
+	if (!exposed_alloc_capable && !exposed_mon_capable) {
+		pr_debug("no resctrl-capable CBQRI controllers found\n");
+		return -ENODEV;
+	}
+
+	err = resctrl_init();
+	if (err) {
+		/*
+		 * resctrl_init() failed before we set cbqri_resctrl_inited,
+		 * so cbqri_resctrl_teardown() would no-op. Roll back the
+		 * exposed_*_capable flags and the resource picks directly
+		 * so resctrl_arch_alloc_capable() / _mon_capable() do not
+		 * lie to callers after this returns.
+		 */
+		for (rid = 0; rid < RDT_NUM_RESOURCES; rid++) {
+			cbqri_resctrl_resources[rid].ctrl = NULL;
+			cbqri_resctrl_resources[rid].cdp_enabled = false;
+		}
+		exposed_alloc_capable = false;
+		exposed_mon_capable = false;
+		max_rmid = U32_MAX;
+		return err;
+	}
+
+	cbqri_resctrl_inited = true;
+	return 0;
+}
+
+static int cbqri_resctrl_online_cpu(unsigned int cpu)
+{
+	int err;
+
+	mutex_lock(&cbqri_domain_list_lock);
+	err = cbqri_attach_cpu_to_all_ctrls(cpu);
+	mutex_unlock(&cbqri_domain_list_lock);
+	if (err)
+		return err;
+
+	/*
+	 * Seed the per-CPU default RCID/MCID to the reserved (0, 0) pair and
+	 * notify the resctrl core so it tracks this CPU in the default group.
+	 * Mirrors x86 resctrl_arch_online_cpu().
+	 */
+	resctrl_arch_set_cpu_default_closid_rmid(cpu, 0, 0);
+	resctrl_online_cpu(cpu);
+	return 0;
+}
+
+static int cbqri_resctrl_offline_cpu(unsigned int cpu)
+{
+	resctrl_offline_cpu(cpu);
+
+	mutex_lock(&cbqri_domain_list_lock);
+	cbqri_detach_cpu_from_all_ctrls(cpu);
+	mutex_unlock(&cbqri_domain_list_lock);
+	return 0;
+}
+
+/* Saved cpuhp slot from cpuhp_setup_state() for symmetric removal. */
+static enum cpuhp_state cbqri_cpuhp_state;
+
+static int __init cbqri_arch_late_init(void)
+{
+	int err;
+
+	if (!riscv_isa_extension_available(NULL, SSQOSID))
+		return -ENODEV;
+
+	err = cbqri_resctrl_setup();
+	if (err)
+		return err;
+
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cbqri:online",
+				cbqri_resctrl_online_cpu,
+				cbqri_resctrl_offline_cpu);
+	if (err < 0) {
+		cbqri_resctrl_teardown();
+		return err;
+	}
+	cbqri_cpuhp_state = err;
+
+	return 0;
+}
+late_initcall(cbqri_arch_late_init);
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index 9a7dfc48cb2e29..d9f05270094155 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -245,8 +245,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
 	if (WARN_ON_ONCE(!parse_ctrlval))
 		return -EINVAL;
 
-	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
-	    (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
+	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && resctrl_is_membw(r)) {
 		rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
 		return -EINVAL;
 	}
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 1a9b29119f88f8..76187987b2ee46 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -397,6 +397,8 @@ void mbm_handle_overflow(struct work_struct *work);
 
 bool is_mba_sc(struct rdt_resource *r);
 
+bool resctrl_is_membw(struct rdt_resource *r);
+
 void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms,
 			     int exclude_cpu);
 
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index 5dfdaa6f9d8ff6..02733b11e115ea 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -1412,7 +1412,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
-		if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
+		if (resctrl_is_membw(r))
 			continue;
 		has_cache = true;
 		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
@@ -1555,6 +1555,12 @@ bool is_mba_sc(struct rdt_resource *r)
 	return r->membw.mba_sc;
 }
 
+/* RANGE schema is bandwidth (MBA/SMBA/MB_MIN/MB_WGHT). BITMAP is cache. */
+bool resctrl_is_membw(struct rdt_resource *r)
+{
+	return r->schema_fmt == RESCTRL_SCHEMA_RANGE;
+}
+
 /*
  * rdtgroup_size_show - Display size in bytes of allocated regions
  *
@@ -1616,8 +1622,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 					ctrl = resctrl_arch_get_config(r, d,
 								       closid,
 								       type);
-				if (r->rid == RDT_RESOURCE_MBA ||
-				    r->rid == RDT_RESOURCE_SMBA)
+				if (resctrl_is_membw(r))
 					size = ctrl;
 				else
 					size = rdtgroup_cbm_to_size(r, d, ctrl);
@@ -2397,6 +2402,8 @@ static unsigned long fflags_from_resource(struct rdt_resource *r)
 		return RFTYPE_RES_CACHE;
 	case RDT_RESOURCE_MBA:
 	case RDT_RESOURCE_SMBA:
+	case RDT_RESOURCE_MB_MIN:
+	case RDT_RESOURCE_MB_WGHT:
 		return RFTYPE_RES_MB;
 	case RDT_RESOURCE_PERF_PKG:
 		return RFTYPE_RES_PERF_PKG;
@@ -3648,8 +3655,7 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
-		if (r->rid == RDT_RESOURCE_MBA ||
-		    r->rid == RDT_RESOURCE_SMBA) {
+		if (resctrl_is_membw(r)) {
 			rdtgroup_init_mba(r, rdtgrp->closid);
 			if (is_mba_sc(r))
 				continue;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 006e57fd7ca589..bcbc166412ef13 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -53,6 +53,8 @@ enum resctrl_res_level {
 	RDT_RESOURCE_L2,
 	RDT_RESOURCE_MBA,
 	RDT_RESOURCE_SMBA,
+	RDT_RESOURCE_MB_MIN,
+	RDT_RESOURCE_MB_WGHT,
 	RDT_RESOURCE_PERF_PKG,
 
 	/* Must be the last */
@@ -245,7 +247,13 @@ enum membw_throttle_mode {
 /**
  * struct resctrl_membw - Memory bandwidth allocation related data
  * @min_bw:		Minimum memory bandwidth percentage user can request
- * @max_bw:		Maximum memory bandwidth value, used as the reset value
+ * @max_bw:		Maximum memory bandwidth value a group can be
+ *			configured with
+ * @default_to_min:	When true, the default control value for new
+ *			groups and reset is @min_bw instead of @max_bw.
+ *			Drivers whose hardware enforces a sum constraint
+ *			across groups (e.g. CBQRI MB_MIN) set this so
+ *			mkdir does not overflow the sum.
  * @bw_gran:		Granularity at which the memory bandwidth is allocated
  * @delay_linear:	True if memory B/W delay is in linear scale
  * @arch_needs_linear:	True if we can't configure non-linear resources
@@ -257,6 +265,7 @@ enum membw_throttle_mode {
 struct resctrl_membw {
 	u32				min_bw;
 	u32				max_bw;
+	bool				default_to_min;
 	u32				bw_gran;
 	u32				delay_linear;
 	bool				arch_needs_linear;
@@ -403,7 +412,7 @@ static inline u32 resctrl_get_default_ctrl(struct rdt_resource *r)
 	case RESCTRL_SCHEMA_BITMAP:
 		return BIT_MASK(r->cache.cbm_len) - 1;
 	case RESCTRL_SCHEMA_RANGE:
-		return r->membw.max_bw;
+		return r->membw.default_to_min ? r->membw.min_bw : r->membw.max_bw;
 	}
 
 	return WARN_ON_ONCE(1);
diff --git a/include/linux/riscv_cbqri.h b/include/linux/riscv_cbqri.h
new file mode 100644
index 00000000000000..5863f0a65f6cc5
--- /dev/null
+++ b/include/linux/riscv_cbqri.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Public registration API for the RISC-V Capacity and Bandwidth QoS
+ * Register Interface (CBQRI) driver. Discovery layers (ACPI RQSC, future
+ * device tree) call riscv_cbqri_register_controller() to hand a controller
+ * descriptor to the driver, which owns all subsequent state.
+ */
+#ifndef _LINUX_RISCV_CBQRI_H
+#define _LINUX_RISCV_CBQRI_H
+
+#include <linux/types.h>
+
+enum cbqri_controller_type {
+	CBQRI_CONTROLLER_TYPE_CAPACITY,
+	CBQRI_CONTROLLER_TYPE_BANDWIDTH,
+};
+
+/* Sanity caps on per-controller RCID/MCID counts from firmware */
+#define CBQRI_MAX_RCID	1024
+#define CBQRI_MAX_MCID	1024
+
+/**
+ * struct cbqri_controller_info - registration descriptor
+ * @addr:        MMIO base address of the controller's register interface
+ * @size:        size of the MMIO region
+ * @type:        capacity or bandwidth controller
+ * @rcid_count:  number of supported RCIDs (per RQSC table)
+ * @mcid_count:  number of supported MCIDs (per RQSC table)
+ * @cache_id:    PPTT cache id. Only meaningful for CAPACITY controllers
+ * @prox_dom:    SRAT proximity domain. Only meaningful for BANDWIDTH
+ *               controllers
+ *
+ * Discovery layers populate one of @cache_id / @prox_dom according to
+ * @type. The CBQRI driver resolves the matching cpumask internally so
+ * callers do not need to know about cacheinfo/NUMA topology.
+ */
+struct cbqri_controller_info {
+	phys_addr_t			addr;
+	phys_addr_t			size;
+	enum cbqri_controller_type	type;
+	u32				rcid_count;
+	u32				mcid_count;
+	u32				cache_id;
+	u32				prox_dom;
+};
+
+#if IS_ENABLED(CONFIG_RISCV_CBQRI_DRIVER)
+int riscv_cbqri_register_controller(const struct cbqri_controller_info *info);
+void riscv_cbqri_unregister_last(unsigned int n);
+#else
+static inline int
+riscv_cbqri_register_controller(const struct cbqri_controller_info *info)
+{
+	return -ENODEV;
+}
+
+static inline void riscv_cbqri_unregister_last(unsigned int n) { }
+#endif
+
+#endif /* _LINUX_RISCV_CBQRI_H */