From b10b55483e1adf9a1c4906afdcac65099ec31651 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:37 +0100
Subject: [PATCH 01/26] xor: assert that xor_blocks is not from preemptible
 user context

Most of the optimized xor_blocks versions require FPU/vector registers,
which generally are not supported in interrupt context.

Both callers already are in user context, so enforce this at the highest
level.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 crypto/xor.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crypto/xor.c b/crypto/xor.c
index f39621a57bb33c..676d7401af430b 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -28,6 +28,8 @@ xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs)
 {
 	unsigned long *p1, *p2, *p3, *p4;
 
+	lockdep_assert_preemption_enabled();
+
 	p1 = (unsigned long *) srcs[0];
 	if (src_count == 1) {
 		active_template->do_2(bytes, dest, p1);

From eb7fb8397d5f324121971554b19392e28c542da5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:38 +0100
Subject: [PATCH 02/26] arm/xor: remove in_interrupt() handling

xor_blocks can't be called from interrupt context, so remove the
handling for that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/arm/include/asm/xor.h | 41 +++++++++++---------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index 934b549905f5c5..bca2a6514746e1 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -4,7 +4,6 @@
  *
  *  Copyright (C) 2001 Russell King
  */
-#include <linux/hardirq.h>
 #include <asm-generic/xor.h>
 #include <asm/hwcap.h>
 #include <asm/neon.h>
@@ -156,13 +155,9 @@ static void
 xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
 	   const unsigned long * __restrict p2)
 {
-	if (in_interrupt()) {
-		xor_arm4regs_2(bytes, p1, p2);
-	} else {
-		kernel_neon_begin();
-		xor_block_neon_inner.do_2(bytes, p1, p2);
-		kernel_neon_end();
-	}
+	kernel_neon_begin();
+	xor_block_neon_inner.do_2(bytes, p1, p2);
+	kernel_neon_end();
 }
 
 static void
@@ -170,13 +165,9 @@ xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
 	   const unsigned long * __restrict p2,
 	   const unsigned long * __restrict p3)
 {
-	if (in_interrupt()) {
-		xor_arm4regs_3(bytes, p1, p2, p3);
-	} else {
-		kernel_neon_begin();
-		xor_block_neon_inner.do_3(bytes, p1, p2, p3);
-		kernel_neon_end();
-	}
+	kernel_neon_begin();
+	xor_block_neon_inner.do_3(bytes, p1, p2, p3);
+	kernel_neon_end();
 }
 
 static void
@@ -185,13 +176,9 @@ xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
 	   const unsigned long * __restrict p3,
 	   const unsigned long * __restrict p4)
 {
-	if (in_interrupt()) {
-		xor_arm4regs_4(bytes, p1, p2, p3, p4);
-	} else {
-		kernel_neon_begin();
-		xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
-		kernel_neon_end();
-	}
+	kernel_neon_begin();
+	xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
+	kernel_neon_end();
 }
 
 static void
@@ -201,13 +188,9 @@ xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
 	   const unsigned long * __restrict p4,
 	   const unsigned long * __restrict p5)
 {
-	if (in_interrupt()) {
-		xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
-	} else {
-		kernel_neon_begin();
-		xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
-		kernel_neon_end();
-	}
+	kernel_neon_begin();
+	xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
+	kernel_neon_end();
 }
 
 static struct xor_block_template xor_block_neon = {

From 814b2e2a24f7b282ab1f463deb6dec1aa5269e27 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:39 +0100
Subject: [PATCH 03/26] um/xor: cleanup xor.h

Since commit c055e3eae0f1 ("crypto: xor - use ktime for template
benchmarking") the benchmarking works just fine even for TT_MODE_INFCPU,
so drop the workarounds.  Note that for CPUs supporting AVX2, which
includes almost everything built in the last 10 years, the AVX2
implementation is forced anyway.

CONFIG_X86_32 is always correctly set for UM in arch/x86/um/Kconfig,
so don't override it either.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/um/include/asm/xor.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/arch/um/include/asm/xor.h b/arch/um/include/asm/xor.h
index 647fae200c5d34..99e5c7e1f47574 100644
--- a/arch/um/include/asm/xor.h
+++ b/arch/um/include/asm/xor.h
@@ -2,23 +2,7 @@
 #ifndef _ASM_UM_XOR_H
 #define _ASM_UM_XOR_H
 
-#ifdef CONFIG_64BIT
-#undef CONFIG_X86_32
-#define TT_CPU_INF_XOR_DEFAULT (AVX_SELECT(&xor_block_sse_pf64))
-#else
-#define CONFIG_X86_32 1
-#define TT_CPU_INF_XOR_DEFAULT (AVX_SELECT(&xor_block_8regs))
-#endif
-
 #include <asm/cpufeature.h>
 #include <../../x86/include/asm/xor.h>
-#include <linux/time-internal.h>
-
-#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
-#undef XOR_SELECT_TEMPLATE
-/* pick an arbitrary one - measuring isn't possible with inf-cpu */
-#define XOR_SELECT_TEMPLATE(x)	\
-	(time_travel_mode == TT_MODE_INFCPU ? TT_CPU_INF_XOR_DEFAULT : x)
-#endif
 
 #endif

From 27ad6ff3f6c9e2a3265e95c907dce27b8b1753a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:40 +0100
Subject: [PATCH 04/26] xor: move to lib/raid/

Move the RAID XOR code to lib/raid/ as it has nothing to do with the
crypto API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 crypto/Kconfig                          | 2 --
 crypto/Makefile                         | 1 -
 lib/Kconfig                             | 1 +
 lib/Makefile                            | 2 +-
 lib/raid/Kconfig                        | 4 ++++
 lib/raid/Makefile                       | 3 +++
 lib/raid/xor/Makefile                   | 5 +++++
 crypto/xor.c => lib/raid/xor/xor-core.c | 0
 8 files changed, 14 insertions(+), 4 deletions(-)
 create mode 100644 lib/raid/Kconfig
 create mode 100644 lib/raid/Makefile
 create mode 100644 lib/raid/xor/Makefile
 rename crypto/xor.c => lib/raid/xor/xor-core.c (100%)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index e2b4106ac961eb..5cdb1b25ae875b 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -2,8 +2,6 @@
 #
 # Generic algorithms support
 #
-config XOR_BLOCKS
-	tristate
 
 #
 # async_tx api: hardware offloaded memory transfer/transform support
diff --git a/crypto/Makefile b/crypto/Makefile
index 04e269117589ac..795c2eea51fe6e 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -196,7 +196,6 @@ obj-$(CONFIG_CRYPTO_ECRDSA) += ecrdsa_generic.o
 #
 # generic algorithms and the async_tx api
 #
-obj-$(CONFIG_XOR_BLOCKS) += xor.o
 obj-$(CONFIG_ASYNC_CORE) += async_tx/
 obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += asymmetric_keys/
 crypto_simd-y := simd.o
diff --git a/lib/Kconfig b/lib/Kconfig
index 0f2fb96106476c..5be57adcd454ff 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -138,6 +138,7 @@ config TRACE_MMIO_ACCESS
 
 source "lib/crc/Kconfig"
 source "lib/crypto/Kconfig"
+source "lib/raid/Kconfig"
 
 config XXHASH
 	tristate
diff --git a/lib/Makefile b/lib/Makefile
index 1b9ee167517f32..84da412a044ff2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -120,7 +120,7 @@ endif
 obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o
 CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any)
 
-obj-y += math/ crc/ crypto/ tests/ vdso/
+obj-y += math/ crc/ crypto/ tests/ vdso/ raid/
 
 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
 obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
diff --git a/lib/raid/Kconfig b/lib/raid/Kconfig
new file mode 100644
index 00000000000000..01b73a1c303fee
--- /dev/null
+++ b/lib/raid/Kconfig
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config XOR_BLOCKS
+	tristate
diff --git a/lib/raid/Makefile b/lib/raid/Makefile
new file mode 100644
index 00000000000000..3540fe846dc427
--- /dev/null
+++ b/lib/raid/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y				+= xor/
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
new file mode 100644
index 00000000000000..7bca0ce8e90ac0
--- /dev/null
+++ b/lib/raid/xor/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_XOR_BLOCKS)	+= xor.o
+
+xor-y				+= xor-core.o
diff --git a/crypto/xor.c b/lib/raid/xor/xor-core.c
similarity index 100%
rename from crypto/xor.c
rename to lib/raid/xor/xor-core.c

From b147b818d7b44bc260f4b0b3d0b57f4004a0a7a8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:41 +0100
Subject: [PATCH 05/26] xor: small cleanups

Update the to of file comment to be correct and non-redundant, and drop
the unused BH_TRACE define.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 lib/raid/xor/xor-core.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index 676d7401af430b..edb4e498da603c 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -1,14 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * xor.c : Multiple Devices driver for Linux
- *
  * Copyright (C) 1996, 1997, 1998, 1999, 2000,
  * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
  *
- * Dispatch optimized RAID-5 checksumming functions.
+ * Dispatch optimized XOR parity functions.
  */
 
-#define BH_TRACE 0
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/raid/xor.h>

From 07a322617d82d93da694a427ffe54308c8543361 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:42 +0100
Subject: [PATCH 06/26] xor: cleanup registration and probing

Originally, the XOR code benchmarked all algorithms at load time, but
it has since then been hacked multiple times to allow forcing an
algorithm, and then commit 524ccdbdfb52 ("crypto: xor - defer load time
benchmark to a later time") changed the logic to a two-step process
or registration and benchmarking, but only when built-in.

Rework this, so that the XOR_TRY_TEMPLATES macro magic now always just
deals with adding the templates to the list, and benchmarking is always
done in a second pass; for modular builds from module_init, and for the
built-in case using a separate init call level.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 lib/raid/xor/xor-core.c | 98 ++++++++++++++++++++---------------------
 1 file changed, 48 insertions(+), 50 deletions(-)

diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index edb4e498da603c..88667a89b75b34 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -52,29 +52,14 @@ EXPORT_SYMBOL(xor_blocks);
 
 /* Set of all registered templates.  */
 static struct xor_block_template *__initdata template_list;
+static bool __initdata xor_forced = false;
 
-#ifndef MODULE
 static void __init do_xor_register(struct xor_block_template *tmpl)
 {
 	tmpl->next = template_list;
 	template_list = tmpl;
 }
 
-static int __init register_xor_blocks(void)
-{
-	active_template = XOR_SELECT_TEMPLATE(NULL);
-
-	if (!active_template) {
-#define xor_speed	do_xor_register
-		// register all the templates and pick the first as the default
-		XOR_TRY_TEMPLATES;
-#undef xor_speed
-		active_template = template_list;
-	}
-	return 0;
-}
-#endif
-
 #define BENCH_SIZE	4096
 #define REPS		800U
 
@@ -85,9 +70,6 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 	unsigned long reps;
 	ktime_t min, start, t0;
 
-	tmpl->next = template_list;
-	template_list = tmpl;
-
 	preempt_disable();
 
 	reps = 0;
@@ -111,63 +93,79 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 	pr_info("   %-16s: %5d MB/sec\n", tmpl->name, speed);
 }
 
-static int __init
-calibrate_xor_blocks(void)
+static int __init calibrate_xor_blocks(void)
 {
 	void *b1, *b2;
 	struct xor_block_template *f, *fastest;
 
-	fastest = XOR_SELECT_TEMPLATE(NULL);
-
-	if (fastest) {
-		printk(KERN_INFO "xor: automatically using best "
-				 "checksumming function   %-10s\n",
-		       fastest->name);
-		goto out;
-	}
+	if (xor_forced)
+		return 0;
 
 	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
 	if (!b1) {
-		printk(KERN_WARNING "xor: Yikes!  No memory available.\n");
+		pr_warn("xor: Yikes!  No memory available.\n");
 		return -ENOMEM;
 	}
 	b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
 
-	/*
-	 * If this arch/cpu has a short-circuited selection, don't loop through
-	 * all the possible functions, just test the best one
-	 */
-
-#define xor_speed(templ)	do_xor_speed((templ), b1, b2)
-
-	printk(KERN_INFO "xor: measuring software checksum speed\n");
-	template_list = NULL;
-	XOR_TRY_TEMPLATES;
+	pr_info("xor: measuring software checksum speed\n");
 	fastest = template_list;
-	for (f = fastest; f; f = f->next)
+	for (f = template_list; f; f = f->next) {
+		do_xor_speed(f, b1, b2);
 		if (f->speed > fastest->speed)
 			fastest = f;
-
+	}
+	active_template = fastest;
 	pr_info("xor: using function: %s (%d MB/sec)\n",
 	       fastest->name, fastest->speed);
 
+	free_pages((unsigned long)b1, 2);
+	return 0;
+}
+
+static int __init xor_init(void)
+{
+	/*
+	 * If this arch/cpu has a short-circuited selection, don't loop through
+	 * all the possible functions, just use the best one.
+	 */
+	active_template = XOR_SELECT_TEMPLATE(NULL);
+	if (active_template) {
+		pr_info("xor: automatically using best checksumming function   %-10s\n",
+			active_template->name);
+		xor_forced = true;
+		return 0;
+	}
+
+#define xor_speed	do_xor_register
+	XOR_TRY_TEMPLATES;
 #undef xor_speed
 
-	free_pages((unsigned long)b1, 2);
-out:
-	active_template = fastest;
+#ifdef MODULE
+	return calibrate_xor_blocks();
+#else
+	/*
+	 * Pick the first template as the temporary default until calibration
+	 * happens.
+	 */
+	active_template = template_list;
 	return 0;
+#endif
 }
 
-static __exit void xor_exit(void) { }
+static __exit void xor_exit(void)
+{
+}
 
 MODULE_DESCRIPTION("RAID-5 checksumming functions");
 MODULE_LICENSE("GPL");
 
+/*
+ * When built-in we must register the default template before md, but we don't
+ * want calibration to run that early as that would delay the boot process.
+ */
 #ifndef MODULE
-/* when built-in xor.o must initialize before drivers/md/md.o */
-core_initcall(register_xor_blocks);
+__initcall(calibrate_xor_blocks);
 #endif
-
-module_init(calibrate_xor_blocks);
+core_initcall(xor_init);
 module_exit(xor_exit);

From df0cfd35c2df9add7a823c379bdc4b9c4e9c142d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:43 +0100
Subject: [PATCH 07/26] xor: split xor.h

Keep xor.h for the public API, and split the struct xor_block_template
definition that is only needed by the xor.ko core and
architecture-specific optimizations into a separate xor_impl.h header.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/arm/lib/xor-neon.c       |  1 +
 arch/arm64/lib/xor-neon.c     |  1 +
 arch/s390/lib/xor.c           |  2 +-
 include/linux/raid/xor.h      | 22 +---------------------
 include/linux/raid/xor_impl.h | 25 +++++++++++++++++++++++++
 lib/raid/xor/xor-core.c       |  1 +
 6 files changed, 30 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/raid/xor_impl.h

diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
index cf57fca979089a..282980b9bf2a22 100644
--- a/arch/arm/lib/xor-neon.c
+++ b/arch/arm/lib/xor-neon.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/raid/xor.h>
+#include <linux/raid/xor_impl.h>
 #include <linux/module.h>
 
 MODULE_DESCRIPTION("NEON accelerated XOR implementation");
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 8fffebfa17b204..351aba92d9324c 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/raid/xor.h>
+#include <linux/raid/xor_impl.h>
 #include <linux/module.h>
 #include <asm/neon-intrinsics.h>
 
diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c
index 1721b73b780369..4d5ed638d850fa 100644
--- a/arch/s390/lib/xor.c
+++ b/arch/s390/lib/xor.c
@@ -8,7 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/export.h>
-#include <linux/raid/xor.h>
+#include <linux/raid/xor_impl.h>
 #include <asm/xor.h>
 
 static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index 51b811b623224f..02bda8d995347b 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -7,24 +7,4 @@
 extern void xor_blocks(unsigned int count, unsigned int bytes,
 	void *dest, void **srcs);
 
-struct xor_block_template {
-        struct xor_block_template *next;
-        const char *name;
-        int speed;
-	void (*do_2)(unsigned long, unsigned long * __restrict,
-		     const unsigned long * __restrict);
-	void (*do_3)(unsigned long, unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict);
-	void (*do_4)(unsigned long, unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict);
-	void (*do_5)(unsigned long, unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict);
-};
-
-#endif
+#endif /* _XOR_H */
diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h
new file mode 100644
index 00000000000000..a1890cd6681243
--- /dev/null
+++ b/include/linux/raid/xor_impl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XOR_IMPL_H
+#define _XOR_IMPL_H
+
+struct xor_block_template {
+	struct xor_block_template *next;
+	const char *name;
+	int speed;
+	void (*do_2)(unsigned long, unsigned long * __restrict,
+		     const unsigned long * __restrict);
+	void (*do_3)(unsigned long, unsigned long * __restrict,
+		     const unsigned long * __restrict,
+		     const unsigned long * __restrict);
+	void (*do_4)(unsigned long, unsigned long * __restrict,
+		     const unsigned long * __restrict,
+		     const unsigned long * __restrict,
+		     const unsigned long * __restrict);
+	void (*do_5)(unsigned long, unsigned long * __restrict,
+		     const unsigned long * __restrict,
+		     const unsigned long * __restrict,
+		     const unsigned long * __restrict,
+		     const unsigned long * __restrict);
+};
+
+#endif /* _XOR_IMPL_H */
diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index 88667a89b75b34..58f2d0c16420d4 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/raid/xor.h>
+#include <linux/raid/xor_impl.h>
 #include <linux/jiffies.h>
 #include <linux/preempt.h>
 #include <asm/xor.h>

From f9df7e7ca8f4e690899b0ae773a6bf89bb51dac9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:44 +0100
Subject: [PATCH 08/26] xor: remove macro abuse for XOR implementation
 registrations

Drop the pretty confusing historic XOR_TRY_TEMPLATES and
XOR_SELECT_TEMPLATE, and instead let the architectures provide a
arch_xor_init that calls either xor_register to register candidates
or xor_force to force a specific implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/alpha/include/asm/xor.h     | 29 ++++++++++++----------
 arch/arm/include/asm/xor.h       | 25 +++++++++----------
 arch/arm64/include/asm/xor.h     | 18 +++++++-------
 arch/loongarch/include/asm/xor.h | 42 ++++++++++++--------------------
 arch/powerpc/include/asm/xor.h   | 31 ++++++++++-------------
 arch/riscv/include/asm/xor.h     | 19 ++++++++-------
 arch/s390/include/asm/xor.h      | 12 ++++-----
 arch/sparc/include/asm/xor_32.h  | 14 +++++------
 arch/sparc/include/asm/xor_64.h  | 31 +++++++++++------------
 arch/x86/include/asm/xor.h       |  3 ---
 arch/x86/include/asm/xor_32.h    | 36 ++++++++++++++-------------
 arch/x86/include/asm/xor_64.h    | 18 ++++++++------
 arch/x86/include/asm/xor_avx.h   |  9 -------
 include/asm-generic/xor.h        |  8 ------
 include/linux/raid/xor_impl.h    |  5 ++++
 lib/raid/xor/xor-core.c          | 41 +++++++++++++++++++++++--------
 16 files changed, 168 insertions(+), 173 deletions(-)

diff --git a/arch/alpha/include/asm/xor.h b/arch/alpha/include/asm/xor.h
index e0de0c233ab923..4c8085711df182 100644
--- a/arch/alpha/include/asm/xor.h
+++ b/arch/alpha/include/asm/xor.h
@@ -851,16 +851,19 @@ static struct xor_block_template xor_block_alpha_prefetch = {
 /* For grins, also test the generic routines.  */
 #include <asm-generic/xor.h>
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-	do {						\
-		xor_speed(&xor_block_8regs);		\
-		xor_speed(&xor_block_32regs);		\
-		xor_speed(&xor_block_alpha);		\
-		xor_speed(&xor_block_alpha_prefetch);	\
-	} while (0)
-
-/* Force the use of alpha_prefetch if EV6, as it is significantly
-   faster in the cold cache case.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
-	(implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST)
+/*
+ * Force the use of alpha_prefetch if EV6, as it is significantly faster in the
+ * cold cache case.
+ */
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	if (implver() == IMPLVER_EV6) {
+		xor_force(&xor_block_alpha_prefetch);
+	} else {
+		xor_register(&xor_block_8regs);
+		xor_register(&xor_block_32regs);
+		xor_register(&xor_block_alpha);
+		xor_register(&xor_block_alpha_prefetch);
+	}
+}
diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index bca2a6514746e1..b2dcd49186e2b9 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -138,15 +138,6 @@ static struct xor_block_template xor_block_arm4regs = {
 	.do_5	= xor_arm4regs_5,
 };
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES			\
-	do {					\
-		xor_speed(&xor_block_arm4regs);	\
-		xor_speed(&xor_block_8regs);	\
-		xor_speed(&xor_block_32regs);	\
-		NEON_TEMPLATES;			\
-	} while (0)
-
 #ifdef CONFIG_KERNEL_MODE_NEON
 
 extern struct xor_block_template const xor_block_neon_inner;
@@ -201,8 +192,16 @@ static struct xor_block_template xor_block_neon = {
 	.do_5	= xor_neon_5
 };
 
-#define NEON_TEMPLATES	\
-	do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0)
-#else
-#define NEON_TEMPLATES
+#endif /* CONFIG_KERNEL_MODE_NEON */
+
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	xor_register(&xor_block_arm4regs);
+	xor_register(&xor_block_8regs);
+	xor_register(&xor_block_32regs);
+#ifdef CONFIG_KERNEL_MODE_NEON
+	if (cpu_has_neon())
+		xor_register(&xor_block_neon);
 #endif
+}
diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
index c38e3d017a79ec..bfa6122f55cecd 100644
--- a/arch/arm64/include/asm/xor.h
+++ b/arch/arm64/include/asm/xor.h
@@ -60,14 +60,14 @@ static struct xor_block_template xor_block_arm64 = {
 	.do_4   = xor_neon_4,
 	.do_5	= xor_neon_5
 };
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES           \
-	do {        \
-		xor_speed(&xor_block_8regs);    \
-		xor_speed(&xor_block_32regs);    \
-		if (cpu_has_neon()) { \
-			xor_speed(&xor_block_arm64);\
-		} \
-	} while (0)
+
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	xor_register(&xor_block_8regs);
+	xor_register(&xor_block_32regs);
+	if (cpu_has_neon())
+		xor_register(&xor_block_arm64);
+}
 
 #endif /* ! CONFIG_KERNEL_MODE_NEON */
diff --git a/arch/loongarch/include/asm/xor.h b/arch/loongarch/include/asm/xor.h
index 12467fffee4687..d17c0e3b047f13 100644
--- a/arch/loongarch/include/asm/xor.h
+++ b/arch/loongarch/include/asm/xor.h
@@ -16,14 +16,6 @@ static struct xor_block_template xor_block_lsx = {
 	.do_4 = xor_lsx_4,
 	.do_5 = xor_lsx_5,
 };
-
-#define XOR_SPEED_LSX()					\
-	do {						\
-		if (cpu_has_lsx)			\
-			xor_speed(&xor_block_lsx);	\
-	} while (0)
-#else /* CONFIG_CPU_HAS_LSX */
-#define XOR_SPEED_LSX()
 #endif /* CONFIG_CPU_HAS_LSX */
 
 #ifdef CONFIG_CPU_HAS_LASX
@@ -34,14 +26,6 @@ static struct xor_block_template xor_block_lasx = {
 	.do_4 = xor_lasx_4,
 	.do_5 = xor_lasx_5,
 };
-
-#define XOR_SPEED_LASX()					\
-	do {							\
-		if (cpu_has_lasx)				\
-			xor_speed(&xor_block_lasx);		\
-	} while (0)
-#else /* CONFIG_CPU_HAS_LASX */
-#define XOR_SPEED_LASX()
 #endif /* CONFIG_CPU_HAS_LASX */
 
 /*
@@ -54,15 +38,21 @@ static struct xor_block_template xor_block_lasx = {
  */
 #include <asm-generic/xor.h>
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-do {							\
-	xor_speed(&xor_block_8regs);			\
-	xor_speed(&xor_block_8regs_p);			\
-	xor_speed(&xor_block_32regs);			\
-	xor_speed(&xor_block_32regs_p);			\
-	XOR_SPEED_LSX();				\
-	XOR_SPEED_LASX();				\
-} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	xor_register(&xor_block_8regs);
+	xor_register(&xor_block_8regs_p);
+	xor_register(&xor_block_32regs);
+	xor_register(&xor_block_32regs_p);
+#ifdef CONFIG_CPU_HAS_LSX
+	if (cpu_has_lsx)
+		xor_register(&xor_block_lsx);
+#endif
+#ifdef CONFIG_CPU_HAS_LASX
+	if (cpu_has_lasx)
+		xor_register(&xor_block_lasx);
+#endif
+}
 
 #endif /* _ASM_LOONGARCH_XOR_H */
diff --git a/arch/powerpc/include/asm/xor.h b/arch/powerpc/include/asm/xor.h
index 37d05c11d09cda..30224c5279c4bf 100644
--- a/arch/powerpc/include/asm/xor.h
+++ b/arch/powerpc/include/asm/xor.h
@@ -21,27 +21,22 @@ static struct xor_block_template xor_block_altivec = {
 	.do_4 = xor_altivec_4,
 	.do_5 = xor_altivec_5,
 };
-
-#define XOR_SPEED_ALTIVEC()				\
-	do {						\
-		if (cpu_has_feature(CPU_FTR_ALTIVEC))	\
-			xor_speed(&xor_block_altivec);	\
-	} while (0)
-#else
-#define XOR_SPEED_ALTIVEC()
-#endif
+#endif /* CONFIG_ALTIVEC */
 
 /* Also try the generic routines. */
 #include <asm-generic/xor.h>
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-do {							\
-	xor_speed(&xor_block_8regs);			\
-	xor_speed(&xor_block_8regs_p);			\
-	xor_speed(&xor_block_32regs);			\
-	xor_speed(&xor_block_32regs_p);			\
-	XOR_SPEED_ALTIVEC();				\
-} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	xor_register(&xor_block_8regs);
+	xor_register(&xor_block_8regs_p);
+	xor_register(&xor_block_32regs);
+	xor_register(&xor_block_32regs_p);
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		xor_register(&xor_block_altivec);
+#endif
+}
 
 #endif /* _ASM_POWERPC_XOR_H */
diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h
index 96011861e46b4d..ed5f27903efc41 100644
--- a/arch/riscv/include/asm/xor.h
+++ b/arch/riscv/include/asm/xor.h
@@ -55,14 +55,15 @@ static struct xor_block_template xor_block_rvv = {
 	.do_4 = xor_vector_4,
 	.do_5 = xor_vector_5
 };
+#endif /* CONFIG_RISCV_ISA_V */
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES           \
-	do {        \
-		xor_speed(&xor_block_8regs);    \
-		xor_speed(&xor_block_32regs);    \
-		if (has_vector()) { \
-			xor_speed(&xor_block_rvv);\
-		} \
-	} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	xor_register(&xor_block_8regs);
+	xor_register(&xor_block_32regs);
+#ifdef CONFIG_RISCV_ISA_V
+	if (has_vector())
+		xor_register(&xor_block_rvv);
 #endif
+}
diff --git a/arch/s390/include/asm/xor.h b/arch/s390/include/asm/xor.h
index 857d6759b67f0d..4e2233f64da98d 100644
--- a/arch/s390/include/asm/xor.h
+++ b/arch/s390/include/asm/xor.h
@@ -10,12 +10,10 @@
 
 extern struct xor_block_template xor_block_xc;
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-do {							\
-	xor_speed(&xor_block_xc);			\
-} while (0)
-
-#define XOR_SELECT_TEMPLATE(FASTEST)	(&xor_block_xc)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	xor_force(&xor_block_xc);
+}
 
 #endif /* _ASM_S390_XOR_H */
diff --git a/arch/sparc/include/asm/xor_32.h b/arch/sparc/include/asm/xor_32.h
index 0351813cf3af5a..8fbf0c07ec2897 100644
--- a/arch/sparc/include/asm/xor_32.h
+++ b/arch/sparc/include/asm/xor_32.h
@@ -259,10 +259,10 @@ static struct xor_block_template xor_block_SPARC = {
 /* For grins, also test the generic routines.  */
 #include <asm-generic/xor.h>
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-	do {						\
-		xor_speed(&xor_block_8regs);		\
-		xor_speed(&xor_block_32regs);		\
-		xor_speed(&xor_block_SPARC);		\
-	} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	xor_register(&xor_block_8regs);
+	xor_register(&xor_block_32regs);
+	xor_register(&xor_block_SPARC);
+}
diff --git a/arch/sparc/include/asm/xor_64.h b/arch/sparc/include/asm/xor_64.h
index caaddea8ad79dd..e0482ecc0a68bd 100644
--- a/arch/sparc/include/asm/xor_64.h
+++ b/arch/sparc/include/asm/xor_64.h
@@ -60,20 +60,17 @@ static struct xor_block_template xor_block_niagara = {
         .do_5	= xor_niagara_5,
 };
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-	do {						\
-		xor_speed(&xor_block_VIS);		\
-		xor_speed(&xor_block_niagara);		\
-	} while (0)
-
-/* For VIS for everything except Niagara.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
-	((tlb_type == hypervisor && \
-	  (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || \
-	   sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || \
-	   sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || \
-	   sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || \
-	   sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) ? \
-	 &xor_block_niagara : \
-	 &xor_block_VIS)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	/* Force VIS for everything except Niagara.  */
+	if (tlb_type == hypervisor &&
+	    (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 ||
+	     sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
+	     sun4v_chip_type == SUN4V_CHIP_NIAGARA3 ||
+	     sun4v_chip_type == SUN4V_CHIP_NIAGARA4 ||
+	     sun4v_chip_type == SUN4V_CHIP_NIAGARA5))
+		xor_force(&xor_block_niagara);
+	else
+		xor_force(&xor_block_VIS);
+}
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 7b0307acc4103c..33f5620d8d691e 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -496,7 +496,4 @@ static struct xor_block_template xor_block_sse_pf64 = {
 # include <asm/xor_64.h>
 #endif
 
-#define XOR_SELECT_TEMPLATE(FASTEST) \
-	AVX_SELECT(FASTEST)
-
 #endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index 7a6b9474591e75..ee32d08c27bc59 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -552,22 +552,24 @@ static struct xor_block_template xor_block_pIII_sse = {
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-do {							\
-	AVX_XOR_SPEED;					\
-	if (boot_cpu_has(X86_FEATURE_XMM)) {				\
-		xor_speed(&xor_block_pIII_sse);		\
-		xor_speed(&xor_block_sse_pf64);		\
-	} else if (boot_cpu_has(X86_FEATURE_MMX)) {	\
-		xor_speed(&xor_block_pII_mmx);		\
-		xor_speed(&xor_block_p5_mmx);		\
-	} else {					\
-		xor_speed(&xor_block_8regs);		\
-		xor_speed(&xor_block_8regs_p);		\
-		xor_speed(&xor_block_32regs);		\
-		xor_speed(&xor_block_32regs_p);		\
-	}						\
-} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		xor_force(&xor_block_avx);
+	} else if (boot_cpu_has(X86_FEATURE_XMM)) {
+		xor_register(&xor_block_pIII_sse);
+		xor_register(&xor_block_sse_pf64);
+	} else if (boot_cpu_has(X86_FEATURE_MMX)) {
+		xor_register(&xor_block_pII_mmx);
+		xor_register(&xor_block_p5_mmx);
+	} else {
+		xor_register(&xor_block_8regs);
+		xor_register(&xor_block_8regs_p);
+		xor_register(&xor_block_32regs);
+		xor_register(&xor_block_32regs_p);
+	}
+}
 
 #endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 0307e4ec504405..2d2ceb2418665e 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -17,12 +17,16 @@ static struct xor_block_template xor_block_sse = {
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES			\
-do {						\
-	AVX_XOR_SPEED;				\
-	xor_speed(&xor_block_sse_pf64);		\
-	xor_speed(&xor_block_sse);		\
-} while (0)
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		xor_force(&xor_block_avx);
+	} else {
+		xor_register(&xor_block_sse_pf64);
+		xor_register(&xor_block_sse);
+	}
+}
 
 #endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 7f81dd5897f417..c600888436bb90 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -166,13 +166,4 @@ static struct xor_block_template xor_block_avx = {
 	.do_5 = xor_avx_5,
 };
 
-#define AVX_XOR_SPEED \
-do { \
-	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
-		xor_speed(&xor_block_avx); \
-} while (0)
-
-#define AVX_SELECT(FASTEST) \
-	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
-
 #endif
diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h
index 44509d48fca21e..79c0096aa9d957 100644
--- a/include/asm-generic/xor.h
+++ b/include/asm-generic/xor.h
@@ -728,11 +728,3 @@ static struct xor_block_template xor_block_32regs_p __maybe_unused = {
 	.do_4 = xor_32regs_p_4,
 	.do_5 = xor_32regs_p_5,
 };
-
-#define XOR_TRY_TEMPLATES			\
-	do {					\
-		xor_speed(&xor_block_8regs);	\
-		xor_speed(&xor_block_8regs_p);	\
-		xor_speed(&xor_block_32regs);	\
-		xor_speed(&xor_block_32regs_p);	\
-	} while (0)
diff --git a/include/linux/raid/xor_impl.h b/include/linux/raid/xor_impl.h
index a1890cd6681243..6ed4c445ab24ce 100644
--- a/include/linux/raid/xor_impl.h
+++ b/include/linux/raid/xor_impl.h
@@ -2,6 +2,8 @@
 #ifndef _XOR_IMPL_H
 #define _XOR_IMPL_H
 
+#include <linux/init.h>
+
 struct xor_block_template {
 	struct xor_block_template *next;
 	const char *name;
@@ -22,4 +24,7 @@ struct xor_block_template {
 		     const unsigned long * __restrict);
 };
 
+void __init xor_register(struct xor_block_template *tmpl);
+void __init xor_force(struct xor_block_template *tmpl);
+
 #endif /* _XOR_IMPL_H */
diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index 58f2d0c16420d4..01a42995b7a528 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -14,10 +14,6 @@
 #include <linux/preempt.h>
 #include <asm/xor.h>
 
-#ifndef XOR_SELECT_TEMPLATE
-#define XOR_SELECT_TEMPLATE(x) (x)
-#endif
-
 /* The xor routines to use.  */
 static struct xor_block_template *active_template;
 
@@ -55,12 +51,33 @@ EXPORT_SYMBOL(xor_blocks);
 static struct xor_block_template *__initdata template_list;
 static bool __initdata xor_forced = false;
 
-static void __init do_xor_register(struct xor_block_template *tmpl)
+/**
+ * xor_register - register a XOR template
+ * @tmpl:	template to register
+ *
+ * Register a XOR implementation with the core.  Registered implementations
+ * will be measured by a trivial benchmark, and the fastest one is chosen
+ * unless an implementation is forced using xor_force().
+ */
+void __init xor_register(struct xor_block_template *tmpl)
 {
 	tmpl->next = template_list;
 	template_list = tmpl;
 }
 
+/**
+ * xor_force - force use of a XOR template
+ * @tmpl:	template to register
+ *
+ * Register a XOR implementation with the core and force using it.  Forcing
+ * an implementation will make the core ignore any template registered using
+ * xor_register(), or any previous implementation forced using xor_force().
+ */
+void __init xor_force(struct xor_block_template *tmpl)
+{
+	active_template = tmpl;
+}
+
 #define BENCH_SIZE	4096
 #define REPS		800U
 
@@ -126,11 +143,19 @@ static int __init calibrate_xor_blocks(void)
 
 static int __init xor_init(void)
 {
+#ifdef arch_xor_init
+	arch_xor_init();
+#else
+	xor_register(&xor_block_8regs);
+	xor_register(&xor_block_8regs_p);
+	xor_register(&xor_block_32regs);
+	xor_register(&xor_block_32regs_p);
+#endif
+
 	/*
 	 * If this arch/cpu has a short-circuited selection, don't loop through
 	 * all the possible functions, just use the best one.
 	 */
-	active_template = XOR_SELECT_TEMPLATE(NULL);
 	if (active_template) {
 		pr_info("xor: automatically using best checksumming function   %-10s\n",
 			active_template->name);
@@ -138,10 +163,6 @@ static int __init xor_init(void)
 		return 0;
 	}
 
-#define xor_speed	do_xor_register
-	XOR_TRY_TEMPLATES;
-#undef xor_speed
-
 #ifdef MODULE
 	return calibrate_xor_blocks();
 #else

From b22551ff41186c40b4c62a074475ce4255a46c96 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:45 +0100
Subject: [PATCH 09/26] xor: move generic implementations out of
 asm-generic/xor.h

Move the generic implementations from asm-generic/xor.h to
per-implementaion .c files in lib/raid.  This will build them
unconditionally even when an architecture forces a specific
implementation, but as we'll need at least one generic version
for the static_call optimization later on we'll pay that price.

Note that this would cause the second xor_block_8regs instance created by
arch/arm/lib/xor-neon.c to be generated instead of discarded as dead
code, so add a NO_TEMPLATE symbol to disable it for this case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/arm/lib/xor-neon.c            |   4 +-
 include/asm-generic/xor.h          | 727 +----------------------------
 lib/raid/xor/Makefile              |   4 +
 lib/raid/xor/xor-32regs-prefetch.c | 268 +++++++++++
 lib/raid/xor/xor-32regs.c          | 219 +++++++++
 lib/raid/xor/xor-8regs-prefetch.c  | 146 ++++++
 lib/raid/xor/xor-8regs.c           | 105 +++++
 7 files changed, 748 insertions(+), 725 deletions(-)
 create mode 100644 lib/raid/xor/xor-32regs-prefetch.c
 create mode 100644 lib/raid/xor/xor-32regs.c
 create mode 100644 lib/raid/xor/xor-8regs-prefetch.c
 create mode 100644 lib/raid/xor/xor-8regs.c

diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
index 282980b9bf2a22..b5be50567991bc 100644
--- a/arch/arm/lib/xor-neon.c
+++ b/arch/arm/lib/xor-neon.c
@@ -26,8 +26,8 @@ MODULE_LICENSE("GPL");
 #pragma GCC optimize "tree-vectorize"
 #endif
 
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#include <asm-generic/xor.h>
+#define NO_TEMPLATE
+#include "../../../lib/raid/xor/xor-8regs.c"
 
 struct xor_block_template const xor_block_neon_inner = {
 	.name	= "__inner_neon__",
diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h
index 79c0096aa9d957..fc151fdc45ab09 100644
--- a/include/asm-generic/xor.h
+++ b/include/asm-generic/xor.h
@@ -5,726 +5,7 @@
  * Generic optimized RAID-5 checksumming functions.
  */
 
-#include <linux/prefetch.h>
-
-static void
-xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1,
-	    const unsigned long * __restrict p2)
-{
-	long lines = bytes / (sizeof (long)) / 8;
-
-	do {
-		p1[0] ^= p2[0];
-		p1[1] ^= p2[1];
-		p1[2] ^= p2[2];
-		p1[3] ^= p2[3];
-		p1[4] ^= p2[4];
-		p1[5] ^= p2[5];
-		p1[6] ^= p2[6];
-		p1[7] ^= p2[7];
-		p1 += 8;
-		p2 += 8;
-	} while (--lines > 0);
-}
-
-static void
-xor_8regs_3(unsigned long bytes, unsigned long * __restrict p1,
-	    const unsigned long * __restrict p2,
-	    const unsigned long * __restrict p3)
-{
-	long lines = bytes / (sizeof (long)) / 8;
-
-	do {
-		p1[0] ^= p2[0] ^ p3[0];
-		p1[1] ^= p2[1] ^ p3[1];
-		p1[2] ^= p2[2] ^ p3[2];
-		p1[3] ^= p2[3] ^ p3[3];
-		p1[4] ^= p2[4] ^ p3[4];
-		p1[5] ^= p2[5] ^ p3[5];
-		p1[6] ^= p2[6] ^ p3[6];
-		p1[7] ^= p2[7] ^ p3[7];
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-	} while (--lines > 0);
-}
-
-static void
-xor_8regs_4(unsigned long bytes, unsigned long * __restrict p1,
-	    const unsigned long * __restrict p2,
-	    const unsigned long * __restrict p3,
-	    const unsigned long * __restrict p4)
-{
-	long lines = bytes / (sizeof (long)) / 8;
-
-	do {
-		p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
-		p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
-		p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
-		p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
-		p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
-		p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
-		p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
-		p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-		p4 += 8;
-	} while (--lines > 0);
-}
-
-static void
-xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
-	    const unsigned long * __restrict p2,
-	    const unsigned long * __restrict p3,
-	    const unsigned long * __restrict p4,
-	    const unsigned long * __restrict p5)
-{
-	long lines = bytes / (sizeof (long)) / 8;
-
-	do {
-		p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
-		p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
-		p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
-		p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
-		p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
-		p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
-		p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
-		p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-		p4 += 8;
-		p5 += 8;
-	} while (--lines > 0);
-}
-
-static void
-xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1,
-	     const unsigned long * __restrict p2)
-{
-	long lines = bytes / (sizeof (long)) / 8;
-
-	do {
-		register long d0, d1, d2, d3, d4, d5, d6, d7;
-		d0 = p1[0];	/* Pull the stuff into registers	*/
-		d1 = p1[1];	/*  ... in bursts, if possible.		*/
-		d2 = p1[2];
-		d3 = p1[3];
-		d4 = p1[4];
-		d5 = p1[5];
-		d6 = p1[6];
-		d7 = p1[7];
-		d0 ^= p2[0];
-		d1 ^= p2[1];
-		d2 ^= p2[2];
-		d3 ^= p2[3];
-		d4 ^= p2[4];
-		d5 ^= p2[5];
-		d6 ^= p2[6];
-		d7 ^= p2[7];
-		p1[0] = d0;	/* Store the result (in bursts)		*/
-		p1[1] = d1;
-		p1[2] = d2;
-		p1[3] = d3;
-		p1[4] = d4;
-		p1[5] = d5;
-		p1[6] = d6;
-		p1[7] = d7;
-		p1 += 8;
-		p2 += 8;
-	} while (--lines > 0);
-}
-
-static void
-xor_32regs_3(unsigned long bytes, unsigned long * __restrict p1,
-	     const unsigned long * __restrict p2,
-	     const unsigned long * __restrict p3)
-{
-	long lines = bytes / (sizeof (long)) / 8;
-
-	do {
-		register long d0, d1, d2, d3, d4, d5, d6, d7;
-		d0 = p1[0];	/* Pull the stuff into registers	*/
-		d1 = p1[1];	/*  ... in bursts, if possible.		*/
-		d2 = p1[2];
-		d3 = p1[3];
-		d4 = p1[4];
-		d5 = p1[5];
-		d6 = p1[6];
-		d7 = p1[7];
-		d0 ^= p2[0];
-		d1 ^= p2[1];
-		d2 ^= p2[2];
-		d3 ^= p2[3];
-		d4 ^= p2[4];
-		d5 ^= p2[5];
-		d6 ^= p2[6];
-		d7 ^= p2[7];
-		d0 ^= p3[0];
-		d1 ^= p3[1];
-		d2 ^= p3[2];
-		d3 ^= p3[3];
-		d4 ^= p3[4];
-		d5 ^= p3[5];
-		d6 ^= p3[6];
-		d7 ^= p3[7];
-		p1[0] = d0;	/* Store the result (in bursts)		*/
-		p1[1] = d1;
-		p1[2] = d2;
-		p1[3] = d3;
-		p1[4] = d4;
-		p1[5] = d5;
-		p1[6] = d6;
-		p1[7] = d7;
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-	} while (--lines > 0);
-}
-
-static void
-xor_32regs_4(unsigned long bytes, unsigned long * __restrict p1,
-	     const unsigned long * __restrict p2,
-	     const unsigned long * __restrict p3,
-	     const unsigned long * __restrict p4)
-{
-	long lines = bytes / (sizeof (long)) / 8;
-
-	do {
-		register long d0, d1, d2, d3, d4, d5, d6, d7;
-		d0 = p1[0];	/* Pull the stuff into registers	*/
-		d1 = p1[1];	/*  ... in bursts, if possible.		*/
-		d2 = p1[2];
-		d3 = p1[3];
-		d4 = p1[4];
-		d5 = p1[5];
-		d6 = p1[6];
-		d7 = p1[7];
-		d0 ^= p2[0];
-		d1 ^= p2[1];
-		d2 ^= p2[2];
-		d3 ^= p2[3];
-		d4 ^= p2[4];
-		d5 ^= p2[5];
-		d6 ^= p2[6];
-		d7 ^= p2[7];
-		d0 ^= p3[0];
-		d1 ^= p3[1];
-		d2 ^= p3[2];
-		d3 ^= p3[3];
-		d4 ^= p3[4];
-		d5 ^= p3[5];
-		d6 ^= p3[6];
-		d7 ^= p3[7];
-		d0 ^= p4[0];
-		d1 ^= p4[1];
-		d2 ^= p4[2];
-		d3 ^= p4[3];
-		d4 ^= p4[4];
-		d5 ^= p4[5];
-		d6 ^= p4[6];
-		d7 ^= p4[7];
-		p1[0] = d0;	/* Store the result (in bursts)		*/
-		p1[1] = d1;
-		p1[2] = d2;
-		p1[3] = d3;
-		p1[4] = d4;
-		p1[5] = d5;
-		p1[6] = d6;
-		p1[7] = d7;
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-		p4 += 8;
-	} while (--lines > 0);
-}
-
-static void
-xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1,
-	     const unsigned long * __restrict p2,
-	     const unsigned long * __restrict p3,
-	     const unsigned long * __restrict p4,
-	     const unsigned long * __restrict p5)
-{
-	long lines = bytes / (sizeof (long)) / 8;
-
-	do {
-		register long d0, d1, d2, d3, d4, d5, d6, d7;
-		d0 = p1[0];	/* Pull the stuff into registers	*/
-		d1 = p1[1];	/*  ... in bursts, if possible.		*/
-		d2 = p1[2];
-		d3 = p1[3];
-		d4 = p1[4];
-		d5 = p1[5];
-		d6 = p1[6];
-		d7 = p1[7];
-		d0 ^= p2[0];
-		d1 ^= p2[1];
-		d2 ^= p2[2];
-		d3 ^= p2[3];
-		d4 ^= p2[4];
-		d5 ^= p2[5];
-		d6 ^= p2[6];
-		d7 ^= p2[7];
-		d0 ^= p3[0];
-		d1 ^= p3[1];
-		d2 ^= p3[2];
-		d3 ^= p3[3];
-		d4 ^= p3[4];
-		d5 ^= p3[5];
-		d6 ^= p3[6];
-		d7 ^= p3[7];
-		d0 ^= p4[0];
-		d1 ^= p4[1];
-		d2 ^= p4[2];
-		d3 ^= p4[3];
-		d4 ^= p4[4];
-		d5 ^= p4[5];
-		d6 ^= p4[6];
-		d7 ^= p4[7];
-		d0 ^= p5[0];
-		d1 ^= p5[1];
-		d2 ^= p5[2];
-		d3 ^= p5[3];
-		d4 ^= p5[4];
-		d5 ^= p5[5];
-		d6 ^= p5[6];
-		d7 ^= p5[7];
-		p1[0] = d0;	/* Store the result (in bursts)		*/
-		p1[1] = d1;
-		p1[2] = d2;
-		p1[3] = d3;
-		p1[4] = d4;
-		p1[5] = d5;
-		p1[6] = d6;
-		p1[7] = d7;
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-		p4 += 8;
-		p5 += 8;
-	} while (--lines > 0);
-}
-
-static void
-xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
-	      const unsigned long * __restrict p2)
-{
-	long lines = bytes / (sizeof (long)) / 8 - 1;
-	prefetchw(p1);
-	prefetch(p2);
-
-	do {
-		prefetchw(p1+8);
-		prefetch(p2+8);
- once_more:
-		p1[0] ^= p2[0];
-		p1[1] ^= p2[1];
-		p1[2] ^= p2[2];
-		p1[3] ^= p2[3];
-		p1[4] ^= p2[4];
-		p1[5] ^= p2[5];
-		p1[6] ^= p2[6];
-		p1[7] ^= p2[7];
-		p1 += 8;
-		p2 += 8;
-	} while (--lines > 0);
-	if (lines == 0)
-		goto once_more;
-}
-
-static void
-xor_8regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
-	      const unsigned long * __restrict p2,
-	      const unsigned long * __restrict p3)
-{
-	long lines = bytes / (sizeof (long)) / 8 - 1;
-	prefetchw(p1);
-	prefetch(p2);
-	prefetch(p3);
-
-	do {
-		prefetchw(p1+8);
-		prefetch(p2+8);
-		prefetch(p3+8);
- once_more:
-		p1[0] ^= p2[0] ^ p3[0];
-		p1[1] ^= p2[1] ^ p3[1];
-		p1[2] ^= p2[2] ^ p3[2];
-		p1[3] ^= p2[3] ^ p3[3];
-		p1[4] ^= p2[4] ^ p3[4];
-		p1[5] ^= p2[5] ^ p3[5];
-		p1[6] ^= p2[6] ^ p3[6];
-		p1[7] ^= p2[7] ^ p3[7];
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-	} while (--lines > 0);
-	if (lines == 0)
-		goto once_more;
-}
-
-static void
-xor_8regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
-	      const unsigned long * __restrict p2,
-	      const unsigned long * __restrict p3,
-	      const unsigned long * __restrict p4)
-{
-	long lines = bytes / (sizeof (long)) / 8 - 1;
-
-	prefetchw(p1);
-	prefetch(p2);
-	prefetch(p3);
-	prefetch(p4);
-
-	do {
-		prefetchw(p1+8);
-		prefetch(p2+8);
-		prefetch(p3+8);
-		prefetch(p4+8);
- once_more:
-		p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
-		p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
-		p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
-		p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
-		p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
-		p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
-		p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
-		p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-		p4 += 8;
-	} while (--lines > 0);
-	if (lines == 0)
-		goto once_more;
-}
-
-static void
-xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
-	      const unsigned long * __restrict p2,
-	      const unsigned long * __restrict p3,
-	      const unsigned long * __restrict p4,
-	      const unsigned long * __restrict p5)
-{
-	long lines = bytes / (sizeof (long)) / 8 - 1;
-
-	prefetchw(p1);
-	prefetch(p2);
-	prefetch(p3);
-	prefetch(p4);
-	prefetch(p5);
-
-	do {
-		prefetchw(p1+8);
-		prefetch(p2+8);
-		prefetch(p3+8);
-		prefetch(p4+8);
-		prefetch(p5+8);
- once_more:
-		p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
-		p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
-		p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
-		p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
-		p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
-		p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
-		p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
-		p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-		p4 += 8;
-		p5 += 8;
-	} while (--lines > 0);
-	if (lines == 0)
-		goto once_more;
-}
-
-static void
-xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2)
-{
-	long lines = bytes / (sizeof (long)) / 8 - 1;
-
-	prefetchw(p1);
-	prefetch(p2);
-
-	do {
-		register long d0, d1, d2, d3, d4, d5, d6, d7;
-
-		prefetchw(p1+8);
-		prefetch(p2+8);
- once_more:
-		d0 = p1[0];	/* Pull the stuff into registers	*/
-		d1 = p1[1];	/*  ... in bursts, if possible.		*/
-		d2 = p1[2];
-		d3 = p1[3];
-		d4 = p1[4];
-		d5 = p1[5];
-		d6 = p1[6];
-		d7 = p1[7];
-		d0 ^= p2[0];
-		d1 ^= p2[1];
-		d2 ^= p2[2];
-		d3 ^= p2[3];
-		d4 ^= p2[4];
-		d5 ^= p2[5];
-		d6 ^= p2[6];
-		d7 ^= p2[7];
-		p1[0] = d0;	/* Store the result (in bursts)		*/
-		p1[1] = d1;
-		p1[2] = d2;
-		p1[3] = d3;
-		p1[4] = d4;
-		p1[5] = d5;
-		p1[6] = d6;
-		p1[7] = d7;
-		p1 += 8;
-		p2 += 8;
-	} while (--lines > 0);
-	if (lines == 0)
-		goto once_more;
-}
-
-static void
-xor_32regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3)
-{
-	long lines = bytes / (sizeof (long)) / 8 - 1;
-
-	prefetchw(p1);
-	prefetch(p2);
-	prefetch(p3);
-
-	do {
-		register long d0, d1, d2, d3, d4, d5, d6, d7;
-
-		prefetchw(p1+8);
-		prefetch(p2+8);
-		prefetch(p3+8);
- once_more:
-		d0 = p1[0];	/* Pull the stuff into registers	*/
-		d1 = p1[1];	/*  ... in bursts, if possible.		*/
-		d2 = p1[2];
-		d3 = p1[3];
-		d4 = p1[4];
-		d5 = p1[5];
-		d6 = p1[6];
-		d7 = p1[7];
-		d0 ^= p2[0];
-		d1 ^= p2[1];
-		d2 ^= p2[2];
-		d3 ^= p2[3];
-		d4 ^= p2[4];
-		d5 ^= p2[5];
-		d6 ^= p2[6];
-		d7 ^= p2[7];
-		d0 ^= p3[0];
-		d1 ^= p3[1];
-		d2 ^= p3[2];
-		d3 ^= p3[3];
-		d4 ^= p3[4];
-		d5 ^= p3[5];
-		d6 ^= p3[6];
-		d7 ^= p3[7];
-		p1[0] = d0;	/* Store the result (in bursts)		*/
-		p1[1] = d1;
-		p1[2] = d2;
-		p1[3] = d3;
-		p1[4] = d4;
-		p1[5] = d5;
-		p1[6] = d6;
-		p1[7] = d7;
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-	} while (--lines > 0);
-	if (lines == 0)
-		goto once_more;
-}
-
-static void
-xor_32regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3,
-	       const unsigned long * __restrict p4)
-{
-	long lines = bytes / (sizeof (long)) / 8 - 1;
-
-	prefetchw(p1);
-	prefetch(p2);
-	prefetch(p3);
-	prefetch(p4);
-
-	do {
-		register long d0, d1, d2, d3, d4, d5, d6, d7;
-
-		prefetchw(p1+8);
-		prefetch(p2+8);
-		prefetch(p3+8);
-		prefetch(p4+8);
- once_more:
-		d0 = p1[0];	/* Pull the stuff into registers	*/
-		d1 = p1[1];	/*  ... in bursts, if possible.		*/
-		d2 = p1[2];
-		d3 = p1[3];
-		d4 = p1[4];
-		d5 = p1[5];
-		d6 = p1[6];
-		d7 = p1[7];
-		d0 ^= p2[0];
-		d1 ^= p2[1];
-		d2 ^= p2[2];
-		d3 ^= p2[3];
-		d4 ^= p2[4];
-		d5 ^= p2[5];
-		d6 ^= p2[6];
-		d7 ^= p2[7];
-		d0 ^= p3[0];
-		d1 ^= p3[1];
-		d2 ^= p3[2];
-		d3 ^= p3[3];
-		d4 ^= p3[4];
-		d5 ^= p3[5];
-		d6 ^= p3[6];
-		d7 ^= p3[7];
-		d0 ^= p4[0];
-		d1 ^= p4[1];
-		d2 ^= p4[2];
-		d3 ^= p4[3];
-		d4 ^= p4[4];
-		d5 ^= p4[5];
-		d6 ^= p4[6];
-		d7 ^= p4[7];
-		p1[0] = d0;	/* Store the result (in bursts)		*/
-		p1[1] = d1;
-		p1[2] = d2;
-		p1[3] = d3;
-		p1[4] = d4;
-		p1[5] = d5;
-		p1[6] = d6;
-		p1[7] = d7;
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-		p4 += 8;
-	} while (--lines > 0);
-	if (lines == 0)
-		goto once_more;
-}
-
-static void
-xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3,
-	       const unsigned long * __restrict p4,
-	       const unsigned long * __restrict p5)
-{
-	long lines = bytes / (sizeof (long)) / 8 - 1;
-
-	prefetchw(p1);
-	prefetch(p2);
-	prefetch(p3);
-	prefetch(p4);
-	prefetch(p5);
-
-	do {
-		register long d0, d1, d2, d3, d4, d5, d6, d7;
-
-		prefetchw(p1+8);
-		prefetch(p2+8);
-		prefetch(p3+8);
-		prefetch(p4+8);
-		prefetch(p5+8);
- once_more:
-		d0 = p1[0];	/* Pull the stuff into registers	*/
-		d1 = p1[1];	/*  ... in bursts, if possible.		*/
-		d2 = p1[2];
-		d3 = p1[3];
-		d4 = p1[4];
-		d5 = p1[5];
-		d6 = p1[6];
-		d7 = p1[7];
-		d0 ^= p2[0];
-		d1 ^= p2[1];
-		d2 ^= p2[2];
-		d3 ^= p2[3];
-		d4 ^= p2[4];
-		d5 ^= p2[5];
-		d6 ^= p2[6];
-		d7 ^= p2[7];
-		d0 ^= p3[0];
-		d1 ^= p3[1];
-		d2 ^= p3[2];
-		d3 ^= p3[3];
-		d4 ^= p3[4];
-		d5 ^= p3[5];
-		d6 ^= p3[6];
-		d7 ^= p3[7];
-		d0 ^= p4[0];
-		d1 ^= p4[1];
-		d2 ^= p4[2];
-		d3 ^= p4[3];
-		d4 ^= p4[4];
-		d5 ^= p4[5];
-		d6 ^= p4[6];
-		d7 ^= p4[7];
-		d0 ^= p5[0];
-		d1 ^= p5[1];
-		d2 ^= p5[2];
-		d3 ^= p5[3];
-		d4 ^= p5[4];
-		d5 ^= p5[5];
-		d6 ^= p5[6];
-		d7 ^= p5[7];
-		p1[0] = d0;	/* Store the result (in bursts)		*/
-		p1[1] = d1;
-		p1[2] = d2;
-		p1[3] = d3;
-		p1[4] = d4;
-		p1[5] = d5;
-		p1[6] = d6;
-		p1[7] = d7;
-		p1 += 8;
-		p2 += 8;
-		p3 += 8;
-		p4 += 8;
-		p5 += 8;
-	} while (--lines > 0);
-	if (lines == 0)
-		goto once_more;
-}
-
-static struct xor_block_template xor_block_8regs = {
-	.name = "8regs",
-	.do_2 = xor_8regs_2,
-	.do_3 = xor_8regs_3,
-	.do_4 = xor_8regs_4,
-	.do_5 = xor_8regs_5,
-};
-
-static struct xor_block_template xor_block_32regs = {
-	.name = "32regs",
-	.do_2 = xor_32regs_2,
-	.do_3 = xor_32regs_3,
-	.do_4 = xor_32regs_4,
-	.do_5 = xor_32regs_5,
-};
-
-static struct xor_block_template xor_block_8regs_p __maybe_unused = {
-	.name = "8regs_prefetch",
-	.do_2 = xor_8regs_p_2,
-	.do_3 = xor_8regs_p_3,
-	.do_4 = xor_8regs_p_4,
-	.do_5 = xor_8regs_p_5,
-};
-
-static struct xor_block_template xor_block_32regs_p __maybe_unused = {
-	.name = "32regs_prefetch",
-	.do_2 = xor_32regs_p_2,
-	.do_3 = xor_32regs_p_3,
-	.do_4 = xor_32regs_p_4,
-	.do_5 = xor_32regs_p_5,
-};
+extern struct xor_block_template xor_block_8regs;
+extern struct xor_block_template xor_block_32regs;
+extern struct xor_block_template xor_block_8regs_p;
+extern struct xor_block_template xor_block_32regs_p;
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 7bca0ce8e90ac0..89a944c9f99094 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -3,3 +3,7 @@
 obj-$(CONFIG_XOR_BLOCKS)	+= xor.o
 
 xor-y				+= xor-core.o
+xor-y				+= xor-8regs.o
+xor-y				+= xor-32regs.o
+xor-y				+= xor-8regs-prefetch.o
+xor-y				+= xor-32regs-prefetch.o
diff --git a/lib/raid/xor/xor-32regs-prefetch.c b/lib/raid/xor/xor-32regs-prefetch.c
new file mode 100644
index 00000000000000..8666c287f7775b
--- /dev/null
+++ b/lib/raid/xor/xor-32regs-prefetch.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/prefetch.h>
+#include <linux/raid/xor_impl.h>
+#include <asm-generic/xor.h>
+
+static void
+xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2)
+{
+	long lines = bytes / (sizeof (long)) / 8 - 1;
+
+	prefetchw(p1);
+	prefetch(p2);
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+
+		prefetchw(p1+8);
+		prefetch(p2+8);
+ once_more:
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		p1[0] = d0;	/* Store the result (in bursts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+	} while (--lines > 0);
+	if (lines == 0)
+		goto once_more;
+}
+
+static void
+xor_32regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3)
+{
+	long lines = bytes / (sizeof (long)) / 8 - 1;
+
+	prefetchw(p1);
+	prefetch(p2);
+	prefetch(p3);
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+
+		prefetchw(p1+8);
+		prefetch(p2+8);
+		prefetch(p3+8);
+ once_more:
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		p1[0] = d0;	/* Store the result (in bursts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+	} while (--lines > 0);
+	if (lines == 0)
+		goto once_more;
+}
+
+static void
+xor_32regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3,
+	       const unsigned long * __restrict p4)
+{
+	long lines = bytes / (sizeof (long)) / 8 - 1;
+
+	prefetchw(p1);
+	prefetch(p2);
+	prefetch(p3);
+	prefetch(p4);
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+
+		prefetchw(p1+8);
+		prefetch(p2+8);
+		prefetch(p3+8);
+		prefetch(p4+8);
+ once_more:
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		d0 ^= p4[0];
+		d1 ^= p4[1];
+		d2 ^= p4[2];
+		d3 ^= p4[3];
+		d4 ^= p4[4];
+		d5 ^= p4[5];
+		d6 ^= p4[6];
+		d7 ^= p4[7];
+		p1[0] = d0;	/* Store the result (in bursts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+	} while (--lines > 0);
+	if (lines == 0)
+		goto once_more;
+}
+
+static void
+xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3,
+	       const unsigned long * __restrict p4,
+	       const unsigned long * __restrict p5)
+{
+	long lines = bytes / (sizeof (long)) / 8 - 1;
+
+	prefetchw(p1);
+	prefetch(p2);
+	prefetch(p3);
+	prefetch(p4);
+	prefetch(p5);
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+
+		prefetchw(p1+8);
+		prefetch(p2+8);
+		prefetch(p3+8);
+		prefetch(p4+8);
+		prefetch(p5+8);
+ once_more:
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		d0 ^= p4[0];
+		d1 ^= p4[1];
+		d2 ^= p4[2];
+		d3 ^= p4[3];
+		d4 ^= p4[4];
+		d5 ^= p4[5];
+		d6 ^= p4[6];
+		d7 ^= p4[7];
+		d0 ^= p5[0];
+		d1 ^= p5[1];
+		d2 ^= p5[2];
+		d3 ^= p5[3];
+		d4 ^= p5[4];
+		d5 ^= p5[5];
+		d6 ^= p5[6];
+		d7 ^= p5[7];
+		p1[0] = d0;	/* Store the result (in bursts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+		p5 += 8;
+	} while (--lines > 0);
+	if (lines == 0)
+		goto once_more;
+}
+
+struct xor_block_template xor_block_32regs_p = {
+	.name = "32regs_prefetch",
+	.do_2 = xor_32regs_p_2,
+	.do_3 = xor_32regs_p_3,
+	.do_4 = xor_32regs_p_4,
+	.do_5 = xor_32regs_p_5,
+};
diff --git a/lib/raid/xor/xor-32regs.c b/lib/raid/xor/xor-32regs.c
new file mode 100644
index 00000000000000..58d4fac43eb44c
--- /dev/null
+++ b/lib/raid/xor/xor-32regs.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/raid/xor_impl.h>
+#include <asm-generic/xor.h>
+
+static void
+xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1,
+	     const unsigned long * __restrict p2)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		p1[0] = d0;	/* Store the result (in bursts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_32regs_3(unsigned long bytes, unsigned long * __restrict p1,
+	     const unsigned long * __restrict p2,
+	     const unsigned long * __restrict p3)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		p1[0] = d0;	/* Store the result (in bursts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_32regs_4(unsigned long bytes, unsigned long * __restrict p1,
+	     const unsigned long * __restrict p2,
+	     const unsigned long * __restrict p3,
+	     const unsigned long * __restrict p4)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		d0 ^= p4[0];
+		d1 ^= p4[1];
+		d2 ^= p4[2];
+		d3 ^= p4[3];
+		d4 ^= p4[4];
+		d5 ^= p4[5];
+		d6 ^= p4[6];
+		d7 ^= p4[7];
+		p1[0] = d0;	/* Store the result (in bursts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1,
+	     const unsigned long * __restrict p2,
+	     const unsigned long * __restrict p3,
+	     const unsigned long * __restrict p4,
+	     const unsigned long * __restrict p5)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		register long d0, d1, d2, d3, d4, d5, d6, d7;
+		d0 = p1[0];	/* Pull the stuff into registers	*/
+		d1 = p1[1];	/*  ... in bursts, if possible.		*/
+		d2 = p1[2];
+		d3 = p1[3];
+		d4 = p1[4];
+		d5 = p1[5];
+		d6 = p1[6];
+		d7 = p1[7];
+		d0 ^= p2[0];
+		d1 ^= p2[1];
+		d2 ^= p2[2];
+		d3 ^= p2[3];
+		d4 ^= p2[4];
+		d5 ^= p2[5];
+		d6 ^= p2[6];
+		d7 ^= p2[7];
+		d0 ^= p3[0];
+		d1 ^= p3[1];
+		d2 ^= p3[2];
+		d3 ^= p3[3];
+		d4 ^= p3[4];
+		d5 ^= p3[5];
+		d6 ^= p3[6];
+		d7 ^= p3[7];
+		d0 ^= p4[0];
+		d1 ^= p4[1];
+		d2 ^= p4[2];
+		d3 ^= p4[3];
+		d4 ^= p4[4];
+		d5 ^= p4[5];
+		d6 ^= p4[6];
+		d7 ^= p4[7];
+		d0 ^= p5[0];
+		d1 ^= p5[1];
+		d2 ^= p5[2];
+		d3 ^= p5[3];
+		d4 ^= p5[4];
+		d5 ^= p5[5];
+		d6 ^= p5[6];
+		d7 ^= p5[7];
+		p1[0] = d0;	/* Store the result (in bursts)		*/
+		p1[1] = d1;
+		p1[2] = d2;
+		p1[3] = d3;
+		p1[4] = d4;
+		p1[5] = d5;
+		p1[6] = d6;
+		p1[7] = d7;
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+		p5 += 8;
+	} while (--lines > 0);
+}
+
+struct xor_block_template xor_block_32regs = {
+	.name = "32regs",
+	.do_2 = xor_32regs_2,
+	.do_3 = xor_32regs_3,
+	.do_4 = xor_32regs_4,
+	.do_5 = xor_32regs_5,
+};
diff --git a/lib/raid/xor/xor-8regs-prefetch.c b/lib/raid/xor/xor-8regs-prefetch.c
new file mode 100644
index 00000000000000..67061e35a0a67f
--- /dev/null
+++ b/lib/raid/xor/xor-8regs-prefetch.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/prefetch.h>
+#include <linux/raid/xor_impl.h>
+#include <asm-generic/xor.h>
+
+static void
+xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
+	      const unsigned long * __restrict p2)
+{
+	long lines = bytes / (sizeof (long)) / 8 - 1;
+	prefetchw(p1);
+	prefetch(p2);
+
+	do {
+		prefetchw(p1+8);
+		prefetch(p2+8);
+ once_more:
+		p1[0] ^= p2[0];
+		p1[1] ^= p2[1];
+		p1[2] ^= p2[2];
+		p1[3] ^= p2[3];
+		p1[4] ^= p2[4];
+		p1[5] ^= p2[5];
+		p1[6] ^= p2[6];
+		p1[7] ^= p2[7];
+		p1 += 8;
+		p2 += 8;
+	} while (--lines > 0);
+	if (lines == 0)
+		goto once_more;
+}
+
+static void
+xor_8regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
+	      const unsigned long * __restrict p2,
+	      const unsigned long * __restrict p3)
+{
+	long lines = bytes / (sizeof (long)) / 8 - 1;
+	prefetchw(p1);
+	prefetch(p2);
+	prefetch(p3);
+
+	do {
+		prefetchw(p1+8);
+		prefetch(p2+8);
+		prefetch(p3+8);
+ once_more:
+		p1[0] ^= p2[0] ^ p3[0];
+		p1[1] ^= p2[1] ^ p3[1];
+		p1[2] ^= p2[2] ^ p3[2];
+		p1[3] ^= p2[3] ^ p3[3];
+		p1[4] ^= p2[4] ^ p3[4];
+		p1[5] ^= p2[5] ^ p3[5];
+		p1[6] ^= p2[6] ^ p3[6];
+		p1[7] ^= p2[7] ^ p3[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+	} while (--lines > 0);
+	if (lines == 0)
+		goto once_more;
+}
+
+static void
+xor_8regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
+	      const unsigned long * __restrict p2,
+	      const unsigned long * __restrict p3,
+	      const unsigned long * __restrict p4)
+{
+	long lines = bytes / (sizeof (long)) / 8 - 1;
+
+	prefetchw(p1);
+	prefetch(p2);
+	prefetch(p3);
+	prefetch(p4);
+
+	do {
+		prefetchw(p1+8);
+		prefetch(p2+8);
+		prefetch(p3+8);
+		prefetch(p4+8);
+ once_more:
+		p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
+		p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
+		p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
+		p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
+		p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
+		p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
+		p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
+		p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+	} while (--lines > 0);
+	if (lines == 0)
+		goto once_more;
+}
+
+static void
+xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
+	      const unsigned long * __restrict p2,
+	      const unsigned long * __restrict p3,
+	      const unsigned long * __restrict p4,
+	      const unsigned long * __restrict p5)
+{
+	long lines = bytes / (sizeof (long)) / 8 - 1;
+
+	prefetchw(p1);
+	prefetch(p2);
+	prefetch(p3);
+	prefetch(p4);
+	prefetch(p5);
+
+	do {
+		prefetchw(p1+8);
+		prefetch(p2+8);
+		prefetch(p3+8);
+		prefetch(p4+8);
+		prefetch(p5+8);
+ once_more:
+		p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
+		p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
+		p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
+		p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
+		p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
+		p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
+		p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
+		p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+		p5 += 8;
+	} while (--lines > 0);
+	if (lines == 0)
+		goto once_more;
+}
+
+struct xor_block_template xor_block_8regs_p = {
+	.name = "8regs_prefetch",
+	.do_2 = xor_8regs_p_2,
+	.do_3 = xor_8regs_p_3,
+	.do_4 = xor_8regs_p_4,
+	.do_5 = xor_8regs_p_5,
+};
diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c
new file mode 100644
index 00000000000000..769f796ab2cf4b
--- /dev/null
+++ b/lib/raid/xor/xor-8regs.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/raid/xor_impl.h>
+#include <asm-generic/xor.h>
+
+static void
+xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		p1[0] ^= p2[0];
+		p1[1] ^= p2[1];
+		p1[2] ^= p2[2];
+		p1[3] ^= p2[3];
+		p1[4] ^= p2[4];
+		p1[5] ^= p2[5];
+		p1[6] ^= p2[6];
+		p1[7] ^= p2[7];
+		p1 += 8;
+		p2 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_8regs_3(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2,
+	    const unsigned long * __restrict p3)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		p1[0] ^= p2[0] ^ p3[0];
+		p1[1] ^= p2[1] ^ p3[1];
+		p1[2] ^= p2[2] ^ p3[2];
+		p1[3] ^= p2[3] ^ p3[3];
+		p1[4] ^= p2[4] ^ p3[4];
+		p1[5] ^= p2[5] ^ p3[5];
+		p1[6] ^= p2[6] ^ p3[6];
+		p1[7] ^= p2[7] ^ p3[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_8regs_4(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2,
+	    const unsigned long * __restrict p3,
+	    const unsigned long * __restrict p4)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		p1[0] ^= p2[0] ^ p3[0] ^ p4[0];
+		p1[1] ^= p2[1] ^ p3[1] ^ p4[1];
+		p1[2] ^= p2[2] ^ p3[2] ^ p4[2];
+		p1[3] ^= p2[3] ^ p3[3] ^ p4[3];
+		p1[4] ^= p2[4] ^ p3[4] ^ p4[4];
+		p1[5] ^= p2[5] ^ p3[5] ^ p4[5];
+		p1[6] ^= p2[6] ^ p3[6] ^ p4[6];
+		p1[7] ^= p2[7] ^ p3[7] ^ p4[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+	} while (--lines > 0);
+}
+
+static void
+xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2,
+	    const unsigned long * __restrict p3,
+	    const unsigned long * __restrict p4,
+	    const unsigned long * __restrict p5)
+{
+	long lines = bytes / (sizeof (long)) / 8;
+
+	do {
+		p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0];
+		p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1];
+		p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2];
+		p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3];
+		p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4];
+		p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5];
+		p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6];
+		p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7];
+		p1 += 8;
+		p2 += 8;
+		p3 += 8;
+		p4 += 8;
+		p5 += 8;
+	} while (--lines > 0);
+}
+
+#ifndef NO_TEMPLATE
+struct xor_block_template xor_block_8regs = {
+	.name = "8regs",
+	.do_2 = xor_8regs_2,
+	.do_3 = xor_8regs_3,
+	.do_4 = xor_8regs_4,
+	.do_5 = xor_8regs_5,
+};
+#endif /* NO_TEMPLATE */

From 8d02871a2357d66cec1f494e81374a415f7dd252 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:46 +0100
Subject: [PATCH 10/26] alpha: move the XOR code to lib/raid/

Move the optimized XOR code out of line into lib/raid.

Note that the giant inline assembly block might be better off as a
separate assembly source file now, but I'll leave that to the alpha
maintainers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Magnus Lindholm <linmag7@gmail.com>
Tested-by: Magnus Lindholm <linmag7@gmail.com>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/alpha/include/asm/xor.h | 853 +----------------------------------
 lib/raid/xor/Makefile        |   2 +
 lib/raid/xor/alpha/xor.c     | 849 ++++++++++++++++++++++++++++++++++
 3 files changed, 855 insertions(+), 849 deletions(-)
 create mode 100644 lib/raid/xor/alpha/xor.c

diff --git a/arch/alpha/include/asm/xor.h b/arch/alpha/include/asm/xor.h
index 4c8085711df182..e517be577a09ff 100644
--- a/arch/alpha/include/asm/xor.h
+++ b/arch/alpha/include/asm/xor.h
@@ -1,856 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * include/asm-alpha/xor.h
- *
- * Optimized RAID-5 checksumming functions for alpha EV5 and EV6
- */
-
-extern void
-xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1,
-	    const unsigned long * __restrict p2);
-extern void
-xor_alpha_3(unsigned long bytes, unsigned long * __restrict p1,
-	    const unsigned long * __restrict p2,
-	    const unsigned long * __restrict p3);
-extern void
-xor_alpha_4(unsigned long bytes, unsigned long * __restrict p1,
-	    const unsigned long * __restrict p2,
-	    const unsigned long * __restrict p3,
-	    const unsigned long * __restrict p4);
-extern void
-xor_alpha_5(unsigned long bytes, unsigned long * __restrict p1,
-	    const unsigned long * __restrict p2,
-	    const unsigned long * __restrict p3,
-	    const unsigned long * __restrict p4,
-	    const unsigned long * __restrict p5);
-
-extern void
-xor_alpha_prefetch_2(unsigned long bytes, unsigned long * __restrict p1,
-		     const unsigned long * __restrict p2);
-extern void
-xor_alpha_prefetch_3(unsigned long bytes, unsigned long * __restrict p1,
-		     const unsigned long * __restrict p2,
-		     const unsigned long * __restrict p3);
-extern void
-xor_alpha_prefetch_4(unsigned long bytes, unsigned long * __restrict p1,
-		     const unsigned long * __restrict p2,
-		     const unsigned long * __restrict p3,
-		     const unsigned long * __restrict p4);
-extern void
-xor_alpha_prefetch_5(unsigned long bytes, unsigned long * __restrict p1,
-		     const unsigned long * __restrict p2,
-		     const unsigned long * __restrict p3,
-		     const unsigned long * __restrict p4,
-		     const unsigned long * __restrict p5);
 
-asm("								\n\
-	.text							\n\
-	.align 3						\n\
-	.ent xor_alpha_2					\n\
-xor_alpha_2:							\n\
-	.prologue 0						\n\
-	srl $16, 6, $16						\n\
-	.align 4						\n\
-2:								\n\
-	ldq $0,0($17)						\n\
-	ldq $1,0($18)						\n\
-	ldq $2,8($17)						\n\
-	ldq $3,8($18)						\n\
-								\n\
-	ldq $4,16($17)						\n\
-	ldq $5,16($18)						\n\
-	ldq $6,24($17)						\n\
-	ldq $7,24($18)						\n\
-								\n\
-	ldq $19,32($17)						\n\
-	ldq $20,32($18)						\n\
-	ldq $21,40($17)						\n\
-	ldq $22,40($18)						\n\
-								\n\
-	ldq $23,48($17)						\n\
-	ldq $24,48($18)						\n\
-	ldq $25,56($17)						\n\
-	xor $0,$1,$0		# 7 cycles from $1 load		\n\
-								\n\
-	ldq $27,56($18)						\n\
-	xor $2,$3,$2						\n\
-	stq $0,0($17)						\n\
-	xor $4,$5,$4						\n\
-								\n\
-	stq $2,8($17)						\n\
-	xor $6,$7,$6						\n\
-	stq $4,16($17)						\n\
-	xor $19,$20,$19						\n\
-								\n\
-	stq $6,24($17)						\n\
-	xor $21,$22,$21						\n\
-	stq $19,32($17)						\n\
-	xor $23,$24,$23						\n\
-								\n\
-	stq $21,40($17)						\n\
-	xor $25,$27,$25						\n\
-	stq $23,48($17)						\n\
-	subq $16,1,$16						\n\
-								\n\
-	stq $25,56($17)						\n\
-	addq $17,64,$17						\n\
-	addq $18,64,$18						\n\
-	bgt $16,2b						\n\
-								\n\
-	ret							\n\
-	.end xor_alpha_2					\n\
-								\n\
-	.align 3						\n\
-	.ent xor_alpha_3					\n\
-xor_alpha_3:							\n\
-	.prologue 0						\n\
-	srl $16, 6, $16						\n\
-	.align 4						\n\
-3:								\n\
-	ldq $0,0($17)						\n\
-	ldq $1,0($18)						\n\
-	ldq $2,0($19)						\n\
-	ldq $3,8($17)						\n\
-								\n\
-	ldq $4,8($18)						\n\
-	ldq $6,16($17)						\n\
-	ldq $7,16($18)						\n\
-	ldq $21,24($17)						\n\
-								\n\
-	ldq $22,24($18)						\n\
-	ldq $24,32($17)						\n\
-	ldq $25,32($18)						\n\
-	ldq $5,8($19)						\n\
-								\n\
-	ldq $20,16($19)						\n\
-	ldq $23,24($19)						\n\
-	ldq $27,32($19)						\n\
-	nop							\n\
-								\n\
-	xor $0,$1,$1		# 8 cycles from $0 load		\n\
-	xor $3,$4,$4		# 6 cycles from $4 load		\n\
-	xor $6,$7,$7		# 6 cycles from $7 load		\n\
-	xor $21,$22,$22		# 5 cycles from $22 load	\n\
-								\n\
-	xor $1,$2,$2		# 9 cycles from $2 load		\n\
-	xor $24,$25,$25		# 5 cycles from $25 load	\n\
-	stq $2,0($17)						\n\
-	xor $4,$5,$5		# 6 cycles from $5 load		\n\
-								\n\
-	stq $5,8($17)						\n\
-	xor $7,$20,$20		# 7 cycles from $20 load	\n\
-	stq $20,16($17)						\n\
-	xor $22,$23,$23		# 7 cycles from $23 load	\n\
-								\n\
-	stq $23,24($17)						\n\
-	xor $25,$27,$27		# 7 cycles from $27 load	\n\
-	stq $27,32($17)						\n\
-	nop							\n\
-								\n\
-	ldq $0,40($17)						\n\
-	ldq $1,40($18)						\n\
-	ldq $3,48($17)						\n\
-	ldq $4,48($18)						\n\
-								\n\
-	ldq $6,56($17)						\n\
-	ldq $7,56($18)						\n\
-	ldq $2,40($19)						\n\
-	ldq $5,48($19)						\n\
-								\n\
-	ldq $20,56($19)						\n\
-	xor $0,$1,$1		# 4 cycles from $1 load		\n\
-	xor $3,$4,$4		# 5 cycles from $4 load		\n\
-	xor $6,$7,$7		# 5 cycles from $7 load		\n\
-								\n\
-	xor $1,$2,$2		# 4 cycles from $2 load		\n\
-	xor $4,$5,$5		# 5 cycles from $5 load		\n\
-	stq $2,40($17)						\n\
-	xor $7,$20,$20		# 4 cycles from $20 load	\n\
-								\n\
-	stq $5,48($17)						\n\
-	subq $16,1,$16						\n\
-	stq $20,56($17)						\n\
-	addq $19,64,$19						\n\
-								\n\
-	addq $18,64,$18						\n\
-	addq $17,64,$17						\n\
-	bgt $16,3b						\n\
-	ret							\n\
-	.end xor_alpha_3					\n\
-								\n\
-	.align 3						\n\
-	.ent xor_alpha_4					\n\
-xor_alpha_4:							\n\
-	.prologue 0						\n\
-	srl $16, 6, $16						\n\
-	.align 4						\n\
-4:								\n\
-	ldq $0,0($17)						\n\
-	ldq $1,0($18)						\n\
-	ldq $2,0($19)						\n\
-	ldq $3,0($20)						\n\
-								\n\
-	ldq $4,8($17)						\n\
-	ldq $5,8($18)						\n\
-	ldq $6,8($19)						\n\
-	ldq $7,8($20)						\n\
-								\n\
-	ldq $21,16($17)						\n\
-	ldq $22,16($18)						\n\
-	ldq $23,16($19)						\n\
-	ldq $24,16($20)						\n\
-								\n\
-	ldq $25,24($17)						\n\
-	xor $0,$1,$1		# 6 cycles from $1 load		\n\
-	ldq $27,24($18)						\n\
-	xor $2,$3,$3		# 6 cycles from $3 load		\n\
-								\n\
-	ldq $0,24($19)						\n\
-	xor $1,$3,$3						\n\
-	ldq $1,24($20)						\n\
-	xor $4,$5,$5		# 7 cycles from $5 load		\n\
-								\n\
-	stq $3,0($17)						\n\
-	xor $6,$7,$7						\n\
-	xor $21,$22,$22		# 7 cycles from $22 load	\n\
-	xor $5,$7,$7						\n\
-								\n\
-	stq $7,8($17)						\n\
-	xor $23,$24,$24		# 7 cycles from $24 load	\n\
-	ldq $2,32($17)						\n\
-	xor $22,$24,$24						\n\
-								\n\
-	ldq $3,32($18)						\n\
-	ldq $4,32($19)						\n\
-	ldq $5,32($20)						\n\
-	xor $25,$27,$27		# 8 cycles from $27 load	\n\
-								\n\
-	ldq $6,40($17)						\n\
-	ldq $7,40($18)						\n\
-	ldq $21,40($19)						\n\
-	ldq $22,40($20)						\n\
-								\n\
-	stq $24,16($17)						\n\
-	xor $0,$1,$1		# 9 cycles from $1 load		\n\
-	xor $2,$3,$3		# 5 cycles from $3 load		\n\
-	xor $27,$1,$1						\n\
-								\n\
-	stq $1,24($17)						\n\
-	xor $4,$5,$5		# 5 cycles from $5 load		\n\
-	ldq $23,48($17)						\n\
-	ldq $24,48($18)						\n\
-								\n\
-	ldq $25,48($19)						\n\
-	xor $3,$5,$5						\n\
-	ldq $27,48($20)						\n\
-	ldq $0,56($17)						\n\
-								\n\
-	ldq $1,56($18)						\n\
-	ldq $2,56($19)						\n\
-	xor $6,$7,$7		# 8 cycles from $6 load		\n\
-	ldq $3,56($20)						\n\
-								\n\
-	stq $5,32($17)						\n\
-	xor $21,$22,$22		# 8 cycles from $22 load	\n\
-	xor $7,$22,$22						\n\
-	xor $23,$24,$24		# 5 cycles from $24 load	\n\
-								\n\
-	stq $22,40($17)						\n\
-	xor $25,$27,$27		# 5 cycles from $27 load	\n\
-	xor $24,$27,$27						\n\
-	xor $0,$1,$1		# 5 cycles from $1 load		\n\
-								\n\
-	stq $27,48($17)						\n\
-	xor $2,$3,$3		# 4 cycles from $3 load		\n\
-	xor $1,$3,$3						\n\
-	subq $16,1,$16						\n\
-								\n\
-	stq $3,56($17)						\n\
-	addq $20,64,$20						\n\
-	addq $19,64,$19						\n\
-	addq $18,64,$18						\n\
-								\n\
-	addq $17,64,$17						\n\
-	bgt $16,4b						\n\
-	ret							\n\
-	.end xor_alpha_4					\n\
-								\n\
-	.align 3						\n\
-	.ent xor_alpha_5					\n\
-xor_alpha_5:							\n\
-	.prologue 0						\n\
-	srl $16, 6, $16						\n\
-	.align 4						\n\
-5:								\n\
-	ldq $0,0($17)						\n\
-	ldq $1,0($18)						\n\
-	ldq $2,0($19)						\n\
-	ldq $3,0($20)						\n\
-								\n\
-	ldq $4,0($21)						\n\
-	ldq $5,8($17)						\n\
-	ldq $6,8($18)						\n\
-	ldq $7,8($19)						\n\
-								\n\
-	ldq $22,8($20)						\n\
-	ldq $23,8($21)						\n\
-	ldq $24,16($17)						\n\
-	ldq $25,16($18)						\n\
-								\n\
-	ldq $27,16($19)						\n\
-	xor $0,$1,$1		# 6 cycles from $1 load		\n\
-	ldq $28,16($20)						\n\
-	xor $2,$3,$3		# 6 cycles from $3 load		\n\
-								\n\
-	ldq $0,16($21)						\n\
-	xor $1,$3,$3						\n\
-	ldq $1,24($17)						\n\
-	xor $3,$4,$4		# 7 cycles from $4 load		\n\
-								\n\
-	stq $4,0($17)						\n\
-	xor $5,$6,$6		# 7 cycles from $6 load		\n\
-	xor $7,$22,$22		# 7 cycles from $22 load	\n\
-	xor $6,$23,$23		# 7 cycles from $23 load	\n\
-								\n\
-	ldq $2,24($18)						\n\
-	xor $22,$23,$23						\n\
-	ldq $3,24($19)						\n\
-	xor $24,$25,$25		# 8 cycles from $25 load	\n\
-								\n\
-	stq $23,8($17)						\n\
-	xor $25,$27,$27		# 8 cycles from $27 load	\n\
-	ldq $4,24($20)						\n\
-	xor $28,$0,$0		# 7 cycles from $0 load		\n\
-								\n\
-	ldq $5,24($21)						\n\
-	xor $27,$0,$0						\n\
-	ldq $6,32($17)						\n\
-	ldq $7,32($18)						\n\
-								\n\
-	stq $0,16($17)						\n\
-	xor $1,$2,$2		# 6 cycles from $2 load		\n\
-	ldq $22,32($19)						\n\
-	xor $3,$4,$4		# 4 cycles from $4 load		\n\
-								\n\
-	ldq $23,32($20)						\n\
-	xor $2,$4,$4						\n\
-	ldq $24,32($21)						\n\
-	ldq $25,40($17)						\n\
-								\n\
-	ldq $27,40($18)						\n\
-	ldq $28,40($19)						\n\
-	ldq $0,40($20)						\n\
-	xor $4,$5,$5		# 7 cycles from $5 load		\n\
-								\n\
-	stq $5,24($17)						\n\
-	xor $6,$7,$7		# 7 cycles from $7 load		\n\
-	ldq $1,40($21)						\n\
-	ldq $2,48($17)						\n\
-								\n\
-	ldq $3,48($18)						\n\
-	xor $7,$22,$22		# 7 cycles from $22 load	\n\
-	ldq $4,48($19)						\n\
-	xor $23,$24,$24		# 6 cycles from $24 load	\n\
-								\n\
-	ldq $5,48($20)						\n\
-	xor $22,$24,$24						\n\
-	ldq $6,48($21)						\n\
-	xor $25,$27,$27		# 7 cycles from $27 load	\n\
-								\n\
-	stq $24,32($17)						\n\
-	xor $27,$28,$28		# 8 cycles from $28 load	\n\
-	ldq $7,56($17)						\n\
-	xor $0,$1,$1		# 6 cycles from $1 load		\n\
-								\n\
-	ldq $22,56($18)						\n\
-	ldq $23,56($19)						\n\
-	ldq $24,56($20)						\n\
-	ldq $25,56($21)						\n\
-								\n\
-	xor $28,$1,$1						\n\
-	xor $2,$3,$3		# 9 cycles from $3 load		\n\
-	xor $3,$4,$4		# 9 cycles from $4 load		\n\
-	xor $5,$6,$6		# 8 cycles from $6 load		\n\
-								\n\
-	stq $1,40($17)						\n\
-	xor $4,$6,$6						\n\
-	xor $7,$22,$22		# 7 cycles from $22 load	\n\
-	xor $23,$24,$24		# 6 cycles from $24 load	\n\
-								\n\
-	stq $6,48($17)						\n\
-	xor $22,$24,$24						\n\
-	subq $16,1,$16						\n\
-	xor $24,$25,$25		# 8 cycles from $25 load	\n\
-								\n\
-	stq $25,56($17)						\n\
-	addq $21,64,$21						\n\
-	addq $20,64,$20						\n\
-	addq $19,64,$19						\n\
-								\n\
-	addq $18,64,$18						\n\
-	addq $17,64,$17						\n\
-	bgt $16,5b						\n\
-	ret							\n\
-	.end xor_alpha_5					\n\
-								\n\
-	.align 3						\n\
-	.ent xor_alpha_prefetch_2				\n\
-xor_alpha_prefetch_2:						\n\
-	.prologue 0						\n\
-	srl $16, 6, $16						\n\
-								\n\
-	ldq $31, 0($17)						\n\
-	ldq $31, 0($18)						\n\
-								\n\
-	ldq $31, 64($17)					\n\
-	ldq $31, 64($18)					\n\
-								\n\
-	ldq $31, 128($17)					\n\
-	ldq $31, 128($18)					\n\
-								\n\
-	ldq $31, 192($17)					\n\
-	ldq $31, 192($18)					\n\
-	.align 4						\n\
-2:								\n\
-	ldq $0,0($17)						\n\
-	ldq $1,0($18)						\n\
-	ldq $2,8($17)						\n\
-	ldq $3,8($18)						\n\
-								\n\
-	ldq $4,16($17)						\n\
-	ldq $5,16($18)						\n\
-	ldq $6,24($17)						\n\
-	ldq $7,24($18)						\n\
-								\n\
-	ldq $19,32($17)						\n\
-	ldq $20,32($18)						\n\
-	ldq $21,40($17)						\n\
-	ldq $22,40($18)						\n\
-								\n\
-	ldq $23,48($17)						\n\
-	ldq $24,48($18)						\n\
-	ldq $25,56($17)						\n\
-	ldq $27,56($18)						\n\
-								\n\
-	ldq $31,256($17)					\n\
-	xor $0,$1,$0		# 8 cycles from $1 load		\n\
-	ldq $31,256($18)					\n\
-	xor $2,$3,$2						\n\
-								\n\
-	stq $0,0($17)						\n\
-	xor $4,$5,$4						\n\
-	stq $2,8($17)						\n\
-	xor $6,$7,$6						\n\
-								\n\
-	stq $4,16($17)						\n\
-	xor $19,$20,$19						\n\
-	stq $6,24($17)						\n\
-	xor $21,$22,$21						\n\
-								\n\
-	stq $19,32($17)						\n\
-	xor $23,$24,$23						\n\
-	stq $21,40($17)						\n\
-	xor $25,$27,$25						\n\
-								\n\
-	stq $23,48($17)						\n\
-	subq $16,1,$16						\n\
-	stq $25,56($17)						\n\
-	addq $17,64,$17						\n\
-								\n\
-	addq $18,64,$18						\n\
-	bgt $16,2b						\n\
-	ret							\n\
-	.end xor_alpha_prefetch_2				\n\
-								\n\
-	.align 3						\n\
-	.ent xor_alpha_prefetch_3				\n\
-xor_alpha_prefetch_3:						\n\
-	.prologue 0						\n\
-	srl $16, 6, $16						\n\
-								\n\
-	ldq $31, 0($17)						\n\
-	ldq $31, 0($18)						\n\
-	ldq $31, 0($19)						\n\
-								\n\
-	ldq $31, 64($17)					\n\
-	ldq $31, 64($18)					\n\
-	ldq $31, 64($19)					\n\
-								\n\
-	ldq $31, 128($17)					\n\
-	ldq $31, 128($18)					\n\
-	ldq $31, 128($19)					\n\
-								\n\
-	ldq $31, 192($17)					\n\
-	ldq $31, 192($18)					\n\
-	ldq $31, 192($19)					\n\
-	.align 4						\n\
-3:								\n\
-	ldq $0,0($17)						\n\
-	ldq $1,0($18)						\n\
-	ldq $2,0($19)						\n\
-	ldq $3,8($17)						\n\
-								\n\
-	ldq $4,8($18)						\n\
-	ldq $6,16($17)						\n\
-	ldq $7,16($18)						\n\
-	ldq $21,24($17)						\n\
-								\n\
-	ldq $22,24($18)						\n\
-	ldq $24,32($17)						\n\
-	ldq $25,32($18)						\n\
-	ldq $5,8($19)						\n\
-								\n\
-	ldq $20,16($19)						\n\
-	ldq $23,24($19)						\n\
-	ldq $27,32($19)						\n\
-	nop							\n\
-								\n\
-	xor $0,$1,$1		# 8 cycles from $0 load		\n\
-	xor $3,$4,$4		# 7 cycles from $4 load		\n\
-	xor $6,$7,$7		# 6 cycles from $7 load		\n\
-	xor $21,$22,$22		# 5 cycles from $22 load	\n\
-								\n\
-	xor $1,$2,$2		# 9 cycles from $2 load		\n\
-	xor $24,$25,$25		# 5 cycles from $25 load	\n\
-	stq $2,0($17)						\n\
-	xor $4,$5,$5		# 6 cycles from $5 load		\n\
-								\n\
-	stq $5,8($17)						\n\
-	xor $7,$20,$20		# 7 cycles from $20 load	\n\
-	stq $20,16($17)						\n\
-	xor $22,$23,$23		# 7 cycles from $23 load	\n\
-								\n\
-	stq $23,24($17)						\n\
-	xor $25,$27,$27		# 7 cycles from $27 load	\n\
-	stq $27,32($17)						\n\
-	nop							\n\
-								\n\
-	ldq $0,40($17)						\n\
-	ldq $1,40($18)						\n\
-	ldq $3,48($17)						\n\
-	ldq $4,48($18)						\n\
-								\n\
-	ldq $6,56($17)						\n\
-	ldq $7,56($18)						\n\
-	ldq $2,40($19)						\n\
-	ldq $5,48($19)						\n\
-								\n\
-	ldq $20,56($19)						\n\
-	ldq $31,256($17)					\n\
-	ldq $31,256($18)					\n\
-	ldq $31,256($19)					\n\
-								\n\
-	xor $0,$1,$1		# 6 cycles from $1 load		\n\
-	xor $3,$4,$4		# 5 cycles from $4 load		\n\
-	xor $6,$7,$7		# 5 cycles from $7 load		\n\
-	xor $1,$2,$2		# 4 cycles from $2 load		\n\
-								\n\
-	xor $4,$5,$5		# 5 cycles from $5 load		\n\
-	xor $7,$20,$20		# 4 cycles from $20 load	\n\
-	stq $2,40($17)						\n\
-	subq $16,1,$16						\n\
-								\n\
-	stq $5,48($17)						\n\
-	addq $19,64,$19						\n\
-	stq $20,56($17)						\n\
-	addq $18,64,$18						\n\
-								\n\
-	addq $17,64,$17						\n\
-	bgt $16,3b						\n\
-	ret							\n\
-	.end xor_alpha_prefetch_3				\n\
-								\n\
-	.align 3						\n\
-	.ent xor_alpha_prefetch_4				\n\
-xor_alpha_prefetch_4:						\n\
-	.prologue 0						\n\
-	srl $16, 6, $16						\n\
-								\n\
-	ldq $31, 0($17)						\n\
-	ldq $31, 0($18)						\n\
-	ldq $31, 0($19)						\n\
-	ldq $31, 0($20)						\n\
-								\n\
-	ldq $31, 64($17)					\n\
-	ldq $31, 64($18)					\n\
-	ldq $31, 64($19)					\n\
-	ldq $31, 64($20)					\n\
-								\n\
-	ldq $31, 128($17)					\n\
-	ldq $31, 128($18)					\n\
-	ldq $31, 128($19)					\n\
-	ldq $31, 128($20)					\n\
-								\n\
-	ldq $31, 192($17)					\n\
-	ldq $31, 192($18)					\n\
-	ldq $31, 192($19)					\n\
-	ldq $31, 192($20)					\n\
-	.align 4						\n\
-4:								\n\
-	ldq $0,0($17)						\n\
-	ldq $1,0($18)						\n\
-	ldq $2,0($19)						\n\
-	ldq $3,0($20)						\n\
-								\n\
-	ldq $4,8($17)						\n\
-	ldq $5,8($18)						\n\
-	ldq $6,8($19)						\n\
-	ldq $7,8($20)						\n\
-								\n\
-	ldq $21,16($17)						\n\
-	ldq $22,16($18)						\n\
-	ldq $23,16($19)						\n\
-	ldq $24,16($20)						\n\
-								\n\
-	ldq $25,24($17)						\n\
-	xor $0,$1,$1		# 6 cycles from $1 load		\n\
-	ldq $27,24($18)						\n\
-	xor $2,$3,$3		# 6 cycles from $3 load		\n\
-								\n\
-	ldq $0,24($19)						\n\
-	xor $1,$3,$3						\n\
-	ldq $1,24($20)						\n\
-	xor $4,$5,$5		# 7 cycles from $5 load		\n\
-								\n\
-	stq $3,0($17)						\n\
-	xor $6,$7,$7						\n\
-	xor $21,$22,$22		# 7 cycles from $22 load	\n\
-	xor $5,$7,$7						\n\
-								\n\
-	stq $7,8($17)						\n\
-	xor $23,$24,$24		# 7 cycles from $24 load	\n\
-	ldq $2,32($17)						\n\
-	xor $22,$24,$24						\n\
-								\n\
-	ldq $3,32($18)						\n\
-	ldq $4,32($19)						\n\
-	ldq $5,32($20)						\n\
-	xor $25,$27,$27		# 8 cycles from $27 load	\n\
-								\n\
-	ldq $6,40($17)						\n\
-	ldq $7,40($18)						\n\
-	ldq $21,40($19)						\n\
-	ldq $22,40($20)						\n\
-								\n\
-	stq $24,16($17)						\n\
-	xor $0,$1,$1		# 9 cycles from $1 load		\n\
-	xor $2,$3,$3		# 5 cycles from $3 load		\n\
-	xor $27,$1,$1						\n\
-								\n\
-	stq $1,24($17)						\n\
-	xor $4,$5,$5		# 5 cycles from $5 load		\n\
-	ldq $23,48($17)						\n\
-	xor $3,$5,$5						\n\
-								\n\
-	ldq $24,48($18)						\n\
-	ldq $25,48($19)						\n\
-	ldq $27,48($20)						\n\
-	ldq $0,56($17)						\n\
-								\n\
-	ldq $1,56($18)						\n\
-	ldq $2,56($19)						\n\
-	ldq $3,56($20)						\n\
-	xor $6,$7,$7		# 8 cycles from $6 load		\n\
-								\n\
-	ldq $31,256($17)					\n\
-	xor $21,$22,$22		# 8 cycles from $22 load	\n\
-	ldq $31,256($18)					\n\
-	xor $7,$22,$22						\n\
-								\n\
-	ldq $31,256($19)					\n\
-	xor $23,$24,$24		# 6 cycles from $24 load	\n\
-	ldq $31,256($20)					\n\
-	xor $25,$27,$27		# 6 cycles from $27 load	\n\
-								\n\
-	stq $5,32($17)						\n\
-	xor $24,$27,$27						\n\
-	xor $0,$1,$1		# 7 cycles from $1 load		\n\
-	xor $2,$3,$3		# 6 cycles from $3 load		\n\
-								\n\
-	stq $22,40($17)						\n\
-	xor $1,$3,$3						\n\
-	stq $27,48($17)						\n\
-	subq $16,1,$16						\n\
-								\n\
-	stq $3,56($17)						\n\
-	addq $20,64,$20						\n\
-	addq $19,64,$19						\n\
-	addq $18,64,$18						\n\
-								\n\
-	addq $17,64,$17						\n\
-	bgt $16,4b						\n\
-	ret							\n\
-	.end xor_alpha_prefetch_4				\n\
-								\n\
-	.align 3						\n\
-	.ent xor_alpha_prefetch_5				\n\
-xor_alpha_prefetch_5:						\n\
-	.prologue 0						\n\
-	srl $16, 6, $16						\n\
-								\n\
-	ldq $31, 0($17)						\n\
-	ldq $31, 0($18)						\n\
-	ldq $31, 0($19)						\n\
-	ldq $31, 0($20)						\n\
-	ldq $31, 0($21)						\n\
-								\n\
-	ldq $31, 64($17)					\n\
-	ldq $31, 64($18)					\n\
-	ldq $31, 64($19)					\n\
-	ldq $31, 64($20)					\n\
-	ldq $31, 64($21)					\n\
-								\n\
-	ldq $31, 128($17)					\n\
-	ldq $31, 128($18)					\n\
-	ldq $31, 128($19)					\n\
-	ldq $31, 128($20)					\n\
-	ldq $31, 128($21)					\n\
-								\n\
-	ldq $31, 192($17)					\n\
-	ldq $31, 192($18)					\n\
-	ldq $31, 192($19)					\n\
-	ldq $31, 192($20)					\n\
-	ldq $31, 192($21)					\n\
-	.align 4						\n\
-5:								\n\
-	ldq $0,0($17)						\n\
-	ldq $1,0($18)						\n\
-	ldq $2,0($19)						\n\
-	ldq $3,0($20)						\n\
-								\n\
-	ldq $4,0($21)						\n\
-	ldq $5,8($17)						\n\
-	ldq $6,8($18)						\n\
-	ldq $7,8($19)						\n\
-								\n\
-	ldq $22,8($20)						\n\
-	ldq $23,8($21)						\n\
-	ldq $24,16($17)						\n\
-	ldq $25,16($18)						\n\
-								\n\
-	ldq $27,16($19)						\n\
-	xor $0,$1,$1		# 6 cycles from $1 load		\n\
-	ldq $28,16($20)						\n\
-	xor $2,$3,$3		# 6 cycles from $3 load		\n\
-								\n\
-	ldq $0,16($21)						\n\
-	xor $1,$3,$3						\n\
-	ldq $1,24($17)						\n\
-	xor $3,$4,$4		# 7 cycles from $4 load		\n\
-								\n\
-	stq $4,0($17)						\n\
-	xor $5,$6,$6		# 7 cycles from $6 load		\n\
-	xor $7,$22,$22		# 7 cycles from $22 load	\n\
-	xor $6,$23,$23		# 7 cycles from $23 load	\n\
-								\n\
-	ldq $2,24($18)						\n\
-	xor $22,$23,$23						\n\
-	ldq $3,24($19)						\n\
-	xor $24,$25,$25		# 8 cycles from $25 load	\n\
-								\n\
-	stq $23,8($17)						\n\
-	xor $25,$27,$27		# 8 cycles from $27 load	\n\
-	ldq $4,24($20)						\n\
-	xor $28,$0,$0		# 7 cycles from $0 load		\n\
-								\n\
-	ldq $5,24($21)						\n\
-	xor $27,$0,$0						\n\
-	ldq $6,32($17)						\n\
-	ldq $7,32($18)						\n\
-								\n\
-	stq $0,16($17)						\n\
-	xor $1,$2,$2		# 6 cycles from $2 load		\n\
-	ldq $22,32($19)						\n\
-	xor $3,$4,$4		# 4 cycles from $4 load		\n\
-								\n\
-	ldq $23,32($20)						\n\
-	xor $2,$4,$4						\n\
-	ldq $24,32($21)						\n\
-	ldq $25,40($17)						\n\
-								\n\
-	ldq $27,40($18)						\n\
-	ldq $28,40($19)						\n\
-	ldq $0,40($20)						\n\
-	xor $4,$5,$5		# 7 cycles from $5 load		\n\
-								\n\
-	stq $5,24($17)						\n\
-	xor $6,$7,$7		# 7 cycles from $7 load		\n\
-	ldq $1,40($21)						\n\
-	ldq $2,48($17)						\n\
-								\n\
-	ldq $3,48($18)						\n\
-	xor $7,$22,$22		# 7 cycles from $22 load	\n\
-	ldq $4,48($19)						\n\
-	xor $23,$24,$24		# 6 cycles from $24 load	\n\
-								\n\
-	ldq $5,48($20)						\n\
-	xor $22,$24,$24						\n\
-	ldq $6,48($21)						\n\
-	xor $25,$27,$27		# 7 cycles from $27 load	\n\
-								\n\
-	stq $24,32($17)						\n\
-	xor $27,$28,$28		# 8 cycles from $28 load	\n\
-	ldq $7,56($17)						\n\
-	xor $0,$1,$1		# 6 cycles from $1 load		\n\
-								\n\
-	ldq $22,56($18)						\n\
-	ldq $23,56($19)						\n\
-	ldq $24,56($20)						\n\
-	ldq $25,56($21)						\n\
-								\n\
-	ldq $31,256($17)					\n\
-	xor $28,$1,$1						\n\
-	ldq $31,256($18)					\n\
-	xor $2,$3,$3		# 9 cycles from $3 load		\n\
-								\n\
-	ldq $31,256($19)					\n\
-	xor $3,$4,$4		# 9 cycles from $4 load		\n\
-	ldq $31,256($20)					\n\
-	xor $5,$6,$6		# 8 cycles from $6 load		\n\
-								\n\
-	stq $1,40($17)						\n\
-	xor $4,$6,$6						\n\
-	xor $7,$22,$22		# 7 cycles from $22 load	\n\
-	xor $23,$24,$24		# 6 cycles from $24 load	\n\
-								\n\
-	stq $6,48($17)						\n\
-	xor $22,$24,$24						\n\
-	ldq $31,256($21)					\n\
-	xor $24,$25,$25		# 8 cycles from $25 load	\n\
-								\n\
-	stq $25,56($17)						\n\
-	subq $16,1,$16						\n\
-	addq $21,64,$21						\n\
-	addq $20,64,$20						\n\
-								\n\
-	addq $19,64,$19						\n\
-	addq $18,64,$18						\n\
-	addq $17,64,$17						\n\
-	bgt $16,5b						\n\
-								\n\
-	ret							\n\
-	.end xor_alpha_prefetch_5				\n\
-");
-
-static struct xor_block_template xor_block_alpha = {
-	.name	= "alpha",
-	.do_2	= xor_alpha_2,
-	.do_3	= xor_alpha_3,
-	.do_4	= xor_alpha_4,
-	.do_5	= xor_alpha_5,
-};
-
-static struct xor_block_template xor_block_alpha_prefetch = {
-	.name	= "alpha prefetch",
-	.do_2	= xor_alpha_prefetch_2,
-	.do_3	= xor_alpha_prefetch_3,
-	.do_4	= xor_alpha_prefetch_4,
-	.do_5	= xor_alpha_prefetch_5,
-};
-
-/* For grins, also test the generic routines.  */
+#include <asm/special_insns.h>
 #include <asm-generic/xor.h>
 
+extern struct xor_block_template xor_block_alpha;
+extern struct xor_block_template xor_block_alpha_prefetch;
+
 /*
  * Force the use of alpha_prefetch if EV6, as it is significantly faster in the
  * cold cache case.
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 89a944c9f99094..6d03c27c37c7d9 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -7,3 +7,5 @@ xor-y				+= xor-8regs.o
 xor-y				+= xor-32regs.o
 xor-y				+= xor-8regs-prefetch.o
 xor-y				+= xor-32regs-prefetch.o
+
+xor-$(CONFIG_ALPHA)		+= alpha/xor.o
diff --git a/lib/raid/xor/alpha/xor.c b/lib/raid/xor/alpha/xor.c
new file mode 100644
index 00000000000000..0964ac42060405
--- /dev/null
+++ b/lib/raid/xor/alpha/xor.c
@@ -0,0 +1,849 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Optimized XOR parity functions for alpha EV5 and EV6
+ */
+#include <linux/raid/xor_impl.h>
+#include <asm/xor.h>
+
+extern void
+xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2);
+extern void
+xor_alpha_3(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2,
+	    const unsigned long * __restrict p3);
+extern void
+xor_alpha_4(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2,
+	    const unsigned long * __restrict p3,
+	    const unsigned long * __restrict p4);
+extern void
+xor_alpha_5(unsigned long bytes, unsigned long * __restrict p1,
+	    const unsigned long * __restrict p2,
+	    const unsigned long * __restrict p3,
+	    const unsigned long * __restrict p4,
+	    const unsigned long * __restrict p5);
+
+extern void
+xor_alpha_prefetch_2(unsigned long bytes, unsigned long * __restrict p1,
+		     const unsigned long * __restrict p2);
+extern void
+xor_alpha_prefetch_3(unsigned long bytes, unsigned long * __restrict p1,
+		     const unsigned long * __restrict p2,
+		     const unsigned long * __restrict p3);
+extern void
+xor_alpha_prefetch_4(unsigned long bytes, unsigned long * __restrict p1,
+		     const unsigned long * __restrict p2,
+		     const unsigned long * __restrict p3,
+		     const unsigned long * __restrict p4);
+extern void
+xor_alpha_prefetch_5(unsigned long bytes, unsigned long * __restrict p1,
+		     const unsigned long * __restrict p2,
+		     const unsigned long * __restrict p3,
+		     const unsigned long * __restrict p4,
+		     const unsigned long * __restrict p5);
+
+asm("								\n\
+	.text							\n\
+	.align 3						\n\
+	.ent xor_alpha_2					\n\
+xor_alpha_2:							\n\
+	.prologue 0						\n\
+	srl $16, 6, $16						\n\
+	.align 4						\n\
+2:								\n\
+	ldq $0,0($17)						\n\
+	ldq $1,0($18)						\n\
+	ldq $2,8($17)						\n\
+	ldq $3,8($18)						\n\
+								\n\
+	ldq $4,16($17)						\n\
+	ldq $5,16($18)						\n\
+	ldq $6,24($17)						\n\
+	ldq $7,24($18)						\n\
+								\n\
+	ldq $19,32($17)						\n\
+	ldq $20,32($18)						\n\
+	ldq $21,40($17)						\n\
+	ldq $22,40($18)						\n\
+								\n\
+	ldq $23,48($17)						\n\
+	ldq $24,48($18)						\n\
+	ldq $25,56($17)						\n\
+	xor $0,$1,$0		# 7 cycles from $1 load		\n\
+								\n\
+	ldq $27,56($18)						\n\
+	xor $2,$3,$2						\n\
+	stq $0,0($17)						\n\
+	xor $4,$5,$4						\n\
+								\n\
+	stq $2,8($17)						\n\
+	xor $6,$7,$6						\n\
+	stq $4,16($17)						\n\
+	xor $19,$20,$19						\n\
+								\n\
+	stq $6,24($17)						\n\
+	xor $21,$22,$21						\n\
+	stq $19,32($17)						\n\
+	xor $23,$24,$23						\n\
+								\n\
+	stq $21,40($17)						\n\
+	xor $25,$27,$25						\n\
+	stq $23,48($17)						\n\
+	subq $16,1,$16						\n\
+								\n\
+	stq $25,56($17)						\n\
+	addq $17,64,$17						\n\
+	addq $18,64,$18						\n\
+	bgt $16,2b						\n\
+								\n\
+	ret							\n\
+	.end xor_alpha_2					\n\
+								\n\
+	.align 3						\n\
+	.ent xor_alpha_3					\n\
+xor_alpha_3:							\n\
+	.prologue 0						\n\
+	srl $16, 6, $16						\n\
+	.align 4						\n\
+3:								\n\
+	ldq $0,0($17)						\n\
+	ldq $1,0($18)						\n\
+	ldq $2,0($19)						\n\
+	ldq $3,8($17)						\n\
+								\n\
+	ldq $4,8($18)						\n\
+	ldq $6,16($17)						\n\
+	ldq $7,16($18)						\n\
+	ldq $21,24($17)						\n\
+								\n\
+	ldq $22,24($18)						\n\
+	ldq $24,32($17)						\n\
+	ldq $25,32($18)						\n\
+	ldq $5,8($19)						\n\
+								\n\
+	ldq $20,16($19)						\n\
+	ldq $23,24($19)						\n\
+	ldq $27,32($19)						\n\
+	nop							\n\
+								\n\
+	xor $0,$1,$1		# 8 cycles from $0 load		\n\
+	xor $3,$4,$4		# 6 cycles from $4 load		\n\
+	xor $6,$7,$7		# 6 cycles from $7 load		\n\
+	xor $21,$22,$22		# 5 cycles from $22 load	\n\
+								\n\
+	xor $1,$2,$2		# 9 cycles from $2 load		\n\
+	xor $24,$25,$25		# 5 cycles from $25 load	\n\
+	stq $2,0($17)						\n\
+	xor $4,$5,$5		# 6 cycles from $5 load		\n\
+								\n\
+	stq $5,8($17)						\n\
+	xor $7,$20,$20		# 7 cycles from $20 load	\n\
+	stq $20,16($17)						\n\
+	xor $22,$23,$23		# 7 cycles from $23 load	\n\
+								\n\
+	stq $23,24($17)						\n\
+	xor $25,$27,$27		# 7 cycles from $27 load	\n\
+	stq $27,32($17)						\n\
+	nop							\n\
+								\n\
+	ldq $0,40($17)						\n\
+	ldq $1,40($18)						\n\
+	ldq $3,48($17)						\n\
+	ldq $4,48($18)						\n\
+								\n\
+	ldq $6,56($17)						\n\
+	ldq $7,56($18)						\n\
+	ldq $2,40($19)						\n\
+	ldq $5,48($19)						\n\
+								\n\
+	ldq $20,56($19)						\n\
+	xor $0,$1,$1		# 4 cycles from $1 load		\n\
+	xor $3,$4,$4		# 5 cycles from $4 load		\n\
+	xor $6,$7,$7		# 5 cycles from $7 load		\n\
+								\n\
+	xor $1,$2,$2		# 4 cycles from $2 load		\n\
+	xor $4,$5,$5		# 5 cycles from $5 load		\n\
+	stq $2,40($17)						\n\
+	xor $7,$20,$20		# 4 cycles from $20 load	\n\
+								\n\
+	stq $5,48($17)						\n\
+	subq $16,1,$16						\n\
+	stq $20,56($17)						\n\
+	addq $19,64,$19						\n\
+								\n\
+	addq $18,64,$18						\n\
+	addq $17,64,$17						\n\
+	bgt $16,3b						\n\
+	ret							\n\
+	.end xor_alpha_3					\n\
+								\n\
+	.align 3						\n\
+	.ent xor_alpha_4					\n\
+xor_alpha_4:							\n\
+	.prologue 0						\n\
+	srl $16, 6, $16						\n\
+	.align 4						\n\
+4:								\n\
+	ldq $0,0($17)						\n\
+	ldq $1,0($18)						\n\
+	ldq $2,0($19)						\n\
+	ldq $3,0($20)						\n\
+								\n\
+	ldq $4,8($17)						\n\
+	ldq $5,8($18)						\n\
+	ldq $6,8($19)						\n\
+	ldq $7,8($20)						\n\
+								\n\
+	ldq $21,16($17)						\n\
+	ldq $22,16($18)						\n\
+	ldq $23,16($19)						\n\
+	ldq $24,16($20)						\n\
+								\n\
+	ldq $25,24($17)						\n\
+	xor $0,$1,$1		# 6 cycles from $1 load		\n\
+	ldq $27,24($18)						\n\
+	xor $2,$3,$3		# 6 cycles from $3 load		\n\
+								\n\
+	ldq $0,24($19)						\n\
+	xor $1,$3,$3						\n\
+	ldq $1,24($20)						\n\
+	xor $4,$5,$5		# 7 cycles from $5 load		\n\
+								\n\
+	stq $3,0($17)						\n\
+	xor $6,$7,$7						\n\
+	xor $21,$22,$22		# 7 cycles from $22 load	\n\
+	xor $5,$7,$7						\n\
+								\n\
+	stq $7,8($17)						\n\
+	xor $23,$24,$24		# 7 cycles from $24 load	\n\
+	ldq $2,32($17)						\n\
+	xor $22,$24,$24						\n\
+								\n\
+	ldq $3,32($18)						\n\
+	ldq $4,32($19)						\n\
+	ldq $5,32($20)						\n\
+	xor $25,$27,$27		# 8 cycles from $27 load	\n\
+								\n\
+	ldq $6,40($17)						\n\
+	ldq $7,40($18)						\n\
+	ldq $21,40($19)						\n\
+	ldq $22,40($20)						\n\
+								\n\
+	stq $24,16($17)						\n\
+	xor $0,$1,$1		# 9 cycles from $1 load		\n\
+	xor $2,$3,$3		# 5 cycles from $3 load		\n\
+	xor $27,$1,$1						\n\
+								\n\
+	stq $1,24($17)						\n\
+	xor $4,$5,$5		# 5 cycles from $5 load		\n\
+	ldq $23,48($17)						\n\
+	ldq $24,48($18)						\n\
+								\n\
+	ldq $25,48($19)						\n\
+	xor $3,$5,$5						\n\
+	ldq $27,48($20)						\n\
+	ldq $0,56($17)						\n\
+								\n\
+	ldq $1,56($18)						\n\
+	ldq $2,56($19)						\n\
+	xor $6,$7,$7		# 8 cycles from $6 load		\n\
+	ldq $3,56($20)						\n\
+								\n\
+	stq $5,32($17)						\n\
+	xor $21,$22,$22		# 8 cycles from $22 load	\n\
+	xor $7,$22,$22						\n\
+	xor $23,$24,$24		# 5 cycles from $24 load	\n\
+								\n\
+	stq $22,40($17)						\n\
+	xor $25,$27,$27		# 5 cycles from $27 load	\n\
+	xor $24,$27,$27						\n\
+	xor $0,$1,$1		# 5 cycles from $1 load		\n\
+								\n\
+	stq $27,48($17)						\n\
+	xor $2,$3,$3		# 4 cycles from $3 load		\n\
+	xor $1,$3,$3						\n\
+	subq $16,1,$16						\n\
+								\n\
+	stq $3,56($17)						\n\
+	addq $20,64,$20						\n\
+	addq $19,64,$19						\n\
+	addq $18,64,$18						\n\
+								\n\
+	addq $17,64,$17						\n\
+	bgt $16,4b						\n\
+	ret							\n\
+	.end xor_alpha_4					\n\
+								\n\
+	.align 3						\n\
+	.ent xor_alpha_5					\n\
+xor_alpha_5:							\n\
+	.prologue 0						\n\
+	srl $16, 6, $16						\n\
+	.align 4						\n\
+5:								\n\
+	ldq $0,0($17)						\n\
+	ldq $1,0($18)						\n\
+	ldq $2,0($19)						\n\
+	ldq $3,0($20)						\n\
+								\n\
+	ldq $4,0($21)						\n\
+	ldq $5,8($17)						\n\
+	ldq $6,8($18)						\n\
+	ldq $7,8($19)						\n\
+								\n\
+	ldq $22,8($20)						\n\
+	ldq $23,8($21)						\n\
+	ldq $24,16($17)						\n\
+	ldq $25,16($18)						\n\
+								\n\
+	ldq $27,16($19)						\n\
+	xor $0,$1,$1		# 6 cycles from $1 load		\n\
+	ldq $28,16($20)						\n\
+	xor $2,$3,$3		# 6 cycles from $3 load		\n\
+								\n\
+	ldq $0,16($21)						\n\
+	xor $1,$3,$3						\n\
+	ldq $1,24($17)						\n\
+	xor $3,$4,$4		# 7 cycles from $4 load		\n\
+								\n\
+	stq $4,0($17)						\n\
+	xor $5,$6,$6		# 7 cycles from $6 load		\n\
+	xor $7,$22,$22		# 7 cycles from $22 load	\n\
+	xor $6,$23,$23		# 7 cycles from $23 load	\n\
+								\n\
+	ldq $2,24($18)						\n\
+	xor $22,$23,$23						\n\
+	ldq $3,24($19)						\n\
+	xor $24,$25,$25		# 8 cycles from $25 load	\n\
+								\n\
+	stq $23,8($17)						\n\
+	xor $25,$27,$27		# 8 cycles from $27 load	\n\
+	ldq $4,24($20)						\n\
+	xor $28,$0,$0		# 7 cycles from $0 load		\n\
+								\n\
+	ldq $5,24($21)						\n\
+	xor $27,$0,$0						\n\
+	ldq $6,32($17)						\n\
+	ldq $7,32($18)						\n\
+								\n\
+	stq $0,16($17)						\n\
+	xor $1,$2,$2		# 6 cycles from $2 load		\n\
+	ldq $22,32($19)						\n\
+	xor $3,$4,$4		# 4 cycles from $4 load		\n\
+								\n\
+	ldq $23,32($20)						\n\
+	xor $2,$4,$4						\n\
+	ldq $24,32($21)						\n\
+	ldq $25,40($17)						\n\
+								\n\
+	ldq $27,40($18)						\n\
+	ldq $28,40($19)						\n\
+	ldq $0,40($20)						\n\
+	xor $4,$5,$5		# 7 cycles from $5 load		\n\
+								\n\
+	stq $5,24($17)						\n\
+	xor $6,$7,$7		# 7 cycles from $7 load		\n\
+	ldq $1,40($21)						\n\
+	ldq $2,48($17)						\n\
+								\n\
+	ldq $3,48($18)						\n\
+	xor $7,$22,$22		# 7 cycles from $22 load	\n\
+	ldq $4,48($19)						\n\
+	xor $23,$24,$24		# 6 cycles from $24 load	\n\
+								\n\
+	ldq $5,48($20)						\n\
+	xor $22,$24,$24						\n\
+	ldq $6,48($21)						\n\
+	xor $25,$27,$27		# 7 cycles from $27 load	\n\
+								\n\
+	stq $24,32($17)						\n\
+	xor $27,$28,$28		# 8 cycles from $28 load	\n\
+	ldq $7,56($17)						\n\
+	xor $0,$1,$1		# 6 cycles from $1 load		\n\
+								\n\
+	ldq $22,56($18)						\n\
+	ldq $23,56($19)						\n\
+	ldq $24,56($20)						\n\
+	ldq $25,56($21)						\n\
+								\n\
+	xor $28,$1,$1						\n\
+	xor $2,$3,$3		# 9 cycles from $3 load		\n\
+	xor $3,$4,$4		# 9 cycles from $4 load		\n\
+	xor $5,$6,$6		# 8 cycles from $6 load		\n\
+								\n\
+	stq $1,40($17)						\n\
+	xor $4,$6,$6						\n\
+	xor $7,$22,$22		# 7 cycles from $22 load	\n\
+	xor $23,$24,$24		# 6 cycles from $24 load	\n\
+								\n\
+	stq $6,48($17)						\n\
+	xor $22,$24,$24						\n\
+	subq $16,1,$16						\n\
+	xor $24,$25,$25		# 8 cycles from $25 load	\n\
+								\n\
+	stq $25,56($17)						\n\
+	addq $21,64,$21						\n\
+	addq $20,64,$20						\n\
+	addq $19,64,$19						\n\
+								\n\
+	addq $18,64,$18						\n\
+	addq $17,64,$17						\n\
+	bgt $16,5b						\n\
+	ret							\n\
+	.end xor_alpha_5					\n\
+								\n\
+	.align 3						\n\
+	.ent xor_alpha_prefetch_2				\n\
+xor_alpha_prefetch_2:						\n\
+	.prologue 0						\n\
+	srl $16, 6, $16						\n\
+								\n\
+	ldq $31, 0($17)						\n\
+	ldq $31, 0($18)						\n\
+								\n\
+	ldq $31, 64($17)					\n\
+	ldq $31, 64($18)					\n\
+								\n\
+	ldq $31, 128($17)					\n\
+	ldq $31, 128($18)					\n\
+								\n\
+	ldq $31, 192($17)					\n\
+	ldq $31, 192($18)					\n\
+	.align 4						\n\
+2:								\n\
+	ldq $0,0($17)						\n\
+	ldq $1,0($18)						\n\
+	ldq $2,8($17)						\n\
+	ldq $3,8($18)						\n\
+								\n\
+	ldq $4,16($17)						\n\
+	ldq $5,16($18)						\n\
+	ldq $6,24($17)						\n\
+	ldq $7,24($18)						\n\
+								\n\
+	ldq $19,32($17)						\n\
+	ldq $20,32($18)						\n\
+	ldq $21,40($17)						\n\
+	ldq $22,40($18)						\n\
+								\n\
+	ldq $23,48($17)						\n\
+	ldq $24,48($18)						\n\
+	ldq $25,56($17)						\n\
+	ldq $27,56($18)						\n\
+								\n\
+	ldq $31,256($17)					\n\
+	xor $0,$1,$0		# 8 cycles from $1 load		\n\
+	ldq $31,256($18)					\n\
+	xor $2,$3,$2						\n\
+								\n\
+	stq $0,0($17)						\n\
+	xor $4,$5,$4						\n\
+	stq $2,8($17)						\n\
+	xor $6,$7,$6						\n\
+								\n\
+	stq $4,16($17)						\n\
+	xor $19,$20,$19						\n\
+	stq $6,24($17)						\n\
+	xor $21,$22,$21						\n\
+								\n\
+	stq $19,32($17)						\n\
+	xor $23,$24,$23						\n\
+	stq $21,40($17)						\n\
+	xor $25,$27,$25						\n\
+								\n\
+	stq $23,48($17)						\n\
+	subq $16,1,$16						\n\
+	stq $25,56($17)						\n\
+	addq $17,64,$17						\n\
+								\n\
+	addq $18,64,$18						\n\
+	bgt $16,2b						\n\
+	ret							\n\
+	.end xor_alpha_prefetch_2				\n\
+								\n\
+	.align 3						\n\
+	.ent xor_alpha_prefetch_3				\n\
+xor_alpha_prefetch_3:						\n\
+	.prologue 0						\n\
+	srl $16, 6, $16						\n\
+								\n\
+	ldq $31, 0($17)						\n\
+	ldq $31, 0($18)						\n\
+	ldq $31, 0($19)						\n\
+								\n\
+	ldq $31, 64($17)					\n\
+	ldq $31, 64($18)					\n\
+	ldq $31, 64($19)					\n\
+								\n\
+	ldq $31, 128($17)					\n\
+	ldq $31, 128($18)					\n\
+	ldq $31, 128($19)					\n\
+								\n\
+	ldq $31, 192($17)					\n\
+	ldq $31, 192($18)					\n\
+	ldq $31, 192($19)					\n\
+	.align 4						\n\
+3:								\n\
+	ldq $0,0($17)						\n\
+	ldq $1,0($18)						\n\
+	ldq $2,0($19)						\n\
+	ldq $3,8($17)						\n\
+								\n\
+	ldq $4,8($18)						\n\
+	ldq $6,16($17)						\n\
+	ldq $7,16($18)						\n\
+	ldq $21,24($17)						\n\
+								\n\
+	ldq $22,24($18)						\n\
+	ldq $24,32($17)						\n\
+	ldq $25,32($18)						\n\
+	ldq $5,8($19)						\n\
+								\n\
+	ldq $20,16($19)						\n\
+	ldq $23,24($19)						\n\
+	ldq $27,32($19)						\n\
+	nop							\n\
+								\n\
+	xor $0,$1,$1		# 8 cycles from $0 load		\n\
+	xor $3,$4,$4		# 7 cycles from $4 load		\n\
+	xor $6,$7,$7		# 6 cycles from $7 load		\n\
+	xor $21,$22,$22		# 5 cycles from $22 load	\n\
+								\n\
+	xor $1,$2,$2		# 9 cycles from $2 load		\n\
+	xor $24,$25,$25		# 5 cycles from $25 load	\n\
+	stq $2,0($17)						\n\
+	xor $4,$5,$5		# 6 cycles from $5 load		\n\
+								\n\
+	stq $5,8($17)						\n\
+	xor $7,$20,$20		# 7 cycles from $20 load	\n\
+	stq $20,16($17)						\n\
+	xor $22,$23,$23		# 7 cycles from $23 load	\n\
+								\n\
+	stq $23,24($17)						\n\
+	xor $25,$27,$27		# 7 cycles from $27 load	\n\
+	stq $27,32($17)						\n\
+	nop							\n\
+								\n\
+	ldq $0,40($17)						\n\
+	ldq $1,40($18)						\n\
+	ldq $3,48($17)						\n\
+	ldq $4,48($18)						\n\
+								\n\
+	ldq $6,56($17)						\n\
+	ldq $7,56($18)						\n\
+	ldq $2,40($19)						\n\
+	ldq $5,48($19)						\n\
+								\n\
+	ldq $20,56($19)						\n\
+	ldq $31,256($17)					\n\
+	ldq $31,256($18)					\n\
+	ldq $31,256($19)					\n\
+								\n\
+	xor $0,$1,$1		# 6 cycles from $1 load		\n\
+	xor $3,$4,$4		# 5 cycles from $4 load		\n\
+	xor $6,$7,$7		# 5 cycles from $7 load		\n\
+	xor $1,$2,$2		# 4 cycles from $2 load		\n\
+								\n\
+	xor $4,$5,$5		# 5 cycles from $5 load		\n\
+	xor $7,$20,$20		# 4 cycles from $20 load	\n\
+	stq $2,40($17)						\n\
+	subq $16,1,$16						\n\
+								\n\
+	stq $5,48($17)						\n\
+	addq $19,64,$19						\n\
+	stq $20,56($17)						\n\
+	addq $18,64,$18						\n\
+								\n\
+	addq $17,64,$17						\n\
+	bgt $16,3b						\n\
+	ret							\n\
+	.end xor_alpha_prefetch_3				\n\
+								\n\
+	.align 3						\n\
+	.ent xor_alpha_prefetch_4				\n\
+xor_alpha_prefetch_4:						\n\
+	.prologue 0						\n\
+	srl $16, 6, $16						\n\
+								\n\
+	ldq $31, 0($17)						\n\
+	ldq $31, 0($18)						\n\
+	ldq $31, 0($19)						\n\
+	ldq $31, 0($20)						\n\
+								\n\
+	ldq $31, 64($17)					\n\
+	ldq $31, 64($18)					\n\
+	ldq $31, 64($19)					\n\
+	ldq $31, 64($20)					\n\
+								\n\
+	ldq $31, 128($17)					\n\
+	ldq $31, 128($18)					\n\
+	ldq $31, 128($19)					\n\
+	ldq $31, 128($20)					\n\
+								\n\
+	ldq $31, 192($17)					\n\
+	ldq $31, 192($18)					\n\
+	ldq $31, 192($19)					\n\
+	ldq $31, 192($20)					\n\
+	.align 4						\n\
+4:								\n\
+	ldq $0,0($17)						\n\
+	ldq $1,0($18)						\n\
+	ldq $2,0($19)						\n\
+	ldq $3,0($20)						\n\
+								\n\
+	ldq $4,8($17)						\n\
+	ldq $5,8($18)						\n\
+	ldq $6,8($19)						\n\
+	ldq $7,8($20)						\n\
+								\n\
+	ldq $21,16($17)						\n\
+	ldq $22,16($18)						\n\
+	ldq $23,16($19)						\n\
+	ldq $24,16($20)						\n\
+								\n\
+	ldq $25,24($17)						\n\
+	xor $0,$1,$1		# 6 cycles from $1 load		\n\
+	ldq $27,24($18)						\n\
+	xor $2,$3,$3		# 6 cycles from $3 load		\n\
+								\n\
+	ldq $0,24($19)						\n\
+	xor $1,$3,$3						\n\
+	ldq $1,24($20)						\n\
+	xor $4,$5,$5		# 7 cycles from $5 load		\n\
+								\n\
+	stq $3,0($17)						\n\
+	xor $6,$7,$7						\n\
+	xor $21,$22,$22		# 7 cycles from $22 load	\n\
+	xor $5,$7,$7						\n\
+								\n\
+	stq $7,8($17)						\n\
+	xor $23,$24,$24		# 7 cycles from $24 load	\n\
+	ldq $2,32($17)						\n\
+	xor $22,$24,$24						\n\
+								\n\
+	ldq $3,32($18)						\n\
+	ldq $4,32($19)						\n\
+	ldq $5,32($20)						\n\
+	xor $25,$27,$27		# 8 cycles from $27 load	\n\
+								\n\
+	ldq $6,40($17)						\n\
+	ldq $7,40($18)						\n\
+	ldq $21,40($19)						\n\
+	ldq $22,40($20)						\n\
+								\n\
+	stq $24,16($17)						\n\
+	xor $0,$1,$1		# 9 cycles from $1 load		\n\
+	xor $2,$3,$3		# 5 cycles from $3 load		\n\
+	xor $27,$1,$1						\n\
+								\n\
+	stq $1,24($17)						\n\
+	xor $4,$5,$5		# 5 cycles from $5 load		\n\
+	ldq $23,48($17)						\n\
+	xor $3,$5,$5						\n\
+								\n\
+	ldq $24,48($18)						\n\
+	ldq $25,48($19)						\n\
+	ldq $27,48($20)						\n\
+	ldq $0,56($17)						\n\
+								\n\
+	ldq $1,56($18)						\n\
+	ldq $2,56($19)						\n\
+	ldq $3,56($20)						\n\
+	xor $6,$7,$7		# 8 cycles from $6 load		\n\
+								\n\
+	ldq $31,256($17)					\n\
+	xor $21,$22,$22		# 8 cycles from $22 load	\n\
+	ldq $31,256($18)					\n\
+	xor $7,$22,$22						\n\
+								\n\
+	ldq $31,256($19)					\n\
+	xor $23,$24,$24		# 6 cycles from $24 load	\n\
+	ldq $31,256($20)					\n\
+	xor $25,$27,$27		# 6 cycles from $27 load	\n\
+								\n\
+	stq $5,32($17)						\n\
+	xor $24,$27,$27						\n\
+	xor $0,$1,$1		# 7 cycles from $1 load		\n\
+	xor $2,$3,$3		# 6 cycles from $3 load		\n\
+								\n\
+	stq $22,40($17)						\n\
+	xor $1,$3,$3						\n\
+	stq $27,48($17)						\n\
+	subq $16,1,$16						\n\
+								\n\
+	stq $3,56($17)						\n\
+	addq $20,64,$20						\n\
+	addq $19,64,$19						\n\
+	addq $18,64,$18						\n\
+								\n\
+	addq $17,64,$17						\n\
+	bgt $16,4b						\n\
+	ret							\n\
+	.end xor_alpha_prefetch_4				\n\
+								\n\
+	.align 3						\n\
+	.ent xor_alpha_prefetch_5				\n\
+xor_alpha_prefetch_5:						\n\
+	.prologue 0						\n\
+	srl $16, 6, $16						\n\
+								\n\
+	ldq $31, 0($17)						\n\
+	ldq $31, 0($18)						\n\
+	ldq $31, 0($19)						\n\
+	ldq $31, 0($20)						\n\
+	ldq $31, 0($21)						\n\
+								\n\
+	ldq $31, 64($17)					\n\
+	ldq $31, 64($18)					\n\
+	ldq $31, 64($19)					\n\
+	ldq $31, 64($20)					\n\
+	ldq $31, 64($21)					\n\
+								\n\
+	ldq $31, 128($17)					\n\
+	ldq $31, 128($18)					\n\
+	ldq $31, 128($19)					\n\
+	ldq $31, 128($20)					\n\
+	ldq $31, 128($21)					\n\
+								\n\
+	ldq $31, 192($17)					\n\
+	ldq $31, 192($18)					\n\
+	ldq $31, 192($19)					\n\
+	ldq $31, 192($20)					\n\
+	ldq $31, 192($21)					\n\
+	.align 4						\n\
+5:								\n\
+	ldq $0,0($17)						\n\
+	ldq $1,0($18)						\n\
+	ldq $2,0($19)						\n\
+	ldq $3,0($20)						\n\
+								\n\
+	ldq $4,0($21)						\n\
+	ldq $5,8($17)						\n\
+	ldq $6,8($18)						\n\
+	ldq $7,8($19)						\n\
+								\n\
+	ldq $22,8($20)						\n\
+	ldq $23,8($21)						\n\
+	ldq $24,16($17)						\n\
+	ldq $25,16($18)						\n\
+								\n\
+	ldq $27,16($19)						\n\
+	xor $0,$1,$1		# 6 cycles from $1 load		\n\
+	ldq $28,16($20)						\n\
+	xor $2,$3,$3		# 6 cycles from $3 load		\n\
+								\n\
+	ldq $0,16($21)						\n\
+	xor $1,$3,$3						\n\
+	ldq $1,24($17)						\n\
+	xor $3,$4,$4		# 7 cycles from $4 load		\n\
+								\n\
+	stq $4,0($17)						\n\
+	xor $5,$6,$6		# 7 cycles from $6 load		\n\
+	xor $7,$22,$22		# 7 cycles from $22 load	\n\
+	xor $6,$23,$23		# 7 cycles from $23 load	\n\
+								\n\
+	ldq $2,24($18)						\n\
+	xor $22,$23,$23						\n\
+	ldq $3,24($19)						\n\
+	xor $24,$25,$25		# 8 cycles from $25 load	\n\
+								\n\
+	stq $23,8($17)						\n\
+	xor $25,$27,$27		# 8 cycles from $27 load	\n\
+	ldq $4,24($20)						\n\
+	xor $28,$0,$0		# 7 cycles from $0 load		\n\
+								\n\
+	ldq $5,24($21)						\n\
+	xor $27,$0,$0						\n\
+	ldq $6,32($17)						\n\
+	ldq $7,32($18)						\n\
+								\n\
+	stq $0,16($17)						\n\
+	xor $1,$2,$2		# 6 cycles from $2 load		\n\
+	ldq $22,32($19)						\n\
+	xor $3,$4,$4		# 4 cycles from $4 load		\n\
+								\n\
+	ldq $23,32($20)						\n\
+	xor $2,$4,$4						\n\
+	ldq $24,32($21)						\n\
+	ldq $25,40($17)						\n\
+								\n\
+	ldq $27,40($18)						\n\
+	ldq $28,40($19)						\n\
+	ldq $0,40($20)						\n\
+	xor $4,$5,$5		# 7 cycles from $5 load		\n\
+								\n\
+	stq $5,24($17)						\n\
+	xor $6,$7,$7		# 7 cycles from $7 load		\n\
+	ldq $1,40($21)						\n\
+	ldq $2,48($17)						\n\
+								\n\
+	ldq $3,48($18)						\n\
+	xor $7,$22,$22		# 7 cycles from $22 load	\n\
+	ldq $4,48($19)						\n\
+	xor $23,$24,$24		# 6 cycles from $24 load	\n\
+								\n\
+	ldq $5,48($20)						\n\
+	xor $22,$24,$24						\n\
+	ldq $6,48($21)						\n\
+	xor $25,$27,$27		# 7 cycles from $27 load	\n\
+								\n\
+	stq $24,32($17)						\n\
+	xor $27,$28,$28		# 8 cycles from $28 load	\n\
+	ldq $7,56($17)						\n\
+	xor $0,$1,$1		# 6 cycles from $1 load		\n\
+								\n\
+	ldq $22,56($18)						\n\
+	ldq $23,56($19)						\n\
+	ldq $24,56($20)						\n\
+	ldq $25,56($21)						\n\
+								\n\
+	ldq $31,256($17)					\n\
+	xor $28,$1,$1						\n\
+	ldq $31,256($18)					\n\
+	xor $2,$3,$3		# 9 cycles from $3 load		\n\
+								\n\
+	ldq $31,256($19)					\n\
+	xor $3,$4,$4		# 9 cycles from $4 load		\n\
+	ldq $31,256($20)					\n\
+	xor $5,$6,$6		# 8 cycles from $6 load		\n\
+								\n\
+	stq $1,40($17)						\n\
+	xor $4,$6,$6						\n\
+	xor $7,$22,$22		# 7 cycles from $22 load	\n\
+	xor $23,$24,$24		# 6 cycles from $24 load	\n\
+								\n\
+	stq $6,48($17)						\n\
+	xor $22,$24,$24						\n\
+	ldq $31,256($21)					\n\
+	xor $24,$25,$25		# 8 cycles from $25 load	\n\
+								\n\
+	stq $25,56($17)						\n\
+	subq $16,1,$16						\n\
+	addq $21,64,$21						\n\
+	addq $20,64,$20						\n\
+								\n\
+	addq $19,64,$19						\n\
+	addq $18,64,$18						\n\
+	addq $17,64,$17						\n\
+	bgt $16,5b						\n\
+								\n\
+	ret							\n\
+	.end xor_alpha_prefetch_5				\n\
+");
+
+struct xor_block_template xor_block_alpha = {
+	.name	= "alpha",
+	.do_2	= xor_alpha_2,
+	.do_3	= xor_alpha_3,
+	.do_4	= xor_alpha_4,
+	.do_5	= xor_alpha_5,
+};
+
+struct xor_block_template xor_block_alpha_prefetch = {
+	.name	= "alpha prefetch",
+	.do_2	= xor_alpha_prefetch_2,
+	.do_3	= xor_alpha_prefetch_3,
+	.do_4	= xor_alpha_prefetch_4,
+	.do_5	= xor_alpha_prefetch_5,
+};

From 36efb00257bca851a7a2e4b1268c796b2f7f7ad3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:47 +0100
Subject: [PATCH 11/26] arm: move the XOR code to lib/raid/

Move the optimized XOR into lib/raid and include it it in the main
xor.ko instead of building a separate module for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/arm/include/asm/xor.h                    | 190 +-----------------
 arch/arm/lib/Makefile                         |   5 -
 lib/raid/xor/Makefile                         |   8 +
 lib/raid/xor/arm/xor-neon-glue.c              |  58 ++++++
 {arch/arm/lib => lib/raid/xor/arm}/xor-neon.c |  10 +-
 lib/raid/xor/arm/xor.c                        | 136 +++++++++++++
 6 files changed, 205 insertions(+), 202 deletions(-)
 create mode 100644 lib/raid/xor/arm/xor-neon-glue.c
 rename {arch/arm/lib => lib/raid/xor/arm}/xor-neon.c (74%)
 create mode 100644 lib/raid/xor/arm/xor.c

diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
index b2dcd49186e2b9..989c55872ef6a5 100644
--- a/arch/arm/include/asm/xor.h
+++ b/arch/arm/include/asm/xor.h
@@ -1,198 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- *  arch/arm/include/asm/xor.h
- *
  *  Copyright (C) 2001 Russell King
  */
 #include <asm-generic/xor.h>
-#include <asm/hwcap.h>
 #include <asm/neon.h>
 
-#define __XOR(a1, a2) a1 ^= a2
-
-#define GET_BLOCK_2(dst) \
-	__asm__("ldmia	%0, {%1, %2}" \
-		: "=r" (dst), "=r" (a1), "=r" (a2) \
-		: "0" (dst))
-
-#define GET_BLOCK_4(dst) \
-	__asm__("ldmia	%0, {%1, %2, %3, %4}" \
-		: "=r" (dst), "=r" (a1), "=r" (a2), "=r" (a3), "=r" (a4) \
-		: "0" (dst))
-
-#define XOR_BLOCK_2(src) \
-	__asm__("ldmia	%0!, {%1, %2}" \
-		: "=r" (src), "=r" (b1), "=r" (b2) \
-		: "0" (src)); \
-	__XOR(a1, b1); __XOR(a2, b2);
-
-#define XOR_BLOCK_4(src) \
-	__asm__("ldmia	%0!, {%1, %2, %3, %4}" \
-		: "=r" (src), "=r" (b1), "=r" (b2), "=r" (b3), "=r" (b4) \
-		: "0" (src)); \
-	__XOR(a1, b1); __XOR(a2, b2); __XOR(a3, b3); __XOR(a4, b4)
-
-#define PUT_BLOCK_2(dst) \
-	__asm__ __volatile__("stmia	%0!, {%2, %3}" \
-		: "=r" (dst) \
-		: "0" (dst), "r" (a1), "r" (a2))
-
-#define PUT_BLOCK_4(dst) \
-	__asm__ __volatile__("stmia	%0!, {%2, %3, %4, %5}" \
-		: "=r" (dst) \
-		: "0" (dst), "r" (a1), "r" (a2), "r" (a3), "r" (a4))
-
-static void
-xor_arm4regs_2(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2)
-{
-	unsigned int lines = bytes / sizeof(unsigned long) / 4;
-	register unsigned int a1 __asm__("r4");
-	register unsigned int a2 __asm__("r5");
-	register unsigned int a3 __asm__("r6");
-	register unsigned int a4 __asm__("r10");
-	register unsigned int b1 __asm__("r8");
-	register unsigned int b2 __asm__("r9");
-	register unsigned int b3 __asm__("ip");
-	register unsigned int b4 __asm__("lr");
-
-	do {
-		GET_BLOCK_4(p1);
-		XOR_BLOCK_4(p2);
-		PUT_BLOCK_4(p1);
-	} while (--lines);
-}
-
-static void
-xor_arm4regs_3(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3)
-{
-	unsigned int lines = bytes / sizeof(unsigned long) / 4;
-	register unsigned int a1 __asm__("r4");
-	register unsigned int a2 __asm__("r5");
-	register unsigned int a3 __asm__("r6");
-	register unsigned int a4 __asm__("r10");
-	register unsigned int b1 __asm__("r8");
-	register unsigned int b2 __asm__("r9");
-	register unsigned int b3 __asm__("ip");
-	register unsigned int b4 __asm__("lr");
-
-	do {
-		GET_BLOCK_4(p1);
-		XOR_BLOCK_4(p2);
-		XOR_BLOCK_4(p3);
-		PUT_BLOCK_4(p1);
-	} while (--lines);
-}
-
-static void
-xor_arm4regs_4(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3,
-	       const unsigned long * __restrict p4)
-{
-	unsigned int lines = bytes / sizeof(unsigned long) / 2;
-	register unsigned int a1 __asm__("r8");
-	register unsigned int a2 __asm__("r9");
-	register unsigned int b1 __asm__("ip");
-	register unsigned int b2 __asm__("lr");
-
-	do {
-		GET_BLOCK_2(p1);
-		XOR_BLOCK_2(p2);
-		XOR_BLOCK_2(p3);
-		XOR_BLOCK_2(p4);
-		PUT_BLOCK_2(p1);
-	} while (--lines);
-}
-
-static void
-xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3,
-	       const unsigned long * __restrict p4,
-	       const unsigned long * __restrict p5)
-{
-	unsigned int lines = bytes / sizeof(unsigned long) / 2;
-	register unsigned int a1 __asm__("r8");
-	register unsigned int a2 __asm__("r9");
-	register unsigned int b1 __asm__("ip");
-	register unsigned int b2 __asm__("lr");
-
-	do {
-		GET_BLOCK_2(p1);
-		XOR_BLOCK_2(p2);
-		XOR_BLOCK_2(p3);
-		XOR_BLOCK_2(p4);
-		XOR_BLOCK_2(p5);
-		PUT_BLOCK_2(p1);
-	} while (--lines);
-}
-
-static struct xor_block_template xor_block_arm4regs = {
-	.name	= "arm4regs",
-	.do_2	= xor_arm4regs_2,
-	.do_3	= xor_arm4regs_3,
-	.do_4	= xor_arm4regs_4,
-	.do_5	= xor_arm4regs_5,
-};
-
-#ifdef CONFIG_KERNEL_MODE_NEON
-
-extern struct xor_block_template const xor_block_neon_inner;
-
-static void
-xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2)
-{
-	kernel_neon_begin();
-	xor_block_neon_inner.do_2(bytes, p1, p2);
-	kernel_neon_end();
-}
-
-static void
-xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3)
-{
-	kernel_neon_begin();
-	xor_block_neon_inner.do_3(bytes, p1, p2, p3);
-	kernel_neon_end();
-}
-
-static void
-xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3,
-	   const unsigned long * __restrict p4)
-{
-	kernel_neon_begin();
-	xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
-	kernel_neon_end();
-}
-
-static void
-xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3,
-	   const unsigned long * __restrict p4,
-	   const unsigned long * __restrict p5)
-{
-	kernel_neon_begin();
-	xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
-	kernel_neon_end();
-}
-
-static struct xor_block_template xor_block_neon = {
-	.name	= "neon",
-	.do_2	= xor_neon_2,
-	.do_3	= xor_neon_3,
-	.do_4	= xor_neon_4,
-	.do_5	= xor_neon_5
-};
-
-#endif /* CONFIG_KERNEL_MODE_NEON */
+extern struct xor_block_template xor_block_arm4regs;
+extern struct xor_block_template xor_block_neon;
 
 #define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 0ca5aae1bcc3e3..9295055cdfc92b 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -39,9 +39,4 @@ endif
 $(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S
 
-ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
-  CFLAGS_xor-neon.o		+= $(CC_FLAGS_FPU)
-  obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
-endif
-
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 6d03c27c37c7d9..fb760edae54b5c 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -9,3 +9,11 @@ xor-y				+= xor-8regs-prefetch.o
 xor-y				+= xor-32regs-prefetch.o
 
 xor-$(CONFIG_ALPHA)		+= alpha/xor.o
+xor-$(CONFIG_ARM)		+= arm/xor.o
+ifeq ($(CONFIG_ARM),y)
+xor-$(CONFIG_KERNEL_MODE_NEON)	+= arm/xor-neon.o arm/xor-neon-glue.o
+endif
+
+
+CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
diff --git a/lib/raid/xor/arm/xor-neon-glue.c b/lib/raid/xor/arm/xor-neon-glue.c
new file mode 100644
index 00000000000000..c7b162b383a25b
--- /dev/null
+++ b/lib/raid/xor/arm/xor-neon-glue.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Copyright (C) 2001 Russell King
+ */
+#include <linux/raid/xor_impl.h>
+#include <asm/xor.h>
+
+extern struct xor_block_template const xor_block_neon_inner;
+
+static void
+xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+	   const unsigned long * __restrict p2)
+{
+	kernel_neon_begin();
+	xor_block_neon_inner.do_2(bytes, p1, p2);
+	kernel_neon_end();
+}
+
+static void
+xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+	   const unsigned long * __restrict p2,
+	   const unsigned long * __restrict p3)
+{
+	kernel_neon_begin();
+	xor_block_neon_inner.do_3(bytes, p1, p2, p3);
+	kernel_neon_end();
+}
+
+static void
+xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+	   const unsigned long * __restrict p2,
+	   const unsigned long * __restrict p3,
+	   const unsigned long * __restrict p4)
+{
+	kernel_neon_begin();
+	xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
+	kernel_neon_end();
+}
+
+static void
+xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+	   const unsigned long * __restrict p2,
+	   const unsigned long * __restrict p3,
+	   const unsigned long * __restrict p4,
+	   const unsigned long * __restrict p5)
+{
+	kernel_neon_begin();
+	xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
+	kernel_neon_end();
+}
+
+struct xor_block_template xor_block_neon = {
+	.name	= "neon",
+	.do_2	= xor_neon_2,
+	.do_3	= xor_neon_3,
+	.do_4	= xor_neon_4,
+	.do_5	= xor_neon_5
+};
diff --git a/arch/arm/lib/xor-neon.c b/lib/raid/xor/arm/xor-neon.c
similarity index 74%
rename from arch/arm/lib/xor-neon.c
rename to lib/raid/xor/arm/xor-neon.c
index b5be50567991bc..c9d4378b0f0ec2 100644
--- a/arch/arm/lib/xor-neon.c
+++ b/lib/raid/xor/arm/xor-neon.c
@@ -1,16 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * linux/arch/arm/lib/xor-neon.c
- *
  * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
-#include <linux/raid/xor.h>
 #include <linux/raid/xor_impl.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("NEON accelerated XOR implementation");
-MODULE_LICENSE("GPL");
 
 #ifndef __ARM_NEON__
 #error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
@@ -27,7 +20,7 @@ MODULE_LICENSE("GPL");
 #endif
 
 #define NO_TEMPLATE
-#include "../../../lib/raid/xor/xor-8regs.c"
+#include "../xor-8regs.c"
 
 struct xor_block_template const xor_block_neon_inner = {
 	.name	= "__inner_neon__",
@@ -36,4 +29,3 @@ struct xor_block_template const xor_block_neon_inner = {
 	.do_4	= xor_8regs_4,
 	.do_5	= xor_8regs_5,
 };
-EXPORT_SYMBOL(xor_block_neon_inner);
diff --git a/lib/raid/xor/arm/xor.c b/lib/raid/xor/arm/xor.c
new file mode 100644
index 00000000000000..2263341dbbcdf9
--- /dev/null
+++ b/lib/raid/xor/arm/xor.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Copyright (C) 2001 Russell King
+ */
+#include <linux/raid/xor_impl.h>
+#include <asm/xor.h>
+
+#define __XOR(a1, a2) a1 ^= a2
+
+#define GET_BLOCK_2(dst) \
+	__asm__("ldmia	%0, {%1, %2}" \
+		: "=r" (dst), "=r" (a1), "=r" (a2) \
+		: "0" (dst))
+
+#define GET_BLOCK_4(dst) \
+	__asm__("ldmia	%0, {%1, %2, %3, %4}" \
+		: "=r" (dst), "=r" (a1), "=r" (a2), "=r" (a3), "=r" (a4) \
+		: "0" (dst))
+
+#define XOR_BLOCK_2(src) \
+	__asm__("ldmia	%0!, {%1, %2}" \
+		: "=r" (src), "=r" (b1), "=r" (b2) \
+		: "0" (src)); \
+	__XOR(a1, b1); __XOR(a2, b2);
+
+#define XOR_BLOCK_4(src) \
+	__asm__("ldmia	%0!, {%1, %2, %3, %4}" \
+		: "=r" (src), "=r" (b1), "=r" (b2), "=r" (b3), "=r" (b4) \
+		: "0" (src)); \
+	__XOR(a1, b1); __XOR(a2, b2); __XOR(a3, b3); __XOR(a4, b4)
+
+#define PUT_BLOCK_2(dst) \
+	__asm__ __volatile__("stmia	%0!, {%2, %3}" \
+		: "=r" (dst) \
+		: "0" (dst), "r" (a1), "r" (a2))
+
+#define PUT_BLOCK_4(dst) \
+	__asm__ __volatile__("stmia	%0!, {%2, %3, %4, %5}" \
+		: "=r" (dst) \
+		: "0" (dst), "r" (a1), "r" (a2), "r" (a3), "r" (a4))
+
+static void
+xor_arm4regs_2(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2)
+{
+	unsigned int lines = bytes / sizeof(unsigned long) / 4;
+	register unsigned int a1 __asm__("r4");
+	register unsigned int a2 __asm__("r5");
+	register unsigned int a3 __asm__("r6");
+	register unsigned int a4 __asm__("r10");
+	register unsigned int b1 __asm__("r8");
+	register unsigned int b2 __asm__("r9");
+	register unsigned int b3 __asm__("ip");
+	register unsigned int b4 __asm__("lr");
+
+	do {
+		GET_BLOCK_4(p1);
+		XOR_BLOCK_4(p2);
+		PUT_BLOCK_4(p1);
+	} while (--lines);
+}
+
+static void
+xor_arm4regs_3(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3)
+{
+	unsigned int lines = bytes / sizeof(unsigned long) / 4;
+	register unsigned int a1 __asm__("r4");
+	register unsigned int a2 __asm__("r5");
+	register unsigned int a3 __asm__("r6");
+	register unsigned int a4 __asm__("r10");
+	register unsigned int b1 __asm__("r8");
+	register unsigned int b2 __asm__("r9");
+	register unsigned int b3 __asm__("ip");
+	register unsigned int b4 __asm__("lr");
+
+	do {
+		GET_BLOCK_4(p1);
+		XOR_BLOCK_4(p2);
+		XOR_BLOCK_4(p3);
+		PUT_BLOCK_4(p1);
+	} while (--lines);
+}
+
+static void
+xor_arm4regs_4(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3,
+	       const unsigned long * __restrict p4)
+{
+	unsigned int lines = bytes / sizeof(unsigned long) / 2;
+	register unsigned int a1 __asm__("r8");
+	register unsigned int a2 __asm__("r9");
+	register unsigned int b1 __asm__("ip");
+	register unsigned int b2 __asm__("lr");
+
+	do {
+		GET_BLOCK_2(p1);
+		XOR_BLOCK_2(p2);
+		XOR_BLOCK_2(p3);
+		XOR_BLOCK_2(p4);
+		PUT_BLOCK_2(p1);
+	} while (--lines);
+}
+
+static void
+xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3,
+	       const unsigned long * __restrict p4,
+	       const unsigned long * __restrict p5)
+{
+	unsigned int lines = bytes / sizeof(unsigned long) / 2;
+	register unsigned int a1 __asm__("r8");
+	register unsigned int a2 __asm__("r9");
+	register unsigned int b1 __asm__("ip");
+	register unsigned int b2 __asm__("lr");
+
+	do {
+		GET_BLOCK_2(p1);
+		XOR_BLOCK_2(p2);
+		XOR_BLOCK_2(p3);
+		XOR_BLOCK_2(p4);
+		XOR_BLOCK_2(p5);
+		PUT_BLOCK_2(p1);
+	} while (--lines);
+}
+
+struct xor_block_template xor_block_arm4regs = {
+	.name	= "arm4regs",
+	.do_2	= xor_arm4regs_2,
+	.do_3	= xor_arm4regs_3,
+	.do_4	= xor_arm4regs_4,
+	.do_5	= xor_arm4regs_5,
+};

From 561b704f4a5e30f2bffdb1a844bb9516ba882ad6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:48 +0100
Subject: [PATCH 12/26] arm64: move the XOR code to lib/raid/

Move the optimized XOR into lib/raid and include it it in the main
xor.ko instead of building a separate module for it.

Note that this drops the CONFIG_KERNEL_MODE_NEON dependency, as that is
always set for arm64.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/arm64/include/asm/xor.h                  | 58 +------------------
 arch/arm64/lib/Makefile                       |  6 --
 lib/raid/xor/Makefile                         |  4 ++
 lib/raid/xor/arm64/xor-neon-glue.c            | 57 ++++++++++++++++++
 .../lib => lib/raid/xor/arm64}/xor-neon.c     | 20 +------
 5 files changed, 67 insertions(+), 78 deletions(-)
 create mode 100644 lib/raid/xor/arm64/xor-neon-glue.c
 rename {arch/arm64/lib => lib/raid/xor/arm64}/xor-neon.c (95%)

diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
index bfa6122f55cecd..81718f010761d8 100644
--- a/arch/arm64/include/asm/xor.h
+++ b/arch/arm64/include/asm/xor.h
@@ -1,73 +1,21 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * arch/arm64/include/asm/xor.h
- *
  * Authors: Jackie Liu <liuyun01@kylinos.cn>
  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
  */
 
-#include <linux/hardirq.h>
 #include <asm-generic/xor.h>
-#include <asm/hwcap.h>
 #include <asm/simd.h>
 
-#ifdef CONFIG_KERNEL_MODE_NEON
-
-extern struct xor_block_template const xor_block_inner_neon;
-
-static void
-xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2)
-{
-	scoped_ksimd()
-		xor_block_inner_neon.do_2(bytes, p1, p2);
-}
-
-static void
-xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3)
-{
-	scoped_ksimd()
-		xor_block_inner_neon.do_3(bytes, p1, p2, p3);
-}
-
-static void
-xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3,
-	   const unsigned long * __restrict p4)
-{
-	scoped_ksimd()
-		xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
-}
-
-static void
-xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3,
-	   const unsigned long * __restrict p4,
-	   const unsigned long * __restrict p5)
-{
-	scoped_ksimd()
-		xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
-}
-
-static struct xor_block_template xor_block_arm64 = {
-	.name   = "arm64_neon",
-	.do_2   = xor_neon_2,
-	.do_3   = xor_neon_3,
-	.do_4   = xor_neon_4,
-	.do_5	= xor_neon_5
-};
+extern struct xor_block_template xor_block_arm64;
+void __init xor_neon_init(void);
 
 #define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
+	xor_neon_init();
 	xor_register(&xor_block_8regs);
 	xor_register(&xor_block_32regs);
 	if (cpu_has_neon())
 		xor_register(&xor_block_arm64);
 }
-
-#endif /* ! CONFIG_KERNEL_MODE_NEON */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 633e5223d944d7..448c917494f305 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -5,12 +5,6 @@ lib-y		:= clear_user.o delay.o copy_from_user.o		\
 		   memset.o memcmp.o strcmp.o strncmp.o strlen.o	\
 		   strnlen.o strchr.o strrchr.o tishift.o
 
-ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
-obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
-CFLAGS_xor-neon.o		+= $(CC_FLAGS_FPU)
-CFLAGS_REMOVE_xor-neon.o	+= $(CC_FLAGS_NO_FPU)
-endif
-
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index fb760edae54b5c..4ab0e7411ff769 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -13,7 +13,11 @@ xor-$(CONFIG_ARM)		+= arm/xor.o
 ifeq ($(CONFIG_ARM),y)
 xor-$(CONFIG_KERNEL_MODE_NEON)	+= arm/xor-neon.o arm/xor-neon-glue.o
 endif
+xor-$(CONFIG_ARM64)		+= arm64/xor-neon.o arm64/xor-neon-glue.o
 
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
 CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
+
+CFLAGS_arm64/xor-neon.o		+= $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_arm64/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
diff --git a/lib/raid/xor/arm64/xor-neon-glue.c b/lib/raid/xor/arm64/xor-neon-glue.c
new file mode 100644
index 00000000000000..067a2095659a45
--- /dev/null
+++ b/lib/raid/xor/arm64/xor-neon-glue.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors: Jackie Liu <liuyun01@kylinos.cn>
+ * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
+ */
+
+#include <linux/raid/xor_impl.h>
+#include <asm/simd.h>
+#include <asm/xor.h>
+
+extern struct xor_block_template const xor_block_inner_neon;
+
+static void
+xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+	   const unsigned long * __restrict p2)
+{
+	scoped_ksimd()
+		xor_block_inner_neon.do_2(bytes, p1, p2);
+}
+
+static void
+xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+	   const unsigned long * __restrict p2,
+	   const unsigned long * __restrict p3)
+{
+	scoped_ksimd()
+		xor_block_inner_neon.do_3(bytes, p1, p2, p3);
+}
+
+static void
+xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+	   const unsigned long * __restrict p2,
+	   const unsigned long * __restrict p3,
+	   const unsigned long * __restrict p4)
+{
+	scoped_ksimd()
+		xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
+}
+
+static void
+xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+	   const unsigned long * __restrict p2,
+	   const unsigned long * __restrict p3,
+	   const unsigned long * __restrict p4,
+	   const unsigned long * __restrict p5)
+{
+	scoped_ksimd()
+		xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
+}
+
+struct xor_block_template xor_block_arm64 = {
+	.name   = "arm64_neon",
+	.do_2   = xor_neon_2,
+	.do_3   = xor_neon_3,
+	.do_4   = xor_neon_4,
+	.do_5	= xor_neon_5
+};
diff --git a/arch/arm64/lib/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c
similarity index 95%
rename from arch/arm64/lib/xor-neon.c
rename to lib/raid/xor/arm64/xor-neon.c
index 351aba92d9324c..8d2d185090db76 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/lib/raid/xor/arm64/xor-neon.c
@@ -1,15 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * arch/arm64/lib/xor-neon.c
- *
  * Authors: Jackie Liu <liuyun01@kylinos.cn>
  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
  */
 
-#include <linux/raid/xor.h>
 #include <linux/raid/xor_impl.h>
-#include <linux/module.h>
+#include <linux/cache.h>
 #include <asm/neon-intrinsics.h>
+#include <asm/xor.h>
 
 static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
 	const unsigned long * __restrict p2)
@@ -180,7 +178,6 @@ struct xor_block_template xor_block_inner_neon __ro_after_init = {
 	.do_4	= xor_arm64_neon_4,
 	.do_5	= xor_arm64_neon_5,
 };
-EXPORT_SYMBOL(xor_block_inner_neon);
 
 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 {
@@ -318,22 +315,11 @@ static void xor_arm64_eor3_5(unsigned long bytes,
 	} while (--lines > 0);
 }
 
-static int __init xor_neon_init(void)
+void __init xor_neon_init(void)
 {
 	if (cpu_have_named_feature(SHA3)) {
 		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
 		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
 		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
 	}
-	return 0;
 }
-module_init(xor_neon_init);
-
-static void __exit xor_neon_exit(void)
-{
-}
-module_exit(xor_neon_exit);
-
-MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
-MODULE_DESCRIPTION("ARMv8 XOR Extensions");
-MODULE_LICENSE("GPL");

From 7676c3a731c73b3038728cf87287bcffc0836f98 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:49 +0100
Subject: [PATCH 13/26] loongarch: move the XOR code to lib/raid/

Move the optimized XOR into lib/raid and include it it in xor.ko
instead of always building it into the main kernel image.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/loongarch/include/asm/xor.h              | 24 ++----------
 arch/loongarch/include/asm/xor_simd.h         | 34 ----------------
 arch/loongarch/lib/Makefile                   |  2 -
 lib/raid/xor/Makefile                         |  2 +
 .../lib => lib/raid/xor/loongarch}/xor_simd.c |  0
 .../lib => lib/raid/xor/loongarch}/xor_simd.h |  0
 .../raid/xor/loongarch}/xor_simd_glue.c       | 39 +++++++++++--------
 .../raid/xor/loongarch}/xor_template.c        |  0
 8 files changed, 27 insertions(+), 74 deletions(-)
 delete mode 100644 arch/loongarch/include/asm/xor_simd.h
 rename {arch/loongarch/lib => lib/raid/xor/loongarch}/xor_simd.c (100%)
 rename {arch/loongarch/lib => lib/raid/xor/loongarch}/xor_simd.h (100%)
 rename {arch/loongarch/lib => lib/raid/xor/loongarch}/xor_simd_glue.c (64%)
 rename {arch/loongarch/lib => lib/raid/xor/loongarch}/xor_template.c (100%)

diff --git a/arch/loongarch/include/asm/xor.h b/arch/loongarch/include/asm/xor.h
index d17c0e3b047f13..7e32f72f8b036d 100644
--- a/arch/loongarch/include/asm/xor.h
+++ b/arch/loongarch/include/asm/xor.h
@@ -6,27 +6,6 @@
 #define _ASM_LOONGARCH_XOR_H
 
 #include <asm/cpu-features.h>
-#include <asm/xor_simd.h>
-
-#ifdef CONFIG_CPU_HAS_LSX
-static struct xor_block_template xor_block_lsx = {
-	.name = "lsx",
-	.do_2 = xor_lsx_2,
-	.do_3 = xor_lsx_3,
-	.do_4 = xor_lsx_4,
-	.do_5 = xor_lsx_5,
-};
-#endif /* CONFIG_CPU_HAS_LSX */
-
-#ifdef CONFIG_CPU_HAS_LASX
-static struct xor_block_template xor_block_lasx = {
-	.name = "lasx",
-	.do_2 = xor_lasx_2,
-	.do_3 = xor_lasx_3,
-	.do_4 = xor_lasx_4,
-	.do_5 = xor_lasx_5,
-};
-#endif /* CONFIG_CPU_HAS_LASX */
 
 /*
  * For grins, also test the generic routines.
@@ -38,6 +17,9 @@ static struct xor_block_template xor_block_lasx = {
  */
 #include <asm-generic/xor.h>
 
+extern struct xor_block_template xor_block_lsx;
+extern struct xor_block_template xor_block_lasx;
+
 #define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
diff --git a/arch/loongarch/include/asm/xor_simd.h b/arch/loongarch/include/asm/xor_simd.h
deleted file mode 100644
index 471b96332f3817..00000000000000
--- a/arch/loongarch/include/asm/xor_simd.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
- */
-#ifndef _ASM_LOONGARCH_XOR_SIMD_H
-#define _ASM_LOONGARCH_XOR_SIMD_H
-
-#ifdef CONFIG_CPU_HAS_LSX
-void xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2);
-void xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2, const unsigned long * __restrict p3);
-void xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2, const unsigned long * __restrict p3,
-	       const unsigned long * __restrict p4);
-void xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2, const unsigned long * __restrict p3,
-	       const unsigned long * __restrict p4, const unsigned long * __restrict p5);
-#endif /* CONFIG_CPU_HAS_LSX */
-
-#ifdef CONFIG_CPU_HAS_LASX
-void xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1,
-	        const unsigned long * __restrict p2);
-void xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1,
-	        const unsigned long * __restrict p2, const unsigned long * __restrict p3);
-void xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1,
-	        const unsigned long * __restrict p2, const unsigned long * __restrict p3,
-	        const unsigned long * __restrict p4);
-void xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1,
-	        const unsigned long * __restrict p2, const unsigned long * __restrict p3,
-	        const unsigned long * __restrict p4, const unsigned long * __restrict p5);
-#endif /* CONFIG_CPU_HAS_LASX */
-
-#endif /* _ASM_LOONGARCH_XOR_SIMD_H */
diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
index ccea3bbd435313..827a88529a425c 100644
--- a/arch/loongarch/lib/Makefile
+++ b/arch/loongarch/lib/Makefile
@@ -8,6 +8,4 @@ lib-y	+= delay.o memset.o memcpy.o memmove.o \
 
 obj-$(CONFIG_ARCH_SUPPORTS_INT128) += tishift.o
 
-obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o
-
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 4ab0e7411ff769..e8868f5fc39642 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -14,6 +14,8 @@ ifeq ($(CONFIG_ARM),y)
 xor-$(CONFIG_KERNEL_MODE_NEON)	+= arm/xor-neon.o arm/xor-neon-glue.o
 endif
 xor-$(CONFIG_ARM64)		+= arm64/xor-neon.o arm64/xor-neon-glue.o
+xor-$(CONFIG_CPU_HAS_LSX)	+= loongarch/xor_simd.o
+xor-$(CONFIG_CPU_HAS_LSX)	+= loongarch/xor_simd_glue.o
 
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
diff --git a/arch/loongarch/lib/xor_simd.c b/lib/raid/xor/loongarch/xor_simd.c
similarity index 100%
rename from arch/loongarch/lib/xor_simd.c
rename to lib/raid/xor/loongarch/xor_simd.c
diff --git a/arch/loongarch/lib/xor_simd.h b/lib/raid/xor/loongarch/xor_simd.h
similarity index 100%
rename from arch/loongarch/lib/xor_simd.h
rename to lib/raid/xor/loongarch/xor_simd.h
diff --git a/arch/loongarch/lib/xor_simd_glue.c b/lib/raid/xor/loongarch/xor_simd_glue.c
similarity index 64%
rename from arch/loongarch/lib/xor_simd_glue.c
rename to lib/raid/xor/loongarch/xor_simd_glue.c
index 393f689dbcf678..11fa3b47ba83a3 100644
--- a/arch/loongarch/lib/xor_simd_glue.c
+++ b/lib/raid/xor/loongarch/xor_simd_glue.c
@@ -5,24 +5,23 @@
  * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
  */
 
-#include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/raid/xor_impl.h>
 #include <asm/fpu.h>
-#include <asm/xor_simd.h>
+#include <asm/xor.h>
 #include "xor_simd.h"
 
 #define MAKE_XOR_GLUE_2(flavor)							\
-void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1,	\
+static void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1,\
 		      const unsigned long * __restrict p2)			\
 {										\
 	kernel_fpu_begin();							\
 	__xor_##flavor##_2(bytes, p1, p2);					\
 	kernel_fpu_end();							\
 }										\
-EXPORT_SYMBOL_GPL(xor_##flavor##_2)
 
 #define MAKE_XOR_GLUE_3(flavor)							\
-void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,	\
+static void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,\
 		      const unsigned long * __restrict p2,			\
 		      const unsigned long * __restrict p3)			\
 {										\
@@ -30,10 +29,9 @@ void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,	\
 	__xor_##flavor##_3(bytes, p1, p2, p3);					\
 	kernel_fpu_end();							\
 }										\
-EXPORT_SYMBOL_GPL(xor_##flavor##_3)
 
 #define MAKE_XOR_GLUE_4(flavor)							\
-void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,	\
+static void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,\
 		      const unsigned long * __restrict p2,			\
 		      const unsigned long * __restrict p3,			\
 		      const unsigned long * __restrict p4)			\
@@ -42,10 +40,9 @@ void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,	\
 	__xor_##flavor##_4(bytes, p1, p2, p3, p4);				\
 	kernel_fpu_end();							\
 }										\
-EXPORT_SYMBOL_GPL(xor_##flavor##_4)
 
 #define MAKE_XOR_GLUE_5(flavor)							\
-void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,	\
+static void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,\
 		      const unsigned long * __restrict p2,			\
 		      const unsigned long * __restrict p3,			\
 		      const unsigned long * __restrict p4,			\
@@ -55,18 +52,26 @@ void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,	\
 	__xor_##flavor##_5(bytes, p1, p2, p3, p4, p5);				\
 	kernel_fpu_end();							\
 }										\
-EXPORT_SYMBOL_GPL(xor_##flavor##_5)
 
-#define MAKE_XOR_GLUES(flavor)		\
-	MAKE_XOR_GLUE_2(flavor);	\
-	MAKE_XOR_GLUE_3(flavor);	\
-	MAKE_XOR_GLUE_4(flavor);	\
-	MAKE_XOR_GLUE_5(flavor)
+#define MAKE_XOR_GLUES(flavor)				\
+	MAKE_XOR_GLUE_2(flavor);			\
+	MAKE_XOR_GLUE_3(flavor);			\
+	MAKE_XOR_GLUE_4(flavor);			\
+	MAKE_XOR_GLUE_5(flavor);			\
+							\
+struct xor_block_template xor_block_##flavor = {	\
+	.name = __stringify(flavor),			\
+	.do_2 = xor_##flavor##_2,			\
+	.do_3 = xor_##flavor##_3,			\
+	.do_4 = xor_##flavor##_4,			\
+	.do_5 = xor_##flavor##_5,			\
+}
+
 
 #ifdef CONFIG_CPU_HAS_LSX
 MAKE_XOR_GLUES(lsx);
-#endif
+#endif /* CONFIG_CPU_HAS_LSX */
 
 #ifdef CONFIG_CPU_HAS_LASX
 MAKE_XOR_GLUES(lasx);
-#endif
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/arch/loongarch/lib/xor_template.c b/lib/raid/xor/loongarch/xor_template.c
similarity index 100%
rename from arch/loongarch/lib/xor_template.c
rename to lib/raid/xor/loongarch/xor_template.c

From cece8b2ed4c7dad5db4b6f9680172b47465d63f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:50 +0100
Subject: [PATCH 14/26] powerpc: move the XOR code to lib/raid/

Move the optimized XOR into lib/raid and include it it in xor.ko
instead of always building it into the main kernel image.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/powerpc/include/asm/xor.h                | 17 +----
 arch/powerpc/include/asm/xor_altivec.h        | 22 ------
 arch/powerpc/lib/Makefile                     |  5 --
 arch/powerpc/lib/xor_vmx_glue.c               | 63 -----------------
 lib/raid/xor/Makefile                         |  5 ++
 .../lib => lib/raid/xor/powerpc}/xor_vmx.c    |  0
 .../lib => lib/raid/xor/powerpc}/xor_vmx.h    |  0
 lib/raid/xor/powerpc/xor_vmx_glue.c           | 67 +++++++++++++++++++
 8 files changed, 74 insertions(+), 105 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/xor_altivec.h
 delete mode 100644 arch/powerpc/lib/xor_vmx_glue.c
 rename {arch/powerpc/lib => lib/raid/xor/powerpc}/xor_vmx.c (100%)
 rename {arch/powerpc/lib => lib/raid/xor/powerpc}/xor_vmx.h (100%)
 create mode 100644 lib/raid/xor/powerpc/xor_vmx_glue.c

diff --git a/arch/powerpc/include/asm/xor.h b/arch/powerpc/include/asm/xor.h
index 30224c5279c4bf..3293ac87181c33 100644
--- a/arch/powerpc/include/asm/xor.h
+++ b/arch/powerpc/include/asm/xor.h
@@ -8,24 +8,11 @@
 #ifndef _ASM_POWERPC_XOR_H
 #define _ASM_POWERPC_XOR_H
 
-#ifdef CONFIG_ALTIVEC
-
-#include <asm/cputable.h>
 #include <asm/cpu_has_feature.h>
-#include <asm/xor_altivec.h>
-
-static struct xor_block_template xor_block_altivec = {
-	.name = "altivec",
-	.do_2 = xor_altivec_2,
-	.do_3 = xor_altivec_3,
-	.do_4 = xor_altivec_4,
-	.do_5 = xor_altivec_5,
-};
-#endif /* CONFIG_ALTIVEC */
-
-/* Also try the generic routines. */
 #include <asm-generic/xor.h>
 
+extern struct xor_block_template xor_block_altivec;
+
 #define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
diff --git a/arch/powerpc/include/asm/xor_altivec.h b/arch/powerpc/include/asm/xor_altivec.h
deleted file mode 100644
index 294620a25f8025..00000000000000
--- a/arch/powerpc/include/asm/xor_altivec.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_POWERPC_XOR_ALTIVEC_H
-#define _ASM_POWERPC_XOR_ALTIVEC_H
-
-#ifdef CONFIG_ALTIVEC
-void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
-		   const unsigned long * __restrict p2);
-void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
-		   const unsigned long * __restrict p2,
-		   const unsigned long * __restrict p3);
-void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
-		   const unsigned long * __restrict p2,
-		   const unsigned long * __restrict p3,
-		   const unsigned long * __restrict p4);
-void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
-		   const unsigned long * __restrict p2,
-		   const unsigned long * __restrict p3,
-		   const unsigned long * __restrict p4,
-		   const unsigned long * __restrict p5);
-
-#endif
-#endif /* _ASM_POWERPC_XOR_ALTIVEC_H */
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index f14ecab674a340..002edc3f01d5c5 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -73,9 +73,4 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
 
 obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
 
-obj-$(CONFIG_ALTIVEC)	+= xor_vmx.o xor_vmx_glue.o
-CFLAGS_xor_vmx.o += -mhard-float -maltivec $(call cc-option,-mabi=altivec)
-# Enable <altivec.h>
-CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include)
-
 obj-$(CONFIG_PPC64) += $(obj64-y)
diff --git a/arch/powerpc/lib/xor_vmx_glue.c b/arch/powerpc/lib/xor_vmx_glue.c
deleted file mode 100644
index 35d917ece4d1e4..00000000000000
--- a/arch/powerpc/lib/xor_vmx_glue.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Altivec XOR operations
- *
- * Copyright 2017 IBM Corp.
- */
-
-#include <linux/preempt.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <asm/switch_to.h>
-#include <asm/xor_altivec.h>
-#include "xor_vmx.h"
-
-void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
-		   const unsigned long * __restrict p2)
-{
-	preempt_disable();
-	enable_kernel_altivec();
-	__xor_altivec_2(bytes, p1, p2);
-	disable_kernel_altivec();
-	preempt_enable();
-}
-EXPORT_SYMBOL(xor_altivec_2);
-
-void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
-		   const unsigned long * __restrict p2,
-		   const unsigned long * __restrict p3)
-{
-	preempt_disable();
-	enable_kernel_altivec();
-	__xor_altivec_3(bytes, p1, p2, p3);
-	disable_kernel_altivec();
-	preempt_enable();
-}
-EXPORT_SYMBOL(xor_altivec_3);
-
-void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
-		   const unsigned long * __restrict p2,
-		   const unsigned long * __restrict p3,
-		   const unsigned long * __restrict p4)
-{
-	preempt_disable();
-	enable_kernel_altivec();
-	__xor_altivec_4(bytes, p1, p2, p3, p4);
-	disable_kernel_altivec();
-	preempt_enable();
-}
-EXPORT_SYMBOL(xor_altivec_4);
-
-void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
-		   const unsigned long * __restrict p2,
-		   const unsigned long * __restrict p3,
-		   const unsigned long * __restrict p4,
-		   const unsigned long * __restrict p5)
-{
-	preempt_disable();
-	enable_kernel_altivec();
-	__xor_altivec_5(bytes, p1, p2, p3, p4, p5);
-	disable_kernel_altivec();
-	preempt_enable();
-}
-EXPORT_SYMBOL(xor_altivec_5);
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index e8868f5fc39642..006b44ce46bf56 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -16,6 +16,7 @@ endif
 xor-$(CONFIG_ARM64)		+= arm64/xor-neon.o arm64/xor-neon-glue.o
 xor-$(CONFIG_CPU_HAS_LSX)	+= loongarch/xor_simd.o
 xor-$(CONFIG_CPU_HAS_LSX)	+= loongarch/xor_simd_glue.o
+xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
 
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
@@ -23,3 +24,7 @@ CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
 
 CFLAGS_arm64/xor-neon.o		+= $(CC_FLAGS_FPU)
 CFLAGS_REMOVE_arm64/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
+
+CFLAGS_powerpc/xor_vmx.o	+= -mhard-float -maltivec \
+				   $(call cc-option,-mabi=altivec) \
+				   -isystem $(shell $(CC) -print-file-name=include)
diff --git a/arch/powerpc/lib/xor_vmx.c b/lib/raid/xor/powerpc/xor_vmx.c
similarity index 100%
rename from arch/powerpc/lib/xor_vmx.c
rename to lib/raid/xor/powerpc/xor_vmx.c
diff --git a/arch/powerpc/lib/xor_vmx.h b/lib/raid/xor/powerpc/xor_vmx.h
similarity index 100%
rename from arch/powerpc/lib/xor_vmx.h
rename to lib/raid/xor/powerpc/xor_vmx.h
diff --git a/lib/raid/xor/powerpc/xor_vmx_glue.c b/lib/raid/xor/powerpc/xor_vmx_glue.c
new file mode 100644
index 00000000000000..c41e3834070042
--- /dev/null
+++ b/lib/raid/xor/powerpc/xor_vmx_glue.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Altivec XOR operations
+ *
+ * Copyright 2017 IBM Corp.
+ */
+
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/raid/xor_impl.h>
+#include <asm/switch_to.h>
+#include <asm/xor.h>
+#include "xor_vmx.h"
+
+static void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2)
+{
+	preempt_disable();
+	enable_kernel_altivec();
+	__xor_altivec_2(bytes, p1, p2);
+	disable_kernel_altivec();
+	preempt_enable();
+}
+
+static void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3)
+{
+	preempt_disable();
+	enable_kernel_altivec();
+	__xor_altivec_3(bytes, p1, p2, p3);
+	disable_kernel_altivec();
+	preempt_enable();
+}
+
+static void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4)
+{
+	preempt_disable();
+	enable_kernel_altivec();
+	__xor_altivec_4(bytes, p1, p2, p3, p4);
+	disable_kernel_altivec();
+	preempt_enable();
+}
+
+static void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4,
+		const unsigned long * __restrict p5)
+{
+	preempt_disable();
+	enable_kernel_altivec();
+	__xor_altivec_5(bytes, p1, p2, p3, p4, p5);
+	disable_kernel_altivec();
+	preempt_enable();
+}
+
+struct xor_block_template xor_block_altivec = {
+	.name = "altivec",
+	.do_2 = xor_altivec_2,
+	.do_3 = xor_altivec_3,
+	.do_4 = xor_altivec_4,
+	.do_5 = xor_altivec_5,
+};

From 1f260a34af6af0c656177fceb5acd5c01173b89e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:51 +0100
Subject: [PATCH 15/26] riscv: move the XOR code to lib/raid/

Move the optimized XOR into lib/raid and include it it in xor.ko
instead of always building it into the main kernel image.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/riscv/include/asm/xor.h                 | 54 +------------------
 arch/riscv/lib/Makefile                      |  1 -
 lib/raid/xor/Makefile                        |  1 +
 lib/raid/xor/riscv/xor-glue.c                | 56 ++++++++++++++++++++
 {arch/riscv/lib => lib/raid/xor/riscv}/xor.S |  4 --
 5 files changed, 59 insertions(+), 57 deletions(-)
 create mode 100644 lib/raid/xor/riscv/xor-glue.c
 rename {arch/riscv/lib => lib/raid/xor/riscv}/xor.S (92%)

diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h
index ed5f27903efc41..614d9209d07823 100644
--- a/arch/riscv/include/asm/xor.h
+++ b/arch/riscv/include/asm/xor.h
@@ -2,60 +2,10 @@
 /*
  * Copyright (C) 2021 SiFive
  */
-
-#include <linux/hardirq.h>
-#include <asm-generic/xor.h>
-#ifdef CONFIG_RISCV_ISA_V
 #include <asm/vector.h>
-#include <asm/switch_to.h>
-#include <asm/asm-prototypes.h>
-
-static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1,
-			 const unsigned long *__restrict p2)
-{
-	kernel_vector_begin();
-	xor_regs_2_(bytes, p1, p2);
-	kernel_vector_end();
-}
-
-static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1,
-			 const unsigned long *__restrict p2,
-			 const unsigned long *__restrict p3)
-{
-	kernel_vector_begin();
-	xor_regs_3_(bytes, p1, p2, p3);
-	kernel_vector_end();
-}
-
-static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1,
-			 const unsigned long *__restrict p2,
-			 const unsigned long *__restrict p3,
-			 const unsigned long *__restrict p4)
-{
-	kernel_vector_begin();
-	xor_regs_4_(bytes, p1, p2, p3, p4);
-	kernel_vector_end();
-}
-
-static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1,
-			 const unsigned long *__restrict p2,
-			 const unsigned long *__restrict p3,
-			 const unsigned long *__restrict p4,
-			 const unsigned long *__restrict p5)
-{
-	kernel_vector_begin();
-	xor_regs_5_(bytes, p1, p2, p3, p4, p5);
-	kernel_vector_end();
-}
+#include <asm-generic/xor.h>
 
-static struct xor_block_template xor_block_rvv = {
-	.name = "rvv",
-	.do_2 = xor_vector_2,
-	.do_3 = xor_vector_3,
-	.do_4 = xor_vector_4,
-	.do_5 = xor_vector_5
-};
-#endif /* CONFIG_RISCV_ISA_V */
+extern struct xor_block_template xor_block_rvv;
 
 #define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index bbc031124974e9..e220c35764ebd6 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -16,5 +16,4 @@ lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
 lib-$(CONFIG_RISCV_ISA_ZICBOZ)	+= clear_page.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
-lib-$(CONFIG_RISCV_ISA_V)	+= xor.o
 lib-$(CONFIG_RISCV_ISA_V)	+= riscv_v_helpers.o
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 006b44ce46bf56..9e729b50e775e2 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -17,6 +17,7 @@ xor-$(CONFIG_ARM64)		+= arm64/xor-neon.o arm64/xor-neon-glue.o
 xor-$(CONFIG_CPU_HAS_LSX)	+= loongarch/xor_simd.o
 xor-$(CONFIG_CPU_HAS_LSX)	+= loongarch/xor_simd_glue.o
 xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
+xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
 
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
diff --git a/lib/raid/xor/riscv/xor-glue.c b/lib/raid/xor/riscv/xor-glue.c
new file mode 100644
index 00000000000000..11666a4b6b6872
--- /dev/null
+++ b/lib/raid/xor/riscv/xor-glue.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021 SiFive
+ */
+
+#include <linux/raid/xor_impl.h>
+#include <asm/vector.h>
+#include <asm/switch_to.h>
+#include <asm/asm-prototypes.h>
+#include <asm/xor.h>
+
+static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1,
+			 const unsigned long *__restrict p2)
+{
+	kernel_vector_begin();
+	xor_regs_2_(bytes, p1, p2);
+	kernel_vector_end();
+}
+
+static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1,
+			 const unsigned long *__restrict p2,
+			 const unsigned long *__restrict p3)
+{
+	kernel_vector_begin();
+	xor_regs_3_(bytes, p1, p2, p3);
+	kernel_vector_end();
+}
+
+static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1,
+			 const unsigned long *__restrict p2,
+			 const unsigned long *__restrict p3,
+			 const unsigned long *__restrict p4)
+{
+	kernel_vector_begin();
+	xor_regs_4_(bytes, p1, p2, p3, p4);
+	kernel_vector_end();
+}
+
+static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1,
+			 const unsigned long *__restrict p2,
+			 const unsigned long *__restrict p3,
+			 const unsigned long *__restrict p4,
+			 const unsigned long *__restrict p5)
+{
+	kernel_vector_begin();
+	xor_regs_5_(bytes, p1, p2, p3, p4, p5);
+	kernel_vector_end();
+}
+
+struct xor_block_template xor_block_rvv = {
+	.name = "rvv",
+	.do_2 = xor_vector_2,
+	.do_3 = xor_vector_3,
+	.do_4 = xor_vector_4,
+	.do_5 = xor_vector_5
+};
diff --git a/arch/riscv/lib/xor.S b/lib/raid/xor/riscv/xor.S
similarity index 92%
rename from arch/riscv/lib/xor.S
rename to lib/raid/xor/riscv/xor.S
index b28f2430e52fa5..56fb7fc1e2cd8d 100644
--- a/arch/riscv/lib/xor.S
+++ b/lib/raid/xor/riscv/xor.S
@@ -18,7 +18,6 @@ SYM_FUNC_START(xor_regs_2_)
 	bnez a0, xor_regs_2_
 	ret
 SYM_FUNC_END(xor_regs_2_)
-EXPORT_SYMBOL(xor_regs_2_)
 
 SYM_FUNC_START(xor_regs_3_)
 	vsetvli a4, a0, e8, m8, ta, ma
@@ -35,7 +34,6 @@ SYM_FUNC_START(xor_regs_3_)
 	bnez a0, xor_regs_3_
 	ret
 SYM_FUNC_END(xor_regs_3_)
-EXPORT_SYMBOL(xor_regs_3_)
 
 SYM_FUNC_START(xor_regs_4_)
 	vsetvli a5, a0, e8, m8, ta, ma
@@ -55,7 +53,6 @@ SYM_FUNC_START(xor_regs_4_)
 	bnez a0, xor_regs_4_
 	ret
 SYM_FUNC_END(xor_regs_4_)
-EXPORT_SYMBOL(xor_regs_4_)
 
 SYM_FUNC_START(xor_regs_5_)
 	vsetvli a6, a0, e8, m8, ta, ma
@@ -78,4 +75,3 @@ SYM_FUNC_START(xor_regs_5_)
 	bnez a0, xor_regs_5_
 	ret
 SYM_FUNC_END(xor_regs_5_)
-EXPORT_SYMBOL(xor_regs_5_)

From ec8d2c65658c7c23f019578aec6b1cb5e866fb52 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:52 +0100
Subject: [PATCH 16/26] sparc: move the XOR code to lib/raid/

Move the optimized XOR into lib/raid and include it it in xor.ko
instead of always building it into the main kernel image.

The code should probably be split into separate files for the two
implementations, but for now this just does the trivial move.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/sparc/include/asm/asm-prototypes.h       |  1 -
 arch/sparc/include/asm/xor.h                  | 45 ++++++++++++++++---
 arch/sparc/lib/Makefile                       |  2 +-
 lib/raid/xor/Makefile                         |  2 +
 .../raid/xor/sparc/xor-sparc32.c              | 23 ++--------
 .../raid/xor/sparc/xor-sparc64-glue.c         | 26 +++--------
 .../xor.S => lib/raid/xor/sparc/xor-sparc64.S | 10 -----
 7 files changed, 52 insertions(+), 57 deletions(-)
 rename arch/sparc/include/asm/xor_32.h => lib/raid/xor/sparc/xor-sparc32.c (93%)
 rename arch/sparc/include/asm/xor_64.h => lib/raid/xor/sparc/xor-sparc64-glue.c (74%)
 rename arch/sparc/lib/xor.S => lib/raid/xor/sparc/xor-sparc64.S (98%)

diff --git a/arch/sparc/include/asm/asm-prototypes.h b/arch/sparc/include/asm/asm-prototypes.h
index 08810808ca6dec..bbd1a8afaabf8b 100644
--- a/arch/sparc/include/asm/asm-prototypes.h
+++ b/arch/sparc/include/asm/asm-prototypes.h
@@ -14,7 +14,6 @@
 #include <asm/oplib.h>
 #include <asm/pgtable.h>
 #include <asm/trap_block.h>
-#include <asm/xor.h>
 
 void *__memscan_zero(void *, size_t);
 void *__memscan_generic(void *, int, size_t);
diff --git a/arch/sparc/include/asm/xor.h b/arch/sparc/include/asm/xor.h
index f4c651e203c438..f923b009fc24fa 100644
--- a/arch/sparc/include/asm/xor.h
+++ b/arch/sparc/include/asm/xor.h
@@ -1,9 +1,44 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
 #ifndef ___ASM_SPARC_XOR_H
 #define ___ASM_SPARC_XOR_H
+
 #if defined(__sparc__) && defined(__arch64__)
-#include <asm/xor_64.h>
-#else
-#include <asm/xor_32.h>
-#endif
-#endif
+#include <asm/spitfire.h>
+
+extern struct xor_block_template xor_block_VIS;
+extern struct xor_block_template xor_block_niagara;
+
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	/* Force VIS for everything except Niagara.  */
+	if (tlb_type == hypervisor &&
+	    (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 ||
+	     sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
+	     sun4v_chip_type == SUN4V_CHIP_NIAGARA3 ||
+	     sun4v_chip_type == SUN4V_CHIP_NIAGARA4 ||
+	     sun4v_chip_type == SUN4V_CHIP_NIAGARA5))
+		xor_force(&xor_block_niagara);
+	else
+		xor_force(&xor_block_VIS);
+}
+#else /* sparc64 */
+
+/* For grins, also test the generic routines.  */
+#include <asm-generic/xor.h>
+
+extern struct xor_block_template xor_block_SPARC;
+
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	xor_register(&xor_block_8regs);
+	xor_register(&xor_block_32regs);
+	xor_register(&xor_block_SPARC);
+}
+#endif /* !sparc64 */
+#endif /* ___ASM_SPARC_XOR_H */
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 783bdec0d7be05..dd10cdd6f0623d 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -48,7 +48,7 @@ lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o
 
 lib-$(CONFIG_SPARC64) += copy_in_user.o memmove.o
-lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o
+lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o hweight.o ffs.o
 
 obj-$(CONFIG_SPARC64) += iomap.o
 obj-$(CONFIG_SPARC32) += atomic32.o
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 9e729b50e775e2..3a7c887d08ee9f 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -18,6 +18,8 @@ xor-$(CONFIG_CPU_HAS_LSX)	+= loongarch/xor_simd.o
 xor-$(CONFIG_CPU_HAS_LSX)	+= loongarch/xor_simd_glue.o
 xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
 xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
+xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
+xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
 
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
diff --git a/arch/sparc/include/asm/xor_32.h b/lib/raid/xor/sparc/xor-sparc32.c
similarity index 93%
rename from arch/sparc/include/asm/xor_32.h
rename to lib/raid/xor/sparc/xor-sparc32.c
index 8fbf0c07ec2897..b65a75a6e59d8f 100644
--- a/arch/sparc/include/asm/xor_32.h
+++ b/lib/raid/xor/sparc/xor-sparc32.c
@@ -1,16 +1,12 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * include/asm/xor.h
- *
- * Optimized RAID-5 checksumming functions for 32-bit Sparc.
- */
-
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * High speed xor_block operation for RAID4/5 utilizing the
  * ldd/std SPARC instructions.
  *
  * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
  */
+#include <linux/raid/xor_impl.h>
+#include <asm/xor.h>
 
 static void
 sparc_2(unsigned long bytes, unsigned long * __restrict p1,
@@ -248,21 +244,10 @@ sparc_5(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-static struct xor_block_template xor_block_SPARC = {
+struct xor_block_template xor_block_SPARC = {
 	.name	= "SPARC",
 	.do_2	= sparc_2,
 	.do_3	= sparc_3,
 	.do_4	= sparc_4,
 	.do_5	= sparc_5,
 };
-
-/* For grins, also test the generic routines.  */
-#include <asm-generic/xor.h>
-
-#define arch_xor_init arch_xor_init
-static __always_inline void __init arch_xor_init(void)
-{
-	xor_register(&xor_block_8regs);
-	xor_register(&xor_block_32regs);
-	xor_register(&xor_block_SPARC);
-}
diff --git a/arch/sparc/include/asm/xor_64.h b/lib/raid/xor/sparc/xor-sparc64-glue.c
similarity index 74%
rename from arch/sparc/include/asm/xor_64.h
rename to lib/raid/xor/sparc/xor-sparc64-glue.c
index e0482ecc0a68bd..3c67c8c3a0e821 100644
--- a/arch/sparc/include/asm/xor_64.h
+++ b/lib/raid/xor/sparc/xor-sparc64-glue.c
@@ -1,7 +1,5 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * include/asm/xor.h
- *
  * High speed xor_block operation for RAID4/5 utilizing the
  * UltraSparc Visual Instruction Set and Niagara block-init
  * twin-load instructions.
@@ -10,7 +8,8 @@
  * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
  */
 
-#include <asm/spitfire.h>
+#include <linux/raid/xor_impl.h>
+#include <asm/xor.h>
 
 void xor_vis_2(unsigned long bytes, unsigned long * __restrict p1,
 	       const unsigned long * __restrict p2);
@@ -29,7 +28,7 @@ void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1,
 
 /* XXX Ugh, write cheetah versions... -DaveM */
 
-static struct xor_block_template xor_block_VIS = {
+struct xor_block_template xor_block_VIS = {
         .name	= "VIS",
         .do_2	= xor_vis_2,
         .do_3	= xor_vis_3,
@@ -52,25 +51,10 @@ void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1,
 		   const unsigned long * __restrict p4,
 		   const unsigned long * __restrict p5);
 
-static struct xor_block_template xor_block_niagara = {
+struct xor_block_template xor_block_niagara = {
         .name	= "Niagara",
         .do_2	= xor_niagara_2,
         .do_3	= xor_niagara_3,
         .do_4	= xor_niagara_4,
         .do_5	= xor_niagara_5,
 };
-
-#define arch_xor_init arch_xor_init
-static __always_inline void __init arch_xor_init(void)
-{
-	/* Force VIS for everything except Niagara.  */
-	if (tlb_type == hypervisor &&
-	    (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 ||
-	     sun4v_chip_type == SUN4V_CHIP_NIAGARA2 ||
-	     sun4v_chip_type == SUN4V_CHIP_NIAGARA3 ||
-	     sun4v_chip_type == SUN4V_CHIP_NIAGARA4 ||
-	     sun4v_chip_type == SUN4V_CHIP_NIAGARA5))
-		xor_force(&xor_block_niagara);
-	else
-		xor_force(&xor_block_VIS);
-}
diff --git a/arch/sparc/lib/xor.S b/lib/raid/xor/sparc/xor-sparc64.S
similarity index 98%
rename from arch/sparc/lib/xor.S
rename to lib/raid/xor/sparc/xor-sparc64.S
index 35461e3b2a9b14..a7b74d473bd47d 100644
--- a/arch/sparc/lib/xor.S
+++ b/lib/raid/xor/sparc/xor-sparc64.S
@@ -1,7 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * arch/sparc64/lib/xor.S
- *
  * High speed xor_block operation for RAID4/5 utilizing the
  * UltraSparc Visual Instruction Set and Niagara store-init/twin-load.
  *
@@ -92,7 +90,6 @@ ENTRY(xor_vis_2)
 	retl
 	  wr	%g0, 0, %fprs
 ENDPROC(xor_vis_2)
-EXPORT_SYMBOL(xor_vis_2)
 
 ENTRY(xor_vis_3)
 	rd	%fprs, %o5
@@ -159,7 +156,6 @@ ENTRY(xor_vis_3)
 	retl
 	 wr	%g0, 0, %fprs
 ENDPROC(xor_vis_3)
-EXPORT_SYMBOL(xor_vis_3)
 
 ENTRY(xor_vis_4)
 	rd	%fprs, %o5
@@ -245,7 +241,6 @@ ENTRY(xor_vis_4)
 	retl
 	 wr	%g0, 0, %fprs
 ENDPROC(xor_vis_4)
-EXPORT_SYMBOL(xor_vis_4)
 
 ENTRY(xor_vis_5)
 	save	%sp, -192, %sp
@@ -352,7 +347,6 @@ ENTRY(xor_vis_5)
 	ret
 	 restore
 ENDPROC(xor_vis_5)
-EXPORT_SYMBOL(xor_vis_5)
 
 	/* Niagara versions. */
 ENTRY(xor_niagara_2) /* %o0=bytes, %o1=dest, %o2=src */
@@ -399,7 +393,6 @@ ENTRY(xor_niagara_2) /* %o0=bytes, %o1=dest, %o2=src */
 	ret
 	 restore
 ENDPROC(xor_niagara_2)
-EXPORT_SYMBOL(xor_niagara_2)
 
 ENTRY(xor_niagara_3) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */
 	save		%sp, -192, %sp
@@ -461,7 +454,6 @@ ENTRY(xor_niagara_3) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */
 	ret
 	 restore
 ENDPROC(xor_niagara_3)
-EXPORT_SYMBOL(xor_niagara_3)
 
 ENTRY(xor_niagara_4) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */
 	save		%sp, -192, %sp
@@ -544,7 +536,6 @@ ENTRY(xor_niagara_4) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */
 	ret
 	 restore
 ENDPROC(xor_niagara_4)
-EXPORT_SYMBOL(xor_niagara_4)
 
 ENTRY(xor_niagara_5) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */
 	save		%sp, -192, %sp
@@ -643,4 +634,3 @@ ENTRY(xor_niagara_5) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=s
 	ret
 	 restore
 ENDPROC(xor_niagara_5)
-EXPORT_SYMBOL(xor_niagara_5)

From 7ce454148b7151ca7d2522aad5309387fa4ab293 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:53 +0100
Subject: [PATCH 17/26] s390: move the XOR code to lib/raid/

Move the optimized XOR into lib/raid and include it it in xor.ko
instead of unconditionally building it into the main kernel image.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/s390/lib/Makefile                     | 2 +-
 lib/raid/xor/Makefile                      | 1 +
 {arch/s390/lib => lib/raid/xor/s390}/xor.c | 2 --
 3 files changed, 2 insertions(+), 3 deletions(-)
 rename {arch/s390/lib => lib/raid/xor/s390}/xor.c (98%)

diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index f43f897d3fc027..2bf47204f6abd9 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -5,7 +5,7 @@
 
 lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o
 lib-y += csum-partial.o
-obj-y += mem.o xor.o
+obj-y += mem.o
 lib-$(CONFIG_KPROBES) += probes.o
 lib-$(CONFIG_UPROBES) += probes.o
 obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 3a7c887d08ee9f..3db6c2b2f26a5c 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -20,6 +20,7 @@ xor-$(CONFIG_ALTIVEC)		+= powerpc/xor_vmx.o powerpc/xor_vmx_glue.o
 xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
 xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
 xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
+xor-$(CONFIG_S390)		+= s390/xor.o
 
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
diff --git a/arch/s390/lib/xor.c b/lib/raid/xor/s390/xor.c
similarity index 98%
rename from arch/s390/lib/xor.c
rename to lib/raid/xor/s390/xor.c
index 4d5ed638d850fa..2d5e33eca2a8af 100644
--- a/arch/s390/lib/xor.c
+++ b/lib/raid/xor/s390/xor.c
@@ -7,7 +7,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/export.h>
 #include <linux/raid/xor_impl.h>
 #include <asm/xor.h>
 
@@ -134,4 +133,3 @@ struct xor_block_template xor_block_xc = {
 	.do_4 = xor_xc_4,
 	.do_5 = xor_xc_5,
 };
-EXPORT_SYMBOL(xor_block_xc);

From 80900d2ae87a3f8f35aab3cedfa3e1f4b0112756 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:54 +0100
Subject: [PATCH 18/26] x86: move the XOR code to lib/raid/

Move the optimized XOR code out of line into lib/raid.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/x86/include/asm/xor.h                    | 518 ++----------------
 arch/x86/include/asm/xor_64.h                 |  32 --
 lib/raid/xor/Makefile                         |   2 +
 .../xor_avx.h => lib/raid/xor/x86/xor-avx.c   |  14 +-
 .../xor_32.h => lib/raid/xor/x86/xor-mmx.c    |  60 +-
 lib/raid/xor/x86/xor-sse.c                    | 476 ++++++++++++++++
 6 files changed, 522 insertions(+), 580 deletions(-)
 delete mode 100644 arch/x86/include/asm/xor_64.h
 rename arch/x86/include/asm/xor_avx.h => lib/raid/xor/x86/xor-avx.c (95%)
 rename arch/x86/include/asm/xor_32.h => lib/raid/xor/x86/xor-mmx.c (90%)
 create mode 100644 lib/raid/xor/x86/xor-sse.c

diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 33f5620d8d691e..d1aab8275908e7 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -2,498 +2,42 @@
 #ifndef _ASM_X86_XOR_H
 #define _ASM_X86_XOR_H
 
-/*
- * Optimized RAID-5 checksumming functions for SSE.
- */
-
-/*
- * Cache avoiding checksumming functions utilizing KNI instructions
- * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
- */
+#include <asm/cpufeature.h>
+#include <asm-generic/xor.h>
 
-/*
- * Based on
- * High-speed RAID5 checksumming functions utilizing SSE instructions.
- * Copyright (C) 1998 Ingo Molnar.
- */
+extern struct xor_block_template xor_block_pII_mmx;
+extern struct xor_block_template xor_block_p5_mmx;
+extern struct xor_block_template xor_block_sse;
+extern struct xor_block_template xor_block_sse_pf64;
+extern struct xor_block_template xor_block_avx;
 
 /*
- * x86-64 changes / gcc fixes from Andi Kleen.
- * Copyright 2002 Andi Kleen, SuSE Labs.
+ * When SSE is available, use it as it can write around L2.  We may also be able
+ * to load into the L1 only depending on how the cpu deals with a load to a line
+ * that is being prefetched.
+ *
+ * When AVX2 is available, force using it as it is better by all measures.
  *
- * This hasn't been optimized for the hammer yet, but there are likely
- * no advantages to be gotten from x86-64 here anyways.
+ * 32-bit without MMX can fall back to the generic routines.
  */
-
-#include <asm/fpu/api.h>
-
-#ifdef CONFIG_X86_32
-/* reduce register pressure */
-# define XOR_CONSTANT_CONSTRAINT "i"
-#else
-# define XOR_CONSTANT_CONSTRAINT "re"
-#endif
-
-#define OFFS(x)		"16*("#x")"
-#define PF_OFFS(x)	"256+16*("#x")"
-#define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
-#define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
-#define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
-#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
-#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
-#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
-#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
-#define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
-#define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
-#define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
-#define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
-#define NOP(x)
-
-#define BLK64(pf, op, i)				\
-		pf(i)					\
-		op(i, 0)				\
-			op(i + 1, 1)			\
-				op(i + 2, 2)		\
-					op(i + 3, 3)
-
-static void
-xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
-	  const unsigned long * __restrict p2)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i)					\
-		LD(i, 0)				\
-			LD(i + 1, 1)			\
-		PF1(i)					\
-				PF1(i + 2)		\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       add %[inc], %[p1]       ;\n"
-	"       add %[inc], %[p2]       ;\n"
-	"       dec %[cnt]              ;\n"
-	"       jnz 1b                  ;\n"
-	: [cnt] "+r" (lines),
-	  [p1] "+r" (p1), [p2] "+r" (p2)
-	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i)			\
-		BLK64(PF0, LD, i)	\
-		BLK64(PF1, XO1, i)	\
-		BLK64(NOP, ST, i)	\
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       add %[inc], %[p1]       ;\n"
-	"       add %[inc], %[p2]       ;\n"
-	"       dec %[cnt]              ;\n"
-	"       jnz 1b                  ;\n"
-	: [cnt] "+r" (lines),
-	  [p1] "+r" (p1), [p2] "+r" (p2)
-	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
-	  const unsigned long * __restrict p2,
-	  const unsigned long * __restrict p3)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i, 0)				\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		XO2(i, 0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       add %[inc], %[p1]       ;\n"
-	"       add %[inc], %[p2]       ;\n"
-	"       add %[inc], %[p3]       ;\n"
-	"       dec %[cnt]              ;\n"
-	"       jnz 1b                  ;\n"
-	: [cnt] "+r" (lines),
-	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
-	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
-	: "memory");
-
-	kernel_fpu_end();
+#define arch_xor_init arch_xor_init
+static __always_inline void __init arch_xor_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		xor_force(&xor_block_avx);
+	} else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) {
+		xor_register(&xor_block_sse);
+		xor_register(&xor_block_sse_pf64);
+	} else if (boot_cpu_has(X86_FEATURE_MMX)) {
+		xor_register(&xor_block_pII_mmx);
+		xor_register(&xor_block_p5_mmx);
+	} else {
+		xor_register(&xor_block_8regs);
+		xor_register(&xor_block_8regs_p);
+		xor_register(&xor_block_32regs);
+		xor_register(&xor_block_32regs_p);
+	}
 }
 
-static void
-xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i)			\
-		BLK64(PF0, LD, i)	\
-		BLK64(PF1, XO1, i)	\
-		BLK64(PF2, XO2, i)	\
-		BLK64(NOP, ST, i)	\
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       add %[inc], %[p1]       ;\n"
-	"       add %[inc], %[p2]       ;\n"
-	"       add %[inc], %[p3]       ;\n"
-	"       dec %[cnt]              ;\n"
-	"       jnz 1b                  ;\n"
-	: [cnt] "+r" (lines),
-	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
-	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
-	  const unsigned long * __restrict p2,
-	  const unsigned long * __restrict p3,
-	  const unsigned long * __restrict p4)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i, 0)				\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		PF3(i)					\
-				PF3(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO2(i, 0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		XO3(i, 0)				\
-			XO3(i + 1, 1)			\
-				XO3(i + 2, 2)		\
-					XO3(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       add %[inc], %[p1]       ;\n"
-	"       add %[inc], %[p2]       ;\n"
-	"       add %[inc], %[p3]       ;\n"
-	"       add %[inc], %[p4]       ;\n"
-	"       dec %[cnt]              ;\n"
-	"       jnz 1b                  ;\n"
-	: [cnt] "+r" (lines), [p1] "+r" (p1),
-	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
-	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3,
-	       const unsigned long * __restrict p4)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i)			\
-		BLK64(PF0, LD, i)	\
-		BLK64(PF1, XO1, i)	\
-		BLK64(PF2, XO2, i)	\
-		BLK64(PF3, XO3, i)	\
-		BLK64(NOP, ST, i)	\
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       add %[inc], %[p1]       ;\n"
-	"       add %[inc], %[p2]       ;\n"
-	"       add %[inc], %[p3]       ;\n"
-	"       add %[inc], %[p4]       ;\n"
-	"       dec %[cnt]              ;\n"
-	"       jnz 1b                  ;\n"
-	: [cnt] "+r" (lines), [p1] "+r" (p1),
-	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
-	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
-	  const unsigned long * __restrict p2,
-	  const unsigned long * __restrict p3,
-	  const unsigned long * __restrict p4,
-	  const unsigned long * __restrict p5)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i) \
-		PF1(i)					\
-				PF1(i + 2)		\
-		LD(i, 0)				\
-			LD(i + 1, 1)			\
-				LD(i + 2, 2)		\
-					LD(i + 3, 3)	\
-		PF2(i)					\
-				PF2(i + 2)		\
-		XO1(i, 0)				\
-			XO1(i + 1, 1)			\
-				XO1(i + 2, 2)		\
-					XO1(i + 3, 3)	\
-		PF3(i)					\
-				PF3(i + 2)		\
-		XO2(i, 0)				\
-			XO2(i + 1, 1)			\
-				XO2(i + 2, 2)		\
-					XO2(i + 3, 3)	\
-		PF4(i)					\
-				PF4(i + 2)		\
-		PF0(i + 4)				\
-				PF0(i + 6)		\
-		XO3(i, 0)				\
-			XO3(i + 1, 1)			\
-				XO3(i + 2, 2)		\
-					XO3(i + 3, 3)	\
-		XO4(i, 0)				\
-			XO4(i + 1, 1)			\
-				XO4(i + 2, 2)		\
-					XO4(i + 3, 3)	\
-		ST(i, 0)				\
-			ST(i + 1, 1)			\
-				ST(i + 2, 2)		\
-					ST(i + 3, 3)	\
-
-
-		PF0(0)
-				PF0(2)
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       add %[inc], %[p1]       ;\n"
-	"       add %[inc], %[p2]       ;\n"
-	"       add %[inc], %[p3]       ;\n"
-	"       add %[inc], %[p4]       ;\n"
-	"       add %[inc], %[p5]       ;\n"
-	"       dec %[cnt]              ;\n"
-	"       jnz 1b                  ;\n"
-	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
-	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
-	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static void
-xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
-	       const unsigned long * __restrict p2,
-	       const unsigned long * __restrict p3,
-	       const unsigned long * __restrict p4,
-	       const unsigned long * __restrict p5)
-{
-	unsigned long lines = bytes >> 8;
-
-	kernel_fpu_begin();
-
-	asm volatile(
-#undef BLOCK
-#define BLOCK(i)			\
-		BLK64(PF0, LD, i)	\
-		BLK64(PF1, XO1, i)	\
-		BLK64(PF2, XO2, i)	\
-		BLK64(PF3, XO3, i)	\
-		BLK64(PF4, XO4, i)	\
-		BLK64(NOP, ST, i)	\
-
-	" .align 32			;\n"
-	" 1:                            ;\n"
-
-		BLOCK(0)
-		BLOCK(4)
-		BLOCK(8)
-		BLOCK(12)
-
-	"       add %[inc], %[p1]       ;\n"
-	"       add %[inc], %[p2]       ;\n"
-	"       add %[inc], %[p3]       ;\n"
-	"       add %[inc], %[p4]       ;\n"
-	"       add %[inc], %[p5]       ;\n"
-	"       dec %[cnt]              ;\n"
-	"       jnz 1b                  ;\n"
-	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
-	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
-	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
-	: "memory");
-
-	kernel_fpu_end();
-}
-
-static struct xor_block_template xor_block_sse_pf64 = {
-	.name = "prefetch64-sse",
-	.do_2 = xor_sse_2_pf64,
-	.do_3 = xor_sse_3_pf64,
-	.do_4 = xor_sse_4_pf64,
-	.do_5 = xor_sse_5_pf64,
-};
-
-#undef LD
-#undef XO1
-#undef XO2
-#undef XO3
-#undef XO4
-#undef ST
-#undef NOP
-#undef BLK64
-#undef BLOCK
-
-#undef XOR_CONSTANT_CONSTRAINT
-
-#ifdef CONFIG_X86_32
-# include <asm/xor_32.h>
-#else
-# include <asm/xor_64.h>
-#endif
-
 #endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
deleted file mode 100644
index 2d2ceb2418665e..00000000000000
--- a/arch/x86/include/asm/xor_64.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_XOR_64_H
-#define _ASM_X86_XOR_64_H
-
-static struct xor_block_template xor_block_sse = {
-	.name = "generic_sse",
-	.do_2 = xor_sse_2,
-	.do_3 = xor_sse_3,
-	.do_4 = xor_sse_4,
-	.do_5 = xor_sse_5,
-};
-
-
-/* Also try the AVX routines */
-#include <asm/xor_avx.h>
-
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define arch_xor_init arch_xor_init
-static __always_inline void __init arch_xor_init(void)
-{
-	if (boot_cpu_has(X86_FEATURE_AVX) &&
-	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
-		xor_force(&xor_block_avx);
-	} else {
-		xor_register(&xor_block_sse_pf64);
-		xor_register(&xor_block_sse);
-	}
-}
-
-#endif /* _ASM_X86_XOR_64_H */
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 3db6c2b2f26a5c..05aca96041b3ac 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -21,6 +21,8 @@ xor-$(CONFIG_RISCV_ISA_V)	+= riscv/xor.o riscv/xor-glue.o
 xor-$(CONFIG_SPARC32)		+= sparc/xor-sparc32.o
 xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
 xor-$(CONFIG_S390)		+= s390/xor.o
+xor-$(CONFIG_X86_32)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
+xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o
 
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
diff --git a/arch/x86/include/asm/xor_avx.h b/lib/raid/xor/x86/xor-avx.c
similarity index 95%
rename from arch/x86/include/asm/xor_avx.h
rename to lib/raid/xor/x86/xor-avx.c
index c600888436bb90..b49cb5199e709e 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/lib/raid/xor/x86/xor-avx.c
@@ -1,18 +1,16 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef _ASM_X86_XOR_AVX_H
-#define _ASM_X86_XOR_AVX_H
-
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * Optimized RAID-5 checksumming functions for AVX
+ * Optimized XOR parity functions for AVX
  *
  * Copyright (C) 2012 Intel Corporation
  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
  *
  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
  */
-
 #include <linux/compiler.h>
+#include <linux/raid/xor_impl.h>
 #include <asm/fpu/api.h>
+#include <asm/xor.h>
 
 #define BLOCK4(i) \
 		BLOCK(32 * i, 0) \
@@ -158,12 +156,10 @@ do { \
 	kernel_fpu_end();
 }
 
-static struct xor_block_template xor_block_avx = {
+struct xor_block_template xor_block_avx = {
 	.name = "avx",
 	.do_2 = xor_avx_2,
 	.do_3 = xor_avx_3,
 	.do_4 = xor_avx_4,
 	.do_5 = xor_avx_5,
 };
-
-#endif
diff --git a/arch/x86/include/asm/xor_32.h b/lib/raid/xor/x86/xor-mmx.c
similarity index 90%
rename from arch/x86/include/asm/xor_32.h
rename to lib/raid/xor/x86/xor-mmx.c
index ee32d08c27bc59..cf0fafea33b72b 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/lib/raid/xor/x86/xor-mmx.c
@@ -1,15 +1,12 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_X86_XOR_32_H
-#define _ASM_X86_XOR_32_H
-
-/*
- * Optimized RAID-5 checksumming functions for MMX.
- */
-
+// SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * High-speed RAID5 checksumming functions utilizing MMX instructions.
+ * Optimized XOR parity functions for MMX.
+ *
  * Copyright (C) 1998 Ingo Molnar.
  */
+#include <linux/raid/xor_impl.h>
+#include <asm/fpu/api.h>
+#include <asm/xor.h>
 
 #define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
 #define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
@@ -18,8 +15,6 @@
 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
 
-#include <asm/fpu/api.h>
-
 static void
 xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
 	      const unsigned long * __restrict p2)
@@ -519,7 +514,7 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
 	kernel_fpu_end();
 }
 
-static struct xor_block_template xor_block_pII_mmx = {
+struct xor_block_template xor_block_pII_mmx = {
 	.name = "pII_mmx",
 	.do_2 = xor_pII_mmx_2,
 	.do_3 = xor_pII_mmx_3,
@@ -527,49 +522,10 @@ static struct xor_block_template xor_block_pII_mmx = {
 	.do_5 = xor_pII_mmx_5,
 };
 
-static struct xor_block_template xor_block_p5_mmx = {
+struct xor_block_template xor_block_p5_mmx = {
 	.name = "p5_mmx",
 	.do_2 = xor_p5_mmx_2,
 	.do_3 = xor_p5_mmx_3,
 	.do_4 = xor_p5_mmx_4,
 	.do_5 = xor_p5_mmx_5,
 };
-
-static struct xor_block_template xor_block_pIII_sse = {
-	.name = "pIII_sse",
-	.do_2 = xor_sse_2,
-	.do_3 = xor_sse_3,
-	.do_4 = xor_sse_4,
-	.do_5 = xor_sse_5,
-};
-
-/* Also try the AVX routines */
-#include <asm/xor_avx.h>
-
-/* Also try the generic routines.  */
-#include <asm-generic/xor.h>
-
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define arch_xor_init arch_xor_init
-static __always_inline void __init arch_xor_init(void)
-{
-	if (boot_cpu_has(X86_FEATURE_AVX) &&
-	    boot_cpu_has(X86_FEATURE_OSXSAVE)) {
-		xor_force(&xor_block_avx);
-	} else if (boot_cpu_has(X86_FEATURE_XMM)) {
-		xor_register(&xor_block_pIII_sse);
-		xor_register(&xor_block_sse_pf64);
-	} else if (boot_cpu_has(X86_FEATURE_MMX)) {
-		xor_register(&xor_block_pII_mmx);
-		xor_register(&xor_block_p5_mmx);
-	} else {
-		xor_register(&xor_block_8regs);
-		xor_register(&xor_block_8regs_p);
-		xor_register(&xor_block_32regs);
-		xor_register(&xor_block_32regs_p);
-	}
-}
-
-#endif /* _ASM_X86_XOR_32_H */
diff --git a/lib/raid/xor/x86/xor-sse.c b/lib/raid/xor/x86/xor-sse.c
new file mode 100644
index 00000000000000..0e727ced8b0051
--- /dev/null
+++ b/lib/raid/xor/x86/xor-sse.c
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Optimized XOR parity functions for SSE.
+ *
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ *
+ * Based on
+ * High-speed RAID5 checksumming functions utilizing SSE instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ *
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/raid/xor_impl.h>
+#include <asm/fpu/api.h>
+#include <asm/xor.h>
+
+#ifdef CONFIG_X86_32
+/* reduce register pressure */
+# define XOR_CONSTANT_CONSTRAINT "i"
+#else
+# define XOR_CONSTANT_CONSTRAINT "re"
+#endif
+
+#define OFFS(x)		"16*("#x")"
+#define PF_OFFS(x)	"256+16*("#x")"
+#define PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
+#define LD(x, y)	"	movaps "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
+#define ST(x, y)	"	movaps %%xmm"#y", "OFFS(x)"(%[p1])	;\n"
+#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
+#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
+#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
+#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
+#define XO1(x, y)	"	xorps "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
+#define XO2(x, y)	"	xorps "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
+#define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
+#define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
+#define NOP(x)
+
+#define BLK64(pf, op, i)				\
+		pf(i)					\
+		op(i, 0)				\
+			op(i + 1, 1)			\
+				op(i + 2, 2)		\
+					op(i + 3, 3)
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
+	  const unsigned long * __restrict p2)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)					\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+		PF1(i)					\
+				PF1(i + 2)		\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)			\
+		BLK64(PF0, LD, i)	\
+		BLK64(PF1, XO1, i)	\
+		BLK64(NOP, ST, i)	\
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
+	  const unsigned long * __restrict p2,
+	  const unsigned long * __restrict p3)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)			\
+		BLK64(PF0, LD, i)	\
+		BLK64(PF1, XO1, i)	\
+		BLK64(PF2, XO2, i)	\
+		BLK64(NOP, ST, i)	\
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
+	  const unsigned long * __restrict p2,
+	  const unsigned long * __restrict p3,
+	  const unsigned long * __restrict p4)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		PF3(i)					\
+				PF3(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		XO3(i, 0)				\
+			XO3(i + 1, 1)			\
+				XO3(i + 2, 2)		\
+					XO3(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       add %[inc], %[p4]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines), [p1] "+r" (p1),
+	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3,
+	       const unsigned long * __restrict p4)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)			\
+		BLK64(PF0, LD, i)	\
+		BLK64(PF1, XO1, i)	\
+		BLK64(PF2, XO2, i)	\
+		BLK64(PF3, XO3, i)	\
+		BLK64(NOP, ST, i)	\
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       add %[inc], %[p4]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines), [p1] "+r" (p1),
+	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
+	  const unsigned long * __restrict p2,
+	  const unsigned long * __restrict p3,
+	  const unsigned long * __restrict p4,
+	  const unsigned long * __restrict p5)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		PF3(i)					\
+				PF3(i + 2)		\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		PF4(i)					\
+				PF4(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO3(i, 0)				\
+			XO3(i + 1, 1)			\
+				XO3(i + 2, 2)		\
+					XO3(i + 3, 3)	\
+		XO4(i, 0)				\
+			XO4(i + 1, 1)			\
+				XO4(i + 2, 2)		\
+					XO4(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       add %[inc], %[p4]       ;\n"
+	"       add %[inc], %[p5]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+static void
+xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2,
+	       const unsigned long * __restrict p3,
+	       const unsigned long * __restrict p4,
+	       const unsigned long * __restrict p5)
+{
+	unsigned long lines = bytes >> 8;
+
+	kernel_fpu_begin();
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i)			\
+		BLK64(PF0, LD, i)	\
+		BLK64(PF1, XO1, i)	\
+		BLK64(PF2, XO2, i)	\
+		BLK64(PF3, XO3, i)	\
+		BLK64(PF4, XO4, i)	\
+		BLK64(NOP, ST, i)	\
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       add %[inc], %[p1]       ;\n"
+	"       add %[inc], %[p2]       ;\n"
+	"       add %[inc], %[p3]       ;\n"
+	"       add %[inc], %[p4]       ;\n"
+	"       add %[inc], %[p5]       ;\n"
+	"       dec %[cnt]              ;\n"
+	"       jnz 1b                  ;\n"
+	: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+	: "memory");
+
+	kernel_fpu_end();
+}
+
+struct xor_block_template xor_block_sse = {
+	.name = "sse",
+	.do_2 = xor_sse_2,
+	.do_3 = xor_sse_3,
+	.do_4 = xor_sse_4,
+	.do_5 = xor_sse_5,
+};
+
+struct xor_block_template xor_block_sse_pf64 = {
+	.name = "prefetch64-sse",
+	.do_2 = xor_sse_2_pf64,
+	.do_3 = xor_sse_3_pf64,
+	.do_4 = xor_sse_4_pf64,
+	.do_5 = xor_sse_5_pf64,
+};

From 1da7d8fd154b38a67effa2920ae25bc95548c442 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:55 +0100
Subject: [PATCH 19/26] xor: avoid indirect calls for arm64-optimized ops

Remove the inner xor_block_templates, and instead have two separate
actual template that call into the neon-enabled compilation unit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/arm64/include/asm/xor.h       | 13 ++--
 lib/raid/xor/arm64/xor-neon-glue.c | 95 +++++++++++++++---------------
 lib/raid/xor/arm64/xor-neon.c      | 73 +++++++++--------------
 lib/raid/xor/arm64/xor-neon.h      | 30 ++++++++++
 4 files changed, 114 insertions(+), 97 deletions(-)
 create mode 100644 lib/raid/xor/arm64/xor-neon.h

diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
index 81718f010761d8..4782c760bcac22 100644
--- a/arch/arm64/include/asm/xor.h
+++ b/arch/arm64/include/asm/xor.h
@@ -7,15 +7,18 @@
 #include <asm-generic/xor.h>
 #include <asm/simd.h>
 
-extern struct xor_block_template xor_block_arm64;
-void __init xor_neon_init(void);
+extern struct xor_block_template xor_block_neon;
+extern struct xor_block_template xor_block_eor3;
 
 #define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
-	xor_neon_init();
 	xor_register(&xor_block_8regs);
 	xor_register(&xor_block_32regs);
-	if (cpu_has_neon())
-		xor_register(&xor_block_arm64);
+	if (cpu_has_neon()) {
+		if (cpu_have_named_feature(SHA3))
+			xor_register(&xor_block_eor3);
+		else
+			xor_register(&xor_block_neon);
+	}
 }
diff --git a/lib/raid/xor/arm64/xor-neon-glue.c b/lib/raid/xor/arm64/xor-neon-glue.c
index 067a2095659a45..08c3e3573388d4 100644
--- a/lib/raid/xor/arm64/xor-neon-glue.c
+++ b/lib/raid/xor/arm64/xor-neon-glue.c
@@ -7,51 +7,54 @@
 #include <linux/raid/xor_impl.h>
 #include <asm/simd.h>
 #include <asm/xor.h>
+#include "xor-neon.h"
 
-extern struct xor_block_template const xor_block_inner_neon;
-
-static void
-xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2)
-{
-	scoped_ksimd()
-		xor_block_inner_neon.do_2(bytes, p1, p2);
-}
-
-static void
-xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3)
-{
-	scoped_ksimd()
-		xor_block_inner_neon.do_3(bytes, p1, p2, p3);
-}
-
-static void
-xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3,
-	   const unsigned long * __restrict p4)
-{
-	scoped_ksimd()
-		xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
-}
-
-static void
-xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3,
-	   const unsigned long * __restrict p4,
-	   const unsigned long * __restrict p5)
-{
-	scoped_ksimd()
-		xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
-}
-
-struct xor_block_template xor_block_arm64 = {
-	.name   = "arm64_neon",
-	.do_2   = xor_neon_2,
-	.do_3   = xor_neon_3,
-	.do_4   = xor_neon_4,
-	.do_5	= xor_neon_5
+#define XOR_TEMPLATE(_name)						\
+static void								\
+xor_##_name##_2(unsigned long bytes, unsigned long * __restrict p1,	\
+	   const unsigned long * __restrict p2)				\
+{									\
+	scoped_ksimd()							\
+		__xor_##_name##_2(bytes, p1, p2);			\
+}									\
+									\
+static void								\
+xor_##_name##_3(unsigned long bytes, unsigned long * __restrict p1,	\
+	   const unsigned long * __restrict p2,				\
+	   const unsigned long * __restrict p3)				\
+{									\
+	scoped_ksimd()							\
+		__xor_##_name##_3(bytes, p1, p2, p3);			\
+}									\
+									\
+static void								\
+xor_##_name##_4(unsigned long bytes, unsigned long * __restrict p1,	\
+	   const unsigned long * __restrict p2,				\
+	   const unsigned long * __restrict p3,				\
+	   const unsigned long * __restrict p4)				\
+{									\
+	scoped_ksimd()							\
+		__xor_##_name##_4(bytes, p1, p2, p3, p4);		\
+}									\
+									\
+static void								\
+xor_##_name##_5(unsigned long bytes, unsigned long * __restrict p1,	\
+	   const unsigned long * __restrict p2,				\
+	   const unsigned long * __restrict p3,				\
+	   const unsigned long * __restrict p4,				\
+	   const unsigned long * __restrict p5)				\
+{									\
+	scoped_ksimd()							\
+		__xor_##_name##_5(bytes, p1, p2, p3, p4, p5);		\
+}									\
+									\
+struct xor_block_template xor_block_##_name = {				\
+	.name   = __stringify(_name),					\
+	.do_2   = xor_##_name##_2,					\
+	.do_3   = xor_##_name##_3,					\
+	.do_4   = xor_##_name##_4,					\
+	.do_5	= xor_##_name##_5					\
 };
+
+XOR_TEMPLATE(neon);
+XOR_TEMPLATE(eor3);
diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c
index 8d2d185090db76..61194c292917e5 100644
--- a/lib/raid/xor/arm64/xor-neon.c
+++ b/lib/raid/xor/arm64/xor-neon.c
@@ -8,9 +8,10 @@
 #include <linux/cache.h>
 #include <asm/neon-intrinsics.h>
 #include <asm/xor.h>
+#include "xor-neon.h"
 
-static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
-	const unsigned long * __restrict p2)
+void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2)
 {
 	uint64_t *dp1 = (uint64_t *)p1;
 	uint64_t *dp2 = (uint64_t *)p2;
@@ -36,9 +37,9 @@ static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
-	const unsigned long * __restrict p2,
-	const unsigned long * __restrict p3)
+void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3)
 {
 	uint64_t *dp1 = (uint64_t *)p1;
 	uint64_t *dp2 = (uint64_t *)p2;
@@ -72,10 +73,10 @@ static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
-	const unsigned long * __restrict p2,
-	const unsigned long * __restrict p3,
-	const unsigned long * __restrict p4)
+void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4)
 {
 	uint64_t *dp1 = (uint64_t *)p1;
 	uint64_t *dp2 = (uint64_t *)p2;
@@ -117,11 +118,11 @@ static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
-	const unsigned long * __restrict p2,
-	const unsigned long * __restrict p3,
-	const unsigned long * __restrict p4,
-	const unsigned long * __restrict p5)
+void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4,
+		const unsigned long * __restrict p5)
 {
 	uint64_t *dp1 = (uint64_t *)p1;
 	uint64_t *dp2 = (uint64_t *)p2;
@@ -171,14 +172,6 @@ static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-struct xor_block_template xor_block_inner_neon __ro_after_init = {
-	.name	= "__inner_neon__",
-	.do_2	= xor_arm64_neon_2,
-	.do_3	= xor_arm64_neon_3,
-	.do_4	= xor_arm64_neon_4,
-	.do_5	= xor_arm64_neon_5,
-};
-
 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 {
 	uint64x2_t res;
@@ -189,10 +182,9 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 	return res;
 }
 
-static void xor_arm64_eor3_3(unsigned long bytes,
-	unsigned long * __restrict p1,
-	const unsigned long * __restrict p2,
-	const unsigned long * __restrict p3)
+void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3)
 {
 	uint64_t *dp1 = (uint64_t *)p1;
 	uint64_t *dp2 = (uint64_t *)p2;
@@ -224,11 +216,10 @@ static void xor_arm64_eor3_3(unsigned long bytes,
 	} while (--lines > 0);
 }
 
-static void xor_arm64_eor3_4(unsigned long bytes,
-	unsigned long * __restrict p1,
-	const unsigned long * __restrict p2,
-	const unsigned long * __restrict p3,
-	const unsigned long * __restrict p4)
+void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4)
 {
 	uint64_t *dp1 = (uint64_t *)p1;
 	uint64_t *dp2 = (uint64_t *)p2;
@@ -268,12 +259,11 @@ static void xor_arm64_eor3_4(unsigned long bytes,
 	} while (--lines > 0);
 }
 
-static void xor_arm64_eor3_5(unsigned long bytes,
-	unsigned long * __restrict p1,
-	const unsigned long * __restrict p2,
-	const unsigned long * __restrict p3,
-	const unsigned long * __restrict p4,
-	const unsigned long * __restrict p5)
+void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4,
+		const unsigned long * __restrict p5)
 {
 	uint64_t *dp1 = (uint64_t *)p1;
 	uint64_t *dp2 = (uint64_t *)p2;
@@ -314,12 +304,3 @@ static void xor_arm64_eor3_5(unsigned long bytes,
 		dp5 += 8;
 	} while (--lines > 0);
 }
-
-void __init xor_neon_init(void)
-{
-	if (cpu_have_named_feature(SHA3)) {
-		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
-		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
-		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
-	}
-}
diff --git a/lib/raid/xor/arm64/xor-neon.h b/lib/raid/xor/arm64/xor-neon.h
new file mode 100644
index 00000000000000..cec0ac846feabd
--- /dev/null
+++ b/lib/raid/xor/arm64/xor-neon.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2);
+void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3);
+void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4);
+void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4,
+		const unsigned long * __restrict p5);
+
+#define __xor_eor3_2	__xor_neon_2
+void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3);
+void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4);
+void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
+		const unsigned long * __restrict p2,
+		const unsigned long * __restrict p3,
+		const unsigned long * __restrict p4,
+		const unsigned long * __restrict p5);

From b1c0b148f509111a7243cb2a416c9e629f04d803 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:56 +0100
Subject: [PATCH 20/26] xor: make xor.ko self-contained in lib/raid/

Move the asm/xor.h headers to lib/raid/xor/$(SRCARCH)/xor_arch.h and
include/linux/raid/xor_impl.h to lib/raid/xor/xor_impl.h so that the
xor.ko module implementation is self-contained in lib/raid/.

As this remove the asm-generic mechanism a new kconfig symbol is
added to indicate that a architecture-specific implementations
exists, and xor_arch.h should be included.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 arch/um/include/asm/xor.h                      |  8 --------
 include/asm-generic/Kbuild                     |  1 -
 include/asm-generic/xor.h                      | 11 -----------
 lib/raid/Kconfig                               | 15 +++++++++++++++
 lib/raid/xor/Makefile                          |  6 ++++++
 lib/raid/xor/alpha/xor.c                       |  4 ++--
 .../asm/xor.h => lib/raid/xor/alpha/xor_arch.h |  2 --
 lib/raid/xor/arm/xor-neon-glue.c               |  4 ++--
 lib/raid/xor/arm/xor-neon.c                    |  2 +-
 lib/raid/xor/arm/xor.c                         |  4 ++--
 .../asm/xor.h => lib/raid/xor/arm/xor_arch.h   |  2 --
 lib/raid/xor/arm64/xor-neon-glue.c             |  4 ++--
 lib/raid/xor/arm64/xor-neon.c                  |  4 ++--
 .../asm/xor.h => lib/raid/xor/arm64/xor_arch.h |  3 ---
 .../xor.h => lib/raid/xor/loongarch/xor_arch.h |  7 -------
 lib/raid/xor/loongarch/xor_simd_glue.c         |  4 ++--
 .../xor.h => lib/raid/xor/powerpc/xor_arch.h   |  7 -------
 lib/raid/xor/powerpc/xor_vmx_glue.c            |  4 ++--
 lib/raid/xor/riscv/xor-glue.c                  |  4 ++--
 .../asm/xor.h => lib/raid/xor/riscv/xor_arch.h |  2 --
 lib/raid/xor/s390/xor.c                        |  4 ++--
 .../asm/xor.h => lib/raid/xor/s390/xor_arch.h  |  6 ------
 lib/raid/xor/sparc/xor-sparc32.c               |  4 ++--
 lib/raid/xor/sparc/xor-sparc64-glue.c          |  4 ++--
 .../asm/xor.h => lib/raid/xor/sparc/xor_arch.h |  9 ---------
 lib/raid/xor/um/xor_arch.h                     |  2 ++
 lib/raid/xor/x86/xor-avx.c                     |  4 ++--
 lib/raid/xor/x86/xor-mmx.c                     |  4 ++--
 lib/raid/xor/x86/xor-sse.c                     |  4 ++--
 .../asm/xor.h => lib/raid/xor/x86/xor_arch.h   |  7 -------
 lib/raid/xor/xor-32regs-prefetch.c             |  3 +--
 lib/raid/xor/xor-32regs.c                      |  3 +--
 lib/raid/xor/xor-8regs-prefetch.c              |  3 +--
 lib/raid/xor/xor-8regs.c                       |  3 +--
 lib/raid/xor/xor-core.c                        | 18 +++++++++++-------
 .../linux/raid => lib/raid/xor}/xor_impl.h     |  6 ++++++
 36 files changed, 73 insertions(+), 109 deletions(-)
 delete mode 100644 arch/um/include/asm/xor.h
 delete mode 100644 include/asm-generic/xor.h
 rename arch/alpha/include/asm/xor.h => lib/raid/xor/alpha/xor_arch.h (90%)
 rename arch/arm/include/asm/xor.h => lib/raid/xor/arm/xor_arch.h (87%)
 rename arch/arm64/include/asm/xor.h => lib/raid/xor/arm64/xor_arch.h (89%)
 rename arch/loongarch/include/asm/xor.h => lib/raid/xor/loongarch/xor_arch.h (85%)
 rename arch/powerpc/include/asm/xor.h => lib/raid/xor/powerpc/xor_arch.h (77%)
 rename arch/riscv/include/asm/xor.h => lib/raid/xor/riscv/xor_arch.h (84%)
 rename arch/s390/include/asm/xor.h => lib/raid/xor/s390/xor_arch.h (71%)
 rename arch/sparc/include/asm/xor.h => lib/raid/xor/sparc/xor_arch.h (81%)
 create mode 100644 lib/raid/xor/um/xor_arch.h
 rename arch/x86/include/asm/xor.h => lib/raid/xor/x86/xor_arch.h (89%)
 rename {include/linux/raid => lib/raid/xor}/xor_impl.h (80%)

diff --git a/arch/um/include/asm/xor.h b/arch/um/include/asm/xor.h
deleted file mode 100644
index 99e5c7e1f47574..00000000000000
--- a/arch/um/include/asm/xor.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_UM_XOR_H
-#define _ASM_UM_XOR_H
-
-#include <asm/cpufeature.h>
-#include <../../x86/include/asm/xor.h>
-
-#endif
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 9aff61e7b8f27c..2c53a1e0b76041 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -65,4 +65,3 @@ mandatory-y += vermagic.h
 mandatory-y += vga.h
 mandatory-y += video.h
 mandatory-y += word-at-a-time.h
-mandatory-y += xor.h
diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h
deleted file mode 100644
index fc151fdc45ab09..00000000000000
--- a/include/asm-generic/xor.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * include/asm-generic/xor.h
- *
- * Generic optimized RAID-5 checksumming functions.
- */
-
-extern struct xor_block_template xor_block_8regs;
-extern struct xor_block_template xor_block_32regs;
-extern struct xor_block_template xor_block_8regs_p;
-extern struct xor_block_template xor_block_32regs_p;
diff --git a/lib/raid/Kconfig b/lib/raid/Kconfig
index 01b73a1c303fee..2ccf0c60de01f0 100644
--- a/lib/raid/Kconfig
+++ b/lib/raid/Kconfig
@@ -2,3 +2,18 @@
 
 config XOR_BLOCKS
 	tristate
+
+# selected by architectures that provide an optimized XOR implementation
+config XOR_BLOCKS_ARCH
+	depends on XOR_BLOCKS
+	default y if ALPHA
+	default y if ARM && KERNEL_MODE_NEON
+	default y if ARM64
+	default y if CPU_HAS_LSX		# loongarch
+	default y if ALTIVEC			# powerpc
+	default y if RISCV_ISA_V
+	default y if SPARC
+	default y if S390
+	default y if X86_32
+	default y if X86_64
+	bool
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index 05aca96041b3ac..df55823c4d827c 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
+ccflags-y			+= -I $(src)
+
 obj-$(CONFIG_XOR_BLOCKS)	+= xor.o
 
 xor-y				+= xor-core.o
@@ -8,6 +10,10 @@ xor-y				+= xor-32regs.o
 xor-y				+= xor-8regs-prefetch.o
 xor-y				+= xor-32regs-prefetch.o
 
+ifeq ($(CONFIG_XOR_BLOCKS_ARCH),y)
+CFLAGS_xor-core.o		+= -I$(src)/$(SRCARCH)
+endif
+
 xor-$(CONFIG_ALPHA)		+= alpha/xor.o
 xor-$(CONFIG_ARM)		+= arm/xor.o
 ifeq ($(CONFIG_ARM),y)
diff --git a/lib/raid/xor/alpha/xor.c b/lib/raid/xor/alpha/xor.c
index 0964ac42060405..90694cc47395b9 100644
--- a/lib/raid/xor/alpha/xor.c
+++ b/lib/raid/xor/alpha/xor.c
@@ -2,8 +2,8 @@
 /*
  * Optimized XOR parity functions for alpha EV5 and EV6
  */
-#include <linux/raid/xor_impl.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 extern void
 xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/arch/alpha/include/asm/xor.h b/lib/raid/xor/alpha/xor_arch.h
similarity index 90%
rename from arch/alpha/include/asm/xor.h
rename to lib/raid/xor/alpha/xor_arch.h
index e517be577a09ff..0dcfea578a488a 100644
--- a/arch/alpha/include/asm/xor.h
+++ b/lib/raid/xor/alpha/xor_arch.h
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 
 #include <asm/special_insns.h>
-#include <asm-generic/xor.h>
 
 extern struct xor_block_template xor_block_alpha;
 extern struct xor_block_template xor_block_alpha_prefetch;
@@ -10,7 +9,6 @@ extern struct xor_block_template xor_block_alpha_prefetch;
  * Force the use of alpha_prefetch if EV6, as it is significantly faster in the
  * cold cache case.
  */
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	if (implver() == IMPLVER_EV6) {
diff --git a/lib/raid/xor/arm/xor-neon-glue.c b/lib/raid/xor/arm/xor-neon-glue.c
index c7b162b383a25b..7afd6294464bc5 100644
--- a/lib/raid/xor/arm/xor-neon-glue.c
+++ b/lib/raid/xor/arm/xor-neon-glue.c
@@ -2,8 +2,8 @@
 /*
  *  Copyright (C) 2001 Russell King
  */
-#include <linux/raid/xor_impl.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 extern struct xor_block_template const xor_block_neon_inner;
 
diff --git a/lib/raid/xor/arm/xor-neon.c b/lib/raid/xor/arm/xor-neon.c
index c9d4378b0f0ec2..806a42c5952c50 100644
--- a/lib/raid/xor/arm/xor-neon.c
+++ b/lib/raid/xor/arm/xor-neon.c
@@ -3,7 +3,7 @@
  * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
-#include <linux/raid/xor_impl.h>
+#include "xor_impl.h"
 
 #ifndef __ARM_NEON__
 #error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
diff --git a/lib/raid/xor/arm/xor.c b/lib/raid/xor/arm/xor.c
index 2263341dbbcdf9..5bd5f048bbe9ea 100644
--- a/lib/raid/xor/arm/xor.c
+++ b/lib/raid/xor/arm/xor.c
@@ -2,8 +2,8 @@
 /*
  *  Copyright (C) 2001 Russell King
  */
-#include <linux/raid/xor_impl.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 #define __XOR(a1, a2) a1 ^= a2
 
diff --git a/arch/arm/include/asm/xor.h b/lib/raid/xor/arm/xor_arch.h
similarity index 87%
rename from arch/arm/include/asm/xor.h
rename to lib/raid/xor/arm/xor_arch.h
index 989c55872ef6a5..5a7eedb48fbb94 100644
--- a/arch/arm/include/asm/xor.h
+++ b/lib/raid/xor/arm/xor_arch.h
@@ -2,13 +2,11 @@
 /*
  *  Copyright (C) 2001 Russell King
  */
-#include <asm-generic/xor.h>
 #include <asm/neon.h>
 
 extern struct xor_block_template xor_block_arm4regs;
 extern struct xor_block_template xor_block_neon;
 
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	xor_register(&xor_block_arm4regs);
diff --git a/lib/raid/xor/arm64/xor-neon-glue.c b/lib/raid/xor/arm64/xor-neon-glue.c
index 08c3e3573388d4..3db0a318cf5bb5 100644
--- a/lib/raid/xor/arm64/xor-neon-glue.c
+++ b/lib/raid/xor/arm64/xor-neon-glue.c
@@ -4,9 +4,9 @@
  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
  */
 
-#include <linux/raid/xor_impl.h>
 #include <asm/simd.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 #include "xor-neon.h"
 
 #define XOR_TEMPLATE(_name)						\
diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c
index 61194c292917e5..61f00c4fee4952 100644
--- a/lib/raid/xor/arm64/xor-neon.c
+++ b/lib/raid/xor/arm64/xor-neon.c
@@ -4,10 +4,10 @@
  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
  */
 
-#include <linux/raid/xor_impl.h>
 #include <linux/cache.h>
 #include <asm/neon-intrinsics.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 #include "xor-neon.h"
 
 void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/arch/arm64/include/asm/xor.h b/lib/raid/xor/arm64/xor_arch.h
similarity index 89%
rename from arch/arm64/include/asm/xor.h
rename to lib/raid/xor/arm64/xor_arch.h
index 4782c760bcac22..5dbb40319501b7 100644
--- a/arch/arm64/include/asm/xor.h
+++ b/lib/raid/xor/arm64/xor_arch.h
@@ -3,14 +3,11 @@
  * Authors: Jackie Liu <liuyun01@kylinos.cn>
  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
  */
-
-#include <asm-generic/xor.h>
 #include <asm/simd.h>
 
 extern struct xor_block_template xor_block_neon;
 extern struct xor_block_template xor_block_eor3;
 
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	xor_register(&xor_block_8regs);
diff --git a/arch/loongarch/include/asm/xor.h b/lib/raid/xor/loongarch/xor_arch.h
similarity index 85%
rename from arch/loongarch/include/asm/xor.h
rename to lib/raid/xor/loongarch/xor_arch.h
index 7e32f72f8b036d..fe5e8244fd0ebd 100644
--- a/arch/loongarch/include/asm/xor.h
+++ b/lib/raid/xor/loongarch/xor_arch.h
@@ -2,9 +2,6 @@
 /*
  * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
  */
-#ifndef _ASM_LOONGARCH_XOR_H
-#define _ASM_LOONGARCH_XOR_H
-
 #include <asm/cpu-features.h>
 
 /*
@@ -15,12 +12,10 @@
  * the scalar ones, maybe for errata or micro-op reasons. It may be
  * appropriate to revisit this after one or two more uarch generations.
  */
-#include <asm-generic/xor.h>
 
 extern struct xor_block_template xor_block_lsx;
 extern struct xor_block_template xor_block_lasx;
 
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	xor_register(&xor_block_8regs);
@@ -36,5 +31,3 @@ static __always_inline void __init arch_xor_init(void)
 		xor_register(&xor_block_lasx);
 #endif
 }
-
-#endif /* _ASM_LOONGARCH_XOR_H */
diff --git a/lib/raid/xor/loongarch/xor_simd_glue.c b/lib/raid/xor/loongarch/xor_simd_glue.c
index 11fa3b47ba83a3..b387aa0213b471 100644
--- a/lib/raid/xor/loongarch/xor_simd_glue.c
+++ b/lib/raid/xor/loongarch/xor_simd_glue.c
@@ -6,9 +6,9 @@
  */
 
 #include <linux/sched.h>
-#include <linux/raid/xor_impl.h>
 #include <asm/fpu.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 #include "xor_simd.h"
 
 #define MAKE_XOR_GLUE_2(flavor)							\
diff --git a/arch/powerpc/include/asm/xor.h b/lib/raid/xor/powerpc/xor_arch.h
similarity index 77%
rename from arch/powerpc/include/asm/xor.h
rename to lib/raid/xor/powerpc/xor_arch.h
index 3293ac87181c33..3b00a4a2fd67cc 100644
--- a/arch/powerpc/include/asm/xor.h
+++ b/lib/raid/xor/powerpc/xor_arch.h
@@ -5,15 +5,10 @@
  *
  * Author: Anton Blanchard <anton@au.ibm.com>
  */
-#ifndef _ASM_POWERPC_XOR_H
-#define _ASM_POWERPC_XOR_H
-
 #include <asm/cpu_has_feature.h>
-#include <asm-generic/xor.h>
 
 extern struct xor_block_template xor_block_altivec;
 
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	xor_register(&xor_block_8regs);
@@ -25,5 +20,3 @@ static __always_inline void __init arch_xor_init(void)
 		xor_register(&xor_block_altivec);
 #endif
 }
-
-#endif /* _ASM_POWERPC_XOR_H */
diff --git a/lib/raid/xor/powerpc/xor_vmx_glue.c b/lib/raid/xor/powerpc/xor_vmx_glue.c
index c41e3834070042..56e99ddfb64f61 100644
--- a/lib/raid/xor/powerpc/xor_vmx_glue.c
+++ b/lib/raid/xor/powerpc/xor_vmx_glue.c
@@ -7,9 +7,9 @@
 
 #include <linux/preempt.h>
 #include <linux/sched.h>
-#include <linux/raid/xor_impl.h>
 #include <asm/switch_to.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 #include "xor_vmx.h"
 
 static void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/lib/raid/xor/riscv/xor-glue.c b/lib/raid/xor/riscv/xor-glue.c
index 11666a4b6b6872..060e5f22ebcce1 100644
--- a/lib/raid/xor/riscv/xor-glue.c
+++ b/lib/raid/xor/riscv/xor-glue.c
@@ -3,11 +3,11 @@
  * Copyright (C) 2021 SiFive
  */
 
-#include <linux/raid/xor_impl.h>
 #include <asm/vector.h>
 #include <asm/switch_to.h>
 #include <asm/asm-prototypes.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1,
 			 const unsigned long *__restrict p2)
diff --git a/arch/riscv/include/asm/xor.h b/lib/raid/xor/riscv/xor_arch.h
similarity index 84%
rename from arch/riscv/include/asm/xor.h
rename to lib/raid/xor/riscv/xor_arch.h
index 614d9209d07823..9240857d760b26 100644
--- a/arch/riscv/include/asm/xor.h
+++ b/lib/raid/xor/riscv/xor_arch.h
@@ -3,11 +3,9 @@
  * Copyright (C) 2021 SiFive
  */
 #include <asm/vector.h>
-#include <asm-generic/xor.h>
 
 extern struct xor_block_template xor_block_rvv;
 
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	xor_register(&xor_block_8regs);
diff --git a/lib/raid/xor/s390/xor.c b/lib/raid/xor/s390/xor.c
index 2d5e33eca2a8af..48b8cdc684a34b 100644
--- a/lib/raid/xor/s390/xor.c
+++ b/lib/raid/xor/s390/xor.c
@@ -7,8 +7,8 @@
  */
 
 #include <linux/types.h>
-#include <linux/raid/xor_impl.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1,
 		     const unsigned long * __restrict p2)
diff --git a/arch/s390/include/asm/xor.h b/lib/raid/xor/s390/xor_arch.h
similarity index 71%
rename from arch/s390/include/asm/xor.h
rename to lib/raid/xor/s390/xor_arch.h
index 4e2233f64da98d..4a233ed2b97a6a 100644
--- a/arch/s390/include/asm/xor.h
+++ b/lib/raid/xor/s390/xor_arch.h
@@ -5,15 +5,9 @@
  * Copyright IBM Corp. 2016
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
-#ifndef _ASM_S390_XOR_H
-#define _ASM_S390_XOR_H
-
 extern struct xor_block_template xor_block_xc;
 
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	xor_force(&xor_block_xc);
 }
-
-#endif /* _ASM_S390_XOR_H */
diff --git a/lib/raid/xor/sparc/xor-sparc32.c b/lib/raid/xor/sparc/xor-sparc32.c
index b65a75a6e59d8f..307c4a84f535d4 100644
--- a/lib/raid/xor/sparc/xor-sparc32.c
+++ b/lib/raid/xor/sparc/xor-sparc32.c
@@ -5,8 +5,8 @@
  *
  * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
  */
-#include <linux/raid/xor_impl.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 static void
 sparc_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/lib/raid/xor/sparc/xor-sparc64-glue.c b/lib/raid/xor/sparc/xor-sparc64-glue.c
index 3c67c8c3a0e821..5f90c2460b54cf 100644
--- a/lib/raid/xor/sparc/xor-sparc64-glue.c
+++ b/lib/raid/xor/sparc/xor-sparc64-glue.c
@@ -8,8 +8,8 @@
  * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
  */
 
-#include <linux/raid/xor_impl.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 void xor_vis_2(unsigned long bytes, unsigned long * __restrict p1,
 	       const unsigned long * __restrict p2);
diff --git a/arch/sparc/include/asm/xor.h b/lib/raid/xor/sparc/xor_arch.h
similarity index 81%
rename from arch/sparc/include/asm/xor.h
rename to lib/raid/xor/sparc/xor_arch.h
index f923b009fc24fa..af288abe4e9176 100644
--- a/arch/sparc/include/asm/xor.h
+++ b/lib/raid/xor/sparc/xor_arch.h
@@ -3,16 +3,12 @@
  * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
  * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
  */
-#ifndef ___ASM_SPARC_XOR_H
-#define ___ASM_SPARC_XOR_H
-
 #if defined(__sparc__) && defined(__arch64__)
 #include <asm/spitfire.h>
 
 extern struct xor_block_template xor_block_VIS;
 extern struct xor_block_template xor_block_niagara;
 
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	/* Force VIS for everything except Niagara.  */
@@ -28,12 +24,8 @@ static __always_inline void __init arch_xor_init(void)
 }
 #else /* sparc64 */
 
-/* For grins, also test the generic routines.  */
-#include <asm-generic/xor.h>
-
 extern struct xor_block_template xor_block_SPARC;
 
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	xor_register(&xor_block_8regs);
@@ -41,4 +33,3 @@ static __always_inline void __init arch_xor_init(void)
 	xor_register(&xor_block_SPARC);
 }
 #endif /* !sparc64 */
-#endif /* ___ASM_SPARC_XOR_H */
diff --git a/lib/raid/xor/um/xor_arch.h b/lib/raid/xor/um/xor_arch.h
new file mode 100644
index 00000000000000..a33e57a26c5ed7
--- /dev/null
+++ b/lib/raid/xor/um/xor_arch.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <../x86/xor_arch.h>
diff --git a/lib/raid/xor/x86/xor-avx.c b/lib/raid/xor/x86/xor-avx.c
index b49cb5199e709e..d411efa1ff4355 100644
--- a/lib/raid/xor/x86/xor-avx.c
+++ b/lib/raid/xor/x86/xor-avx.c
@@ -8,9 +8,9 @@
  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
  */
 #include <linux/compiler.h>
-#include <linux/raid/xor_impl.h>
 #include <asm/fpu/api.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 #define BLOCK4(i) \
 		BLOCK(32 * i, 0) \
diff --git a/lib/raid/xor/x86/xor-mmx.c b/lib/raid/xor/x86/xor-mmx.c
index cf0fafea33b72b..e48c58f9287408 100644
--- a/lib/raid/xor/x86/xor-mmx.c
+++ b/lib/raid/xor/x86/xor-mmx.c
@@ -4,9 +4,9 @@
  *
  * Copyright (C) 1998 Ingo Molnar.
  */
-#include <linux/raid/xor_impl.h>
 #include <asm/fpu/api.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 #define LD(x, y)	"       movq   8*("#x")(%1), %%mm"#y"   ;\n"
 #define ST(x, y)	"       movq %%mm"#y",   8*("#x")(%1)   ;\n"
diff --git a/lib/raid/xor/x86/xor-sse.c b/lib/raid/xor/x86/xor-sse.c
index 0e727ced8b0051..5993ed688c1553 100644
--- a/lib/raid/xor/x86/xor-sse.c
+++ b/lib/raid/xor/x86/xor-sse.c
@@ -12,9 +12,9 @@
  * x86-64 changes / gcc fixes from Andi Kleen.
  * Copyright 2002 Andi Kleen, SuSE Labs.
  */
-#include <linux/raid/xor_impl.h>
 #include <asm/fpu/api.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
+#include "xor_arch.h"
 
 #ifdef CONFIG_X86_32
 /* reduce register pressure */
diff --git a/arch/x86/include/asm/xor.h b/lib/raid/xor/x86/xor_arch.h
similarity index 89%
rename from arch/x86/include/asm/xor.h
rename to lib/raid/xor/x86/xor_arch.h
index d1aab8275908e7..99fe85a213c669 100644
--- a/arch/x86/include/asm/xor.h
+++ b/lib/raid/xor/x86/xor_arch.h
@@ -1,9 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_X86_XOR_H
-#define _ASM_X86_XOR_H
-
 #include <asm/cpufeature.h>
-#include <asm-generic/xor.h>
 
 extern struct xor_block_template xor_block_pII_mmx;
 extern struct xor_block_template xor_block_p5_mmx;
@@ -20,7 +16,6 @@ extern struct xor_block_template xor_block_avx;
  *
  * 32-bit without MMX can fall back to the generic routines.
  */
-#define arch_xor_init arch_xor_init
 static __always_inline void __init arch_xor_init(void)
 {
 	if (boot_cpu_has(X86_FEATURE_AVX) &&
@@ -39,5 +34,3 @@ static __always_inline void __init arch_xor_init(void)
 		xor_register(&xor_block_32regs_p);
 	}
 }
-
-#endif /* _ASM_X86_XOR_H */
diff --git a/lib/raid/xor/xor-32regs-prefetch.c b/lib/raid/xor/xor-32regs-prefetch.c
index 8666c287f7775b..2856a8e50cb840 100644
--- a/lib/raid/xor/xor-32regs-prefetch.c
+++ b/lib/raid/xor/xor-32regs-prefetch.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <linux/prefetch.h>
-#include <linux/raid/xor_impl.h>
-#include <asm-generic/xor.h>
+#include "xor_impl.h"
 
 static void
 xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/lib/raid/xor/xor-32regs.c b/lib/raid/xor/xor-32regs.c
index 58d4fac43eb44c..cc44d64032fa77 100644
--- a/lib/raid/xor/xor-32regs.c
+++ b/lib/raid/xor/xor-32regs.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/raid/xor_impl.h>
-#include <asm-generic/xor.h>
+#include "xor_impl.h"
 
 static void
 xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/lib/raid/xor/xor-8regs-prefetch.c b/lib/raid/xor/xor-8regs-prefetch.c
index 67061e35a0a67f..1d53aec50d27b6 100644
--- a/lib/raid/xor/xor-8regs-prefetch.c
+++ b/lib/raid/xor/xor-8regs-prefetch.c
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <linux/prefetch.h>
-#include <linux/raid/xor_impl.h>
-#include <asm-generic/xor.h>
+#include "xor_impl.h"
 
 static void
 xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c
index 769f796ab2cf4b..72a44e898c5513 100644
--- a/lib/raid/xor/xor-8regs.c
+++ b/lib/raid/xor/xor-8regs.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/raid/xor_impl.h>
-#include <asm-generic/xor.h>
+#include "xor_impl.h"
 
 static void
 xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1,
diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index 01a42995b7a528..47e09ae954b2b5 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -9,10 +9,9 @@
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/raid/xor.h>
-#include <linux/raid/xor_impl.h>
 #include <linux/jiffies.h>
 #include <linux/preempt.h>
-#include <asm/xor.h>
+#include "xor_impl.h"
 
 /* The xor routines to use.  */
 static struct xor_block_template *active_template;
@@ -141,16 +140,21 @@ static int __init calibrate_xor_blocks(void)
 	return 0;
 }
 
-static int __init xor_init(void)
-{
-#ifdef arch_xor_init
-	arch_xor_init();
+#ifdef CONFIG_XOR_BLOCKS_ARCH
+#include "xor_arch.h" /* $SRCARCH/xor_arch.h */
 #else
+static void __init arch_xor_init(void)
+{
 	xor_register(&xor_block_8regs);
 	xor_register(&xor_block_8regs_p);
 	xor_register(&xor_block_32regs);
 	xor_register(&xor_block_32regs_p);
-#endif
+}
+#endif /* CONFIG_XOR_BLOCKS_ARCH */
+
+static int __init xor_init(void)
+{
+	arch_xor_init();
 
 	/*
 	 * If this arch/cpu has a short-circuited selection, don't loop through
diff --git a/include/linux/raid/xor_impl.h b/lib/raid/xor/xor_impl.h
similarity index 80%
rename from include/linux/raid/xor_impl.h
rename to lib/raid/xor/xor_impl.h
index 6ed4c445ab24ce..44b6c99e2093e1 100644
--- a/include/linux/raid/xor_impl.h
+++ b/lib/raid/xor/xor_impl.h
@@ -24,6 +24,12 @@ struct xor_block_template {
 		     const unsigned long * __restrict);
 };
 
+/* generic implementations */
+extern struct xor_block_template xor_block_8regs;
+extern struct xor_block_template xor_block_32regs;
+extern struct xor_block_template xor_block_8regs_p;
+extern struct xor_block_template xor_block_32regs_p;
+
 void __init xor_register(struct xor_block_template *tmpl);
 void __init xor_force(struct xor_block_template *tmpl);
 

From a475882cbef76a1238d37eb02656c53d7c34fe08 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:57 +0100
Subject: [PATCH 21/26] xor: add a better public API

xor_blocks is very annoying to use, because it is limited to 4 + 1
sources / destinations, has an odd argument order and is completely
undocumented.

Lift the code that loops around it from btrfs and async_tx/async_xor into
common code under the name xor_gen and properly document it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 include/linux/raid/xor.h |  2 ++
 lib/raid/xor/xor-core.c  | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index 02bda8d995347b..6d9a39fd85ddab 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -7,4 +7,6 @@
 extern void xor_blocks(unsigned int count, unsigned int bytes,
 	void *dest, void **srcs);
 
+void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes);
+
 #endif /* _XOR_H */
diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index 47e09ae954b2b5..fdd0cf5584e7be 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -46,6 +46,38 @@ xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs)
 }
 EXPORT_SYMBOL(xor_blocks);
 
+/**
+ * xor_gen - generate RAID-style XOR information
+ * @dest:	destination vector
+ * @srcs:	source vectors
+ * @src_cnt:	number of source vectors
+ * @bytes:	length in bytes of each vector
+ *
+ * Performs bit-wise XOR operation into @dest for each of the @src_cnt vectors
+ * in @srcs for a length of @bytes bytes.  @src_count must be non-zero, and the
+ * memory pointed to by @dest and each member of @srcs must be at least 64-byte
+ * aligned.  @bytes must be non-zero and a multiple of 512.
+ *
+ * Note: for typical RAID uses, @dest either needs to be zeroed, or filled with
+ * the first disk, which then needs to be removed from @srcs.
+ */
+void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes)
+{
+	unsigned int src_off = 0;
+
+	WARN_ON_ONCE(bytes & 511);
+
+	while (src_cnt > 0) {
+		unsigned int this_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+
+		xor_blocks(this_cnt, bytes, dest, srcs + src_off);
+
+		src_cnt -= this_cnt;
+		src_off += this_cnt;
+	}
+}
+EXPORT_SYMBOL(xor_gen);
+
 /* Set of all registered templates.  */
 static struct xor_block_template *__initdata template_list;
 static bool __initdata xor_forced = false;

From 9e069b9e02b4e9f269cb7b891553511136aaed59 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:58 +0100
Subject: [PATCH 22/26] async_xor: use xor_gen

Replace use of the loop around xor_blocks with the easier to use xor_gen
API.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 crypto/async_tx/async_xor.c | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c
index 2c499654a36c85..84458375b202b5 100644
--- a/crypto/async_tx/async_xor.c
+++ b/crypto/async_tx/async_xor.c
@@ -103,7 +103,6 @@ do_sync_xor_offs(struct page *dest, unsigned int offset,
 {
 	int i;
 	int xor_src_cnt = 0;
-	int src_off = 0;
 	void *dest_buf;
 	void **srcs;
 
@@ -117,23 +116,12 @@ do_sync_xor_offs(struct page *dest, unsigned int offset,
 		if (src_list[i])
 			srcs[xor_src_cnt++] = page_address(src_list[i]) +
 				(src_offs ? src_offs[i] : offset);
-	src_cnt = xor_src_cnt;
+
 	/* set destination address */
 	dest_buf = page_address(dest) + offset;
-
 	if (submit->flags & ASYNC_TX_XOR_ZERO_DST)
 		memset(dest_buf, 0, len);
-
-	while (src_cnt > 0) {
-		/* process up to 'MAX_XOR_BLOCKS' sources */
-		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
-		xor_blocks(xor_src_cnt, len, dest_buf, &srcs[src_off]);
-
-		/* drop completed sources */
-		src_cnt -= xor_src_cnt;
-		src_off += xor_src_cnt;
-	}
-
+	xor_gen(dest_buf, srcs, xor_src_cnt, len);
 	async_tx_sync_epilog(submit);
 }
 
@@ -168,11 +156,10 @@ dma_xor_aligned_offsets(struct dma_device *device, unsigned int offset,
  *
  * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST
  *
- * xor_blocks always uses the dest as a source so the
- * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in
- * the calculation.  The assumption with dma engines is that they only
- * use the destination buffer as a source when it is explicitly specified
- * in the source list.
+ * xor_gen always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST flag
+ * must be set to not include dest data in the calculation.  The assumption with
+ * dma engines is that they only use the destination buffer as a source when it
+ * is explicitly specified in the source list.
  *
  * src_list note: if the dest is also a source it must be at index zero.
  * The contents of this array will be overwritten if a scribble region
@@ -259,11 +246,10 @@ EXPORT_SYMBOL_GPL(async_xor_offs);
  *
  * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST
  *
- * xor_blocks always uses the dest as a source so the
- * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in
- * the calculation.  The assumption with dma engines is that they only
- * use the destination buffer as a source when it is explicitly specified
- * in the source list.
+ * xor_gen always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST flag
+ * must be set to not include dest data in the calculation.  The assumption with
+ * dma engines is that they only use the destination buffer as a source when it
+ * is explicitly specified in the source list.
  *
  * src_list note: if the dest is also a source it must be at index zero.
  * The contents of this array will be overwritten if a scribble region

From 3eb428ef17e16ee9bcb7b09f6e7010fa192d48b7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:21:59 +0100
Subject: [PATCH 23/26] btrfs: use xor_gen

Use the new xor_gen helper instead of open coding the loop around
xor_blocks.  This helper is very similar to the existing run_xor helper
in btrfs, except that the destination buffer is passed explicitly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: David Sterba <dsterba@suse.com>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 fs/btrfs/raid56.c | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index baadaaa189c05d..7e326d8e63ae67 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -617,26 +617,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
 	spin_unlock(&table->cache_lock);
 }
 
-/*
- * helper function to run the xor_blocks api.  It is only
- * able to do MAX_XOR_BLOCKS at a time, so we need to
- * loop through.
- */
-static void run_xor(void **pages, int src_cnt, ssize_t len)
-{
-	int src_off = 0;
-	int xor_src_cnt = 0;
-	void *dest = pages[src_cnt];
-
-	while(src_cnt > 0) {
-		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
-		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
-
-		src_cnt -= xor_src_cnt;
-		src_off += xor_src_cnt;
-	}
-}
-
 /*
  * Returns true if the bio list inside this rbio covers an entire stripe (no
  * rmw required).
@@ -1432,7 +1412,8 @@ static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int
 	} else {
 		/* raid5 */
 		memcpy(pointers[rbio->nr_data], pointers[0], step);
-		run_xor(pointers + 1, rbio->nr_data - 1, step);
+		xor_gen(pointers[rbio->nr_data], pointers + 1, rbio->nr_data - 1,
+				step);
 	}
 	for (stripe = stripe - 1; stripe >= 0; stripe--)
 		kunmap_local(pointers[stripe]);
@@ -2032,7 +2013,7 @@ static void recover_vertical_step(struct btrfs_raid_bio *rbio,
 		pointers[rbio->nr_data - 1] = p;
 
 		/* Xor in the rest */
-		run_xor(pointers, rbio->nr_data - 1, step);
+		xor_gen(p, pointers, rbio->nr_data - 1, step);
 	}
 
 cleanup:
@@ -2662,7 +2643,7 @@ static bool verify_one_parity_step(struct btrfs_raid_bio *rbio,
 	} else {
 		/* RAID5. */
 		memcpy(pointers[nr_data], pointers[0], step);
-		run_xor(pointers + 1, nr_data - 1, step);
+		xor_gen(pointers[nr_data], pointers + 1, nr_data - 1, step);
 	}
 
 	/* Check scrubbing parity and repair it. */

From 1a3bfff7fe844db0b7a625ceab82612c74272717 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:22:00 +0100
Subject: [PATCH 24/26] xor: pass the entire operation to the low-level ops

Currently the high-level xor code chunks up all operations into small
units for only up to 1 + 4 vectors, and passes it to four different
methods.  This means the FPU/vector context is entered and left a lot
for wide stripes, and a lot of indirect expensive indirect calls are
performed.  Switch to passing the entire gen_xor request to the
low-level ops, and provide a macro to dispatch it to the existing
helper.

This reduce the number of indirect calls and FPU/vector context switches
by a factor approaching nr_stripes / 4, and also reduces source and
binary code size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 include/linux/raid/xor.h               |  5 --
 lib/raid/xor/alpha/xor.c               | 19 ++++----
 lib/raid/xor/arm/xor-neon-glue.c       | 49 ++------------------
 lib/raid/xor/arm/xor-neon.c            |  9 +---
 lib/raid/xor/arm/xor.c                 | 10 ++--
 lib/raid/xor/arm/xor_arch.h            |  3 ++
 lib/raid/xor/arm64/xor-neon-glue.c     | 44 ++----------------
 lib/raid/xor/arm64/xor-neon.c          | 20 +++++---
 lib/raid/xor/arm64/xor-neon.h          | 32 ++-----------
 lib/raid/xor/loongarch/xor_simd_glue.c | 62 +++++--------------------
 lib/raid/xor/powerpc/xor_vmx.c         | 40 ++++++++--------
 lib/raid/xor/powerpc/xor_vmx.h         | 16 +------
 lib/raid/xor/powerpc/xor_vmx_glue.c    | 49 ++------------------
 lib/raid/xor/riscv/xor-glue.c          | 43 +++--------------
 lib/raid/xor/s390/xor.c                |  9 ++--
 lib/raid/xor/sparc/xor-sparc32.c       |  9 ++--
 lib/raid/xor/sparc/xor-sparc64-glue.c  | 19 ++++----
 lib/raid/xor/x86/xor-avx.c             | 29 ++++--------
 lib/raid/xor/x86/xor-mmx.c             | 64 ++++++++++----------------
 lib/raid/xor/x86/xor-sse.c             | 63 +++++++++----------------
 lib/raid/xor/xor-32regs-prefetch.c     | 10 ++--
 lib/raid/xor/xor-32regs.c              |  9 ++--
 lib/raid/xor/xor-8regs-prefetch.c      | 11 +++--
 lib/raid/xor/xor-8regs.c               |  9 ++--
 lib/raid/xor/xor-core.c                | 47 ++-----------------
 lib/raid/xor/xor_impl.h                | 48 +++++++++++++------
 26 files changed, 224 insertions(+), 504 deletions(-)

diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
index 6d9a39fd85ddab..870558c9d36ee3 100644
--- a/include/linux/raid/xor.h
+++ b/include/linux/raid/xor.h
@@ -2,11 +2,6 @@
 #ifndef _XOR_H
 #define _XOR_H
 
-#define MAX_XOR_BLOCKS 4
-
-extern void xor_blocks(unsigned int count, unsigned int bytes,
-	void *dest, void **srcs);
-
 void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes);
 
 #endif /* _XOR_H */
diff --git a/lib/raid/xor/alpha/xor.c b/lib/raid/xor/alpha/xor.c
index 90694cc47395b9..a8f72f2dd3a5e2 100644
--- a/lib/raid/xor/alpha/xor.c
+++ b/lib/raid/xor/alpha/xor.c
@@ -832,18 +832,17 @@ xor_alpha_prefetch_5:						\n\
 	.end xor_alpha_prefetch_5				\n\
 ");
 
+DO_XOR_BLOCKS(alpha, xor_alpha_2, xor_alpha_3, xor_alpha_4, xor_alpha_5);
+
 struct xor_block_template xor_block_alpha = {
-	.name	= "alpha",
-	.do_2	= xor_alpha_2,
-	.do_3	= xor_alpha_3,
-	.do_4	= xor_alpha_4,
-	.do_5	= xor_alpha_5,
+	.name		= "alpha",
+	.xor_gen	= xor_gen_alpha,
 };
 
+DO_XOR_BLOCKS(alpha_prefetch, xor_alpha_prefetch_2, xor_alpha_prefetch_3,
+		xor_alpha_prefetch_4, xor_alpha_prefetch_5);
+
 struct xor_block_template xor_block_alpha_prefetch = {
-	.name	= "alpha prefetch",
-	.do_2	= xor_alpha_prefetch_2,
-	.do_3	= xor_alpha_prefetch_3,
-	.do_4	= xor_alpha_prefetch_4,
-	.do_5	= xor_alpha_prefetch_5,
+	.name		= "alpha prefetch",
+	.xor_gen	= xor_gen_alpha_prefetch,
 };
diff --git a/lib/raid/xor/arm/xor-neon-glue.c b/lib/raid/xor/arm/xor-neon-glue.c
index 7afd6294464bc5..cea39e0199048e 100644
--- a/lib/raid/xor/arm/xor-neon-glue.c
+++ b/lib/raid/xor/arm/xor-neon-glue.c
@@ -5,54 +5,15 @@
 #include "xor_impl.h"
 #include "xor_arch.h"
 
-extern struct xor_block_template const xor_block_neon_inner;
-
-static void
-xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2)
-{
-	kernel_neon_begin();
-	xor_block_neon_inner.do_2(bytes, p1, p2);
-	kernel_neon_end();
-}
-
-static void
-xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3)
-{
-	kernel_neon_begin();
-	xor_block_neon_inner.do_3(bytes, p1, p2, p3);
-	kernel_neon_end();
-}
-
-static void
-xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3,
-	   const unsigned long * __restrict p4)
-{
-	kernel_neon_begin();
-	xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
-	kernel_neon_end();
-}
-
-static void
-xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
-	   const unsigned long * __restrict p2,
-	   const unsigned long * __restrict p3,
-	   const unsigned long * __restrict p4,
-	   const unsigned long * __restrict p5)
+static void xor_gen_neon(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes)
 {
 	kernel_neon_begin();
-	xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
+	xor_gen_neon_inner(dest, srcs, src_cnt, bytes);
 	kernel_neon_end();
 }
 
 struct xor_block_template xor_block_neon = {
-	.name	= "neon",
-	.do_2	= xor_neon_2,
-	.do_3	= xor_neon_3,
-	.do_4	= xor_neon_4,
-	.do_5	= xor_neon_5
+	.name		= "neon",
+	.xor_gen	= xor_gen_neon,
 };
diff --git a/lib/raid/xor/arm/xor-neon.c b/lib/raid/xor/arm/xor-neon.c
index 806a42c5952c50..23147e3a79044f 100644
--- a/lib/raid/xor/arm/xor-neon.c
+++ b/lib/raid/xor/arm/xor-neon.c
@@ -4,6 +4,7 @@
  */
 
 #include "xor_impl.h"
+#include "xor_arch.h"
 
 #ifndef __ARM_NEON__
 #error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon'
@@ -22,10 +23,4 @@
 #define NO_TEMPLATE
 #include "../xor-8regs.c"
 
-struct xor_block_template const xor_block_neon_inner = {
-	.name	= "__inner_neon__",
-	.do_2	= xor_8regs_2,
-	.do_3	= xor_8regs_3,
-	.do_4	= xor_8regs_4,
-	.do_5	= xor_8regs_5,
-};
+__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
diff --git a/lib/raid/xor/arm/xor.c b/lib/raid/xor/arm/xor.c
index 5bd5f048bbe9ea..45139b6c55eaa8 100644
--- a/lib/raid/xor/arm/xor.c
+++ b/lib/raid/xor/arm/xor.c
@@ -127,10 +127,10 @@ xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines);
 }
 
+DO_XOR_BLOCKS(arm4regs, xor_arm4regs_2, xor_arm4regs_3, xor_arm4regs_4,
+		xor_arm4regs_5);
+
 struct xor_block_template xor_block_arm4regs = {
-	.name	= "arm4regs",
-	.do_2	= xor_arm4regs_2,
-	.do_3	= xor_arm4regs_3,
-	.do_4	= xor_arm4regs_4,
-	.do_5	= xor_arm4regs_5,
+	.name		= "arm4regs",
+	.xor_gen	= xor_gen_arm4regs,
 };
diff --git a/lib/raid/xor/arm/xor_arch.h b/lib/raid/xor/arm/xor_arch.h
index 5a7eedb48fbb94..775ff835df656e 100644
--- a/lib/raid/xor/arm/xor_arch.h
+++ b/lib/raid/xor/arm/xor_arch.h
@@ -7,6 +7,9 @@
 extern struct xor_block_template xor_block_arm4regs;
 extern struct xor_block_template xor_block_neon;
 
+void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes);
+
 static __always_inline void __init arch_xor_init(void)
 {
 	xor_register(&xor_block_arm4regs);
diff --git a/lib/raid/xor/arm64/xor-neon-glue.c b/lib/raid/xor/arm64/xor-neon-glue.c
index 3db0a318cf5bb5..f0284f86feb4c8 100644
--- a/lib/raid/xor/arm64/xor-neon-glue.c
+++ b/lib/raid/xor/arm64/xor-neon-glue.c
@@ -10,50 +10,16 @@
 #include "xor-neon.h"
 
 #define XOR_TEMPLATE(_name)						\
-static void								\
-xor_##_name##_2(unsigned long bytes, unsigned long * __restrict p1,	\
-	   const unsigned long * __restrict p2)				\
+static void xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt, \
+		unsigned int bytes)					\
 {									\
 	scoped_ksimd()							\
-		__xor_##_name##_2(bytes, p1, p2);			\
-}									\
-									\
-static void								\
-xor_##_name##_3(unsigned long bytes, unsigned long * __restrict p1,	\
-	   const unsigned long * __restrict p2,				\
-	   const unsigned long * __restrict p3)				\
-{									\
-	scoped_ksimd()							\
-		__xor_##_name##_3(bytes, p1, p2, p3);			\
-}									\
-									\
-static void								\
-xor_##_name##_4(unsigned long bytes, unsigned long * __restrict p1,	\
-	   const unsigned long * __restrict p2,				\
-	   const unsigned long * __restrict p3,				\
-	   const unsigned long * __restrict p4)				\
-{									\
-	scoped_ksimd()							\
-		__xor_##_name##_4(bytes, p1, p2, p3, p4);		\
-}									\
-									\
-static void								\
-xor_##_name##_5(unsigned long bytes, unsigned long * __restrict p1,	\
-	   const unsigned long * __restrict p2,				\
-	   const unsigned long * __restrict p3,				\
-	   const unsigned long * __restrict p4,				\
-	   const unsigned long * __restrict p5)				\
-{									\
-	scoped_ksimd()							\
-		__xor_##_name##_5(bytes, p1, p2, p3, p4, p5);		\
+		xor_gen_##_name##_inner(dest, srcs, src_cnt, bytes);	\
 }									\
 									\
 struct xor_block_template xor_block_##_name = {				\
-	.name   = __stringify(_name),					\
-	.do_2   = xor_##_name##_2,					\
-	.do_3   = xor_##_name##_3,					\
-	.do_4   = xor_##_name##_4,					\
-	.do_5	= xor_##_name##_5					\
+	.name   	= __stringify(_name),				\
+	.xor_gen	= xor_gen_##_name,				\
 };
 
 XOR_TEMPLATE(neon);
diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c
index 61f00c4fee4952..97ef3cb924968d 100644
--- a/lib/raid/xor/arm64/xor-neon.c
+++ b/lib/raid/xor/arm64/xor-neon.c
@@ -10,7 +10,7 @@
 #include "xor_arch.h"
 #include "xor-neon.h"
 
-void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
 		const unsigned long * __restrict p2)
 {
 	uint64_t *dp1 = (uint64_t *)p1;
@@ -37,7 +37,7 @@ void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
 		const unsigned long * __restrict p2,
 		const unsigned long * __restrict p3)
 {
@@ -73,7 +73,7 @@ void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
 		const unsigned long * __restrict p2,
 		const unsigned long * __restrict p3,
 		const unsigned long * __restrict p4)
@@ -118,7 +118,7 @@ void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
 		const unsigned long * __restrict p2,
 		const unsigned long * __restrict p3,
 		const unsigned long * __restrict p4,
@@ -172,6 +172,9 @@ void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
+__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
+		__xor_neon_5);
+
 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 {
 	uint64x2_t res;
@@ -182,7 +185,7 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
 	return res;
 }
 
-void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
 		const unsigned long * __restrict p2,
 		const unsigned long * __restrict p3)
 {
@@ -216,7 +219,7 @@ void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
 		const unsigned long * __restrict p2,
 		const unsigned long * __restrict p3,
 		const unsigned long * __restrict p4)
@@ -259,7 +262,7 @@ void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
-void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
+static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
 		const unsigned long * __restrict p2,
 		const unsigned long * __restrict p3,
 		const unsigned long * __restrict p4,
@@ -304,3 +307,6 @@ void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
 		dp5 += 8;
 	} while (--lines > 0);
 }
+
+__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4,
+		__xor_eor3_5);
diff --git a/lib/raid/xor/arm64/xor-neon.h b/lib/raid/xor/arm64/xor-neon.h
index cec0ac846feabd..514699ba8f5f8e 100644
--- a/lib/raid/xor/arm64/xor-neon.h
+++ b/lib/raid/xor/arm64/xor-neon.h
@@ -1,30 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 
-void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2);
-void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3);
-void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3,
-		const unsigned long * __restrict p4);
-void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3,
-		const unsigned long * __restrict p4,
-		const unsigned long * __restrict p5);
-
-#define __xor_eor3_2	__xor_neon_2
-void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3);
-void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3,
-		const unsigned long * __restrict p4);
-void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3,
-		const unsigned long * __restrict p4,
-		const unsigned long * __restrict p5);
+void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes);
+void xor_gen_eor3_inner(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes);
diff --git a/lib/raid/xor/loongarch/xor_simd_glue.c b/lib/raid/xor/loongarch/xor_simd_glue.c
index b387aa0213b471..7f324d924f8791 100644
--- a/lib/raid/xor/loongarch/xor_simd_glue.c
+++ b/lib/raid/xor/loongarch/xor_simd_glue.c
@@ -11,63 +11,23 @@
 #include "xor_arch.h"
 #include "xor_simd.h"
 
-#define MAKE_XOR_GLUE_2(flavor)							\
-static void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1,\
-		      const unsigned long * __restrict p2)			\
+#define MAKE_XOR_GLUES(flavor)							\
+DO_XOR_BLOCKS(flavor##_inner, __xor_##flavor##_2, __xor_##flavor##_3,		\
+		__xor_##flavor##_4, __xor_##flavor##_5);			\
+										\
+static void xor_gen_##flavor(void *dest, void **srcs, unsigned int src_cnt,	\
+		unsigned int bytes)						\
 {										\
 	kernel_fpu_begin();							\
-	__xor_##flavor##_2(bytes, p1, p2);					\
+	xor_gen_##flavor##_inner(dest, srcs, src_cnt, bytes);			\
 	kernel_fpu_end();							\
 }										\
-
-#define MAKE_XOR_GLUE_3(flavor)							\
-static void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,\
-		      const unsigned long * __restrict p2,			\
-		      const unsigned long * __restrict p3)			\
-{										\
-	kernel_fpu_begin();							\
-	__xor_##flavor##_3(bytes, p1, p2, p3);					\
-	kernel_fpu_end();							\
-}										\
-
-#define MAKE_XOR_GLUE_4(flavor)							\
-static void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,\
-		      const unsigned long * __restrict p2,			\
-		      const unsigned long * __restrict p3,			\
-		      const unsigned long * __restrict p4)			\
-{										\
-	kernel_fpu_begin();							\
-	__xor_##flavor##_4(bytes, p1, p2, p3, p4);				\
-	kernel_fpu_end();							\
-}										\
-
-#define MAKE_XOR_GLUE_5(flavor)							\
-static void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,\
-		      const unsigned long * __restrict p2,			\
-		      const unsigned long * __restrict p3,			\
-		      const unsigned long * __restrict p4,			\
-		      const unsigned long * __restrict p5)			\
-{										\
-	kernel_fpu_begin();							\
-	__xor_##flavor##_5(bytes, p1, p2, p3, p4, p5);				\
-	kernel_fpu_end();							\
-}										\
-
-#define MAKE_XOR_GLUES(flavor)				\
-	MAKE_XOR_GLUE_2(flavor);			\
-	MAKE_XOR_GLUE_3(flavor);			\
-	MAKE_XOR_GLUE_4(flavor);			\
-	MAKE_XOR_GLUE_5(flavor);			\
-							\
-struct xor_block_template xor_block_##flavor = {	\
-	.name = __stringify(flavor),			\
-	.do_2 = xor_##flavor##_2,			\
-	.do_3 = xor_##flavor##_3,			\
-	.do_4 = xor_##flavor##_4,			\
-	.do_5 = xor_##flavor##_5,			\
+										\
+struct xor_block_template xor_block_##flavor = {				\
+	.name		= __stringify(flavor),					\
+	.xor_gen	= xor_gen_##flavor					\
 }
 
-
 #ifdef CONFIG_CPU_HAS_LSX
 MAKE_XOR_GLUES(lsx);
 #endif /* CONFIG_CPU_HAS_LSX */
diff --git a/lib/raid/xor/powerpc/xor_vmx.c b/lib/raid/xor/powerpc/xor_vmx.c
index aab49d056d1883..09bed98c1bc723 100644
--- a/lib/raid/xor/powerpc/xor_vmx.c
+++ b/lib/raid/xor/powerpc/xor_vmx.c
@@ -10,6 +10,7 @@
  * Sparse (as at v0.5.0) gets very, very confused by this file.
  * Make it a bit simpler for it.
  */
+#include "xor_impl.h"
 #if !defined(__CHECKER__)
 #include <altivec.h>
 #else
@@ -49,9 +50,9 @@ typedef vector signed char unative_t;
 		V1##_3 = vec_xor(V1##_3, V2##_3);	\
 	} while (0)
 
-void __xor_altivec_2(unsigned long bytes,
-		     unsigned long * __restrict v1_in,
-		     const unsigned long * __restrict v2_in)
+static void __xor_altivec_2(unsigned long bytes,
+		unsigned long * __restrict v1_in,
+		const unsigned long * __restrict v2_in)
 {
 	DEFINE(v1);
 	DEFINE(v2);
@@ -68,10 +69,10 @@ void __xor_altivec_2(unsigned long bytes,
 	} while (--lines > 0);
 }
 
-void __xor_altivec_3(unsigned long bytes,
-		     unsigned long * __restrict v1_in,
-		     const unsigned long * __restrict v2_in,
-		     const unsigned long * __restrict v3_in)
+static void __xor_altivec_3(unsigned long bytes,
+		unsigned long * __restrict v1_in,
+		const unsigned long * __restrict v2_in,
+		const unsigned long * __restrict v3_in)
 {
 	DEFINE(v1);
 	DEFINE(v2);
@@ -92,11 +93,11 @@ void __xor_altivec_3(unsigned long bytes,
 	} while (--lines > 0);
 }
 
-void __xor_altivec_4(unsigned long bytes,
-		     unsigned long * __restrict v1_in,
-		     const unsigned long * __restrict v2_in,
-		     const unsigned long * __restrict v3_in,
-		     const unsigned long * __restrict v4_in)
+static void __xor_altivec_4(unsigned long bytes,
+		unsigned long * __restrict v1_in,
+		const unsigned long * __restrict v2_in,
+		const unsigned long * __restrict v3_in,
+		const unsigned long * __restrict v4_in)
 {
 	DEFINE(v1);
 	DEFINE(v2);
@@ -121,12 +122,12 @@ void __xor_altivec_4(unsigned long bytes,
 	} while (--lines > 0);
 }
 
-void __xor_altivec_5(unsigned long bytes,
-		     unsigned long * __restrict v1_in,
-		     const unsigned long * __restrict v2_in,
-		     const unsigned long * __restrict v3_in,
-		     const unsigned long * __restrict v4_in,
-		     const unsigned long * __restrict v5_in)
+static void __xor_altivec_5(unsigned long bytes,
+		unsigned long * __restrict v1_in,
+		const unsigned long * __restrict v2_in,
+		const unsigned long * __restrict v3_in,
+		const unsigned long * __restrict v4_in,
+		const unsigned long * __restrict v5_in)
 {
 	DEFINE(v1);
 	DEFINE(v2);
@@ -154,3 +155,6 @@ void __xor_altivec_5(unsigned long bytes,
 		v5 += 4;
 	} while (--lines > 0);
 }
+
+__DO_XOR_BLOCKS(altivec_inner, __xor_altivec_2, __xor_altivec_3,
+		__xor_altivec_4, __xor_altivec_5);
diff --git a/lib/raid/xor/powerpc/xor_vmx.h b/lib/raid/xor/powerpc/xor_vmx.h
index 573c41d90dac52..1d26c1133a8685 100644
--- a/lib/raid/xor/powerpc/xor_vmx.h
+++ b/lib/raid/xor/powerpc/xor_vmx.h
@@ -6,17 +6,5 @@
  * outside of the enable/disable altivec block.
  */
 
-void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
-		     const unsigned long * __restrict p2);
-void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
-		     const unsigned long * __restrict p2,
-		     const unsigned long * __restrict p3);
-void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
-		     const unsigned long * __restrict p2,
-		     const unsigned long * __restrict p3,
-		     const unsigned long * __restrict p4);
-void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
-		     const unsigned long * __restrict p2,
-		     const unsigned long * __restrict p3,
-		     const unsigned long * __restrict p4,
-		     const unsigned long * __restrict p5);
+void xor_gen_altivec_inner(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes);
diff --git a/lib/raid/xor/powerpc/xor_vmx_glue.c b/lib/raid/xor/powerpc/xor_vmx_glue.c
index 56e99ddfb64f61..dbfbb5cadc36af 100644
--- a/lib/raid/xor/powerpc/xor_vmx_glue.c
+++ b/lib/raid/xor/powerpc/xor_vmx_glue.c
@@ -12,56 +12,17 @@
 #include "xor_arch.h"
 #include "xor_vmx.h"
 
-static void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2)
+static void xor_gen_altivec(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes)
 {
 	preempt_disable();
 	enable_kernel_altivec();
-	__xor_altivec_2(bytes, p1, p2);
-	disable_kernel_altivec();
-	preempt_enable();
-}
-
-static void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3)
-{
-	preempt_disable();
-	enable_kernel_altivec();
-	__xor_altivec_3(bytes, p1, p2, p3);
-	disable_kernel_altivec();
-	preempt_enable();
-}
-
-static void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3,
-		const unsigned long * __restrict p4)
-{
-	preempt_disable();
-	enable_kernel_altivec();
-	__xor_altivec_4(bytes, p1, p2, p3, p4);
-	disable_kernel_altivec();
-	preempt_enable();
-}
-
-static void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
-		const unsigned long * __restrict p2,
-		const unsigned long * __restrict p3,
-		const unsigned long * __restrict p4,
-		const unsigned long * __restrict p5)
-{
-	preempt_disable();
-	enable_kernel_altivec();
-	__xor_altivec_5(bytes, p1, p2, p3, p4, p5);
+	xor_gen_altivec_inner(dest, srcs, src_cnt, bytes);
 	disable_kernel_altivec();
 	preempt_enable();
 }
 
 struct xor_block_template xor_block_altivec = {
-	.name = "altivec",
-	.do_2 = xor_altivec_2,
-	.do_3 = xor_altivec_3,
-	.do_4 = xor_altivec_4,
-	.do_5 = xor_altivec_5,
+	.name		= "altivec",
+	.xor_gen	= xor_gen_altivec,
 };
diff --git a/lib/raid/xor/riscv/xor-glue.c b/lib/raid/xor/riscv/xor-glue.c
index 060e5f22ebcce1..2e4c1b05d998fa 100644
--- a/lib/raid/xor/riscv/xor-glue.c
+++ b/lib/raid/xor/riscv/xor-glue.c
@@ -9,48 +9,17 @@
 #include "xor_impl.h"
 #include "xor_arch.h"
 
-static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1,
-			 const unsigned long *__restrict p2)
-{
-	kernel_vector_begin();
-	xor_regs_2_(bytes, p1, p2);
-	kernel_vector_end();
-}
-
-static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1,
-			 const unsigned long *__restrict p2,
-			 const unsigned long *__restrict p3)
-{
-	kernel_vector_begin();
-	xor_regs_3_(bytes, p1, p2, p3);
-	kernel_vector_end();
-}
-
-static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1,
-			 const unsigned long *__restrict p2,
-			 const unsigned long *__restrict p3,
-			 const unsigned long *__restrict p4)
-{
-	kernel_vector_begin();
-	xor_regs_4_(bytes, p1, p2, p3, p4);
-	kernel_vector_end();
-}
+DO_XOR_BLOCKS(vector_inner, xor_regs_2_, xor_regs_3_, xor_regs_4_, xor_regs_5_);
 
-static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1,
-			 const unsigned long *__restrict p2,
-			 const unsigned long *__restrict p3,
-			 const unsigned long *__restrict p4,
-			 const unsigned long *__restrict p5)
+static void xor_gen_vector(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes)
 {
 	kernel_vector_begin();
-	xor_regs_5_(bytes, p1, p2, p3, p4, p5);
+	xor_gen_vector_inner(dest, srcs, src_cnt, bytes);
 	kernel_vector_end();
 }
 
 struct xor_block_template xor_block_rvv = {
-	.name = "rvv",
-	.do_2 = xor_vector_2,
-	.do_3 = xor_vector_3,
-	.do_4 = xor_vector_4,
-	.do_5 = xor_vector_5
+	.name		= "rvv",
+	.xor_gen	= xor_gen_vector,
 };
diff --git a/lib/raid/xor/s390/xor.c b/lib/raid/xor/s390/xor.c
index 48b8cdc684a34b..d8a62a70db6c3a 100644
--- a/lib/raid/xor/s390/xor.c
+++ b/lib/raid/xor/s390/xor.c
@@ -126,10 +126,9 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
 		: : "0", "cc", "memory");
 }
 
+DO_XOR_BLOCKS(xc, xor_xc_2, xor_xc_3, xor_xc_4, xor_xc_5);
+
 struct xor_block_template xor_block_xc = {
-	.name = "xc",
-	.do_2 = xor_xc_2,
-	.do_3 = xor_xc_3,
-	.do_4 = xor_xc_4,
-	.do_5 = xor_xc_5,
+	.name		= "xc",
+	.xor_gen	= xor_gen_xc,
 };
diff --git a/lib/raid/xor/sparc/xor-sparc32.c b/lib/raid/xor/sparc/xor-sparc32.c
index 307c4a84f535d4..fb37631e90e697 100644
--- a/lib/raid/xor/sparc/xor-sparc32.c
+++ b/lib/raid/xor/sparc/xor-sparc32.c
@@ -244,10 +244,9 @@ sparc_5(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
+DO_XOR_BLOCKS(sparc32, sparc_2, sparc_3, sparc_4, sparc_5);
+
 struct xor_block_template xor_block_SPARC = {
-	.name	= "SPARC",
-	.do_2	= sparc_2,
-	.do_3	= sparc_3,
-	.do_4	= sparc_4,
-	.do_5	= sparc_5,
+	.name		= "SPARC",
+	.xor_gen	= xor_gen_sparc32,
 };
diff --git a/lib/raid/xor/sparc/xor-sparc64-glue.c b/lib/raid/xor/sparc/xor-sparc64-glue.c
index 5f90c2460b54cf..a8a686e0d25830 100644
--- a/lib/raid/xor/sparc/xor-sparc64-glue.c
+++ b/lib/raid/xor/sparc/xor-sparc64-glue.c
@@ -28,12 +28,11 @@ void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1,
 
 /* XXX Ugh, write cheetah versions... -DaveM */
 
+DO_XOR_BLOCKS(vis, xor_vis_2, xor_vis_3, xor_vis_4, xor_vis_5);
+
 struct xor_block_template xor_block_VIS = {
-        .name	= "VIS",
-        .do_2	= xor_vis_2,
-        .do_3	= xor_vis_3,
-        .do_4	= xor_vis_4,
-        .do_5	= xor_vis_5,
+        .name		= "VIS",
+	.xor_gen	= xor_gen_vis,
 };
 
 void xor_niagara_2(unsigned long bytes, unsigned long * __restrict p1,
@@ -51,10 +50,10 @@ void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1,
 		   const unsigned long * __restrict p4,
 		   const unsigned long * __restrict p5);
 
+DO_XOR_BLOCKS(niagara, xor_niagara_2, xor_niagara_3, xor_niagara_4,
+		xor_niagara_5);
+
 struct xor_block_template xor_block_niagara = {
-        .name	= "Niagara",
-        .do_2	= xor_niagara_2,
-        .do_3	= xor_niagara_3,
-        .do_4	= xor_niagara_4,
-        .do_5	= xor_niagara_5,
+        .name		= "Niagara",
+	.xor_gen	= xor_gen_niagara,
 };
diff --git a/lib/raid/xor/x86/xor-avx.c b/lib/raid/xor/x86/xor-avx.c
index d411efa1ff4355..f7777d7aa269bd 100644
--- a/lib/raid/xor/x86/xor-avx.c
+++ b/lib/raid/xor/x86/xor-avx.c
@@ -29,8 +29,6 @@ static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
 {
 	unsigned long lines = bytes >> 9;
 
-	kernel_fpu_begin();
-
 	while (lines--) {
 #undef BLOCK
 #define BLOCK(i, reg) \
@@ -47,8 +45,6 @@ do { \
 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
 	}
-
-	kernel_fpu_end();
 }
 
 static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
@@ -57,8 +53,6 @@ static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
 {
 	unsigned long lines = bytes >> 9;
 
-	kernel_fpu_begin();
-
 	while (lines--) {
 #undef BLOCK
 #define BLOCK(i, reg) \
@@ -78,8 +72,6 @@ do { \
 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
 	}
-
-	kernel_fpu_end();
 }
 
 static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
@@ -89,8 +81,6 @@ static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
 {
 	unsigned long lines = bytes >> 9;
 
-	kernel_fpu_begin();
-
 	while (lines--) {
 #undef BLOCK
 #define BLOCK(i, reg) \
@@ -113,8 +103,6 @@ do { \
 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
 	}
-
-	kernel_fpu_end();
 }
 
 static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
@@ -125,8 +113,6 @@ static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
 {
 	unsigned long lines = bytes >> 9;
 
-	kernel_fpu_begin();
-
 	while (lines--) {
 #undef BLOCK
 #define BLOCK(i, reg) \
@@ -152,14 +138,19 @@ do { \
 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
 	}
+}
+
+DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5);
 
+static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt,
+			unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_avx_inner(dest, srcs, src_cnt, bytes);
 	kernel_fpu_end();
 }
 
 struct xor_block_template xor_block_avx = {
-	.name = "avx",
-	.do_2 = xor_avx_2,
-	.do_3 = xor_avx_3,
-	.do_4 = xor_avx_4,
-	.do_5 = xor_avx_5,
+	.name		= "avx",
+	.xor_gen	= xor_gen_avx,
 };
diff --git a/lib/raid/xor/x86/xor-mmx.c b/lib/raid/xor/x86/xor-mmx.c
index e48c58f9287408..63a8b0444fcef1 100644
--- a/lib/raid/xor/x86/xor-mmx.c
+++ b/lib/raid/xor/x86/xor-mmx.c
@@ -21,8 +21,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 7;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i)				\
@@ -55,8 +53,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
 	  "+r" (p1), "+r" (p2)
 	:
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -66,8 +62,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 7;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i)				\
@@ -105,8 +99,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
 	  "+r" (p1), "+r" (p2), "+r" (p3)
 	:
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -117,8 +109,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 7;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i)				\
@@ -161,8 +151,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 	:
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 
@@ -175,8 +163,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 7;
 
-	kernel_fpu_begin();
-
 	/* Make sure GCC forgets anything it knows about p4 or p5,
 	   such that it won't pass to the asm volatile below a
 	   register that is shared with any other variable.  That's
@@ -237,8 +223,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
 	   Clobber them just to be sure nobody does something stupid
 	   like assuming they have some legal value.  */
 	asm("" : "=r" (p4), "=r" (p5));
-
-	kernel_fpu_end();
 }
 
 #undef LD
@@ -255,8 +239,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 6;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 	" .align 32	             ;\n"
 	" 1:                         ;\n"
@@ -293,8 +275,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
 	  "+r" (p1), "+r" (p2)
 	:
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -304,8 +284,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 6;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 	" .align 32,0x90             ;\n"
 	" 1:                         ;\n"
@@ -351,8 +329,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
 	  "+r" (p1), "+r" (p2), "+r" (p3)
 	:
 	: "memory" );
-
-	kernel_fpu_end();
 }
 
 static void
@@ -363,8 +339,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 6;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 	" .align 32,0x90             ;\n"
 	" 1:                         ;\n"
@@ -419,8 +393,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
 	  "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
 	:
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -432,8 +404,6 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 6;
 
-	kernel_fpu_begin();
-
 	/* Make sure GCC forgets anything it knows about p4 or p5,
 	   such that it won't pass to the asm volatile below a
 	   register that is shared with any other variable.  That's
@@ -510,22 +480,36 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
 	   Clobber them just to be sure nobody does something stupid
 	   like assuming they have some legal value.  */
 	asm("" : "=r" (p4), "=r" (p5));
+}
+
+DO_XOR_BLOCKS(pII_mmx_inner, xor_pII_mmx_2, xor_pII_mmx_3, xor_pII_mmx_4,
+		xor_pII_mmx_5);
 
+static void xor_gen_pII_mmx(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_pII_mmx_inner(dest, srcs, src_cnt, bytes);
 	kernel_fpu_end();
 }
 
 struct xor_block_template xor_block_pII_mmx = {
-	.name = "pII_mmx",
-	.do_2 = xor_pII_mmx_2,
-	.do_3 = xor_pII_mmx_3,
-	.do_4 = xor_pII_mmx_4,
-	.do_5 = xor_pII_mmx_5,
+	.name		= "pII_mmx",
+	.xor_gen	= xor_gen_pII_mmx,
 };
 
+DO_XOR_BLOCKS(p5_mmx_inner, xor_p5_mmx_2, xor_p5_mmx_3, xor_p5_mmx_4,
+		xor_p5_mmx_5);
+
+static void xor_gen_p5_mmx(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_p5_mmx_inner(dest, srcs, src_cnt, bytes);
+	kernel_fpu_end();
+}
+
 struct xor_block_template xor_block_p5_mmx = {
-	.name = "p5_mmx",
-	.do_2 = xor_p5_mmx_2,
-	.do_3 = xor_p5_mmx_3,
-	.do_4 = xor_p5_mmx_4,
-	.do_5 = xor_p5_mmx_5,
+	.name		= "p5_mmx",
+	.xor_gen	= xor_gen_p5_mmx,
 };
diff --git a/lib/raid/xor/x86/xor-sse.c b/lib/raid/xor/x86/xor-sse.c
index 5993ed688c1553..c6626ecae6ba5d 100644
--- a/lib/raid/xor/x86/xor-sse.c
+++ b/lib/raid/xor/x86/xor-sse.c
@@ -51,8 +51,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 8;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i)					\
@@ -93,8 +91,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
 	  [p1] "+r" (p1), [p2] "+r" (p2)
 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -103,8 +99,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 8;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i)			\
@@ -128,8 +122,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
 	  [p1] "+r" (p1), [p2] "+r" (p2)
 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -139,8 +131,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 8;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
@@ -188,8 +178,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -199,8 +187,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 8;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i)			\
@@ -226,8 +212,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
 	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -238,8 +222,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 8;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
@@ -294,8 +276,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -306,8 +286,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 8;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i)			\
@@ -335,8 +313,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
 	  [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -348,8 +324,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 8;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i) \
@@ -411,8 +385,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 	: "memory");
-
-	kernel_fpu_end();
 }
 
 static void
@@ -424,8 +396,6 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
 {
 	unsigned long lines = bytes >> 8;
 
-	kernel_fpu_begin();
-
 	asm volatile(
 #undef BLOCK
 #define BLOCK(i)			\
@@ -455,22 +425,35 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
 	  [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 	: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 	: "memory");
+}
+
+DO_XOR_BLOCKS(sse_inner, xor_sse_2, xor_sse_3, xor_sse_4, xor_sse_5);
 
+static void xor_gen_sse(void *dest, void **srcs, unsigned int src_cnt,
+			unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_sse_inner(dest, srcs, src_cnt, bytes);
 	kernel_fpu_end();
 }
 
 struct xor_block_template xor_block_sse = {
-	.name = "sse",
-	.do_2 = xor_sse_2,
-	.do_3 = xor_sse_3,
-	.do_4 = xor_sse_4,
-	.do_5 = xor_sse_5,
+	.name		= "sse",
+	.xor_gen	= xor_gen_sse,
 };
 
+DO_XOR_BLOCKS(sse_pf64_inner, xor_sse_2_pf64, xor_sse_3_pf64, xor_sse_4_pf64,
+		xor_sse_5_pf64);
+
+static void xor_gen_sse_pf64(void *dest, void **srcs, unsigned int src_cnt,
+			unsigned int bytes)
+{
+	kernel_fpu_begin();
+	xor_gen_sse_pf64_inner(dest, srcs, src_cnt, bytes);
+	kernel_fpu_end();
+}
+
 struct xor_block_template xor_block_sse_pf64 = {
-	.name = "prefetch64-sse",
-	.do_2 = xor_sse_2_pf64,
-	.do_3 = xor_sse_3_pf64,
-	.do_4 = xor_sse_4_pf64,
-	.do_5 = xor_sse_5_pf64,
+	.name		= "prefetch64-sse",
+	.xor_gen	= xor_gen_sse_pf64,
 };
diff --git a/lib/raid/xor/xor-32regs-prefetch.c b/lib/raid/xor/xor-32regs-prefetch.c
index 2856a8e50cb840..ade2a7d8cbe2ae 100644
--- a/lib/raid/xor/xor-32regs-prefetch.c
+++ b/lib/raid/xor/xor-32regs-prefetch.c
@@ -258,10 +258,10 @@ xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
 		goto once_more;
 }
 
+DO_XOR_BLOCKS(32regs_p, xor_32regs_p_2, xor_32regs_p_3, xor_32regs_p_4,
+		xor_32regs_p_5);
+
 struct xor_block_template xor_block_32regs_p = {
-	.name = "32regs_prefetch",
-	.do_2 = xor_32regs_p_2,
-	.do_3 = xor_32regs_p_3,
-	.do_4 = xor_32regs_p_4,
-	.do_5 = xor_32regs_p_5,
+	.name		= "32regs_prefetch",
+	.xor_gen	= xor_gen_32regs_p,
 };
diff --git a/lib/raid/xor/xor-32regs.c b/lib/raid/xor/xor-32regs.c
index cc44d64032fa77..acb4a10d1e95bd 100644
--- a/lib/raid/xor/xor-32regs.c
+++ b/lib/raid/xor/xor-32regs.c
@@ -209,10 +209,9 @@ xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1,
 	} while (--lines > 0);
 }
 
+DO_XOR_BLOCKS(32regs, xor_32regs_2, xor_32regs_3, xor_32regs_4, xor_32regs_5);
+
 struct xor_block_template xor_block_32regs = {
-	.name = "32regs",
-	.do_2 = xor_32regs_2,
-	.do_3 = xor_32regs_3,
-	.do_4 = xor_32regs_4,
-	.do_5 = xor_32regs_5,
+	.name		= "32regs",
+	.xor_gen	= xor_gen_32regs,
 };
diff --git a/lib/raid/xor/xor-8regs-prefetch.c b/lib/raid/xor/xor-8regs-prefetch.c
index 1d53aec50d27b6..451527a951b1a2 100644
--- a/lib/raid/xor/xor-8regs-prefetch.c
+++ b/lib/raid/xor/xor-8regs-prefetch.c
@@ -136,10 +136,11 @@ xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
 		goto once_more;
 }
 
+
+DO_XOR_BLOCKS(8regs_p, xor_8regs_p_2, xor_8regs_p_3, xor_8regs_p_4,
+		xor_8regs_p_5);
+
 struct xor_block_template xor_block_8regs_p = {
-	.name = "8regs_prefetch",
-	.do_2 = xor_8regs_p_2,
-	.do_3 = xor_8regs_p_3,
-	.do_4 = xor_8regs_p_4,
-	.do_5 = xor_8regs_p_5,
+	.name		= "8regs_prefetch",
+	.xor_gen	= xor_gen_8regs_p,
 };
diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c
index 72a44e898c5513..1edaed8acffe60 100644
--- a/lib/raid/xor/xor-8regs.c
+++ b/lib/raid/xor/xor-8regs.c
@@ -94,11 +94,10 @@ xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
 }
 
 #ifndef NO_TEMPLATE
+DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5);
+
 struct xor_block_template xor_block_8regs = {
-	.name = "8regs",
-	.do_2 = xor_8regs_2,
-	.do_3 = xor_8regs_3,
-	.do_4 = xor_8regs_4,
-	.do_5 = xor_8regs_5,
+	.name		= "8regs",
+	.xor_gen	= xor_gen_8regs,
 };
 #endif /* NO_TEMPLATE */
diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index fdd0cf5584e7be..a9b984d47d8c1b 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -13,39 +13,9 @@
 #include <linux/preempt.h>
 #include "xor_impl.h"
 
-/* The xor routines to use.  */
+/* The xor routine to use.  */
 static struct xor_block_template *active_template;
 
-void
-xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs)
-{
-	unsigned long *p1, *p2, *p3, *p4;
-
-	lockdep_assert_preemption_enabled();
-
-	p1 = (unsigned long *) srcs[0];
-	if (src_count == 1) {
-		active_template->do_2(bytes, dest, p1);
-		return;
-	}
-
-	p2 = (unsigned long *) srcs[1];
-	if (src_count == 2) {
-		active_template->do_3(bytes, dest, p1, p2);
-		return;
-	}
-
-	p3 = (unsigned long *) srcs[2];
-	if (src_count == 3) {
-		active_template->do_4(bytes, dest, p1, p2, p3);
-		return;
-	}
-
-	p4 = (unsigned long *) srcs[3];
-	active_template->do_5(bytes, dest, p1, p2, p3, p4);
-}
-EXPORT_SYMBOL(xor_blocks);
-
 /**
  * xor_gen - generate RAID-style XOR information
  * @dest:	destination vector
@@ -63,18 +33,10 @@ EXPORT_SYMBOL(xor_blocks);
  */
 void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes)
 {
-	unsigned int src_off = 0;
-
+	lockdep_assert_preemption_enabled();
 	WARN_ON_ONCE(bytes & 511);
 
-	while (src_cnt > 0) {
-		unsigned int this_cnt = min(src_cnt, MAX_XOR_BLOCKS);
-
-		xor_blocks(this_cnt, bytes, dest, srcs + src_off);
-
-		src_cnt -= this_cnt;
-		src_off += this_cnt;
-	}
+	active_template->xor_gen(dest, srcs, src_cnt, bytes);
 }
 EXPORT_SYMBOL(xor_gen);
 
@@ -118,6 +80,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 	int speed;
 	unsigned long reps;
 	ktime_t min, start, t0;
+	void *srcs[1] = { b2 };
 
 	preempt_disable();
 
@@ -128,7 +91,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 		cpu_relax();
 	do {
 		mb(); /* prevent loop optimization */
-		tmpl->do_2(BENCH_SIZE, b1, b2);
+		tmpl->xor_gen(b1, srcs, 1, BENCH_SIZE);
 		mb();
 	} while (reps++ < REPS || (t0 = ktime_get()) == start);
 	min = ktime_sub(t0, start);
diff --git a/lib/raid/xor/xor_impl.h b/lib/raid/xor/xor_impl.h
index 44b6c99e2093e1..09ae2916f71ecb 100644
--- a/lib/raid/xor/xor_impl.h
+++ b/lib/raid/xor/xor_impl.h
@@ -3,27 +3,47 @@
 #define _XOR_IMPL_H
 
 #include <linux/init.h>
+#include <linux/minmax.h>
 
 struct xor_block_template {
 	struct xor_block_template *next;
 	const char *name;
 	int speed;
-	void (*do_2)(unsigned long, unsigned long * __restrict,
-		     const unsigned long * __restrict);
-	void (*do_3)(unsigned long, unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict);
-	void (*do_4)(unsigned long, unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict);
-	void (*do_5)(unsigned long, unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict,
-		     const unsigned long * __restrict);
+	void (*xor_gen)(void *dest, void **srcs, unsigned int src_cnt,
+			unsigned int bytes);
 };
 
+#define __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4)	\
+void								\
+xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt,		\
+		unsigned int bytes)					\
+{									\
+	unsigned int src_off = 0;					\
+									\
+	while (src_cnt > 0) {						\
+		unsigned int this_cnt = min(src_cnt, 4);		\
+									\
+		if (this_cnt == 1)					\
+			_handle1(bytes, dest, srcs[src_off]);		\
+		else if (this_cnt == 2)					\
+			_handle2(bytes, dest, srcs[src_off],		\
+				srcs[src_off + 1]);			\
+		else if (this_cnt == 3)					\
+			_handle3(bytes, dest, srcs[src_off],		\
+				srcs[src_off + 1], srcs[src_off + 2]);	\
+		else							\
+			_handle4(bytes, dest, srcs[src_off],		\
+				srcs[src_off + 1], srcs[src_off + 2],	\
+				srcs[src_off + 3]);			\
+									\
+		src_cnt -= this_cnt;					\
+		src_off += this_cnt;					\
+	}								\
+}
+
+#define DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4)	\
+	static __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4)
+
 /* generic implementations */
 extern struct xor_block_template xor_block_8regs;
 extern struct xor_block_template xor_block_32regs;

From e11e13d9a7619e77f71ca602392806df08fad182 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:22:01 +0100
Subject: [PATCH 25/26] xor: use static_call for xor_gen

Avoid the indirect call for xor_generation by using a static_call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 lib/raid/xor/xor-core.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c
index a9b984d47d8c1b..67dc8ade7f0bed 100644
--- a/lib/raid/xor/xor-core.c
+++ b/lib/raid/xor/xor-core.c
@@ -11,10 +11,10 @@
 #include <linux/raid/xor.h>
 #include <linux/jiffies.h>
 #include <linux/preempt.h>
+#include <linux/static_call.h>
 #include "xor_impl.h"
 
-/* The xor routine to use.  */
-static struct xor_block_template *active_template;
+DEFINE_STATIC_CALL_NULL(xor_gen_impl, *xor_block_8regs.xor_gen);
 
 /**
  * xor_gen - generate RAID-style XOR information
@@ -36,13 +36,13 @@ void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes)
 	lockdep_assert_preemption_enabled();
 	WARN_ON_ONCE(bytes & 511);
 
-	active_template->xor_gen(dest, srcs, src_cnt, bytes);
+	static_call(xor_gen_impl)(dest, srcs, src_cnt, bytes);
 }
 EXPORT_SYMBOL(xor_gen);
 
 /* Set of all registered templates.  */
 static struct xor_block_template *__initdata template_list;
-static bool __initdata xor_forced = false;
+static struct xor_block_template *forced_template;
 
 /**
  * xor_register - register a XOR template
@@ -68,7 +68,7 @@ void __init xor_register(struct xor_block_template *tmpl)
  */
 void __init xor_force(struct xor_block_template *tmpl)
 {
-	active_template = tmpl;
+	forced_template = tmpl;
 }
 
 #define BENCH_SIZE	4096
@@ -110,7 +110,7 @@ static int __init calibrate_xor_blocks(void)
 	void *b1, *b2;
 	struct xor_block_template *f, *fastest;
 
-	if (xor_forced)
+	if (forced_template)
 		return 0;
 
 	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
@@ -127,7 +127,7 @@ static int __init calibrate_xor_blocks(void)
 		if (f->speed > fastest->speed)
 			fastest = f;
 	}
-	active_template = fastest;
+	static_call_update(xor_gen_impl, fastest->xor_gen);
 	pr_info("xor: using function: %s (%d MB/sec)\n",
 	       fastest->name, fastest->speed);
 
@@ -155,10 +155,10 @@ static int __init xor_init(void)
 	 * If this arch/cpu has a short-circuited selection, don't loop through
 	 * all the possible functions, just use the best one.
 	 */
-	if (active_template) {
+	if (forced_template) {
 		pr_info("xor: automatically using best checksumming function   %-10s\n",
-			active_template->name);
-		xor_forced = true;
+			forced_template->name);
+		static_call_update(xor_gen_impl, forced_template->xor_gen);
 		return 0;
 	}
 
@@ -169,7 +169,7 @@ static int __init xor_init(void)
 	 * Pick the first template as the temporary default until calibration
 	 * happens.
 	 */
-	active_template = template_list;
+	static_call_update(xor_gen_impl, template_list->xor_gen);
 	return 0;
 #endif
 }

From c0fe2371448effdccd89e41f21e125f697ace439 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 24 Mar 2026 07:22:02 +0100
Subject: [PATCH 26/26] xor: add a kunit test case

Add a test case for the XOR routines loosely based on the CRC kunit
test.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linux RISC-V bot <linux.riscv.bot@gmail.com>
---
 lib/raid/.kunitconfig          |   3 +
 lib/raid/Kconfig               |  11 ++
 lib/raid/xor/Makefile          |   2 +-
 lib/raid/xor/tests/Makefile    |   3 +
 lib/raid/xor/tests/xor_kunit.c | 187 +++++++++++++++++++++++++++++++++
 5 files changed, 205 insertions(+), 1 deletion(-)
 create mode 100644 lib/raid/.kunitconfig
 create mode 100644 lib/raid/xor/tests/Makefile
 create mode 100644 lib/raid/xor/tests/xor_kunit.c

diff --git a/lib/raid/.kunitconfig b/lib/raid/.kunitconfig
new file mode 100644
index 00000000000000..351d22ed19540d
--- /dev/null
+++ b/lib/raid/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_BTRFS_FS=y
+CONFIG_XOR_KUNIT_TEST=y
diff --git a/lib/raid/Kconfig b/lib/raid/Kconfig
index 2ccf0c60de01f0..1fc4b00e0d71de 100644
--- a/lib/raid/Kconfig
+++ b/lib/raid/Kconfig
@@ -17,3 +17,14 @@ config XOR_BLOCKS_ARCH
 	default y if X86_32
 	default y if X86_64
 	bool
+
+config XOR_KUNIT_TEST
+	tristate "KUnit tests for xor_gen" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	depends on XOR_BLOCKS
+	default KUNIT_ALL_TESTS
+	help
+	  Unit tests for the XOR library functions.
+
+	  This is intended to help people writing architecture-specific
+	  optimized versions.  If unsure, say N.
diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile
index df55823c4d827c..4d633dfd5b90cf 100644
--- a/lib/raid/xor/Makefile
+++ b/lib/raid/xor/Makefile
@@ -29,7 +29,7 @@ xor-$(CONFIG_SPARC64)		+= sparc/xor-sparc64.o sparc/xor-sparc64-glue.o
 xor-$(CONFIG_S390)		+= s390/xor.o
 xor-$(CONFIG_X86_32)		+= x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o
 xor-$(CONFIG_X86_64)		+= x86/xor-avx.o x86/xor-sse.o
-
+obj-y				+= tests/
 
 CFLAGS_arm/xor-neon.o		+= $(CC_FLAGS_FPU)
 CFLAGS_REMOVE_arm/xor-neon.o	+= $(CC_FLAGS_NO_FPU)
diff --git a/lib/raid/xor/tests/Makefile b/lib/raid/xor/tests/Makefile
new file mode 100644
index 00000000000000..661e8f6ffd1f35
--- /dev/null
+++ b/lib/raid/xor/tests/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_XOR_KUNIT_TEST) += xor_kunit.o
diff --git a/lib/raid/xor/tests/xor_kunit.c b/lib/raid/xor/tests/xor_kunit.c
new file mode 100644
index 00000000000000..01cbdf44f6b033
--- /dev/null
+++ b/lib/raid/xor/tests/xor_kunit.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Unit test the XOR library functions.
+ *
+ * Copyright 2024 Google LLC
+ * Copyright 2026 Christoph Hellwig
+ *
+ * Based on the CRC tests by Eric Biggers <ebiggers@google.com>.
+ */
+#include <kunit/test.h>
+#include <linux/prandom.h>
+#include <linux/string_choices.h>
+#include <linux/vmalloc.h>
+#include <linux/raid/xor.h>
+
+#define XOR_KUNIT_SEED			42
+#define XOR_KUNIT_MAX_BYTES		16384
+#define XOR_KUNIT_MAX_BUFFERS		64
+#define XOR_KUNIT_NUM_TEST_ITERS	1000
+
+static struct rnd_state rng;
+static void *test_buffers[XOR_KUNIT_MAX_BUFFERS];
+static void *test_dest;
+static void *test_ref;
+static size_t test_buflen;
+
+static u32 rand32(void)
+{
+	return prandom_u32_state(&rng);
+}
+
+/* Reference implementation using dumb byte-wise XOR */
+static void xor_ref(void *dest, void **srcs, unsigned int src_cnt,
+		unsigned int bytes)
+{
+	unsigned int off, idx;
+	u8 *d = dest;
+
+	for (off = 0; off < bytes; off++) {
+		for (idx = 0; idx < src_cnt; idx++) {
+			u8 *src = srcs[idx];
+
+			d[off] ^= src[off];
+		}
+	}
+}
+
+/* Generate a random length that is a multiple of 512. */
+static unsigned int random_length(unsigned int max_length)
+{
+	return (rand32() % (max_length + 1)) & ~511;
+}
+
+/* Generate a random alignment that is a multiple of 64. */
+static unsigned int random_alignment(unsigned int max_alignment)
+{
+	return (rand32() % (max_alignment + 1)) & ~63;
+}
+
+static void xor_generate_random_data(void)
+{
+	int i;
+
+	prandom_bytes_state(&rng, test_dest, test_buflen);
+	memcpy(test_ref, test_dest, test_buflen);
+	for (i = 0; i < XOR_KUNIT_MAX_BUFFERS; i++)
+		prandom_bytes_state(&rng, test_buffers[i], test_buflen);
+}
+
+/* Test that xor_gen gives the same result as a reference implementation. */
+static void xor_test(struct kunit *test)
+{
+	void *aligned_buffers[XOR_KUNIT_MAX_BUFFERS];
+	size_t i;
+
+	for (i = 0; i < XOR_KUNIT_NUM_TEST_ITERS; i++) {
+		unsigned int nr_buffers =
+			(rand32() % XOR_KUNIT_MAX_BUFFERS) + 1;
+		unsigned int len = random_length(XOR_KUNIT_MAX_BYTES);
+		unsigned int max_alignment, align = 0;
+		void *buffers;
+
+		if (rand32() % 8 == 0)
+			/* Refresh the data occasionally. */
+			xor_generate_random_data();
+
+		/*
+		 * If we're not using the entire buffer size, inject randomize
+		 * alignment into the buffer.
+		 */
+		max_alignment = XOR_KUNIT_MAX_BYTES - len;
+		if (max_alignment == 0) {
+			buffers = test_buffers;
+		} else if (rand32() % 2 == 0) {
+			/* Use random alignments mod 64 */
+			int j;
+
+			for (j = 0; j < nr_buffers; j++)
+				aligned_buffers[j] = test_buffers[j] +
+					random_alignment(max_alignment);
+			buffers = aligned_buffers;
+			align = random_alignment(max_alignment);
+		} else {
+			/* Go up to the guard page, to catch buffer overreads */
+			int j;
+
+			align = test_buflen - len;
+			for (j = 0; j < nr_buffers; j++)
+				aligned_buffers[j] = test_buffers[j] + align;
+			buffers = aligned_buffers;
+		}
+
+		/*
+		 * Compute the XOR, and verify that it equals the XOR computed
+		 * by a simple byte-at-a-time reference implementation.
+		 */
+		xor_ref(test_ref + align, buffers, nr_buffers, len);
+		xor_gen(test_dest + align, buffers, nr_buffers, len);
+		KUNIT_EXPECT_MEMEQ_MSG(test, test_ref + align,
+				test_dest + align, len,
+				"Wrong result with buffers=%u, len=%u, unaligned=%s, at_end=%s",
+				nr_buffers, len,
+				str_yes_no(max_alignment),
+				str_yes_no(align + len == test_buflen));
+	}
+}
+
+static struct kunit_case xor_test_cases[] = {
+	KUNIT_CASE(xor_test),
+	{},
+};
+
+static int xor_suite_init(struct kunit_suite *suite)
+{
+	int i;
+
+	/*
+	 * Allocate the test buffer using vmalloc() with a page-aligned length
+	 * so that it is immediately followed by a guard page.  This allows
+	 * buffer overreads to be detected, even in assembly code.
+	 */
+	test_buflen = round_up(XOR_KUNIT_MAX_BYTES, PAGE_SIZE);
+	test_ref = vmalloc(test_buflen);
+	if (!test_ref)
+		return -ENOMEM;
+	test_dest = vmalloc(test_buflen);
+	if (!test_dest)
+		goto out_free_ref;
+	for (i = 0; i < XOR_KUNIT_MAX_BUFFERS; i++) {
+		test_buffers[i] = vmalloc(test_buflen);
+		if (!test_buffers[i])
+			goto out_free_buffers;
+	}
+
+	prandom_seed_state(&rng, XOR_KUNIT_SEED);
+	xor_generate_random_data();
+	return 0;
+
+out_free_buffers:
+	while (--i >= 0)
+		vfree(test_buffers[i]);
+	vfree(test_dest);
+out_free_ref:
+	vfree(test_ref);
+	return -ENOMEM;
+}
+
+static void xor_suite_exit(struct kunit_suite *suite)
+{
+	int i;
+
+	vfree(test_ref);
+	vfree(test_dest);
+	for (i = 0; i < XOR_KUNIT_MAX_BUFFERS; i++)
+		vfree(test_buffers[i]);
+}
+
+static struct kunit_suite xor_test_suite = {
+	.name		= "xor",
+	.test_cases	= xor_test_cases,
+	.suite_init	= xor_suite_init,
+	.suite_exit	= xor_suite_exit,
+};
+kunit_test_suite(xor_test_suite);
+
+MODULE_DESCRIPTION("Unit test for the XOR library functions");
+MODULE_LICENSE("GPL");