diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 0ca5aae1bcc3e3..9295055cdfc92b 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -39,9 +39,4 @@ endif $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S -ifeq ($(CONFIG_KERNEL_MODE_NEON),y) - CFLAGS_xor-neon.o += $(CC_FLAGS_FPU) - obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o -endif - obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h deleted file mode 100644 index c38e3d017a79ec..00000000000000 --- a/arch/arm64/include/asm/xor.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * arch/arm64/include/asm/xor.h - * - * Authors: Jackie Liu - * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. - */ - -#include -#include -#include -#include - -#ifdef CONFIG_KERNEL_MODE_NEON - -extern struct xor_block_template const xor_block_inner_neon; - -static void -xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - scoped_ksimd() - xor_block_inner_neon.do_2(bytes, p1, p2); -} - -static void -xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - scoped_ksimd() - xor_block_inner_neon.do_3(bytes, p1, p2, p3); -} - -static void -xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - scoped_ksimd() - xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4); -} - -static void -xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - scoped_ksimd() - xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5); -} - -static struct xor_block_template xor_block_arm64 = { - .name = "arm64_neon", - .do_2 = xor_neon_2, - .do_3 = xor_neon_3, - .do_4 = xor_neon_4, - .do_5 = xor_neon_5 -}; -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - if (cpu_has_neon()) { \ - xor_speed(&xor_block_arm64);\ - } \ - } while (0) - -#endif /* ! CONFIG_KERNEL_MODE_NEON */ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 633e5223d944d7..448c917494f305 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -5,12 +5,6 @@ lib-y := clear_user.o delay.o copy_from_user.o \ memset.o memcmp.o strcmp.o strncmp.o strlen.o \ strnlen.o strchr.o strrchr.o tishift.o -ifeq ($(CONFIG_KERNEL_MODE_NEON), y) -obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o -CFLAGS_xor-neon.o += $(CC_FLAGS_FPU) -CFLAGS_REMOVE_xor-neon.o += $(CC_FLAGS_NO_FPU) -endif - lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/loongarch/include/asm/xor.h b/arch/loongarch/include/asm/xor.h deleted file mode 100644 index 12467fffee4687..00000000000000 --- a/arch/loongarch/include/asm/xor.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2023 WANG Xuerui - */ -#ifndef _ASM_LOONGARCH_XOR_H -#define _ASM_LOONGARCH_XOR_H - -#include -#include - -#ifdef CONFIG_CPU_HAS_LSX -static struct xor_block_template xor_block_lsx = { - .name = "lsx", - .do_2 = xor_lsx_2, - .do_3 = xor_lsx_3, - .do_4 = xor_lsx_4, - .do_5 = xor_lsx_5, -}; - -#define XOR_SPEED_LSX() \ - do { \ - if (cpu_has_lsx) \ - xor_speed(&xor_block_lsx); \ - } while (0) -#else /* CONFIG_CPU_HAS_LSX */ -#define XOR_SPEED_LSX() -#endif /* CONFIG_CPU_HAS_LSX */ - -#ifdef CONFIG_CPU_HAS_LASX -static struct xor_block_template xor_block_lasx = { - .name = "lasx", - .do_2 = xor_lasx_2, - .do_3 = xor_lasx_3, - .do_4 = xor_lasx_4, - .do_5 = xor_lasx_5, -}; - -#define XOR_SPEED_LASX() \ - do { \ - if (cpu_has_lasx) \ - xor_speed(&xor_block_lasx); \ - } while (0) -#else /* CONFIG_CPU_HAS_LASX */ -#define XOR_SPEED_LASX() -#endif /* CONFIG_CPU_HAS_LASX */ - -/* - * For grins, also test the generic routines. - * - * More importantly: it cannot be ruled out at this point of time, that some - * future (maybe reduced) models could run the vector algorithms slower than - * the scalar ones, maybe for errata or micro-op reasons. It may be - * appropriate to revisit this after one or two more uarch generations. - */ -#include - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - XOR_SPEED_LSX(); \ - XOR_SPEED_LASX(); \ -} while (0) - -#endif /* _ASM_LOONGARCH_XOR_H */ diff --git a/arch/loongarch/include/asm/xor_simd.h b/arch/loongarch/include/asm/xor_simd.h deleted file mode 100644 index 471b96332f3817..00000000000000 --- a/arch/loongarch/include/asm/xor_simd.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2023 WANG Xuerui - */ -#ifndef _ASM_LOONGARCH_XOR_SIMD_H -#define _ASM_LOONGARCH_XOR_SIMD_H - -#ifdef CONFIG_CPU_HAS_LSX -void xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2); -void xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, const unsigned long * __restrict p3); -void xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, const unsigned long * __restrict p3, - const unsigned long * __restrict p4); -void xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, const unsigned long * __restrict p3, - const unsigned long * __restrict p4, const unsigned long * __restrict p5); -#endif /* CONFIG_CPU_HAS_LSX */ - -#ifdef CONFIG_CPU_HAS_LASX -void xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2); -void xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, const unsigned long * __restrict p3); -void xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, const unsigned long * __restrict p3, - const unsigned long * __restrict p4); -void xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, const unsigned long * __restrict p3, - const unsigned long * __restrict p4, const unsigned long * __restrict p5); -#endif /* CONFIG_CPU_HAS_LASX */ - -#endif /* _ASM_LOONGARCH_XOR_SIMD_H */ diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile index ccea3bbd435313..827a88529a425c 100644 --- a/arch/loongarch/lib/Makefile +++ b/arch/loongarch/lib/Makefile @@ -8,6 +8,4 @@ lib-y += delay.o memset.o memcpy.o memmove.o \ obj-$(CONFIG_ARCH_SUPPORTS_INT128) += tishift.o -obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o - obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/loongarch/lib/xor_simd_glue.c b/arch/loongarch/lib/xor_simd_glue.c deleted file mode 100644 index 393f689dbcf678..00000000000000 --- a/arch/loongarch/lib/xor_simd_glue.c +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * LoongArch SIMD XOR operations - * - * Copyright (C) 2023 WANG Xuerui - */ - -#include -#include -#include -#include -#include "xor_simd.h" - -#define MAKE_XOR_GLUE_2(flavor) \ -void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1, \ - const unsigned long * __restrict p2) \ -{ \ - kernel_fpu_begin(); \ - __xor_##flavor##_2(bytes, p1, p2); \ - kernel_fpu_end(); \ -} \ -EXPORT_SYMBOL_GPL(xor_##flavor##_2) - -#define MAKE_XOR_GLUE_3(flavor) \ -void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1, \ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3) \ -{ \ - kernel_fpu_begin(); \ - __xor_##flavor##_3(bytes, p1, p2, p3); \ - kernel_fpu_end(); \ -} \ -EXPORT_SYMBOL_GPL(xor_##flavor##_3) - -#define MAKE_XOR_GLUE_4(flavor) \ -void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1, \ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3, \ - const unsigned long * __restrict p4) \ -{ \ - kernel_fpu_begin(); \ - __xor_##flavor##_4(bytes, p1, p2, p3, p4); \ - kernel_fpu_end(); \ -} \ -EXPORT_SYMBOL_GPL(xor_##flavor##_4) - -#define MAKE_XOR_GLUE_5(flavor) \ -void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1, \ - const unsigned long * __restrict p2, \ - const unsigned long * __restrict p3, \ - const unsigned long * __restrict p4, \ - const unsigned long * __restrict p5) \ -{ \ - kernel_fpu_begin(); \ - __xor_##flavor##_5(bytes, p1, p2, p3, p4, p5); \ - kernel_fpu_end(); \ -} \ -EXPORT_SYMBOL_GPL(xor_##flavor##_5) - -#define MAKE_XOR_GLUES(flavor) \ - MAKE_XOR_GLUE_2(flavor); \ - MAKE_XOR_GLUE_3(flavor); \ - MAKE_XOR_GLUE_4(flavor); \ - MAKE_XOR_GLUE_5(flavor) - -#ifdef CONFIG_CPU_HAS_LSX -MAKE_XOR_GLUES(lsx); -#endif - -#ifdef CONFIG_CPU_HAS_LASX -MAKE_XOR_GLUES(lasx); -#endif diff --git a/arch/powerpc/include/asm/xor.h b/arch/powerpc/include/asm/xor.h deleted file mode 100644 index 37d05c11d09cda..00000000000000 --- a/arch/powerpc/include/asm/xor.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * - * Copyright (C) IBM Corporation, 2012 - * - * Author: Anton Blanchard - */ -#ifndef _ASM_POWERPC_XOR_H -#define _ASM_POWERPC_XOR_H - -#ifdef CONFIG_ALTIVEC - -#include -#include -#include - -static struct xor_block_template xor_block_altivec = { - .name = "altivec", - .do_2 = xor_altivec_2, - .do_3 = xor_altivec_3, - .do_4 = xor_altivec_4, - .do_5 = xor_altivec_5, -}; - -#define XOR_SPEED_ALTIVEC() \ - do { \ - if (cpu_has_feature(CPU_FTR_ALTIVEC)) \ - xor_speed(&xor_block_altivec); \ - } while (0) -#else -#define XOR_SPEED_ALTIVEC() -#endif - -/* Also try the generic routines. */ -#include - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - XOR_SPEED_ALTIVEC(); \ -} while (0) - -#endif /* _ASM_POWERPC_XOR_H */ diff --git a/arch/powerpc/include/asm/xor_altivec.h b/arch/powerpc/include/asm/xor_altivec.h deleted file mode 100644 index 294620a25f8025..00000000000000 --- a/arch/powerpc/include/asm/xor_altivec.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_XOR_ALTIVEC_H -#define _ASM_POWERPC_XOR_ALTIVEC_H - -#ifdef CONFIG_ALTIVEC -void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2); -void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3); -void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4); -void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5); - -#endif -#endif /* _ASM_POWERPC_XOR_ALTIVEC_H */ diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index f14ecab674a340..002edc3f01d5c5 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -73,9 +73,4 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o -obj-$(CONFIG_ALTIVEC) += xor_vmx.o xor_vmx_glue.o -CFLAGS_xor_vmx.o += -mhard-float -maltivec $(call cc-option,-mabi=altivec) -# Enable -CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include) - obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/lib/xor_vmx.h b/arch/powerpc/lib/xor_vmx.h deleted file mode 100644 index 573c41d90dac52..00000000000000 --- a/arch/powerpc/lib/xor_vmx.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Simple interface to link xor_vmx.c and xor_vmx_glue.c - * - * Separating these file ensures that no altivec instructions are run - * outside of the enable/disable altivec block. - */ - -void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2); -void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3); -void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4); -void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5); diff --git a/arch/powerpc/lib/xor_vmx_glue.c b/arch/powerpc/lib/xor_vmx_glue.c deleted file mode 100644 index 35d917ece4d1e4..00000000000000 --- a/arch/powerpc/lib/xor_vmx_glue.c +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Altivec XOR operations - * - * Copyright 2017 IBM Corp. - */ - -#include -#include -#include -#include -#include -#include "xor_vmx.h" - -void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - preempt_disable(); - enable_kernel_altivec(); - __xor_altivec_2(bytes, p1, p2); - disable_kernel_altivec(); - preempt_enable(); -} -EXPORT_SYMBOL(xor_altivec_2); - -void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - preempt_disable(); - enable_kernel_altivec(); - __xor_altivec_3(bytes, p1, p2, p3); - disable_kernel_altivec(); - preempt_enable(); -} -EXPORT_SYMBOL(xor_altivec_3); - -void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - preempt_disable(); - enable_kernel_altivec(); - __xor_altivec_4(bytes, p1, p2, p3, p4); - disable_kernel_altivec(); - preempt_enable(); -} -EXPORT_SYMBOL(xor_altivec_4); - -void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - preempt_disable(); - enable_kernel_altivec(); - __xor_altivec_5(bytes, p1, p2, p3, p4, p5); - disable_kernel_altivec(); - preempt_enable(); -} -EXPORT_SYMBOL(xor_altivec_5); diff --git a/arch/riscv/include/asm/xor.h b/arch/riscv/include/asm/xor.h deleted file mode 100644 index 96011861e46b4d..00000000000000 --- a/arch/riscv/include/asm/xor.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2021 SiFive - */ - -#include -#include -#ifdef CONFIG_RISCV_ISA_V -#include -#include -#include - -static void xor_vector_2(unsigned long bytes, unsigned long *__restrict p1, - const unsigned long *__restrict p2) -{ - kernel_vector_begin(); - xor_regs_2_(bytes, p1, p2); - kernel_vector_end(); -} - -static void xor_vector_3(unsigned long bytes, unsigned long *__restrict p1, - const unsigned long *__restrict p2, - const unsigned long *__restrict p3) -{ - kernel_vector_begin(); - xor_regs_3_(bytes, p1, p2, p3); - kernel_vector_end(); -} - -static void xor_vector_4(unsigned long bytes, unsigned long *__restrict p1, - const unsigned long *__restrict p2, - const unsigned long *__restrict p3, - const unsigned long *__restrict p4) -{ - kernel_vector_begin(); - xor_regs_4_(bytes, p1, p2, p3, p4); - kernel_vector_end(); -} - -static void xor_vector_5(unsigned long bytes, unsigned long *__restrict p1, - const unsigned long *__restrict p2, - const unsigned long *__restrict p3, - const unsigned long *__restrict p4, - const unsigned long *__restrict p5) -{ - kernel_vector_begin(); - xor_regs_5_(bytes, p1, p2, p3, p4, p5); - kernel_vector_end(); -} - -static struct xor_block_template xor_block_rvv = { - .name = "rvv", - .do_2 = xor_vector_2, - .do_3 = xor_vector_3, - .do_4 = xor_vector_4, - .do_5 = xor_vector_5 -}; - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - if (has_vector()) { \ - xor_speed(&xor_block_rvv);\ - } \ - } while (0) -#endif diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index bbc031124974e9..e220c35764ebd6 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -16,5 +16,4 @@ lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o -lib-$(CONFIG_RISCV_ISA_V) += xor.o lib-$(CONFIG_RISCV_ISA_V) += riscv_v_helpers.o diff --git a/arch/s390/include/asm/xor.h b/arch/s390/include/asm/xor.h deleted file mode 100644 index 857d6759b67f0d..00000000000000 --- a/arch/s390/include/asm/xor.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Optimited xor routines - * - * Copyright IBM Corp. 2016 - * Author(s): Martin Schwidefsky - */ -#ifndef _ASM_S390_XOR_H -#define _ASM_S390_XOR_H - -extern struct xor_block_template xor_block_xc; - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - xor_speed(&xor_block_xc); \ -} while (0) - -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_xc) - -#endif /* _ASM_S390_XOR_H */ diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile index f43f897d3fc027..2bf47204f6abd9 100644 --- a/arch/s390/lib/Makefile +++ b/arch/s390/lib/Makefile @@ -5,7 +5,7 @@ lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o lib-y += csum-partial.o -obj-y += mem.o xor.o +obj-y += mem.o lib-$(CONFIG_KPROBES) += probes.o lib-$(CONFIG_UPROBES) += probes.o obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o diff --git a/arch/sparc/include/asm/asm-prototypes.h b/arch/sparc/include/asm/asm-prototypes.h index 08810808ca6dec..bbd1a8afaabf8b 100644 --- a/arch/sparc/include/asm/asm-prototypes.h +++ b/arch/sparc/include/asm/asm-prototypes.h @@ -14,7 +14,6 @@ #include #include #include -#include void *__memscan_zero(void *, size_t); void *__memscan_generic(void *, int, size_t); diff --git a/arch/sparc/include/asm/xor.h b/arch/sparc/include/asm/xor.h deleted file mode 100644 index f4c651e203c438..00000000000000 --- a/arch/sparc/include/asm/xor.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ___ASM_SPARC_XOR_H -#define ___ASM_SPARC_XOR_H -#if defined(__sparc__) && defined(__arch64__) -#include -#else -#include -#endif -#endif diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 783bdec0d7be05..dd10cdd6f0623d 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -48,7 +48,7 @@ lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o lib-$(CONFIG_SPARC64) += copy_in_user.o memmove.o -lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o xor.o hweight.o ffs.o +lib-$(CONFIG_SPARC64) += mcount.o ipcsum.o hweight.o ffs.o obj-$(CONFIG_SPARC64) += iomap.o obj-$(CONFIG_SPARC32) += atomic32.o diff --git a/arch/um/include/asm/xor.h b/arch/um/include/asm/xor.h deleted file mode 100644 index 647fae200c5d34..00000000000000 --- a/arch/um/include/asm/xor.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_UM_XOR_H -#define _ASM_UM_XOR_H - -#ifdef CONFIG_64BIT -#undef CONFIG_X86_32 -#define TT_CPU_INF_XOR_DEFAULT (AVX_SELECT(&xor_block_sse_pf64)) -#else -#define CONFIG_X86_32 1 -#define TT_CPU_INF_XOR_DEFAULT (AVX_SELECT(&xor_block_8regs)) -#endif - -#include -#include <../../x86/include/asm/xor.h> -#include - -#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT -#undef XOR_SELECT_TEMPLATE -/* pick an arbitrary one - measuring isn't possible with inf-cpu */ -#define XOR_SELECT_TEMPLATE(x) \ - (time_travel_mode == TT_MODE_INFCPU ? TT_CPU_INF_XOR_DEFAULT : x) -#endif - -#endif diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h deleted file mode 100644 index 0307e4ec504405..00000000000000 --- a/arch/x86/include/asm/xor_64.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_XOR_64_H -#define _ASM_X86_XOR_64_H - -static struct xor_block_template xor_block_sse = { - .name = "generic_sse", - .do_2 = xor_sse_2, - .do_3 = xor_sse_3, - .do_4 = xor_sse_4, - .do_5 = xor_sse_5, -}; - - -/* Also try the AVX routines */ -#include - -/* We force the use of the SSE xor block because it can write around L2. - We may also be able to load into the L1 only depending on how the cpu - deals with a load to a line that is being prefetched. */ -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - AVX_XOR_SPEED; \ - xor_speed(&xor_block_sse_pf64); \ - xor_speed(&xor_block_sse); \ -} while (0) - -#endif /* _ASM_X86_XOR_64_H */ diff --git a/crypto/Kconfig b/crypto/Kconfig index e2b4106ac961eb..5cdb1b25ae875b 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -2,8 +2,6 @@ # # Generic algorithms support # -config XOR_BLOCKS - tristate # # async_tx api: hardware offloaded memory transfer/transform support diff --git a/crypto/Makefile b/crypto/Makefile index 04e269117589ac..795c2eea51fe6e 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -196,7 +196,6 @@ obj-$(CONFIG_CRYPTO_ECRDSA) += ecrdsa_generic.o # # generic algorithms and the async_tx api # -obj-$(CONFIG_XOR_BLOCKS) += xor.o obj-$(CONFIG_ASYNC_CORE) += async_tx/ obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += asymmetric_keys/ crypto_simd-y := simd.o diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index 2c499654a36c85..84458375b202b5 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -103,7 +103,6 @@ do_sync_xor_offs(struct page *dest, unsigned int offset, { int i; int xor_src_cnt = 0; - int src_off = 0; void *dest_buf; void **srcs; @@ -117,23 +116,12 @@ do_sync_xor_offs(struct page *dest, unsigned int offset, if (src_list[i]) srcs[xor_src_cnt++] = page_address(src_list[i]) + (src_offs ? src_offs[i] : offset); - src_cnt = xor_src_cnt; + /* set destination address */ dest_buf = page_address(dest) + offset; - if (submit->flags & ASYNC_TX_XOR_ZERO_DST) memset(dest_buf, 0, len); - - while (src_cnt > 0) { - /* process up to 'MAX_XOR_BLOCKS' sources */ - xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); - xor_blocks(xor_src_cnt, len, dest_buf, &srcs[src_off]); - - /* drop completed sources */ - src_cnt -= xor_src_cnt; - src_off += xor_src_cnt; - } - + xor_gen(dest_buf, srcs, xor_src_cnt, len); async_tx_sync_epilog(submit); } @@ -168,11 +156,10 @@ dma_xor_aligned_offsets(struct dma_device *device, unsigned int offset, * * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST * - * xor_blocks always uses the dest as a source so the - * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in - * the calculation. The assumption with dma engines is that they only - * use the destination buffer as a source when it is explicitly specified - * in the source list. + * xor_gen always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST flag + * must be set to not include dest data in the calculation. The assumption with + * dma engines is that they only use the destination buffer as a source when it + * is explicitly specified in the source list. * * src_list note: if the dest is also a source it must be at index zero. * The contents of this array will be overwritten if a scribble region @@ -259,11 +246,10 @@ EXPORT_SYMBOL_GPL(async_xor_offs); * * honored flags: ASYNC_TX_ACK, ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DST * - * xor_blocks always uses the dest as a source so the - * ASYNC_TX_XOR_ZERO_DST flag must be set to not include dest data in - * the calculation. The assumption with dma engines is that they only - * use the destination buffer as a source when it is explicitly specified - * in the source list. + * xor_gen always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST flag + * must be set to not include dest data in the calculation. The assumption with + * dma engines is that they only use the destination buffer as a source when it + * is explicitly specified in the source list. * * src_list note: if the dest is also a source it must be at index zero. * The contents of this array will be overwritten if a scribble region diff --git a/crypto/xor.c b/crypto/xor.c deleted file mode 100644 index f39621a57bb33c..00000000000000 --- a/crypto/xor.c +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * xor.c : Multiple Devices driver for Linux - * - * Copyright (C) 1996, 1997, 1998, 1999, 2000, - * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson. - * - * Dispatch optimized RAID-5 checksumming functions. - */ - -#define BH_TRACE 0 -#include -#include -#include -#include -#include -#include - -#ifndef XOR_SELECT_TEMPLATE -#define XOR_SELECT_TEMPLATE(x) (x) -#endif - -/* The xor routines to use. */ -static struct xor_block_template *active_template; - -void -xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs) -{ - unsigned long *p1, *p2, *p3, *p4; - - p1 = (unsigned long *) srcs[0]; - if (src_count == 1) { - active_template->do_2(bytes, dest, p1); - return; - } - - p2 = (unsigned long *) srcs[1]; - if (src_count == 2) { - active_template->do_3(bytes, dest, p1, p2); - return; - } - - p3 = (unsigned long *) srcs[2]; - if (src_count == 3) { - active_template->do_4(bytes, dest, p1, p2, p3); - return; - } - - p4 = (unsigned long *) srcs[3]; - active_template->do_5(bytes, dest, p1, p2, p3, p4); -} -EXPORT_SYMBOL(xor_blocks); - -/* Set of all registered templates. */ -static struct xor_block_template *__initdata template_list; - -#ifndef MODULE -static void __init do_xor_register(struct xor_block_template *tmpl) -{ - tmpl->next = template_list; - template_list = tmpl; -} - -static int __init register_xor_blocks(void) -{ - active_template = XOR_SELECT_TEMPLATE(NULL); - - if (!active_template) { -#define xor_speed do_xor_register - // register all the templates and pick the first as the default - XOR_TRY_TEMPLATES; -#undef xor_speed - active_template = template_list; - } - return 0; -} -#endif - -#define BENCH_SIZE 4096 -#define REPS 800U - -static void __init -do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) -{ - int speed; - unsigned long reps; - ktime_t min, start, t0; - - tmpl->next = template_list; - template_list = tmpl; - - preempt_disable(); - - reps = 0; - t0 = ktime_get(); - /* delay start until time has advanced */ - while ((start = ktime_get()) == t0) - cpu_relax(); - do { - mb(); /* prevent loop optimization */ - tmpl->do_2(BENCH_SIZE, b1, b2); - mb(); - } while (reps++ < REPS || (t0 = ktime_get()) == start); - min = ktime_sub(t0, start); - - preempt_enable(); - - // bytes/ns == GB/s, multiply by 1000 to get MB/s [not MiB/s] - speed = (1000 * reps * BENCH_SIZE) / (unsigned int)ktime_to_ns(min); - tmpl->speed = speed; - - pr_info(" %-16s: %5d MB/sec\n", tmpl->name, speed); -} - -static int __init -calibrate_xor_blocks(void) -{ - void *b1, *b2; - struct xor_block_template *f, *fastest; - - fastest = XOR_SELECT_TEMPLATE(NULL); - - if (fastest) { - printk(KERN_INFO "xor: automatically using best " - "checksumming function %-10s\n", - fastest->name); - goto out; - } - - b1 = (void *) __get_free_pages(GFP_KERNEL, 2); - if (!b1) { - printk(KERN_WARNING "xor: Yikes! No memory available.\n"); - return -ENOMEM; - } - b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE; - - /* - * If this arch/cpu has a short-circuited selection, don't loop through - * all the possible functions, just test the best one - */ - -#define xor_speed(templ) do_xor_speed((templ), b1, b2) - - printk(KERN_INFO "xor: measuring software checksum speed\n"); - template_list = NULL; - XOR_TRY_TEMPLATES; - fastest = template_list; - for (f = fastest; f; f = f->next) - if (f->speed > fastest->speed) - fastest = f; - - pr_info("xor: using function: %s (%d MB/sec)\n", - fastest->name, fastest->speed); - -#undef xor_speed - - free_pages((unsigned long)b1, 2); -out: - active_template = fastest; - return 0; -} - -static __exit void xor_exit(void) { } - -MODULE_DESCRIPTION("RAID-5 checksumming functions"); -MODULE_LICENSE("GPL"); - -#ifndef MODULE -/* when built-in xor.o must initialize before drivers/md/md.o */ -core_initcall(register_xor_blocks); -#endif - -module_init(calibrate_xor_blocks); -module_exit(xor_exit); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index baadaaa189c05d..7e326d8e63ae67 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -617,26 +617,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) spin_unlock(&table->cache_lock); } -/* - * helper function to run the xor_blocks api. It is only - * able to do MAX_XOR_BLOCKS at a time, so we need to - * loop through. - */ -static void run_xor(void **pages, int src_cnt, ssize_t len) -{ - int src_off = 0; - int xor_src_cnt = 0; - void *dest = pages[src_cnt]; - - while(src_cnt > 0) { - xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); - xor_blocks(xor_src_cnt, len, dest, pages + src_off); - - src_cnt -= xor_src_cnt; - src_off += xor_src_cnt; - } -} - /* * Returns true if the bio list inside this rbio covers an entire stripe (no * rmw required). @@ -1432,7 +1412,8 @@ static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int } else { /* raid5 */ memcpy(pointers[rbio->nr_data], pointers[0], step); - run_xor(pointers + 1, rbio->nr_data - 1, step); + xor_gen(pointers[rbio->nr_data], pointers + 1, rbio->nr_data - 1, + step); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); @@ -2032,7 +2013,7 @@ static void recover_vertical_step(struct btrfs_raid_bio *rbio, pointers[rbio->nr_data - 1] = p; /* Xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, step); + xor_gen(p, pointers, rbio->nr_data - 1, step); } cleanup: @@ -2662,7 +2643,7 @@ static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, } else { /* RAID5. */ memcpy(pointers[nr_data], pointers[0], step); - run_xor(pointers + 1, nr_data - 1, step); + xor_gen(pointers[nr_data], pointers + 1, nr_data - 1, step); } /* Check scrubbing parity and repair it. */ diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 9aff61e7b8f27c..2c53a1e0b76041 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -65,4 +65,3 @@ mandatory-y += vermagic.h mandatory-y += vga.h mandatory-y += video.h mandatory-y += word-at-a-time.h -mandatory-y += xor.h diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h deleted file mode 100644 index 44509d48fca21e..00000000000000 --- a/include/asm-generic/xor.h +++ /dev/null @@ -1,738 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * include/asm-generic/xor.h - * - * Generic optimized RAID-5 checksumming functions. - */ - -#include - -static void -xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - long lines = bytes / (sizeof (long)) / 8; - - do { - p1[0] ^= p2[0]; - p1[1] ^= p2[1]; - p1[2] ^= p2[2]; - p1[3] ^= p2[3]; - p1[4] ^= p2[4]; - p1[5] ^= p2[5]; - p1[6] ^= p2[6]; - p1[7] ^= p2[7]; - p1 += 8; - p2 += 8; - } while (--lines > 0); -} - -static void -xor_8regs_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - long lines = bytes / (sizeof (long)) / 8; - - do { - p1[0] ^= p2[0] ^ p3[0]; - p1[1] ^= p2[1] ^ p3[1]; - p1[2] ^= p2[2] ^ p3[2]; - p1[3] ^= p2[3] ^ p3[3]; - p1[4] ^= p2[4] ^ p3[4]; - p1[5] ^= p2[5] ^ p3[5]; - p1[6] ^= p2[6] ^ p3[6]; - p1[7] ^= p2[7] ^ p3[7]; - p1 += 8; - p2 += 8; - p3 += 8; - } while (--lines > 0); -} - -static void -xor_8regs_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - long lines = bytes / (sizeof (long)) / 8; - - do { - p1[0] ^= p2[0] ^ p3[0] ^ p4[0]; - p1[1] ^= p2[1] ^ p3[1] ^ p4[1]; - p1[2] ^= p2[2] ^ p3[2] ^ p4[2]; - p1[3] ^= p2[3] ^ p3[3] ^ p4[3]; - p1[4] ^= p2[4] ^ p3[4] ^ p4[4]; - p1[5] ^= p2[5] ^ p3[5] ^ p4[5]; - p1[6] ^= p2[6] ^ p3[6] ^ p4[6]; - p1[7] ^= p2[7] ^ p3[7] ^ p4[7]; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - } while (--lines > 0); -} - -static void -xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - long lines = bytes / (sizeof (long)) / 8; - - do { - p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0]; - p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1]; - p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2]; - p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3]; - p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4]; - p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5]; - p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6]; - p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7]; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - p5 += 8; - } while (--lines > 0); -} - -static void -xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - long lines = bytes / (sizeof (long)) / 8; - - do { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = p1[0]; /* Pull the stuff into registers */ - d1 = p1[1]; /* ... in bursts, if possible. */ - d2 = p1[2]; - d3 = p1[3]; - d4 = p1[4]; - d5 = p1[5]; - d6 = p1[6]; - d7 = p1[7]; - d0 ^= p2[0]; - d1 ^= p2[1]; - d2 ^= p2[2]; - d3 ^= p2[3]; - d4 ^= p2[4]; - d5 ^= p2[5]; - d6 ^= p2[6]; - d7 ^= p2[7]; - p1[0] = d0; /* Store the result (in bursts) */ - p1[1] = d1; - p1[2] = d2; - p1[3] = d3; - p1[4] = d4; - p1[5] = d5; - p1[6] = d6; - p1[7] = d7; - p1 += 8; - p2 += 8; - } while (--lines > 0); -} - -static void -xor_32regs_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - long lines = bytes / (sizeof (long)) / 8; - - do { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = p1[0]; /* Pull the stuff into registers */ - d1 = p1[1]; /* ... in bursts, if possible. */ - d2 = p1[2]; - d3 = p1[3]; - d4 = p1[4]; - d5 = p1[5]; - d6 = p1[6]; - d7 = p1[7]; - d0 ^= p2[0]; - d1 ^= p2[1]; - d2 ^= p2[2]; - d3 ^= p2[3]; - d4 ^= p2[4]; - d5 ^= p2[5]; - d6 ^= p2[6]; - d7 ^= p2[7]; - d0 ^= p3[0]; - d1 ^= p3[1]; - d2 ^= p3[2]; - d3 ^= p3[3]; - d4 ^= p3[4]; - d5 ^= p3[5]; - d6 ^= p3[6]; - d7 ^= p3[7]; - p1[0] = d0; /* Store the result (in bursts) */ - p1[1] = d1; - p1[2] = d2; - p1[3] = d3; - p1[4] = d4; - p1[5] = d5; - p1[6] = d6; - p1[7] = d7; - p1 += 8; - p2 += 8; - p3 += 8; - } while (--lines > 0); -} - -static void -xor_32regs_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - long lines = bytes / (sizeof (long)) / 8; - - do { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = p1[0]; /* Pull the stuff into registers */ - d1 = p1[1]; /* ... in bursts, if possible. */ - d2 = p1[2]; - d3 = p1[3]; - d4 = p1[4]; - d5 = p1[5]; - d6 = p1[6]; - d7 = p1[7]; - d0 ^= p2[0]; - d1 ^= p2[1]; - d2 ^= p2[2]; - d3 ^= p2[3]; - d4 ^= p2[4]; - d5 ^= p2[5]; - d6 ^= p2[6]; - d7 ^= p2[7]; - d0 ^= p3[0]; - d1 ^= p3[1]; - d2 ^= p3[2]; - d3 ^= p3[3]; - d4 ^= p3[4]; - d5 ^= p3[5]; - d6 ^= p3[6]; - d7 ^= p3[7]; - d0 ^= p4[0]; - d1 ^= p4[1]; - d2 ^= p4[2]; - d3 ^= p4[3]; - d4 ^= p4[4]; - d5 ^= p4[5]; - d6 ^= p4[6]; - d7 ^= p4[7]; - p1[0] = d0; /* Store the result (in bursts) */ - p1[1] = d1; - p1[2] = d2; - p1[3] = d3; - p1[4] = d4; - p1[5] = d5; - p1[6] = d6; - p1[7] = d7; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - } while (--lines > 0); -} - -static void -xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - long lines = bytes / (sizeof (long)) / 8; - - do { - register long d0, d1, d2, d3, d4, d5, d6, d7; - d0 = p1[0]; /* Pull the stuff into registers */ - d1 = p1[1]; /* ... in bursts, if possible. */ - d2 = p1[2]; - d3 = p1[3]; - d4 = p1[4]; - d5 = p1[5]; - d6 = p1[6]; - d7 = p1[7]; - d0 ^= p2[0]; - d1 ^= p2[1]; - d2 ^= p2[2]; - d3 ^= p2[3]; - d4 ^= p2[4]; - d5 ^= p2[5]; - d6 ^= p2[6]; - d7 ^= p2[7]; - d0 ^= p3[0]; - d1 ^= p3[1]; - d2 ^= p3[2]; - d3 ^= p3[3]; - d4 ^= p3[4]; - d5 ^= p3[5]; - d6 ^= p3[6]; - d7 ^= p3[7]; - d0 ^= p4[0]; - d1 ^= p4[1]; - d2 ^= p4[2]; - d3 ^= p4[3]; - d4 ^= p4[4]; - d5 ^= p4[5]; - d6 ^= p4[6]; - d7 ^= p4[7]; - d0 ^= p5[0]; - d1 ^= p5[1]; - d2 ^= p5[2]; - d3 ^= p5[3]; - d4 ^= p5[4]; - d5 ^= p5[5]; - d6 ^= p5[6]; - d7 ^= p5[7]; - p1[0] = d0; /* Store the result (in bursts) */ - p1[1] = d1; - p1[2] = d2; - p1[3] = d3; - p1[4] = d4; - p1[5] = d5; - p1[6] = d6; - p1[7] = d7; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - p5 += 8; - } while (--lines > 0); -} - -static void -xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - long lines = bytes / (sizeof (long)) / 8 - 1; - prefetchw(p1); - prefetch(p2); - - do { - prefetchw(p1+8); - prefetch(p2+8); - once_more: - p1[0] ^= p2[0]; - p1[1] ^= p2[1]; - p1[2] ^= p2[2]; - p1[3] ^= p2[3]; - p1[4] ^= p2[4]; - p1[5] ^= p2[5]; - p1[6] ^= p2[6]; - p1[7] ^= p2[7]; - p1 += 8; - p2 += 8; - } while (--lines > 0); - if (lines == 0) - goto once_more; -} - -static void -xor_8regs_p_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - long lines = bytes / (sizeof (long)) / 8 - 1; - prefetchw(p1); - prefetch(p2); - prefetch(p3); - - do { - prefetchw(p1+8); - prefetch(p2+8); - prefetch(p3+8); - once_more: - p1[0] ^= p2[0] ^ p3[0]; - p1[1] ^= p2[1] ^ p3[1]; - p1[2] ^= p2[2] ^ p3[2]; - p1[3] ^= p2[3] ^ p3[3]; - p1[4] ^= p2[4] ^ p3[4]; - p1[5] ^= p2[5] ^ p3[5]; - p1[6] ^= p2[6] ^ p3[6]; - p1[7] ^= p2[7] ^ p3[7]; - p1 += 8; - p2 += 8; - p3 += 8; - } while (--lines > 0); - if (lines == 0) - goto once_more; -} - -static void -xor_8regs_p_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - long lines = bytes / (sizeof (long)) / 8 - 1; - - prefetchw(p1); - prefetch(p2); - prefetch(p3); - prefetch(p4); - - do { - prefetchw(p1+8); - prefetch(p2+8); - prefetch(p3+8); - prefetch(p4+8); - once_more: - p1[0] ^= p2[0] ^ p3[0] ^ p4[0]; - p1[1] ^= p2[1] ^ p3[1] ^ p4[1]; - p1[2] ^= p2[2] ^ p3[2] ^ p4[2]; - p1[3] ^= p2[3] ^ p3[3] ^ p4[3]; - p1[4] ^= p2[4] ^ p3[4] ^ p4[4]; - p1[5] ^= p2[5] ^ p3[5] ^ p4[5]; - p1[6] ^= p2[6] ^ p3[6] ^ p4[6]; - p1[7] ^= p2[7] ^ p3[7] ^ p4[7]; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - } while (--lines > 0); - if (lines == 0) - goto once_more; -} - -static void -xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - long lines = bytes / (sizeof (long)) / 8 - 1; - - prefetchw(p1); - prefetch(p2); - prefetch(p3); - prefetch(p4); - prefetch(p5); - - do { - prefetchw(p1+8); - prefetch(p2+8); - prefetch(p3+8); - prefetch(p4+8); - prefetch(p5+8); - once_more: - p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0]; - p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1]; - p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2]; - p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3]; - p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4]; - p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5]; - p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6]; - p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7]; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - p5 += 8; - } while (--lines > 0); - if (lines == 0) - goto once_more; -} - -static void -xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - long lines = bytes / (sizeof (long)) / 8 - 1; - - prefetchw(p1); - prefetch(p2); - - do { - register long d0, d1, d2, d3, d4, d5, d6, d7; - - prefetchw(p1+8); - prefetch(p2+8); - once_more: - d0 = p1[0]; /* Pull the stuff into registers */ - d1 = p1[1]; /* ... in bursts, if possible. */ - d2 = p1[2]; - d3 = p1[3]; - d4 = p1[4]; - d5 = p1[5]; - d6 = p1[6]; - d7 = p1[7]; - d0 ^= p2[0]; - d1 ^= p2[1]; - d2 ^= p2[2]; - d3 ^= p2[3]; - d4 ^= p2[4]; - d5 ^= p2[5]; - d6 ^= p2[6]; - d7 ^= p2[7]; - p1[0] = d0; /* Store the result (in bursts) */ - p1[1] = d1; - p1[2] = d2; - p1[3] = d3; - p1[4] = d4; - p1[5] = d5; - p1[6] = d6; - p1[7] = d7; - p1 += 8; - p2 += 8; - } while (--lines > 0); - if (lines == 0) - goto once_more; -} - -static void -xor_32regs_p_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - long lines = bytes / (sizeof (long)) / 8 - 1; - - prefetchw(p1); - prefetch(p2); - prefetch(p3); - - do { - register long d0, d1, d2, d3, d4, d5, d6, d7; - - prefetchw(p1+8); - prefetch(p2+8); - prefetch(p3+8); - once_more: - d0 = p1[0]; /* Pull the stuff into registers */ - d1 = p1[1]; /* ... in bursts, if possible. */ - d2 = p1[2]; - d3 = p1[3]; - d4 = p1[4]; - d5 = p1[5]; - d6 = p1[6]; - d7 = p1[7]; - d0 ^= p2[0]; - d1 ^= p2[1]; - d2 ^= p2[2]; - d3 ^= p2[3]; - d4 ^= p2[4]; - d5 ^= p2[5]; - d6 ^= p2[6]; - d7 ^= p2[7]; - d0 ^= p3[0]; - d1 ^= p3[1]; - d2 ^= p3[2]; - d3 ^= p3[3]; - d4 ^= p3[4]; - d5 ^= p3[5]; - d6 ^= p3[6]; - d7 ^= p3[7]; - p1[0] = d0; /* Store the result (in bursts) */ - p1[1] = d1; - p1[2] = d2; - p1[3] = d3; - p1[4] = d4; - p1[5] = d5; - p1[6] = d6; - p1[7] = d7; - p1 += 8; - p2 += 8; - p3 += 8; - } while (--lines > 0); - if (lines == 0) - goto once_more; -} - -static void -xor_32regs_p_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - long lines = bytes / (sizeof (long)) / 8 - 1; - - prefetchw(p1); - prefetch(p2); - prefetch(p3); - prefetch(p4); - - do { - register long d0, d1, d2, d3, d4, d5, d6, d7; - - prefetchw(p1+8); - prefetch(p2+8); - prefetch(p3+8); - prefetch(p4+8); - once_more: - d0 = p1[0]; /* Pull the stuff into registers */ - d1 = p1[1]; /* ... in bursts, if possible. */ - d2 = p1[2]; - d3 = p1[3]; - d4 = p1[4]; - d5 = p1[5]; - d6 = p1[6]; - d7 = p1[7]; - d0 ^= p2[0]; - d1 ^= p2[1]; - d2 ^= p2[2]; - d3 ^= p2[3]; - d4 ^= p2[4]; - d5 ^= p2[5]; - d6 ^= p2[6]; - d7 ^= p2[7]; - d0 ^= p3[0]; - d1 ^= p3[1]; - d2 ^= p3[2]; - d3 ^= p3[3]; - d4 ^= p3[4]; - d5 ^= p3[5]; - d6 ^= p3[6]; - d7 ^= p3[7]; - d0 ^= p4[0]; - d1 ^= p4[1]; - d2 ^= p4[2]; - d3 ^= p4[3]; - d4 ^= p4[4]; - d5 ^= p4[5]; - d6 ^= p4[6]; - d7 ^= p4[7]; - p1[0] = d0; /* Store the result (in bursts) */ - p1[1] = d1; - p1[2] = d2; - p1[3] = d3; - p1[4] = d4; - p1[5] = d5; - p1[6] = d6; - p1[7] = d7; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - } while (--lines > 0); - if (lines == 0) - goto once_more; -} - -static void -xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - long lines = bytes / (sizeof (long)) / 8 - 1; - - prefetchw(p1); - prefetch(p2); - prefetch(p3); - prefetch(p4); - prefetch(p5); - - do { - register long d0, d1, d2, d3, d4, d5, d6, d7; - - prefetchw(p1+8); - prefetch(p2+8); - prefetch(p3+8); - prefetch(p4+8); - prefetch(p5+8); - once_more: - d0 = p1[0]; /* Pull the stuff into registers */ - d1 = p1[1]; /* ... in bursts, if possible. */ - d2 = p1[2]; - d3 = p1[3]; - d4 = p1[4]; - d5 = p1[5]; - d6 = p1[6]; - d7 = p1[7]; - d0 ^= p2[0]; - d1 ^= p2[1]; - d2 ^= p2[2]; - d3 ^= p2[3]; - d4 ^= p2[4]; - d5 ^= p2[5]; - d6 ^= p2[6]; - d7 ^= p2[7]; - d0 ^= p3[0]; - d1 ^= p3[1]; - d2 ^= p3[2]; - d3 ^= p3[3]; - d4 ^= p3[4]; - d5 ^= p3[5]; - d6 ^= p3[6]; - d7 ^= p3[7]; - d0 ^= p4[0]; - d1 ^= p4[1]; - d2 ^= p4[2]; - d3 ^= p4[3]; - d4 ^= p4[4]; - d5 ^= p4[5]; - d6 ^= p4[6]; - d7 ^= p4[7]; - d0 ^= p5[0]; - d1 ^= p5[1]; - d2 ^= p5[2]; - d3 ^= p5[3]; - d4 ^= p5[4]; - d5 ^= p5[5]; - d6 ^= p5[6]; - d7 ^= p5[7]; - p1[0] = d0; /* Store the result (in bursts) */ - p1[1] = d1; - p1[2] = d2; - p1[3] = d3; - p1[4] = d4; - p1[5] = d5; - p1[6] = d6; - p1[7] = d7; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - p5 += 8; - } while (--lines > 0); - if (lines == 0) - goto once_more; -} - -static struct xor_block_template xor_block_8regs = { - .name = "8regs", - .do_2 = xor_8regs_2, - .do_3 = xor_8regs_3, - .do_4 = xor_8regs_4, - .do_5 = xor_8regs_5, -}; - -static struct xor_block_template xor_block_32regs = { - .name = "32regs", - .do_2 = xor_32regs_2, - .do_3 = xor_32regs_3, - .do_4 = xor_32regs_4, - .do_5 = xor_32regs_5, -}; - -static struct xor_block_template xor_block_8regs_p __maybe_unused = { - .name = "8regs_prefetch", - .do_2 = xor_8regs_p_2, - .do_3 = xor_8regs_p_3, - .do_4 = xor_8regs_p_4, - .do_5 = xor_8regs_p_5, -}; - -static struct xor_block_template xor_block_32regs_p __maybe_unused = { - .name = "32regs_prefetch", - .do_2 = xor_32regs_p_2, - .do_3 = xor_32regs_p_3, - .do_4 = xor_32regs_p_4, - .do_5 = xor_32regs_p_5, -}; - -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - } while (0) diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index 51b811b623224f..870558c9d36ee3 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -2,29 +2,6 @@ #ifndef _XOR_H #define _XOR_H -#define MAX_XOR_BLOCKS 4 +void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes); -extern void xor_blocks(unsigned int count, unsigned int bytes, - void *dest, void **srcs); - -struct xor_block_template { - struct xor_block_template *next; - const char *name; - int speed; - void (*do_2)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_3)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_4)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); - void (*do_5)(unsigned long, unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict, - const unsigned long * __restrict); -}; - -#endif +#endif /* _XOR_H */ diff --git a/lib/Kconfig b/lib/Kconfig index 0f2fb96106476c..5be57adcd454ff 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -138,6 +138,7 @@ config TRACE_MMIO_ACCESS source "lib/crc/Kconfig" source "lib/crypto/Kconfig" +source "lib/raid/Kconfig" config XXHASH tristate diff --git a/lib/Makefile b/lib/Makefile index 1b9ee167517f32..84da412a044ff2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -120,7 +120,7 @@ endif obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any) -obj-y += math/ crc/ crypto/ tests/ vdso/ +obj-y += math/ crc/ crypto/ tests/ vdso/ raid/ obj-$(CONFIG_GENERIC_IOMAP) += iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o diff --git a/lib/raid/.kunitconfig b/lib/raid/.kunitconfig new file mode 100644 index 00000000000000..351d22ed19540d --- /dev/null +++ b/lib/raid/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_BTRFS_FS=y +CONFIG_XOR_KUNIT_TEST=y diff --git a/lib/raid/Kconfig b/lib/raid/Kconfig new file mode 100644 index 00000000000000..1fc4b00e0d71de --- /dev/null +++ b/lib/raid/Kconfig @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: GPL-2.0 + +config XOR_BLOCKS + tristate + +# selected by architectures that provide an optimized XOR implementation +config XOR_BLOCKS_ARCH + depends on XOR_BLOCKS + default y if ALPHA + default y if ARM && KERNEL_MODE_NEON + default y if ARM64 + default y if CPU_HAS_LSX # loongarch + default y if ALTIVEC # powerpc + default y if RISCV_ISA_V + default y if SPARC + default y if S390 + default y if X86_32 + default y if X86_64 + bool + +config XOR_KUNIT_TEST + tristate "KUnit tests for xor_gen" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on XOR_BLOCKS + default KUNIT_ALL_TESTS + help + Unit tests for the XOR library functions. + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. diff --git a/lib/raid/Makefile b/lib/raid/Makefile new file mode 100644 index 00000000000000..3540fe846dc427 --- /dev/null +++ b/lib/raid/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += xor/ diff --git a/lib/raid/xor/Makefile b/lib/raid/xor/Makefile new file mode 100644 index 00000000000000..4d633dfd5b90cf --- /dev/null +++ b/lib/raid/xor/Makefile @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I $(src) + +obj-$(CONFIG_XOR_BLOCKS) += xor.o + +xor-y += xor-core.o +xor-y += xor-8regs.o +xor-y += xor-32regs.o +xor-y += xor-8regs-prefetch.o +xor-y += xor-32regs-prefetch.o + +ifeq ($(CONFIG_XOR_BLOCKS_ARCH),y) +CFLAGS_xor-core.o += -I$(src)/$(SRCARCH) +endif + +xor-$(CONFIG_ALPHA) += alpha/xor.o +xor-$(CONFIG_ARM) += arm/xor.o +ifeq ($(CONFIG_ARM),y) +xor-$(CONFIG_KERNEL_MODE_NEON) += arm/xor-neon.o arm/xor-neon-glue.o +endif +xor-$(CONFIG_ARM64) += arm64/xor-neon.o arm64/xor-neon-glue.o +xor-$(CONFIG_CPU_HAS_LSX) += loongarch/xor_simd.o +xor-$(CONFIG_CPU_HAS_LSX) += loongarch/xor_simd_glue.o +xor-$(CONFIG_ALTIVEC) += powerpc/xor_vmx.o powerpc/xor_vmx_glue.o +xor-$(CONFIG_RISCV_ISA_V) += riscv/xor.o riscv/xor-glue.o +xor-$(CONFIG_SPARC32) += sparc/xor-sparc32.o +xor-$(CONFIG_SPARC64) += sparc/xor-sparc64.o sparc/xor-sparc64-glue.o +xor-$(CONFIG_S390) += s390/xor.o +xor-$(CONFIG_X86_32) += x86/xor-avx.o x86/xor-sse.o x86/xor-mmx.o +xor-$(CONFIG_X86_64) += x86/xor-avx.o x86/xor-sse.o +obj-y += tests/ + +CFLAGS_arm/xor-neon.o += $(CC_FLAGS_FPU) +CFLAGS_REMOVE_arm/xor-neon.o += $(CC_FLAGS_NO_FPU) + +CFLAGS_arm64/xor-neon.o += $(CC_FLAGS_FPU) +CFLAGS_REMOVE_arm64/xor-neon.o += $(CC_FLAGS_NO_FPU) + +CFLAGS_powerpc/xor_vmx.o += -mhard-float -maltivec \ + $(call cc-option,-mabi=altivec) \ + -isystem $(shell $(CC) -print-file-name=include) diff --git a/arch/alpha/include/asm/xor.h b/lib/raid/xor/alpha/xor.c similarity index 95% rename from arch/alpha/include/asm/xor.h rename to lib/raid/xor/alpha/xor.c index e0de0c233ab923..a8f72f2dd3a5e2 100644 --- a/arch/alpha/include/asm/xor.h +++ b/lib/raid/xor/alpha/xor.c @@ -1,9 +1,9 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * include/asm-alpha/xor.h - * - * Optimized RAID-5 checksumming functions for alpha EV5 and EV6 + * Optimized XOR parity functions for alpha EV5 and EV6 */ +#include "xor_impl.h" +#include "xor_arch.h" extern void xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1, @@ -832,35 +832,17 @@ xor_alpha_prefetch_5: \n\ .end xor_alpha_prefetch_5 \n\ "); -static struct xor_block_template xor_block_alpha = { - .name = "alpha", - .do_2 = xor_alpha_2, - .do_3 = xor_alpha_3, - .do_4 = xor_alpha_4, - .do_5 = xor_alpha_5, -}; +DO_XOR_BLOCKS(alpha, xor_alpha_2, xor_alpha_3, xor_alpha_4, xor_alpha_5); -static struct xor_block_template xor_block_alpha_prefetch = { - .name = "alpha prefetch", - .do_2 = xor_alpha_prefetch_2, - .do_3 = xor_alpha_prefetch_3, - .do_4 = xor_alpha_prefetch_4, - .do_5 = xor_alpha_prefetch_5, +struct xor_block_template xor_block_alpha = { + .name = "alpha", + .xor_gen = xor_gen_alpha, }; -/* For grins, also test the generic routines. */ -#include - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_alpha); \ - xor_speed(&xor_block_alpha_prefetch); \ - } while (0) +DO_XOR_BLOCKS(alpha_prefetch, xor_alpha_prefetch_2, xor_alpha_prefetch_3, + xor_alpha_prefetch_4, xor_alpha_prefetch_5); -/* Force the use of alpha_prefetch if EV6, as it is significantly - faster in the cold cache case. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - (implver() == IMPLVER_EV6 ? &xor_block_alpha_prefetch : FASTEST) +struct xor_block_template xor_block_alpha_prefetch = { + .name = "alpha prefetch", + .xor_gen = xor_gen_alpha_prefetch, +}; diff --git a/lib/raid/xor/alpha/xor_arch.h b/lib/raid/xor/alpha/xor_arch.h new file mode 100644 index 00000000000000..0dcfea578a488a --- /dev/null +++ b/lib/raid/xor/alpha/xor_arch.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include + +extern struct xor_block_template xor_block_alpha; +extern struct xor_block_template xor_block_alpha_prefetch; + +/* + * Force the use of alpha_prefetch if EV6, as it is significantly faster in the + * cold cache case. + */ +static __always_inline void __init arch_xor_init(void) +{ + if (implver() == IMPLVER_EV6) { + xor_force(&xor_block_alpha_prefetch); + } else { + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + xor_register(&xor_block_alpha); + xor_register(&xor_block_alpha_prefetch); + } +} diff --git a/lib/raid/xor/arm/xor-neon-glue.c b/lib/raid/xor/arm/xor-neon-glue.c new file mode 100644 index 00000000000000..cea39e0199048e --- /dev/null +++ b/lib/raid/xor/arm/xor-neon-glue.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2001 Russell King + */ +#include "xor_impl.h" +#include "xor_arch.h" + +static void xor_gen_neon(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_neon_begin(); + xor_gen_neon_inner(dest, srcs, src_cnt, bytes); + kernel_neon_end(); +} + +struct xor_block_template xor_block_neon = { + .name = "neon", + .xor_gen = xor_gen_neon, +}; diff --git a/arch/arm/lib/xor-neon.c b/lib/raid/xor/arm/xor-neon.c similarity index 53% rename from arch/arm/lib/xor-neon.c rename to lib/raid/xor/arm/xor-neon.c index cf57fca979089a..23147e3a79044f 100644 --- a/arch/arm/lib/xor-neon.c +++ b/lib/raid/xor/arm/xor-neon.c @@ -1,15 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/arch/arm/lib/xor-neon.c - * * Copyright (C) 2013 Linaro Ltd */ -#include -#include - -MODULE_DESCRIPTION("NEON accelerated XOR implementation"); -MODULE_LICENSE("GPL"); +#include "xor_impl.h" +#include "xor_arch.h" #ifndef __ARM_NEON__ #error You should compile this file with '-march=armv7-a -mfloat-abi=softfp -mfpu=neon' @@ -25,14 +20,7 @@ MODULE_LICENSE("GPL"); #pragma GCC optimize "tree-vectorize" #endif -#pragma GCC diagnostic ignored "-Wunused-variable" -#include +#define NO_TEMPLATE +#include "../xor-8regs.c" -struct xor_block_template const xor_block_neon_inner = { - .name = "__inner_neon__", - .do_2 = xor_8regs_2, - .do_3 = xor_8regs_3, - .do_4 = xor_8regs_4, - .do_5 = xor_8regs_5, -}; -EXPORT_SYMBOL(xor_block_neon_inner); +__DO_XOR_BLOCKS(neon_inner, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5); diff --git a/arch/arm/include/asm/xor.h b/lib/raid/xor/arm/xor.c similarity index 59% rename from arch/arm/include/asm/xor.h rename to lib/raid/xor/arm/xor.c index 934b549905f5c5..45139b6c55eaa8 100644 --- a/arch/arm/include/asm/xor.h +++ b/lib/raid/xor/arm/xor.c @@ -1,13 +1,9 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +// SPDX-License-Identifier: GPL-2.0-only /* - * arch/arm/include/asm/xor.h - * * Copyright (C) 2001 Russell King */ -#include -#include -#include -#include +#include "xor_impl.h" +#include "xor_arch.h" #define __XOR(a1, a2) a1 ^= a2 @@ -131,95 +127,10 @@ xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1, } while (--lines); } -static struct xor_block_template xor_block_arm4regs = { - .name = "arm4regs", - .do_2 = xor_arm4regs_2, - .do_3 = xor_arm4regs_3, - .do_4 = xor_arm4regs_4, - .do_5 = xor_arm4regs_5, -}; - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_arm4regs); \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - NEON_TEMPLATES; \ - } while (0) - -#ifdef CONFIG_KERNEL_MODE_NEON +DO_XOR_BLOCKS(arm4regs, xor_arm4regs_2, xor_arm4regs_3, xor_arm4regs_4, + xor_arm4regs_5); -extern struct xor_block_template const xor_block_neon_inner; - -static void -xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) -{ - if (in_interrupt()) { - xor_arm4regs_2(bytes, p1, p2); - } else { - kernel_neon_begin(); - xor_block_neon_inner.do_2(bytes, p1, p2); - kernel_neon_end(); - } -} - -static void -xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) -{ - if (in_interrupt()) { - xor_arm4regs_3(bytes, p1, p2, p3); - } else { - kernel_neon_begin(); - xor_block_neon_inner.do_3(bytes, p1, p2, p3); - kernel_neon_end(); - } -} - -static void -xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) -{ - if (in_interrupt()) { - xor_arm4regs_4(bytes, p1, p2, p3, p4); - } else { - kernel_neon_begin(); - xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4); - kernel_neon_end(); - } -} - -static void -xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) -{ - if (in_interrupt()) { - xor_arm4regs_5(bytes, p1, p2, p3, p4, p5); - } else { - kernel_neon_begin(); - xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5); - kernel_neon_end(); - } -} - -static struct xor_block_template xor_block_neon = { - .name = "neon", - .do_2 = xor_neon_2, - .do_3 = xor_neon_3, - .do_4 = xor_neon_4, - .do_5 = xor_neon_5 +struct xor_block_template xor_block_arm4regs = { + .name = "arm4regs", + .xor_gen = xor_gen_arm4regs, }; - -#define NEON_TEMPLATES \ - do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0) -#else -#define NEON_TEMPLATES -#endif diff --git a/lib/raid/xor/arm/xor_arch.h b/lib/raid/xor/arm/xor_arch.h new file mode 100644 index 00000000000000..775ff835df656e --- /dev/null +++ b/lib/raid/xor/arm/xor_arch.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2001 Russell King + */ +#include + +extern struct xor_block_template xor_block_arm4regs; +extern struct xor_block_template xor_block_neon; + +void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_arm4regs); + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); +#ifdef CONFIG_KERNEL_MODE_NEON + if (cpu_has_neon()) + xor_register(&xor_block_neon); +#endif +} diff --git a/lib/raid/xor/arm64/xor-neon-glue.c b/lib/raid/xor/arm64/xor-neon-glue.c new file mode 100644 index 00000000000000..f0284f86feb4c8 --- /dev/null +++ b/lib/raid/xor/arm64/xor-neon-glue.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Authors: Jackie Liu + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. + */ + +#include +#include "xor_impl.h" +#include "xor_arch.h" +#include "xor-neon.h" + +#define XOR_TEMPLATE(_name) \ +static void xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt, \ + unsigned int bytes) \ +{ \ + scoped_ksimd() \ + xor_gen_##_name##_inner(dest, srcs, src_cnt, bytes); \ +} \ + \ +struct xor_block_template xor_block_##_name = { \ + .name = __stringify(_name), \ + .xor_gen = xor_gen_##_name, \ +}; + +XOR_TEMPLATE(neon); +XOR_TEMPLATE(eor3); diff --git a/arch/arm64/lib/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c similarity index 76% rename from arch/arm64/lib/xor-neon.c rename to lib/raid/xor/arm64/xor-neon.c index 8fffebfa17b204..97ef3cb924968d 100644 --- a/arch/arm64/lib/xor-neon.c +++ b/lib/raid/xor/arm64/xor-neon.c @@ -1,17 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * arch/arm64/lib/xor-neon.c - * * Authors: Jackie Liu * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. */ -#include -#include +#include #include +#include "xor_impl.h" +#include "xor_arch.h" +#include "xor-neon.h" -static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2) +static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -37,9 +37,9 @@ static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) +static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -73,10 +73,10 @@ static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) +static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -118,11 +118,11 @@ static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) +static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -172,14 +172,8 @@ static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -struct xor_block_template xor_block_inner_neon __ro_after_init = { - .name = "__inner_neon__", - .do_2 = xor_arm64_neon_2, - .do_3 = xor_arm64_neon_3, - .do_4 = xor_arm64_neon_4, - .do_5 = xor_arm64_neon_5, -}; -EXPORT_SYMBOL(xor_block_inner_neon); +__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4, + __xor_neon_5); static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) { @@ -191,10 +185,9 @@ static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) return res; } -static void xor_arm64_eor3_3(unsigned long bytes, - unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3) +static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -226,11 +219,10 @@ static void xor_arm64_eor3_3(unsigned long bytes, } while (--lines > 0); } -static void xor_arm64_eor3_4(unsigned long bytes, - unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4) +static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -270,12 +262,11 @@ static void xor_arm64_eor3_4(unsigned long bytes, } while (--lines > 0); } -static void xor_arm64_eor3_5(unsigned long bytes, - unsigned long * __restrict p1, - const unsigned long * __restrict p2, - const unsigned long * __restrict p3, - const unsigned long * __restrict p4, - const unsigned long * __restrict p5) +static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) { uint64_t *dp1 = (uint64_t *)p1; uint64_t *dp2 = (uint64_t *)p2; @@ -317,22 +308,5 @@ static void xor_arm64_eor3_5(unsigned long bytes, } while (--lines > 0); } -static int __init xor_neon_init(void) -{ - if (cpu_have_named_feature(SHA3)) { - xor_block_inner_neon.do_3 = xor_arm64_eor3_3; - xor_block_inner_neon.do_4 = xor_arm64_eor3_4; - xor_block_inner_neon.do_5 = xor_arm64_eor3_5; - } - return 0; -} -module_init(xor_neon_init); - -static void __exit xor_neon_exit(void) -{ -} -module_exit(xor_neon_exit); - -MODULE_AUTHOR("Jackie Liu "); -MODULE_DESCRIPTION("ARMv8 XOR Extensions"); -MODULE_LICENSE("GPL"); +__DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4, + __xor_eor3_5); diff --git a/lib/raid/xor/arm64/xor-neon.h b/lib/raid/xor/arm64/xor-neon.h new file mode 100644 index 00000000000000..514699ba8f5f8e --- /dev/null +++ b/lib/raid/xor/arm64/xor-neon.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); +void xor_gen_eor3_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); diff --git a/lib/raid/xor/arm64/xor_arch.h b/lib/raid/xor/arm64/xor_arch.h new file mode 100644 index 00000000000000..5dbb40319501b7 --- /dev/null +++ b/lib/raid/xor/arm64/xor_arch.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Authors: Jackie Liu + * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. + */ +#include + +extern struct xor_block_template xor_block_neon; +extern struct xor_block_template xor_block_eor3; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + if (cpu_has_neon()) { + if (cpu_have_named_feature(SHA3)) + xor_register(&xor_block_eor3); + else + xor_register(&xor_block_neon); + } +} diff --git a/lib/raid/xor/loongarch/xor_arch.h b/lib/raid/xor/loongarch/xor_arch.h new file mode 100644 index 00000000000000..fe5e8244fd0ebd --- /dev/null +++ b/lib/raid/xor/loongarch/xor_arch.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023 WANG Xuerui + */ +#include + +/* + * For grins, also test the generic routines. + * + * More importantly: it cannot be ruled out at this point of time, that some + * future (maybe reduced) models could run the vector algorithms slower than + * the scalar ones, maybe for errata or micro-op reasons. It may be + * appropriate to revisit this after one or two more uarch generations. + */ + +extern struct xor_block_template xor_block_lsx; +extern struct xor_block_template xor_block_lasx; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#ifdef CONFIG_CPU_HAS_LSX + if (cpu_has_lsx) + xor_register(&xor_block_lsx); +#endif +#ifdef CONFIG_CPU_HAS_LASX + if (cpu_has_lasx) + xor_register(&xor_block_lasx); +#endif +} diff --git a/arch/loongarch/lib/xor_simd.c b/lib/raid/xor/loongarch/xor_simd.c similarity index 100% rename from arch/loongarch/lib/xor_simd.c rename to lib/raid/xor/loongarch/xor_simd.c diff --git a/arch/loongarch/lib/xor_simd.h b/lib/raid/xor/loongarch/xor_simd.h similarity index 100% rename from arch/loongarch/lib/xor_simd.h rename to lib/raid/xor/loongarch/xor_simd.h diff --git a/lib/raid/xor/loongarch/xor_simd_glue.c b/lib/raid/xor/loongarch/xor_simd_glue.c new file mode 100644 index 00000000000000..7f324d924f8791 --- /dev/null +++ b/lib/raid/xor/loongarch/xor_simd_glue.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * LoongArch SIMD XOR operations + * + * Copyright (C) 2023 WANG Xuerui + */ + +#include +#include +#include "xor_impl.h" +#include "xor_arch.h" +#include "xor_simd.h" + +#define MAKE_XOR_GLUES(flavor) \ +DO_XOR_BLOCKS(flavor##_inner, __xor_##flavor##_2, __xor_##flavor##_3, \ + __xor_##flavor##_4, __xor_##flavor##_5); \ + \ +static void xor_gen_##flavor(void *dest, void **srcs, unsigned int src_cnt, \ + unsigned int bytes) \ +{ \ + kernel_fpu_begin(); \ + xor_gen_##flavor##_inner(dest, srcs, src_cnt, bytes); \ + kernel_fpu_end(); \ +} \ + \ +struct xor_block_template xor_block_##flavor = { \ + .name = __stringify(flavor), \ + .xor_gen = xor_gen_##flavor \ +} + +#ifdef CONFIG_CPU_HAS_LSX +MAKE_XOR_GLUES(lsx); +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +MAKE_XOR_GLUES(lasx); +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/arch/loongarch/lib/xor_template.c b/lib/raid/xor/loongarch/xor_template.c similarity index 100% rename from arch/loongarch/lib/xor_template.c rename to lib/raid/xor/loongarch/xor_template.c diff --git a/lib/raid/xor/powerpc/xor_arch.h b/lib/raid/xor/powerpc/xor_arch.h new file mode 100644 index 00000000000000..3b00a4a2fd67cc --- /dev/null +++ b/lib/raid/xor/powerpc/xor_arch.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include + +extern struct xor_block_template xor_block_altivec; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + xor_register(&xor_block_altivec); +#endif +} diff --git a/arch/powerpc/lib/xor_vmx.c b/lib/raid/xor/powerpc/xor_vmx.c similarity index 69% rename from arch/powerpc/lib/xor_vmx.c rename to lib/raid/xor/powerpc/xor_vmx.c index aab49d056d1883..09bed98c1bc723 100644 --- a/arch/powerpc/lib/xor_vmx.c +++ b/lib/raid/xor/powerpc/xor_vmx.c @@ -10,6 +10,7 @@ * Sparse (as at v0.5.0) gets very, very confused by this file. * Make it a bit simpler for it. */ +#include "xor_impl.h" #if !defined(__CHECKER__) #include #else @@ -49,9 +50,9 @@ typedef vector signed char unative_t; V1##_3 = vec_xor(V1##_3, V2##_3); \ } while (0) -void __xor_altivec_2(unsigned long bytes, - unsigned long * __restrict v1_in, - const unsigned long * __restrict v2_in) +static void __xor_altivec_2(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in) { DEFINE(v1); DEFINE(v2); @@ -68,10 +69,10 @@ void __xor_altivec_2(unsigned long bytes, } while (--lines > 0); } -void __xor_altivec_3(unsigned long bytes, - unsigned long * __restrict v1_in, - const unsigned long * __restrict v2_in, - const unsigned long * __restrict v3_in) +static void __xor_altivec_3(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in) { DEFINE(v1); DEFINE(v2); @@ -92,11 +93,11 @@ void __xor_altivec_3(unsigned long bytes, } while (--lines > 0); } -void __xor_altivec_4(unsigned long bytes, - unsigned long * __restrict v1_in, - const unsigned long * __restrict v2_in, - const unsigned long * __restrict v3_in, - const unsigned long * __restrict v4_in) +static void __xor_altivec_4(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in, + const unsigned long * __restrict v4_in) { DEFINE(v1); DEFINE(v2); @@ -121,12 +122,12 @@ void __xor_altivec_4(unsigned long bytes, } while (--lines > 0); } -void __xor_altivec_5(unsigned long bytes, - unsigned long * __restrict v1_in, - const unsigned long * __restrict v2_in, - const unsigned long * __restrict v3_in, - const unsigned long * __restrict v4_in, - const unsigned long * __restrict v5_in) +static void __xor_altivec_5(unsigned long bytes, + unsigned long * __restrict v1_in, + const unsigned long * __restrict v2_in, + const unsigned long * __restrict v3_in, + const unsigned long * __restrict v4_in, + const unsigned long * __restrict v5_in) { DEFINE(v1); DEFINE(v2); @@ -154,3 +155,6 @@ void __xor_altivec_5(unsigned long bytes, v5 += 4; } while (--lines > 0); } + +__DO_XOR_BLOCKS(altivec_inner, __xor_altivec_2, __xor_altivec_3, + __xor_altivec_4, __xor_altivec_5); diff --git a/lib/raid/xor/powerpc/xor_vmx.h b/lib/raid/xor/powerpc/xor_vmx.h new file mode 100644 index 00000000000000..1d26c1133a8685 --- /dev/null +++ b/lib/raid/xor/powerpc/xor_vmx.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Simple interface to link xor_vmx.c and xor_vmx_glue.c + * + * Separating these file ensures that no altivec instructions are run + * outside of the enable/disable altivec block. + */ + +void xor_gen_altivec_inner(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); diff --git a/lib/raid/xor/powerpc/xor_vmx_glue.c b/lib/raid/xor/powerpc/xor_vmx_glue.c new file mode 100644 index 00000000000000..dbfbb5cadc36af --- /dev/null +++ b/lib/raid/xor/powerpc/xor_vmx_glue.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Altivec XOR operations + * + * Copyright 2017 IBM Corp. + */ + +#include +#include +#include +#include "xor_impl.h" +#include "xor_arch.h" +#include "xor_vmx.h" + +static void xor_gen_altivec(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + preempt_disable(); + enable_kernel_altivec(); + xor_gen_altivec_inner(dest, srcs, src_cnt, bytes); + disable_kernel_altivec(); + preempt_enable(); +} + +struct xor_block_template xor_block_altivec = { + .name = "altivec", + .xor_gen = xor_gen_altivec, +}; diff --git a/lib/raid/xor/riscv/xor-glue.c b/lib/raid/xor/riscv/xor-glue.c new file mode 100644 index 00000000000000..2e4c1b05d998fa --- /dev/null +++ b/lib/raid/xor/riscv/xor-glue.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021 SiFive + */ + +#include +#include +#include +#include "xor_impl.h" +#include "xor_arch.h" + +DO_XOR_BLOCKS(vector_inner, xor_regs_2_, xor_regs_3_, xor_regs_4_, xor_regs_5_); + +static void xor_gen_vector(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_vector_begin(); + xor_gen_vector_inner(dest, srcs, src_cnt, bytes); + kernel_vector_end(); +} + +struct xor_block_template xor_block_rvv = { + .name = "rvv", + .xor_gen = xor_gen_vector, +}; diff --git a/arch/riscv/lib/xor.S b/lib/raid/xor/riscv/xor.S similarity index 92% rename from arch/riscv/lib/xor.S rename to lib/raid/xor/riscv/xor.S index b28f2430e52fa5..56fb7fc1e2cd8d 100644 --- a/arch/riscv/lib/xor.S +++ b/lib/raid/xor/riscv/xor.S @@ -18,7 +18,6 @@ SYM_FUNC_START(xor_regs_2_) bnez a0, xor_regs_2_ ret SYM_FUNC_END(xor_regs_2_) -EXPORT_SYMBOL(xor_regs_2_) SYM_FUNC_START(xor_regs_3_) vsetvli a4, a0, e8, m8, ta, ma @@ -35,7 +34,6 @@ SYM_FUNC_START(xor_regs_3_) bnez a0, xor_regs_3_ ret SYM_FUNC_END(xor_regs_3_) -EXPORT_SYMBOL(xor_regs_3_) SYM_FUNC_START(xor_regs_4_) vsetvli a5, a0, e8, m8, ta, ma @@ -55,7 +53,6 @@ SYM_FUNC_START(xor_regs_4_) bnez a0, xor_regs_4_ ret SYM_FUNC_END(xor_regs_4_) -EXPORT_SYMBOL(xor_regs_4_) SYM_FUNC_START(xor_regs_5_) vsetvli a6, a0, e8, m8, ta, ma @@ -78,4 +75,3 @@ SYM_FUNC_START(xor_regs_5_) bnez a0, xor_regs_5_ ret SYM_FUNC_END(xor_regs_5_) -EXPORT_SYMBOL(xor_regs_5_) diff --git a/lib/raid/xor/riscv/xor_arch.h b/lib/raid/xor/riscv/xor_arch.h new file mode 100644 index 00000000000000..9240857d760b26 --- /dev/null +++ b/lib/raid/xor/riscv/xor_arch.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 SiFive + */ +#include + +extern struct xor_block_template xor_block_rvv; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); +#ifdef CONFIG_RISCV_ISA_V + if (has_vector()) + xor_register(&xor_block_rvv); +#endif +} diff --git a/arch/s390/lib/xor.c b/lib/raid/xor/s390/xor.c similarity index 93% rename from arch/s390/lib/xor.c rename to lib/raid/xor/s390/xor.c index 1721b73b780369..d8a62a70db6c3a 100644 --- a/arch/s390/lib/xor.c +++ b/lib/raid/xor/s390/xor.c @@ -7,9 +7,8 @@ */ #include -#include -#include -#include +#include "xor_impl.h" +#include "xor_arch.h" static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2) @@ -127,11 +126,9 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, : : "0", "cc", "memory"); } +DO_XOR_BLOCKS(xc, xor_xc_2, xor_xc_3, xor_xc_4, xor_xc_5); + struct xor_block_template xor_block_xc = { - .name = "xc", - .do_2 = xor_xc_2, - .do_3 = xor_xc_3, - .do_4 = xor_xc_4, - .do_5 = xor_xc_5, + .name = "xc", + .xor_gen = xor_gen_xc, }; -EXPORT_SYMBOL(xor_block_xc); diff --git a/lib/raid/xor/s390/xor_arch.h b/lib/raid/xor/s390/xor_arch.h new file mode 100644 index 00000000000000..4a233ed2b97a6a --- /dev/null +++ b/lib/raid/xor/s390/xor_arch.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Optimited xor routines + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky + */ +extern struct xor_block_template xor_block_xc; + +static __always_inline void __init arch_xor_init(void) +{ + xor_force(&xor_block_xc); +} diff --git a/arch/sparc/include/asm/xor_32.h b/lib/raid/xor/sparc/xor-sparc32.c similarity index 92% rename from arch/sparc/include/asm/xor_32.h rename to lib/raid/xor/sparc/xor-sparc32.c index 0351813cf3af5a..fb37631e90e697 100644 --- a/arch/sparc/include/asm/xor_32.h +++ b/lib/raid/xor/sparc/xor-sparc32.c @@ -1,16 +1,12 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * include/asm/xor.h - * - * Optimized RAID-5 checksumming functions for 32-bit Sparc. - */ - +// SPDX-License-Identifier: GPL-2.0-or-later /* * High speed xor_block operation for RAID4/5 utilizing the * ldd/std SPARC instructions. * * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) */ +#include "xor_impl.h" +#include "xor_arch.h" static void sparc_2(unsigned long bytes, unsigned long * __restrict p1, @@ -248,21 +244,9 @@ sparc_5(unsigned long bytes, unsigned long * __restrict p1, } while (--lines > 0); } -static struct xor_block_template xor_block_SPARC = { - .name = "SPARC", - .do_2 = sparc_2, - .do_3 = sparc_3, - .do_4 = sparc_4, - .do_5 = sparc_5, -}; - -/* For grins, also test the generic routines. */ -#include +DO_XOR_BLOCKS(sparc32, sparc_2, sparc_3, sparc_4, sparc_5); -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_SPARC); \ - } while (0) +struct xor_block_template xor_block_SPARC = { + .name = "SPARC", + .xor_gen = xor_gen_sparc32, +}; diff --git a/arch/sparc/include/asm/xor_64.h b/lib/raid/xor/sparc/xor-sparc64-glue.c similarity index 63% rename from arch/sparc/include/asm/xor_64.h rename to lib/raid/xor/sparc/xor-sparc64-glue.c index caaddea8ad79dd..a8a686e0d25830 100644 --- a/arch/sparc/include/asm/xor_64.h +++ b/lib/raid/xor/sparc/xor-sparc64-glue.c @@ -1,7 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * include/asm/xor.h - * * High speed xor_block operation for RAID4/5 utilizing the * UltraSparc Visual Instruction Set and Niagara block-init * twin-load instructions. @@ -10,7 +8,8 @@ * Copyright (C) 2006 David S. Miller */ -#include +#include "xor_impl.h" +#include "xor_arch.h" void xor_vis_2(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2); @@ -29,12 +28,11 @@ void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1, /* XXX Ugh, write cheetah versions... -DaveM */ -static struct xor_block_template xor_block_VIS = { - .name = "VIS", - .do_2 = xor_vis_2, - .do_3 = xor_vis_3, - .do_4 = xor_vis_4, - .do_5 = xor_vis_5, +DO_XOR_BLOCKS(vis, xor_vis_2, xor_vis_3, xor_vis_4, xor_vis_5); + +struct xor_block_template xor_block_VIS = { + .name = "VIS", + .xor_gen = xor_gen_vis, }; void xor_niagara_2(unsigned long bytes, unsigned long * __restrict p1, @@ -52,28 +50,10 @@ void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p4, const unsigned long * __restrict p5); -static struct xor_block_template xor_block_niagara = { - .name = "Niagara", - .do_2 = xor_niagara_2, - .do_3 = xor_niagara_3, - .do_4 = xor_niagara_4, - .do_5 = xor_niagara_5, -}; +DO_XOR_BLOCKS(niagara, xor_niagara_2, xor_niagara_3, xor_niagara_4, + xor_niagara_5); -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_VIS); \ - xor_speed(&xor_block_niagara); \ - } while (0) - -/* For VIS for everything except Niagara. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - ((tlb_type == hypervisor && \ - (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || \ - sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) ? \ - &xor_block_niagara : \ - &xor_block_VIS) +struct xor_block_template xor_block_niagara = { + .name = "Niagara", + .xor_gen = xor_gen_niagara, +}; diff --git a/arch/sparc/lib/xor.S b/lib/raid/xor/sparc/xor-sparc64.S similarity index 98% rename from arch/sparc/lib/xor.S rename to lib/raid/xor/sparc/xor-sparc64.S index 35461e3b2a9b14..a7b74d473bd47d 100644 --- a/arch/sparc/lib/xor.S +++ b/lib/raid/xor/sparc/xor-sparc64.S @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * arch/sparc64/lib/xor.S - * * High speed xor_block operation for RAID4/5 utilizing the * UltraSparc Visual Instruction Set and Niagara store-init/twin-load. * @@ -92,7 +90,6 @@ ENTRY(xor_vis_2) retl wr %g0, 0, %fprs ENDPROC(xor_vis_2) -EXPORT_SYMBOL(xor_vis_2) ENTRY(xor_vis_3) rd %fprs, %o5 @@ -159,7 +156,6 @@ ENTRY(xor_vis_3) retl wr %g0, 0, %fprs ENDPROC(xor_vis_3) -EXPORT_SYMBOL(xor_vis_3) ENTRY(xor_vis_4) rd %fprs, %o5 @@ -245,7 +241,6 @@ ENTRY(xor_vis_4) retl wr %g0, 0, %fprs ENDPROC(xor_vis_4) -EXPORT_SYMBOL(xor_vis_4) ENTRY(xor_vis_5) save %sp, -192, %sp @@ -352,7 +347,6 @@ ENTRY(xor_vis_5) ret restore ENDPROC(xor_vis_5) -EXPORT_SYMBOL(xor_vis_5) /* Niagara versions. */ ENTRY(xor_niagara_2) /* %o0=bytes, %o1=dest, %o2=src */ @@ -399,7 +393,6 @@ ENTRY(xor_niagara_2) /* %o0=bytes, %o1=dest, %o2=src */ ret restore ENDPROC(xor_niagara_2) -EXPORT_SYMBOL(xor_niagara_2) ENTRY(xor_niagara_3) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */ save %sp, -192, %sp @@ -461,7 +454,6 @@ ENTRY(xor_niagara_3) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */ ret restore ENDPROC(xor_niagara_3) -EXPORT_SYMBOL(xor_niagara_3) ENTRY(xor_niagara_4) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */ save %sp, -192, %sp @@ -544,7 +536,6 @@ ENTRY(xor_niagara_4) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */ ret restore ENDPROC(xor_niagara_4) -EXPORT_SYMBOL(xor_niagara_4) ENTRY(xor_niagara_5) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */ save %sp, -192, %sp @@ -643,4 +634,3 @@ ENTRY(xor_niagara_5) /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=s ret restore ENDPROC(xor_niagara_5) -EXPORT_SYMBOL(xor_niagara_5) diff --git a/lib/raid/xor/sparc/xor_arch.h b/lib/raid/xor/sparc/xor_arch.h new file mode 100644 index 00000000000000..af288abe4e9176 --- /dev/null +++ b/lib/raid/xor/sparc/xor_arch.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + * Copyright (C) 2006 David S. Miller + */ +#if defined(__sparc__) && defined(__arch64__) +#include + +extern struct xor_block_template xor_block_VIS; +extern struct xor_block_template xor_block_niagara; + +static __always_inline void __init arch_xor_init(void) +{ + /* Force VIS for everything except Niagara. */ + if (tlb_type == hypervisor && + (sun4v_chip_type == SUN4V_CHIP_NIAGARA1 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA2 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA3 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA4 || + sun4v_chip_type == SUN4V_CHIP_NIAGARA5)) + xor_force(&xor_block_niagara); + else + xor_force(&xor_block_VIS); +} +#else /* sparc64 */ + +extern struct xor_block_template xor_block_SPARC; + +static __always_inline void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_32regs); + xor_register(&xor_block_SPARC); +} +#endif /* !sparc64 */ diff --git a/lib/raid/xor/tests/Makefile b/lib/raid/xor/tests/Makefile new file mode 100644 index 00000000000000..661e8f6ffd1f35 --- /dev/null +++ b/lib/raid/xor/tests/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_XOR_KUNIT_TEST) += xor_kunit.o diff --git a/lib/raid/xor/tests/xor_kunit.c b/lib/raid/xor/tests/xor_kunit.c new file mode 100644 index 00000000000000..01cbdf44f6b033 --- /dev/null +++ b/lib/raid/xor/tests/xor_kunit.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Unit test the XOR library functions. + * + * Copyright 2024 Google LLC + * Copyright 2026 Christoph Hellwig + * + * Based on the CRC tests by Eric Biggers . + */ +#include +#include +#include +#include +#include + +#define XOR_KUNIT_SEED 42 +#define XOR_KUNIT_MAX_BYTES 16384 +#define XOR_KUNIT_MAX_BUFFERS 64 +#define XOR_KUNIT_NUM_TEST_ITERS 1000 + +static struct rnd_state rng; +static void *test_buffers[XOR_KUNIT_MAX_BUFFERS]; +static void *test_dest; +static void *test_ref; +static size_t test_buflen; + +static u32 rand32(void) +{ + return prandom_u32_state(&rng); +} + +/* Reference implementation using dumb byte-wise XOR */ +static void xor_ref(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + unsigned int off, idx; + u8 *d = dest; + + for (off = 0; off < bytes; off++) { + for (idx = 0; idx < src_cnt; idx++) { + u8 *src = srcs[idx]; + + d[off] ^= src[off]; + } + } +} + +/* Generate a random length that is a multiple of 512. */ +static unsigned int random_length(unsigned int max_length) +{ + return (rand32() % (max_length + 1)) & ~511; +} + +/* Generate a random alignment that is a multiple of 64. */ +static unsigned int random_alignment(unsigned int max_alignment) +{ + return (rand32() % (max_alignment + 1)) & ~63; +} + +static void xor_generate_random_data(void) +{ + int i; + + prandom_bytes_state(&rng, test_dest, test_buflen); + memcpy(test_ref, test_dest, test_buflen); + for (i = 0; i < XOR_KUNIT_MAX_BUFFERS; i++) + prandom_bytes_state(&rng, test_buffers[i], test_buflen); +} + +/* Test that xor_gen gives the same result as a reference implementation. */ +static void xor_test(struct kunit *test) +{ + void *aligned_buffers[XOR_KUNIT_MAX_BUFFERS]; + size_t i; + + for (i = 0; i < XOR_KUNIT_NUM_TEST_ITERS; i++) { + unsigned int nr_buffers = + (rand32() % XOR_KUNIT_MAX_BUFFERS) + 1; + unsigned int len = random_length(XOR_KUNIT_MAX_BYTES); + unsigned int max_alignment, align = 0; + void *buffers; + + if (rand32() % 8 == 0) + /* Refresh the data occasionally. */ + xor_generate_random_data(); + + /* + * If we're not using the entire buffer size, inject randomize + * alignment into the buffer. + */ + max_alignment = XOR_KUNIT_MAX_BYTES - len; + if (max_alignment == 0) { + buffers = test_buffers; + } else if (rand32() % 2 == 0) { + /* Use random alignments mod 64 */ + int j; + + for (j = 0; j < nr_buffers; j++) + aligned_buffers[j] = test_buffers[j] + + random_alignment(max_alignment); + buffers = aligned_buffers; + align = random_alignment(max_alignment); + } else { + /* Go up to the guard page, to catch buffer overreads */ + int j; + + align = test_buflen - len; + for (j = 0; j < nr_buffers; j++) + aligned_buffers[j] = test_buffers[j] + align; + buffers = aligned_buffers; + } + + /* + * Compute the XOR, and verify that it equals the XOR computed + * by a simple byte-at-a-time reference implementation. + */ + xor_ref(test_ref + align, buffers, nr_buffers, len); + xor_gen(test_dest + align, buffers, nr_buffers, len); + KUNIT_EXPECT_MEMEQ_MSG(test, test_ref + align, + test_dest + align, len, + "Wrong result with buffers=%u, len=%u, unaligned=%s, at_end=%s", + nr_buffers, len, + str_yes_no(max_alignment), + str_yes_no(align + len == test_buflen)); + } +} + +static struct kunit_case xor_test_cases[] = { + KUNIT_CASE(xor_test), + {}, +}; + +static int xor_suite_init(struct kunit_suite *suite) +{ + int i; + + /* + * Allocate the test buffer using vmalloc() with a page-aligned length + * so that it is immediately followed by a guard page. This allows + * buffer overreads to be detected, even in assembly code. + */ + test_buflen = round_up(XOR_KUNIT_MAX_BYTES, PAGE_SIZE); + test_ref = vmalloc(test_buflen); + if (!test_ref) + return -ENOMEM; + test_dest = vmalloc(test_buflen); + if (!test_dest) + goto out_free_ref; + for (i = 0; i < XOR_KUNIT_MAX_BUFFERS; i++) { + test_buffers[i] = vmalloc(test_buflen); + if (!test_buffers[i]) + goto out_free_buffers; + } + + prandom_seed_state(&rng, XOR_KUNIT_SEED); + xor_generate_random_data(); + return 0; + +out_free_buffers: + while (--i >= 0) + vfree(test_buffers[i]); + vfree(test_dest); +out_free_ref: + vfree(test_ref); + return -ENOMEM; +} + +static void xor_suite_exit(struct kunit_suite *suite) +{ + int i; + + vfree(test_ref); + vfree(test_dest); + for (i = 0; i < XOR_KUNIT_MAX_BUFFERS; i++) + vfree(test_buffers[i]); +} + +static struct kunit_suite xor_test_suite = { + .name = "xor", + .test_cases = xor_test_cases, + .suite_init = xor_suite_init, + .suite_exit = xor_suite_exit, +}; +kunit_test_suite(xor_test_suite); + +MODULE_DESCRIPTION("Unit test for the XOR library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/raid/xor/um/xor_arch.h b/lib/raid/xor/um/xor_arch.h new file mode 100644 index 00000000000000..a33e57a26c5ed7 --- /dev/null +++ b/lib/raid/xor/um/xor_arch.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <../x86/xor_arch.h> diff --git a/arch/x86/include/asm/xor_avx.h b/lib/raid/xor/x86/xor-avx.c similarity index 84% rename from arch/x86/include/asm/xor_avx.h rename to lib/raid/xor/x86/xor-avx.c index 7f81dd5897f417..f7777d7aa269bd 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/lib/raid/xor/x86/xor-avx.c @@ -1,18 +1,16 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ASM_X86_XOR_AVX_H -#define _ASM_X86_XOR_AVX_H - +// SPDX-License-Identifier: GPL-2.0-only /* - * Optimized RAID-5 checksumming functions for AVX + * Optimized XOR parity functions for AVX * * Copyright (C) 2012 Intel Corporation * Author: Jim Kukunas * * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines */ - #include #include +#include "xor_impl.h" +#include "xor_arch.h" #define BLOCK4(i) \ BLOCK(32 * i, 0) \ @@ -31,8 +29,6 @@ static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0, { unsigned long lines = bytes >> 9; - kernel_fpu_begin(); - while (lines--) { #undef BLOCK #define BLOCK(i, reg) \ @@ -49,8 +45,6 @@ do { \ p0 = (unsigned long *)((uintptr_t)p0 + 512); p1 = (unsigned long *)((uintptr_t)p1 + 512); } - - kernel_fpu_end(); } static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, @@ -59,8 +53,6 @@ static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0, { unsigned long lines = bytes >> 9; - kernel_fpu_begin(); - while (lines--) { #undef BLOCK #define BLOCK(i, reg) \ @@ -80,8 +72,6 @@ do { \ p1 = (unsigned long *)((uintptr_t)p1 + 512); p2 = (unsigned long *)((uintptr_t)p2 + 512); } - - kernel_fpu_end(); } static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, @@ -91,8 +81,6 @@ static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0, { unsigned long lines = bytes >> 9; - kernel_fpu_begin(); - while (lines--) { #undef BLOCK #define BLOCK(i, reg) \ @@ -115,8 +103,6 @@ do { \ p2 = (unsigned long *)((uintptr_t)p2 + 512); p3 = (unsigned long *)((uintptr_t)p3 + 512); } - - kernel_fpu_end(); } static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, @@ -127,8 +113,6 @@ static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0, { unsigned long lines = bytes >> 9; - kernel_fpu_begin(); - while (lines--) { #undef BLOCK #define BLOCK(i, reg) \ @@ -154,25 +138,19 @@ do { \ p3 = (unsigned long *)((uintptr_t)p3 + 512); p4 = (unsigned long *)((uintptr_t)p4 + 512); } +} + +DO_XOR_BLOCKS(avx_inner, xor_avx_2, xor_avx_3, xor_avx_4, xor_avx_5); +static void xor_gen_avx(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_avx_inner(dest, srcs, src_cnt, bytes); kernel_fpu_end(); } -static struct xor_block_template xor_block_avx = { - .name = "avx", - .do_2 = xor_avx_2, - .do_3 = xor_avx_3, - .do_4 = xor_avx_4, - .do_5 = xor_avx_5, +struct xor_block_template xor_block_avx = { + .name = "avx", + .xor_gen = xor_gen_avx, }; - -#define AVX_XOR_SPEED \ -do { \ - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ - xor_speed(&xor_block_avx); \ -} while (0) - -#define AVX_SELECT(FASTEST) \ - (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) - -#endif diff --git a/arch/x86/include/asm/xor_32.h b/lib/raid/xor/x86/xor-mmx.c similarity index 87% rename from arch/x86/include/asm/xor_32.h rename to lib/raid/xor/x86/xor-mmx.c index 7a6b9474591e75..63a8b0444fcef1 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/lib/raid/xor/x86/xor-mmx.c @@ -1,15 +1,12 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_X86_XOR_32_H -#define _ASM_X86_XOR_32_H - -/* - * Optimized RAID-5 checksumming functions for MMX. - */ - +// SPDX-License-Identifier: GPL-2.0-or-later /* - * High-speed RAID5 checksumming functions utilizing MMX instructions. + * Optimized XOR parity functions for MMX. + * * Copyright (C) 1998 Ingo Molnar. */ +#include +#include "xor_impl.h" +#include "xor_arch.h" #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n" #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n" @@ -18,16 +15,12 @@ #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" -#include - static void xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p2) { unsigned long lines = bytes >> 7; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -60,8 +53,6 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2) : : "memory"); - - kernel_fpu_end(); } static void @@ -71,8 +62,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 7; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -110,8 +99,6 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2), "+r" (p3) : : "memory"); - - kernel_fpu_end(); } static void @@ -122,8 +109,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 7; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -166,8 +151,6 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) : : "memory"); - - kernel_fpu_end(); } @@ -180,8 +163,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 7; - kernel_fpu_begin(); - /* Make sure GCC forgets anything it knows about p4 or p5, such that it won't pass to the asm volatile below a register that is shared with any other variable. That's @@ -242,8 +223,6 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1, Clobber them just to be sure nobody does something stupid like assuming they have some legal value. */ asm("" : "=r" (p4), "=r" (p5)); - - kernel_fpu_end(); } #undef LD @@ -260,8 +239,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 6; - kernel_fpu_begin(); - asm volatile( " .align 32 ;\n" " 1: ;\n" @@ -298,8 +275,6 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2) : : "memory"); - - kernel_fpu_end(); } static void @@ -309,8 +284,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 6; - kernel_fpu_begin(); - asm volatile( " .align 32,0x90 ;\n" " 1: ;\n" @@ -356,8 +329,6 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2), "+r" (p3) : : "memory" ); - - kernel_fpu_end(); } static void @@ -368,8 +339,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 6; - kernel_fpu_begin(); - asm volatile( " .align 32,0x90 ;\n" " 1: ;\n" @@ -424,8 +393,6 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1, "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) : : "memory"); - - kernel_fpu_end(); } static void @@ -437,8 +404,6 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 6; - kernel_fpu_begin(); - /* Make sure GCC forgets anything it knows about p4 or p5, such that it won't pass to the asm volatile below a register that is shared with any other variable. That's @@ -515,59 +480,36 @@ xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1, Clobber them just to be sure nobody does something stupid like assuming they have some legal value. */ asm("" : "=r" (p4), "=r" (p5)); +} + +DO_XOR_BLOCKS(pII_mmx_inner, xor_pII_mmx_2, xor_pII_mmx_3, xor_pII_mmx_4, + xor_pII_mmx_5); +static void xor_gen_pII_mmx(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_pII_mmx_inner(dest, srcs, src_cnt, bytes); kernel_fpu_end(); } -static struct xor_block_template xor_block_pII_mmx = { - .name = "pII_mmx", - .do_2 = xor_pII_mmx_2, - .do_3 = xor_pII_mmx_3, - .do_4 = xor_pII_mmx_4, - .do_5 = xor_pII_mmx_5, +struct xor_block_template xor_block_pII_mmx = { + .name = "pII_mmx", + .xor_gen = xor_gen_pII_mmx, }; -static struct xor_block_template xor_block_p5_mmx = { - .name = "p5_mmx", - .do_2 = xor_p5_mmx_2, - .do_3 = xor_p5_mmx_3, - .do_4 = xor_p5_mmx_4, - .do_5 = xor_p5_mmx_5, -}; +DO_XOR_BLOCKS(p5_mmx_inner, xor_p5_mmx_2, xor_p5_mmx_3, xor_p5_mmx_4, + xor_p5_mmx_5); -static struct xor_block_template xor_block_pIII_sse = { - .name = "pIII_sse", - .do_2 = xor_sse_2, - .do_3 = xor_sse_3, - .do_4 = xor_sse_4, - .do_5 = xor_sse_5, -}; +static void xor_gen_p5_mmx(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_p5_mmx_inner(dest, srcs, src_cnt, bytes); + kernel_fpu_end(); +} -/* Also try the AVX routines */ -#include - -/* Also try the generic routines. */ -#include - -/* We force the use of the SSE xor block because it can write around L2. - We may also be able to load into the L1 only depending on how the cpu - deals with a load to a line that is being prefetched. */ -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ -do { \ - AVX_XOR_SPEED; \ - if (boot_cpu_has(X86_FEATURE_XMM)) { \ - xor_speed(&xor_block_pIII_sse); \ - xor_speed(&xor_block_sse_pf64); \ - } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ - xor_speed(&xor_block_pII_mmx); \ - xor_speed(&xor_block_p5_mmx); \ - } else { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_8regs_p); \ - xor_speed(&xor_block_32regs); \ - xor_speed(&xor_block_32regs_p); \ - } \ -} while (0) - -#endif /* _ASM_X86_XOR_32_H */ +struct xor_block_template xor_block_p5_mmx = { + .name = "p5_mmx", + .xor_gen = xor_gen_p5_mmx, +}; diff --git a/arch/x86/include/asm/xor.h b/lib/raid/xor/x86/xor-sse.c similarity index 90% rename from arch/x86/include/asm/xor.h rename to lib/raid/xor/x86/xor-sse.c index 7b0307acc4103c..c6626ecae6ba5d 100644 --- a/arch/x86/include/asm/xor.h +++ b/lib/raid/xor/x86/xor-sse.c @@ -1,31 +1,20 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_X86_XOR_H -#define _ASM_X86_XOR_H - -/* - * Optimized RAID-5 checksumming functions for SSE. - */ - +// SPDX-License-Identifier: GPL-2.0-or-later /* + * Optimized XOR parity functions for SSE. + * * Cache avoiding checksumming functions utilizing KNI instructions * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) - */ - -/* + * * Based on * High-speed RAID5 checksumming functions utilizing SSE instructions. * Copyright (C) 1998 Ingo Molnar. - */ - -/* + * * x86-64 changes / gcc fixes from Andi Kleen. * Copyright 2002 Andi Kleen, SuSE Labs. - * - * This hasn't been optimized for the hammer yet, but there are likely - * no advantages to be gotten from x86-64 here anyways. */ - #include +#include "xor_impl.h" +#include "xor_arch.h" #ifdef CONFIG_X86_32 /* reduce register pressure */ @@ -62,8 +51,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -104,8 +91,6 @@ xor_sse_2(unsigned long bytes, unsigned long * __restrict p1, [p1] "+r" (p1), [p2] "+r" (p2) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -114,8 +99,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -139,8 +122,6 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1, [p1] "+r" (p1), [p2] "+r" (p2) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -150,8 +131,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -199,8 +178,6 @@ xor_sse_3(unsigned long bytes, unsigned long * __restrict p1, [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -210,8 +187,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -237,8 +212,6 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1, [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -249,8 +222,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -305,8 +276,6 @@ xor_sse_4(unsigned long bytes, unsigned long * __restrict p1, [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -317,8 +286,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -346,8 +313,6 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1, [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -359,8 +324,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -422,8 +385,6 @@ xor_sse_5(unsigned long bytes, unsigned long * __restrict p1, [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); - - kernel_fpu_end(); } static void @@ -435,8 +396,6 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, { unsigned long lines = bytes >> 8; - kernel_fpu_begin(); - asm volatile( #undef BLOCK #define BLOCK(i) \ @@ -466,37 +425,35 @@ xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1, [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) : [inc] XOR_CONSTANT_CONSTRAINT (256UL) : "memory"); +} + +DO_XOR_BLOCKS(sse_inner, xor_sse_2, xor_sse_3, xor_sse_4, xor_sse_5); +static void xor_gen_sse(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_sse_inner(dest, srcs, src_cnt, bytes); kernel_fpu_end(); } -static struct xor_block_template xor_block_sse_pf64 = { - .name = "prefetch64-sse", - .do_2 = xor_sse_2_pf64, - .do_3 = xor_sse_3_pf64, - .do_4 = xor_sse_4_pf64, - .do_5 = xor_sse_5_pf64, +struct xor_block_template xor_block_sse = { + .name = "sse", + .xor_gen = xor_gen_sse, }; -#undef LD -#undef XO1 -#undef XO2 -#undef XO3 -#undef XO4 -#undef ST -#undef NOP -#undef BLK64 -#undef BLOCK - -#undef XOR_CONSTANT_CONSTRAINT +DO_XOR_BLOCKS(sse_pf64_inner, xor_sse_2_pf64, xor_sse_3_pf64, xor_sse_4_pf64, + xor_sse_5_pf64); -#ifdef CONFIG_X86_32 -# include -#else -# include -#endif - -#define XOR_SELECT_TEMPLATE(FASTEST) \ - AVX_SELECT(FASTEST) +static void xor_gen_sse_pf64(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes) +{ + kernel_fpu_begin(); + xor_gen_sse_pf64_inner(dest, srcs, src_cnt, bytes); + kernel_fpu_end(); +} -#endif /* _ASM_X86_XOR_H */ +struct xor_block_template xor_block_sse_pf64 = { + .name = "prefetch64-sse", + .xor_gen = xor_gen_sse_pf64, +}; diff --git a/lib/raid/xor/x86/xor_arch.h b/lib/raid/xor/x86/xor_arch.h new file mode 100644 index 00000000000000..99fe85a213c669 --- /dev/null +++ b/lib/raid/xor/x86/xor_arch.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include + +extern struct xor_block_template xor_block_pII_mmx; +extern struct xor_block_template xor_block_p5_mmx; +extern struct xor_block_template xor_block_sse; +extern struct xor_block_template xor_block_sse_pf64; +extern struct xor_block_template xor_block_avx; + +/* + * When SSE is available, use it as it can write around L2. We may also be able + * to load into the L1 only depending on how the cpu deals with a load to a line + * that is being prefetched. + * + * When AVX2 is available, force using it as it is better by all measures. + * + * 32-bit without MMX can fall back to the generic routines. + */ +static __always_inline void __init arch_xor_init(void) +{ + if (boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_OSXSAVE)) { + xor_force(&xor_block_avx); + } else if (IS_ENABLED(CONFIG_X86_64) || boot_cpu_has(X86_FEATURE_XMM)) { + xor_register(&xor_block_sse); + xor_register(&xor_block_sse_pf64); + } else if (boot_cpu_has(X86_FEATURE_MMX)) { + xor_register(&xor_block_pII_mmx); + xor_register(&xor_block_p5_mmx); + } else { + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); + } +} diff --git a/lib/raid/xor/xor-32regs-prefetch.c b/lib/raid/xor/xor-32regs-prefetch.c new file mode 100644 index 00000000000000..ade2a7d8cbe2ae --- /dev/null +++ b/lib/raid/xor/xor-32regs-prefetch.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include "xor_impl.h" + +static void +xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) +{ + long lines = bytes / (sizeof (long)) / 8 - 1; + + prefetchw(p1); + prefetch(p2); + + do { + register long d0, d1, d2, d3, d4, d5, d6, d7; + + prefetchw(p1+8); + prefetch(p2+8); + once_more: + d0 = p1[0]; /* Pull the stuff into registers */ + d1 = p1[1]; /* ... in bursts, if possible. */ + d2 = p1[2]; + d3 = p1[3]; + d4 = p1[4]; + d5 = p1[5]; + d6 = p1[6]; + d7 = p1[7]; + d0 ^= p2[0]; + d1 ^= p2[1]; + d2 ^= p2[2]; + d3 ^= p2[3]; + d4 ^= p2[4]; + d5 ^= p2[5]; + d6 ^= p2[6]; + d7 ^= p2[7]; + p1[0] = d0; /* Store the result (in bursts) */ + p1[1] = d1; + p1[2] = d2; + p1[3] = d3; + p1[4] = d4; + p1[5] = d5; + p1[6] = d6; + p1[7] = d7; + p1 += 8; + p2 += 8; + } while (--lines > 0); + if (lines == 0) + goto once_more; +} + +static void +xor_32regs_p_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + long lines = bytes / (sizeof (long)) / 8 - 1; + + prefetchw(p1); + prefetch(p2); + prefetch(p3); + + do { + register long d0, d1, d2, d3, d4, d5, d6, d7; + + prefetchw(p1+8); + prefetch(p2+8); + prefetch(p3+8); + once_more: + d0 = p1[0]; /* Pull the stuff into registers */ + d1 = p1[1]; /* ... in bursts, if possible. */ + d2 = p1[2]; + d3 = p1[3]; + d4 = p1[4]; + d5 = p1[5]; + d6 = p1[6]; + d7 = p1[7]; + d0 ^= p2[0]; + d1 ^= p2[1]; + d2 ^= p2[2]; + d3 ^= p2[3]; + d4 ^= p2[4]; + d5 ^= p2[5]; + d6 ^= p2[6]; + d7 ^= p2[7]; + d0 ^= p3[0]; + d1 ^= p3[1]; + d2 ^= p3[2]; + d3 ^= p3[3]; + d4 ^= p3[4]; + d5 ^= p3[5]; + d6 ^= p3[6]; + d7 ^= p3[7]; + p1[0] = d0; /* Store the result (in bursts) */ + p1[1] = d1; + p1[2] = d2; + p1[3] = d3; + p1[4] = d4; + p1[5] = d5; + p1[6] = d6; + p1[7] = d7; + p1 += 8; + p2 += 8; + p3 += 8; + } while (--lines > 0); + if (lines == 0) + goto once_more; +} + +static void +xor_32regs_p_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + long lines = bytes / (sizeof (long)) / 8 - 1; + + prefetchw(p1); + prefetch(p2); + prefetch(p3); + prefetch(p4); + + do { + register long d0, d1, d2, d3, d4, d5, d6, d7; + + prefetchw(p1+8); + prefetch(p2+8); + prefetch(p3+8); + prefetch(p4+8); + once_more: + d0 = p1[0]; /* Pull the stuff into registers */ + d1 = p1[1]; /* ... in bursts, if possible. */ + d2 = p1[2]; + d3 = p1[3]; + d4 = p1[4]; + d5 = p1[5]; + d6 = p1[6]; + d7 = p1[7]; + d0 ^= p2[0]; + d1 ^= p2[1]; + d2 ^= p2[2]; + d3 ^= p2[3]; + d4 ^= p2[4]; + d5 ^= p2[5]; + d6 ^= p2[6]; + d7 ^= p2[7]; + d0 ^= p3[0]; + d1 ^= p3[1]; + d2 ^= p3[2]; + d3 ^= p3[3]; + d4 ^= p3[4]; + d5 ^= p3[5]; + d6 ^= p3[6]; + d7 ^= p3[7]; + d0 ^= p4[0]; + d1 ^= p4[1]; + d2 ^= p4[2]; + d3 ^= p4[3]; + d4 ^= p4[4]; + d5 ^= p4[5]; + d6 ^= p4[6]; + d7 ^= p4[7]; + p1[0] = d0; /* Store the result (in bursts) */ + p1[1] = d1; + p1[2] = d2; + p1[3] = d3; + p1[4] = d4; + p1[5] = d5; + p1[6] = d6; + p1[7] = d7; + p1 += 8; + p2 += 8; + p3 += 8; + p4 += 8; + } while (--lines > 0); + if (lines == 0) + goto once_more; +} + +static void +xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + long lines = bytes / (sizeof (long)) / 8 - 1; + + prefetchw(p1); + prefetch(p2); + prefetch(p3); + prefetch(p4); + prefetch(p5); + + do { + register long d0, d1, d2, d3, d4, d5, d6, d7; + + prefetchw(p1+8); + prefetch(p2+8); + prefetch(p3+8); + prefetch(p4+8); + prefetch(p5+8); + once_more: + d0 = p1[0]; /* Pull the stuff into registers */ + d1 = p1[1]; /* ... in bursts, if possible. */ + d2 = p1[2]; + d3 = p1[3]; + d4 = p1[4]; + d5 = p1[5]; + d6 = p1[6]; + d7 = p1[7]; + d0 ^= p2[0]; + d1 ^= p2[1]; + d2 ^= p2[2]; + d3 ^= p2[3]; + d4 ^= p2[4]; + d5 ^= p2[5]; + d6 ^= p2[6]; + d7 ^= p2[7]; + d0 ^= p3[0]; + d1 ^= p3[1]; + d2 ^= p3[2]; + d3 ^= p3[3]; + d4 ^= p3[4]; + d5 ^= p3[5]; + d6 ^= p3[6]; + d7 ^= p3[7]; + d0 ^= p4[0]; + d1 ^= p4[1]; + d2 ^= p4[2]; + d3 ^= p4[3]; + d4 ^= p4[4]; + d5 ^= p4[5]; + d6 ^= p4[6]; + d7 ^= p4[7]; + d0 ^= p5[0]; + d1 ^= p5[1]; + d2 ^= p5[2]; + d3 ^= p5[3]; + d4 ^= p5[4]; + d5 ^= p5[5]; + d6 ^= p5[6]; + d7 ^= p5[7]; + p1[0] = d0; /* Store the result (in bursts) */ + p1[1] = d1; + p1[2] = d2; + p1[3] = d3; + p1[4] = d4; + p1[5] = d5; + p1[6] = d6; + p1[7] = d7; + p1 += 8; + p2 += 8; + p3 += 8; + p4 += 8; + p5 += 8; + } while (--lines > 0); + if (lines == 0) + goto once_more; +} + +DO_XOR_BLOCKS(32regs_p, xor_32regs_p_2, xor_32regs_p_3, xor_32regs_p_4, + xor_32regs_p_5); + +struct xor_block_template xor_block_32regs_p = { + .name = "32regs_prefetch", + .xor_gen = xor_gen_32regs_p, +}; diff --git a/lib/raid/xor/xor-32regs.c b/lib/raid/xor/xor-32regs.c new file mode 100644 index 00000000000000..acb4a10d1e95bd --- /dev/null +++ b/lib/raid/xor/xor-32regs.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "xor_impl.h" + +static void +xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) +{ + long lines = bytes / (sizeof (long)) / 8; + + do { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = p1[0]; /* Pull the stuff into registers */ + d1 = p1[1]; /* ... in bursts, if possible. */ + d2 = p1[2]; + d3 = p1[3]; + d4 = p1[4]; + d5 = p1[5]; + d6 = p1[6]; + d7 = p1[7]; + d0 ^= p2[0]; + d1 ^= p2[1]; + d2 ^= p2[2]; + d3 ^= p2[3]; + d4 ^= p2[4]; + d5 ^= p2[5]; + d6 ^= p2[6]; + d7 ^= p2[7]; + p1[0] = d0; /* Store the result (in bursts) */ + p1[1] = d1; + p1[2] = d2; + p1[3] = d3; + p1[4] = d4; + p1[5] = d5; + p1[6] = d6; + p1[7] = d7; + p1 += 8; + p2 += 8; + } while (--lines > 0); +} + +static void +xor_32regs_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + long lines = bytes / (sizeof (long)) / 8; + + do { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = p1[0]; /* Pull the stuff into registers */ + d1 = p1[1]; /* ... in bursts, if possible. */ + d2 = p1[2]; + d3 = p1[3]; + d4 = p1[4]; + d5 = p1[5]; + d6 = p1[6]; + d7 = p1[7]; + d0 ^= p2[0]; + d1 ^= p2[1]; + d2 ^= p2[2]; + d3 ^= p2[3]; + d4 ^= p2[4]; + d5 ^= p2[5]; + d6 ^= p2[6]; + d7 ^= p2[7]; + d0 ^= p3[0]; + d1 ^= p3[1]; + d2 ^= p3[2]; + d3 ^= p3[3]; + d4 ^= p3[4]; + d5 ^= p3[5]; + d6 ^= p3[6]; + d7 ^= p3[7]; + p1[0] = d0; /* Store the result (in bursts) */ + p1[1] = d1; + p1[2] = d2; + p1[3] = d3; + p1[4] = d4; + p1[5] = d5; + p1[6] = d6; + p1[7] = d7; + p1 += 8; + p2 += 8; + p3 += 8; + } while (--lines > 0); +} + +static void +xor_32regs_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + long lines = bytes / (sizeof (long)) / 8; + + do { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = p1[0]; /* Pull the stuff into registers */ + d1 = p1[1]; /* ... in bursts, if possible. */ + d2 = p1[2]; + d3 = p1[3]; + d4 = p1[4]; + d5 = p1[5]; + d6 = p1[6]; + d7 = p1[7]; + d0 ^= p2[0]; + d1 ^= p2[1]; + d2 ^= p2[2]; + d3 ^= p2[3]; + d4 ^= p2[4]; + d5 ^= p2[5]; + d6 ^= p2[6]; + d7 ^= p2[7]; + d0 ^= p3[0]; + d1 ^= p3[1]; + d2 ^= p3[2]; + d3 ^= p3[3]; + d4 ^= p3[4]; + d5 ^= p3[5]; + d6 ^= p3[6]; + d7 ^= p3[7]; + d0 ^= p4[0]; + d1 ^= p4[1]; + d2 ^= p4[2]; + d3 ^= p4[3]; + d4 ^= p4[4]; + d5 ^= p4[5]; + d6 ^= p4[6]; + d7 ^= p4[7]; + p1[0] = d0; /* Store the result (in bursts) */ + p1[1] = d1; + p1[2] = d2; + p1[3] = d3; + p1[4] = d4; + p1[5] = d5; + p1[6] = d6; + p1[7] = d7; + p1 += 8; + p2 += 8; + p3 += 8; + p4 += 8; + } while (--lines > 0); +} + +static void +xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + long lines = bytes / (sizeof (long)) / 8; + + do { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = p1[0]; /* Pull the stuff into registers */ + d1 = p1[1]; /* ... in bursts, if possible. */ + d2 = p1[2]; + d3 = p1[3]; + d4 = p1[4]; + d5 = p1[5]; + d6 = p1[6]; + d7 = p1[7]; + d0 ^= p2[0]; + d1 ^= p2[1]; + d2 ^= p2[2]; + d3 ^= p2[3]; + d4 ^= p2[4]; + d5 ^= p2[5]; + d6 ^= p2[6]; + d7 ^= p2[7]; + d0 ^= p3[0]; + d1 ^= p3[1]; + d2 ^= p3[2]; + d3 ^= p3[3]; + d4 ^= p3[4]; + d5 ^= p3[5]; + d6 ^= p3[6]; + d7 ^= p3[7]; + d0 ^= p4[0]; + d1 ^= p4[1]; + d2 ^= p4[2]; + d3 ^= p4[3]; + d4 ^= p4[4]; + d5 ^= p4[5]; + d6 ^= p4[6]; + d7 ^= p4[7]; + d0 ^= p5[0]; + d1 ^= p5[1]; + d2 ^= p5[2]; + d3 ^= p5[3]; + d4 ^= p5[4]; + d5 ^= p5[5]; + d6 ^= p5[6]; + d7 ^= p5[7]; + p1[0] = d0; /* Store the result (in bursts) */ + p1[1] = d1; + p1[2] = d2; + p1[3] = d3; + p1[4] = d4; + p1[5] = d5; + p1[6] = d6; + p1[7] = d7; + p1 += 8; + p2 += 8; + p3 += 8; + p4 += 8; + p5 += 8; + } while (--lines > 0); +} + +DO_XOR_BLOCKS(32regs, xor_32regs_2, xor_32regs_3, xor_32regs_4, xor_32regs_5); + +struct xor_block_template xor_block_32regs = { + .name = "32regs", + .xor_gen = xor_gen_32regs, +}; diff --git a/lib/raid/xor/xor-8regs-prefetch.c b/lib/raid/xor/xor-8regs-prefetch.c new file mode 100644 index 00000000000000..451527a951b1a2 --- /dev/null +++ b/lib/raid/xor/xor-8regs-prefetch.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include "xor_impl.h" + +static void +xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) +{ + long lines = bytes / (sizeof (long)) / 8 - 1; + prefetchw(p1); + prefetch(p2); + + do { + prefetchw(p1+8); + prefetch(p2+8); + once_more: + p1[0] ^= p2[0]; + p1[1] ^= p2[1]; + p1[2] ^= p2[2]; + p1[3] ^= p2[3]; + p1[4] ^= p2[4]; + p1[5] ^= p2[5]; + p1[6] ^= p2[6]; + p1[7] ^= p2[7]; + p1 += 8; + p2 += 8; + } while (--lines > 0); + if (lines == 0) + goto once_more; +} + +static void +xor_8regs_p_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + long lines = bytes / (sizeof (long)) / 8 - 1; + prefetchw(p1); + prefetch(p2); + prefetch(p3); + + do { + prefetchw(p1+8); + prefetch(p2+8); + prefetch(p3+8); + once_more: + p1[0] ^= p2[0] ^ p3[0]; + p1[1] ^= p2[1] ^ p3[1]; + p1[2] ^= p2[2] ^ p3[2]; + p1[3] ^= p2[3] ^ p3[3]; + p1[4] ^= p2[4] ^ p3[4]; + p1[5] ^= p2[5] ^ p3[5]; + p1[6] ^= p2[6] ^ p3[6]; + p1[7] ^= p2[7] ^ p3[7]; + p1 += 8; + p2 += 8; + p3 += 8; + } while (--lines > 0); + if (lines == 0) + goto once_more; +} + +static void +xor_8regs_p_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + long lines = bytes / (sizeof (long)) / 8 - 1; + + prefetchw(p1); + prefetch(p2); + prefetch(p3); + prefetch(p4); + + do { + prefetchw(p1+8); + prefetch(p2+8); + prefetch(p3+8); + prefetch(p4+8); + once_more: + p1[0] ^= p2[0] ^ p3[0] ^ p4[0]; + p1[1] ^= p2[1] ^ p3[1] ^ p4[1]; + p1[2] ^= p2[2] ^ p3[2] ^ p4[2]; + p1[3] ^= p2[3] ^ p3[3] ^ p4[3]; + p1[4] ^= p2[4] ^ p3[4] ^ p4[4]; + p1[5] ^= p2[5] ^ p3[5] ^ p4[5]; + p1[6] ^= p2[6] ^ p3[6] ^ p4[6]; + p1[7] ^= p2[7] ^ p3[7] ^ p4[7]; + p1 += 8; + p2 += 8; + p3 += 8; + p4 += 8; + } while (--lines > 0); + if (lines == 0) + goto once_more; +} + +static void +xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + long lines = bytes / (sizeof (long)) / 8 - 1; + + prefetchw(p1); + prefetch(p2); + prefetch(p3); + prefetch(p4); + prefetch(p5); + + do { + prefetchw(p1+8); + prefetch(p2+8); + prefetch(p3+8); + prefetch(p4+8); + prefetch(p5+8); + once_more: + p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0]; + p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1]; + p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2]; + p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3]; + p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4]; + p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5]; + p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6]; + p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7]; + p1 += 8; + p2 += 8; + p3 += 8; + p4 += 8; + p5 += 8; + } while (--lines > 0); + if (lines == 0) + goto once_more; +} + + +DO_XOR_BLOCKS(8regs_p, xor_8regs_p_2, xor_8regs_p_3, xor_8regs_p_4, + xor_8regs_p_5); + +struct xor_block_template xor_block_8regs_p = { + .name = "8regs_prefetch", + .xor_gen = xor_gen_8regs_p, +}; diff --git a/lib/raid/xor/xor-8regs.c b/lib/raid/xor/xor-8regs.c new file mode 100644 index 00000000000000..1edaed8acffe60 --- /dev/null +++ b/lib/raid/xor/xor-8regs.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include "xor_impl.h" + +static void +xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2) +{ + long lines = bytes / (sizeof (long)) / 8; + + do { + p1[0] ^= p2[0]; + p1[1] ^= p2[1]; + p1[2] ^= p2[2]; + p1[3] ^= p2[3]; + p1[4] ^= p2[4]; + p1[5] ^= p2[5]; + p1[6] ^= p2[6]; + p1[7] ^= p2[7]; + p1 += 8; + p2 += 8; + } while (--lines > 0); +} + +static void +xor_8regs_3(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3) +{ + long lines = bytes / (sizeof (long)) / 8; + + do { + p1[0] ^= p2[0] ^ p3[0]; + p1[1] ^= p2[1] ^ p3[1]; + p1[2] ^= p2[2] ^ p3[2]; + p1[3] ^= p2[3] ^ p3[3]; + p1[4] ^= p2[4] ^ p3[4]; + p1[5] ^= p2[5] ^ p3[5]; + p1[6] ^= p2[6] ^ p3[6]; + p1[7] ^= p2[7] ^ p3[7]; + p1 += 8; + p2 += 8; + p3 += 8; + } while (--lines > 0); +} + +static void +xor_8regs_4(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4) +{ + long lines = bytes / (sizeof (long)) / 8; + + do { + p1[0] ^= p2[0] ^ p3[0] ^ p4[0]; + p1[1] ^= p2[1] ^ p3[1] ^ p4[1]; + p1[2] ^= p2[2] ^ p3[2] ^ p4[2]; + p1[3] ^= p2[3] ^ p3[3] ^ p4[3]; + p1[4] ^= p2[4] ^ p3[4] ^ p4[4]; + p1[5] ^= p2[5] ^ p3[5] ^ p4[5]; + p1[6] ^= p2[6] ^ p3[6] ^ p4[6]; + p1[7] ^= p2[7] ^ p3[7] ^ p4[7]; + p1 += 8; + p2 += 8; + p3 += 8; + p4 += 8; + } while (--lines > 0); +} + +static void +xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1, + const unsigned long * __restrict p2, + const unsigned long * __restrict p3, + const unsigned long * __restrict p4, + const unsigned long * __restrict p5) +{ + long lines = bytes / (sizeof (long)) / 8; + + do { + p1[0] ^= p2[0] ^ p3[0] ^ p4[0] ^ p5[0]; + p1[1] ^= p2[1] ^ p3[1] ^ p4[1] ^ p5[1]; + p1[2] ^= p2[2] ^ p3[2] ^ p4[2] ^ p5[2]; + p1[3] ^= p2[3] ^ p3[3] ^ p4[3] ^ p5[3]; + p1[4] ^= p2[4] ^ p3[4] ^ p4[4] ^ p5[4]; + p1[5] ^= p2[5] ^ p3[5] ^ p4[5] ^ p5[5]; + p1[6] ^= p2[6] ^ p3[6] ^ p4[6] ^ p5[6]; + p1[7] ^= p2[7] ^ p3[7] ^ p4[7] ^ p5[7]; + p1 += 8; + p2 += 8; + p3 += 8; + p4 += 8; + p5 += 8; + } while (--lines > 0); +} + +#ifndef NO_TEMPLATE +DO_XOR_BLOCKS(8regs, xor_8regs_2, xor_8regs_3, xor_8regs_4, xor_8regs_5); + +struct xor_block_template xor_block_8regs = { + .name = "8regs", + .xor_gen = xor_gen_8regs, +}; +#endif /* NO_TEMPLATE */ diff --git a/lib/raid/xor/xor-core.c b/lib/raid/xor/xor-core.c new file mode 100644 index 00000000000000..67dc8ade7f0bed --- /dev/null +++ b/lib/raid/xor/xor-core.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 1996, 1997, 1998, 1999, 2000, + * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson. + * + * Dispatch optimized XOR parity functions. + */ + +#include +#include +#include +#include +#include +#include +#include "xor_impl.h" + +DEFINE_STATIC_CALL_NULL(xor_gen_impl, *xor_block_8regs.xor_gen); + +/** + * xor_gen - generate RAID-style XOR information + * @dest: destination vector + * @srcs: source vectors + * @src_cnt: number of source vectors + * @bytes: length in bytes of each vector + * + * Performs bit-wise XOR operation into @dest for each of the @src_cnt vectors + * in @srcs for a length of @bytes bytes. @src_count must be non-zero, and the + * memory pointed to by @dest and each member of @srcs must be at least 64-byte + * aligned. @bytes must be non-zero and a multiple of 512. + * + * Note: for typical RAID uses, @dest either needs to be zeroed, or filled with + * the first disk, which then needs to be removed from @srcs. + */ +void xor_gen(void *dest, void **srcs, unsigned int src_cnt, unsigned int bytes) +{ + lockdep_assert_preemption_enabled(); + WARN_ON_ONCE(bytes & 511); + + static_call(xor_gen_impl)(dest, srcs, src_cnt, bytes); +} +EXPORT_SYMBOL(xor_gen); + +/* Set of all registered templates. */ +static struct xor_block_template *__initdata template_list; +static struct xor_block_template *forced_template; + +/** + * xor_register - register a XOR template + * @tmpl: template to register + * + * Register a XOR implementation with the core. Registered implementations + * will be measured by a trivial benchmark, and the fastest one is chosen + * unless an implementation is forced using xor_force(). + */ +void __init xor_register(struct xor_block_template *tmpl) +{ + tmpl->next = template_list; + template_list = tmpl; +} + +/** + * xor_force - force use of a XOR template + * @tmpl: template to register + * + * Register a XOR implementation with the core and force using it. Forcing + * an implementation will make the core ignore any template registered using + * xor_register(), or any previous implementation forced using xor_force(). + */ +void __init xor_force(struct xor_block_template *tmpl) +{ + forced_template = tmpl; +} + +#define BENCH_SIZE 4096 +#define REPS 800U + +static void __init +do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) +{ + int speed; + unsigned long reps; + ktime_t min, start, t0; + void *srcs[1] = { b2 }; + + preempt_disable(); + + reps = 0; + t0 = ktime_get(); + /* delay start until time has advanced */ + while ((start = ktime_get()) == t0) + cpu_relax(); + do { + mb(); /* prevent loop optimization */ + tmpl->xor_gen(b1, srcs, 1, BENCH_SIZE); + mb(); + } while (reps++ < REPS || (t0 = ktime_get()) == start); + min = ktime_sub(t0, start); + + preempt_enable(); + + // bytes/ns == GB/s, multiply by 1000 to get MB/s [not MiB/s] + speed = (1000 * reps * BENCH_SIZE) / (unsigned int)ktime_to_ns(min); + tmpl->speed = speed; + + pr_info(" %-16s: %5d MB/sec\n", tmpl->name, speed); +} + +static int __init calibrate_xor_blocks(void) +{ + void *b1, *b2; + struct xor_block_template *f, *fastest; + + if (forced_template) + return 0; + + b1 = (void *) __get_free_pages(GFP_KERNEL, 2); + if (!b1) { + pr_warn("xor: Yikes! No memory available.\n"); + return -ENOMEM; + } + b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE; + + pr_info("xor: measuring software checksum speed\n"); + fastest = template_list; + for (f = template_list; f; f = f->next) { + do_xor_speed(f, b1, b2); + if (f->speed > fastest->speed) + fastest = f; + } + static_call_update(xor_gen_impl, fastest->xor_gen); + pr_info("xor: using function: %s (%d MB/sec)\n", + fastest->name, fastest->speed); + + free_pages((unsigned long)b1, 2); + return 0; +} + +#ifdef CONFIG_XOR_BLOCKS_ARCH +#include "xor_arch.h" /* $SRCARCH/xor_arch.h */ +#else +static void __init arch_xor_init(void) +{ + xor_register(&xor_block_8regs); + xor_register(&xor_block_8regs_p); + xor_register(&xor_block_32regs); + xor_register(&xor_block_32regs_p); +} +#endif /* CONFIG_XOR_BLOCKS_ARCH */ + +static int __init xor_init(void) +{ + arch_xor_init(); + + /* + * If this arch/cpu has a short-circuited selection, don't loop through + * all the possible functions, just use the best one. + */ + if (forced_template) { + pr_info("xor: automatically using best checksumming function %-10s\n", + forced_template->name); + static_call_update(xor_gen_impl, forced_template->xor_gen); + return 0; + } + +#ifdef MODULE + return calibrate_xor_blocks(); +#else + /* + * Pick the first template as the temporary default until calibration + * happens. + */ + static_call_update(xor_gen_impl, template_list->xor_gen); + return 0; +#endif +} + +static __exit void xor_exit(void) +{ +} + +MODULE_DESCRIPTION("RAID-5 checksumming functions"); +MODULE_LICENSE("GPL"); + +/* + * When built-in we must register the default template before md, but we don't + * want calibration to run that early as that would delay the boot process. + */ +#ifndef MODULE +__initcall(calibrate_xor_blocks); +#endif +core_initcall(xor_init); +module_exit(xor_exit); diff --git a/lib/raid/xor/xor_impl.h b/lib/raid/xor/xor_impl.h new file mode 100644 index 00000000000000..09ae2916f71ecb --- /dev/null +++ b/lib/raid/xor/xor_impl.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XOR_IMPL_H +#define _XOR_IMPL_H + +#include +#include + +struct xor_block_template { + struct xor_block_template *next; + const char *name; + int speed; + void (*xor_gen)(void *dest, void **srcs, unsigned int src_cnt, + unsigned int bytes); +}; + +#define __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) \ +void \ +xor_gen_##_name(void *dest, void **srcs, unsigned int src_cnt, \ + unsigned int bytes) \ +{ \ + unsigned int src_off = 0; \ + \ + while (src_cnt > 0) { \ + unsigned int this_cnt = min(src_cnt, 4); \ + \ + if (this_cnt == 1) \ + _handle1(bytes, dest, srcs[src_off]); \ + else if (this_cnt == 2) \ + _handle2(bytes, dest, srcs[src_off], \ + srcs[src_off + 1]); \ + else if (this_cnt == 3) \ + _handle3(bytes, dest, srcs[src_off], \ + srcs[src_off + 1], srcs[src_off + 2]); \ + else \ + _handle4(bytes, dest, srcs[src_off], \ + srcs[src_off + 1], srcs[src_off + 2], \ + srcs[src_off + 3]); \ + \ + src_cnt -= this_cnt; \ + src_off += this_cnt; \ + } \ +} + +#define DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) \ + static __DO_XOR_BLOCKS(_name, _handle1, _handle2, _handle3, _handle4) + +/* generic implementations */ +extern struct xor_block_template xor_block_8regs; +extern struct xor_block_template xor_block_32regs; +extern struct xor_block_template xor_block_8regs_p; +extern struct xor_block_template xor_block_32regs_p; + +void __init xor_register(struct xor_block_template *tmpl); +void __init xor_force(struct xor_block_template *tmpl); + +#endif /* _XOR_IMPL_H */