diff --git a/.github/workflows/checks.yaml b/.github/workflows/checks.yaml index 0ddeac8f..4feb029a 100644 --- a/.github/workflows/checks.yaml +++ b/.github/workflows/checks.yaml @@ -31,6 +31,10 @@ jobs: # powerpc64le-unknown-linux-gnu CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER: powerpc64le-linux-gnu-gcc CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER: qemu-ppc64le -L /usr/powerpc64le-linux-gnu + # riscv64gc-unknown-linux-gnu + CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER: riscv64-linux-gnu-gcc + CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER: qemu-riscv64 -L /usr/riscv64-linux-gnu -cpu rv64,zbkc=true + CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUSTFLAGS: "-Ctarget-feature=+zbkc" # wasm32-wasip1 (std for wasip2 is unstable) WASI_SDK_PATH: /tmp/wasi-sdk-24.0-x86_64-linux CC_wasm32_wasip1: /tmp/wasi-sdk-24.0-x86_64-linux/bin/clang @@ -82,6 +86,11 @@ jobs: codecov: false packages: gcc-powerpc64le-linux-gnu g++-powerpc64le-linux-gnu qemu-user qemu-user-static + - target: "riscv64gc-unknown-linux-gnu" + os: ubuntu-latest + codecov: false + packages: gcc-riscv64-linux-gnu g++-riscv64-linux-gnu qemu-user qemu-user-static + - target: "wasm32-wasip1" os: ubuntu-latest flags: "-p zlib-rs -p libz-rs-sys -p test-libz-rs-sys" diff --git a/zlib-rs/src/cpu_features.rs b/zlib-rs/src/cpu_features.rs index f442d49c..8eca4e9d 100644 --- a/zlib-rs/src/cpu_features.rs +++ b/zlib-rs/src/cpu_features.rs @@ -108,3 +108,13 @@ pub fn is_enabled_simd128() -> bool { false } + +#[inline(always)] +pub fn is_enabled_zbkc() -> bool { + // FIXME: std::arch::is_riscv64_feature_detected is stabilized in 1.78. Switch to runtime + // feature detection once MSRV is bumped. Until then, zbkc support is compile-time only. + #[cfg(target_arch = "riscv64")] + return cfg!(target_feature = "zbkc"); + + false +} diff --git a/zlib-rs/src/crc32.rs b/zlib-rs/src/crc32.rs index cf4acfc7..b570a56d 100644 --- a/zlib-rs/src/crc32.rs +++ b/zlib-rs/src/crc32.rs @@ -13,6 +13,8 @@ mod pclmulqdq; #[cfg(target_arch = "x86_64")] #[cfg(feature = "vpclmulqdq")] mod vpclmulqdq; +#[cfg(target_arch = "riscv64")] +mod zbkc; pub use combine::{crc32_combine, crc32_combine_gen, crc32_combine_op}; @@ -83,6 +85,12 @@ impl Crc32Fold { return; } + #[cfg(target_arch = "riscv64")] + if crate::cpu_features::is_enabled_zbkc() { + self.value = unsafe { self::zbkc::crc32_zbkc_riscv64(self.value, src) }; + return; + } + #[cfg(target_arch = "loongarch64")] { self.value = self::loongarch::crc32_loongarch64(self.value, src); diff --git a/zlib-rs/src/crc32/zbkc.rs b/zlib-rs/src/crc32/zbkc.rs new file mode 100644 index 00000000..9f4948f9 --- /dev/null +++ b/zlib-rs/src/crc32/zbkc.rs @@ -0,0 +1,108 @@ +//! crc32 implementation using the riscv64 zbkc ISA extension. Derived from +//! zlib-ng's implementation, see +//! https://github.com/zlib-ng/zlib-ng/blob/da22434b657578c41af1bdf06b27304e4aceb00f/arch/riscv/crc32_zbc.c +//! +//! # Safety +//! +//! The functions in this module must only be executed on a riscv64 system with +//! the zbkc feature. + +use crate::crc32::zbkc::asm::{clmul, clmulh}; + +use super::crc32_braid; + +const CLMUL_MIN_LEN: usize = 16; +const CLMUL_CHUNK_LEN: usize = 16; + +const CONSTANT_R3: u64 = 0x1751997D0; +const CONSTANT_R4: u64 = 0x0CCAA009E; +const CONSTANT_R5: u64 = 0x163CD6124; +const MASK32: u64 = 0xFFFFFFFF; +const CRCPOLY_TRUE_LE_FULL: u64 = 0x1DB710641; +const CONSTANT_RU: u64 = 0x1F7011641; + +pub unsafe fn crc32_zbkc_riscv64(mut crc: u32, buf: &[u8]) -> u32 { + if buf.len() < CLMUL_MIN_LEN { + return crc32_braid(crc, buf); + } + + let unaligned_len = buf.len() % CLMUL_CHUNK_LEN; + if unaligned_len > 0 { + crc = crc32_braid(crc, &buf[..unaligned_len]); + } + + !unsafe { crc32_zbkc_riscv64_impl(!crc, &buf[unaligned_len..]) } +} + +unsafe fn crc32_zbkc_riscv64_impl(crc: u32, buf: &[u8]) -> u32 { + // This unwrap is legal because crc32_zbkc_riscv64 guarantees the input is at + // least 16 bytes. + let mut low = u64::from_le_bytes(buf[..8].try_into().unwrap()) ^ crc as u64; + let mut high = u64::from_le_bytes(buf[8..16].try_into().unwrap()); + + buf.chunks_exact(16).skip(1).for_each(|chunk| { + let t2 = unsafe { clmul(CONSTANT_R4, high) }; + let t3 = unsafe { clmulh(CONSTANT_R4, high) }; + let t0_new = unsafe { clmul(CONSTANT_R3, low) }; + let t1_new = unsafe { clmulh(CONSTANT_R3, low) }; + low = t0_new ^ t2; + high = t1_new ^ t3; + low ^= u64::from_le_bytes(chunk[..8].try_into().unwrap()); + high ^= u64::from_le_bytes(chunk[8..].try_into().unwrap()); + }); + + // Fold the 128-bit result into 64 bits + let fold_t3 = unsafe { clmulh(low, CONSTANT_R4) }; + let fold_t2 = unsafe { clmul(low, CONSTANT_R4) }; + low = high ^ fold_t2; + high = fold_t3; + + // Combine the low and high parts and perform polynomial reduction + let combined = (low >> 32) | ((high & MASK32) << 32); + let reduced_low = unsafe { clmul(low & MASK32, CONSTANT_R5) } ^ combined; + + // Barrett reduction step + let mut barrett = unsafe { clmul(reduced_low & MASK32, CONSTANT_RU) & MASK32 }; + barrett = unsafe { clmul(barrett, CRCPOLY_TRUE_LE_FULL) }; + let ret = barrett ^ reduced_low; + + (ret >> 32) as u32 +} + +/// Inline assembly for required instructions, since the stdarch intrinsics are +/// currently unstable. +mod asm { + // Returns the lower half of carryless multiplication of rs1 and rs2. + // See https://riscv.github.io/riscv-isa-manual/snapshot/spec/#insns-clmul + #[target_feature(enable = "zbkc")] + pub unsafe fn clmul(rs1: u64, rs2: u64) -> u64 { + let rd; + unsafe { + core::arch::asm!( + "clmul {rd}, {rs1}, {rs2}", + rs1 = in(reg) rs1, + rs2 = in(reg) rs2, + rd = out(reg) rd, + options(pure, nomem, nostack) + ); + } + rd + } + + // Returns the upper half of carryless multiplication of rs1 and rs2. + // See https://riscv.github.io/riscv-isa-manual/snapshot/spec/#insns-clmulh + #[target_feature(enable = "zbkc")] + pub unsafe fn clmulh(rs1: u64, rs2: u64) -> u64 { + let rd; + unsafe { + core::arch::asm!( + "clmulh {rd}, {rs1}, {rs2}", + rs1 = in(reg) rs1, + rs2 = in(reg) rs2, + rd = out(reg) rd, + options(pure, nomem, nostack) + ); + } + rd + } +}