Skip to content
34 changes: 34 additions & 0 deletions compiler/rustc_codegen_llvm/src/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,27 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
let pair = self.insert_value(pair, high, 1);
pair
}

// FIXME move into the branch below when LLVM 22 is the lowest version we support.
sym::carryless_mul if crate::llvm_util::get_version() >= (22, 0, 0) => {
let ty = args[0].layout.ty;
if !ty.is_integral() {
tcx.dcx().emit_err(InvalidMonomorphization::BasicIntegerType {
span,
name,
ty,
});
return Ok(());
}
let (size, _) = ty.int_size_and_signed(self.tcx);
let width = size.bits();
let llty = self.type_ix(width);

let lhs = args[0].immediate();
let rhs = args[1].immediate();
self.call_intrinsic("llvm.clmul", &[llty], &[lhs, rhs])
}

sym::ctlz
| sym::ctlz_nonzero
| sym::cttz
Expand Down Expand Up @@ -2763,6 +2784,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
| sym::simd_ctlz
| sym::simd_ctpop
| sym::simd_cttz
| sym::simd_carryless_mul
| sym::simd_funnel_shl
| sym::simd_funnel_shr
) {
Expand All @@ -2787,6 +2809,7 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
sym::simd_cttz => "llvm.cttz",
sym::simd_funnel_shl => "llvm.fshl",
sym::simd_funnel_shr => "llvm.fshr",
sym::simd_carryless_mul => "llvm.clmul",
_ => unreachable!(),
};
let int_size = in_elem.int_size_and_signed(bx.tcx()).0.bits();
Expand All @@ -2812,6 +2835,17 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
&[vec_ty],
&[args[0].immediate(), args[1].immediate(), args[2].immediate()],
)),
sym::simd_carryless_mul => {
if crate::llvm_util::get_version() >= (22, 0, 0) {
Ok(bx.call_intrinsic(
llvm_intrinsic,
&[vec_ty],
&[args[0].immediate(), args[1].immediate()],
))
} else {
span_bug!(span, "`simd_carryless_mul` needs LLVM 22 or higher");
}
}
_ => unreachable!(),
};
}
Expand Down
9 changes: 8 additions & 1 deletion compiler/rustc_codegen_llvm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,14 @@ impl CodegenBackend for LlvmCodegenBackend {
}

fn replaced_intrinsics(&self) -> Vec<Symbol> {
vec![sym::unchecked_funnel_shl, sym::unchecked_funnel_shr, sym::carrying_mul_add]
let mut will_not_use_fallback =
vec![sym::unchecked_funnel_shl, sym::unchecked_funnel_shr, sym::carrying_mul_add];

if llvm_util::get_version() >= (22, 0, 0) {
will_not_use_fallback.push(sym::carryless_mul);
}

will_not_use_fallback
}

fn codegen_crate<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Box<dyn Any> {
Expand Down
27 changes: 27 additions & 0 deletions compiler/rustc_const_eval/src/interpret/intrinsics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,33 @@ impl<'tcx, M: Machine<'tcx>> InterpCx<'tcx, M> {
sym::fmuladdf128 => {
self.float_muladd_intrinsic::<Quad>(args, dest, MulAddType::Nondeterministic)?
}
sym::carryless_mul => {
let size = dest.layout.size;

let left = self.read_scalar(&args[0])?.to_bits(size)?;
let right = self.read_scalar(&args[1])?.to_bits(size)?;

// perform carry-less multiplication.
//
// this operation is like long multiplication, but ignores the carries.
// that idea corresponds to the xor operator, which is used in the implementation.
//
// wikipedia has an example https://en.wikipedia.org/wiki/carry-less_product#example
Comment on lines +742 to +747
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use proper spelling and capitalization.

let mut result: u128 = 0;

for i in 0..size.bits() {
// if the i-th bit in right is set
if (right >> i) & 1 != 0 {
// xor result with `left` shifted to the left by i positions
result ^= left << i;
}
}
Comment on lines +750 to +756
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please explain why it is okay to run this algorithm on u128 no matter the actual type. In particular, couldn't the underlying type be signed...?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is no signed carryless-mul, if it exists on a signed type, it always returns the same results as the corresponding unsigned version (the definition using shifting assumes unsigned types where that matters).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems worth an assertion in the code.


// Only return the lower bits.
result &= u128::MAX >> (128 - size.bits());
Comment on lines +758 to +759
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use https://doc.rust-lang.org/nightly/nightly-rustc/rustc_middle/ty/struct.ScalarInt.html#method.truncate_from_uint to get a ScalarInt with implicit truncation which you can then turn into a Scalar.


self.write_scalar(Scalar::from_uint(result, dest.layout.size), dest)?;
}

// Unsupported intrinsic: skip the return_to_block below.
_ => return interp_ok(false),
Expand Down
5 changes: 4 additions & 1 deletion compiler/rustc_hir_analysis/src/check/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
| sym::bswap
| sym::caller_location
| sym::carrying_mul_add
| sym::carryless_mul
| sym::ceilf16
| sym::ceilf32
| sym::ceilf64
Expand Down Expand Up @@ -564,6 +565,7 @@ pub(crate) fn check_intrinsic_type(
(1, 0, vec![param(0), param(0)], param(0))
}
sym::saturating_add | sym::saturating_sub => (1, 0, vec![param(0), param(0)], param(0)),
sym::carryless_mul => (1, 0, vec![param(0), param(0)], param(0)),
sym::fadd_fast | sym::fsub_fast | sym::fmul_fast | sym::fdiv_fast | sym::frem_fast => {
(1, 0, vec![param(0), param(0)], param(0))
}
Expand Down Expand Up @@ -711,7 +713,8 @@ pub(crate) fn check_intrinsic_type(
| sym::simd_fmin
| sym::simd_fmax
| sym::simd_saturating_add
| sym::simd_saturating_sub => (1, 0, vec![param(0), param(0)], param(0)),
| sym::simd_saturating_sub
| sym::simd_carryless_mul => (1, 0, vec![param(0), param(0)], param(0)),
sym::simd_arith_offset => (2, 0, vec![param(0), param(1)], param(0)),
sym::simd_neg
| sym::simd_bswap
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_span/src/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,7 @@ symbols! {
caller_location,
capture_disjoint_fields,
carrying_mul_add,
carryless_mul,
catch_unwind,
cause,
cdylib,
Expand Down Expand Up @@ -2083,6 +2084,7 @@ symbols! {
simd_bitmask,
simd_bitreverse,
simd_bswap,
simd_carryless_mul,
simd_cast,
simd_cast_ptr,
simd_ceil,
Expand Down
98 changes: 98 additions & 0 deletions library/core/src/intrinsics/fallback.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,101 @@ macro_rules! impl_funnel_shifts {
impl_funnel_shifts! {
u8, u16, u32, u64, u128, usize
}

#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
pub const trait CarrylessMul: Copy + 'static {
/// See [`super::carryless_mul`]; we just need the trait indirection to handle
/// different types since calling intrinsics with generics doesn't work.
fn carryless_mul(self, rhs: Self) -> Self;
}

macro_rules! impl_carryless_mul{
($($type:ident),*) => {$(
/// This approach uses a bitmask of the form `0b100010001...0001` to avoid carry spilling.
/// When carries do occur, they wind up in a "hole" of zeros and are subsequently masked
/// out of the result.
Comment on lines +231 to +233
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This approach with 4-bit digits works up to integers with 4 * 15 = 60 bits. Past that, one digit can overflow to the next.

For u64, it does actually work for this "non-widening" operation, since the top digit may be computed as 16, but there is no next digit that would be affected. The wide result would be erroneous however. E.g. x.carryless_mul(x) with x = MASK as u64 as u128.

The impl for u128::carryless_mul is currently incorrect for that reason. You could probably extend the approach to use 5-bit digits, but it's likely better to just implement it in terms of u64::carryless_mul.

some tests against a naive impl: playground

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a const { assert!(std::mem::size_of::<Self>() <= 8); } here or so to make it less likely we accidentally merge this incorrectly in the future?

#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
impl const CarrylessMul for $type {
#[inline]
fn carryless_mul(self, rhs: Self) -> Self {
use crate::num::Wrapping;

// i.e. 0b100010001...0001 in binary.
const MASK: u64 = 0x1111_1111_1111_1111u64;

const M0: $type = MASK as $type;
const M1: $type = M0 << 1;
const M2: $type = M1 << 1;
const M3: $type = M2 << 1;

let x = self;
let y = rhs;

let x0 = Wrapping(x & M0);
let x1 = Wrapping(x & M1);
let x2 = Wrapping(x & M2);
let x3 = Wrapping(x & M3);

let y0 = Wrapping(y & M0);
let y1 = Wrapping(y & M1);
let y2 = Wrapping(y & M2);
let y3 = Wrapping(y & M3);

let z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1);
let z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2);
let z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3);
let z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0);

(z0.0 & M0) | (z1.0 & M1) | (z2.0 & M2) | (z3.0 & M3)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that Wikipedia and the relevant C++ paper (https://isocpp.org/files/papers/P3642R3.html) have much simpler implementations. We also provide a simpler implementation as the 'simple implementation' in the description of the public function (I think the same one C++ paper provides). I think this manual unrolling may help performance, but do we have evidence of that? Does that performance matter since in practice distros ~only ship stable toolchains and Rust will not stabilize this on LLVM < 22? Maybe we should use the simple implementation as fallback too?

Copy link
Contributor

@tarcieri tarcieri Feb 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm reading the implementation from that C++ paper correctly, it's doing one integer multiply for every bit of the input, whereas this does 16 multiply ops. For a u64, that should be 1/4 the multiplies. It's effectively batching up the work so it can do more with each multiply.

The C++ example really seems more like a naive/idealized algorithm description designed to lower to optimized intrinsics as opposed to something you'd actually want to practically deploy as a portable implementation.

Note: the original LLVM RFC originally proposed using the above method "If the CPU does not have a dedication clmul operation, it can be lowered to regular multiplication, by using holes to avoid carrys" but it sounds like the actual portable codegen out of LLVM isn't using it (yet)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's doing one integer multiply for every bit of the input

Technically yes, but note that each of those multiplies is of the form x * (1 << i), which is just x << i (where i is constant if unrolled), so the comparison isn't so clear.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I understand that this is batching that work. But given that this code is unreachable except on gcc + cranelift in Rust-distributed builds, it's not obvious to me how much that matters. I'd also maybe expect that llvm can optimize the naive form a bit closer to this? Not sure there.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's reachable on builds with older llvm

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but it seems unlikely this will stabilize before those are largely phased out of our support? In any case, I think the main thing is adding more thorough test coverage.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we care enough about older LLVM to add non-trivial fast-paths for those versions?

}
}
)*};
}

impl_carryless_mul! {
u8, u16, u32, u64, usize
}

#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
impl const CarrylessMul for u128 {
#[inline]
fn carryless_mul(self, rhs: Self) -> Self {
let l = u64::carryless_mul(self as u64, rhs as u64);
let lh = u64::carryless_mul(self as u64, (rhs >> 64) as u64);
let hl = u64::carryless_mul((self >> 64) as u64, rhs as u64);
let h = lh ^ hl ^ carryless_mul_high(self as u64, rhs as u64);
((h as u128) << 64) | l as u128
}
}

#[rustc_const_unstable(feature = "core_intrinsics_fallbacks", issue = "none")]
#[inline]
const fn carryless_mul_high(x: u64, y: u64) -> u64 {
// i.e. 0b100010001...0001 in binary.
const MASK: u64 = 0x1111_1111_1111_1111u64;

const M0: u64 = MASK;
const M1: u64 = M0 << 1;
const M2: u64 = M1 << 1;
const M3: u64 = M2 << 1;

macro_rules! mul {
($x_mask_shift:literal, $y_mask_shift:literal) => {{
let x = x & (MASK << $x_mask_shift);
let y = y & (MASK << $y_mask_shift);
crate::hint::select_unpredictable(
x == MASK << $x_mask_shift && y == MASK << $y_mask_shift,
// only case where the multiply overflows the 4-bit parts
0x0101_0101_0101_0101u64 << ($x_mask_shift + $y_mask_shift),
x.carrying_mul(y, 0).1,
)
}};
}

let z0 = mul!(0, 0) ^ mul!(1, 3) ^ mul!(2, 2) ^ mul!(3, 1);
let z1 = mul!(0, 1) ^ mul!(1, 0) ^ mul!(2, 3) ^ mul!(3, 2);
let z2 = mul!(0, 2) ^ mul!(1, 1) ^ mul!(2, 0) ^ mul!(3, 3);
let z3 = mul!(0, 3) ^ mul!(1, 2) ^ mul!(2, 1) ^ mul!(3, 0);

(z0 & M0) | (z1 & M1) | (z2 & M2) | (z3 & M3)
}
14 changes: 14 additions & 0 deletions library/core/src/intrinsics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2179,6 +2179,20 @@ pub const unsafe fn unchecked_funnel_shr<T: [const] fallback::FunnelShift>(
unsafe { a.unchecked_funnel_shr(b, shift) }
}

/// Carryless multiply.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this meant to be a self-contained description? To me these words mean nothing.^^

///
/// Safe versions of this intrinsic are available on the integer primitives
/// via the `carryless_mul` method. For example, [`u32::carryless_mul`].
#[rustc_intrinsic]
#[rustc_nounwind]
#[rustc_const_unstable(feature = "uint_carryless_mul", issue = "152080")]
#[unstable(feature = "uint_carryless_mul", issue = "152080")]
pub const fn carryless_mul<T: [const] fallback::CarrylessMul>(a: T, b: T) -> T {
// NOTE: while this implementation could serve as the specification, rustc_const_eval
// actually implements a simpler but less efficient variant as the specification.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? As I ask above, maybe our fallback could be the simple spec?

a.carryless_mul(b)
}

/// This is an implementation detail of [`crate::ptr::read`] and should
/// not be used anywhere else. See its comments for why this exists.
///
Expand Down
12 changes: 12 additions & 0 deletions library/core/src/intrinsics/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,18 @@ pub const unsafe fn simd_funnel_shl<T>(a: T, b: T, shift: T) -> T;
#[rustc_nounwind]
pub const unsafe fn simd_funnel_shr<T>(a: T, b: T, shift: T) -> T;

/// Compute the carry-less product.
///
/// This is similar to long multiplication except that the carry is discarded.
///
/// This operation can be used to model multiplication in `GF(2)[X]`, the polynomial
/// ring over `GF(2)`.
///
/// `T` must be a vector of integers.
#[rustc_intrinsic]
#[rustc_nounwind]
pub unsafe fn simd_carryless_mul<T>(a: T, b: T) -> T;

/// "And"s vectors elementwise.
///
/// `T` must be a vector of integers.
Expand Down
1 change: 1 addition & 0 deletions library/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@
#![feature(trait_alias)]
#![feature(transparent_unions)]
#![feature(try_blocks)]
#![feature(uint_carryless_mul)]
#![feature(unboxed_closures)]
#![feature(unsized_fn_params)]
#![feature(with_negative_coherence)]
Expand Down
Loading
Loading