Describe what you are looking for
NEON extensions have been back-ported to Arm v8.2 which still ship to hundreds of millions devices worldwide. I've started considering adding such a compatibility layer for older Android devices, but for now - it's low priority.
What's truly ARM64-only
Only two things have no ARM32 equivalent:
float64x2_t — the type doesn't exist on AArch32 (no 64-bit float SIMD)
vrndnq_f32 — no NEON rounding instruction on AArch32
Everything else has a portable helper — including vqtbl1q_u8 (via vtbl2) and all horizontal reductions (via vpadd cascades). Functions that accumulate in f64 for precision (like nk_dot_f32_neon) use nk_b128_vec_t as a portable accumulator: SIMD f64 FMA on ARM64, NEON f32 multiply + scalar f64 accumulation via VFP on ARM32.
Portable helpers
There are several helpers we can add to keep the kernel code clean, replacing pure intrinsic usage.
Reduction helpers in reduce/neon.h
| Helper |
Replaces |
ARM32 implementation |
nk_f32x4_reduce_sum_neon_ |
vaddvq_f32 |
vpadd_f32 cascade |
nk_i32x4_reduce_sum_neon_ |
vaddvq_s32 |
vpadd_s32 cascade |
nk_u32x4_reduce_sum_neon_ |
vaddvq_u32 |
vpadd_u32 cascade |
nk_f64x2_reduce_sum_neon_ |
vaddvq_f64 |
ARM64-only, guarded |
nk_u8x16_reduce_sum_neon_ |
vaddlvq_u8 |
vpaddlq cascade (u8→u16→u32→u64) |
nk_f32x4_reduce_max_neon_ |
vmaxvq_f32 |
vpmax_f32 cascade |
nk_f32x4_reduce_min_neon_ |
vminvq_f32 |
vpmin_f32 cascade |
nk_f32x4_pairwise_add_neon_ |
vpaddq_f32 |
two vpadd_f32 + vcombine_f32 |
nk_i32x4_pairwise_add_neon_ |
vpaddq_s32 |
two vpadd_s32 + vcombine_s32 |
nk_u32x4_pairwise_add_neon_ |
vpaddq_u32 |
two vpadd_u32 + vcombine_u32 |
Conversion helpers in cast/neon.h
| Helper |
Replaces |
ARM32 implementation |
nk_f16x8_high_to_f32x4_neon_ |
vcvt_high_f32_f16 |
vcvt_f32_f16(vget_high_f16(...)) |
nk_f64x2_from_f32x4_high_neon_ |
vcvt_high_f64_f32 |
ARM64-only, guarded |
nk_u16x8_shll_high_16_neon_ |
vshll_high_n_u16(v, 16) |
vshll_n_u16(vget_high_u16(...), 16) |
nk_u8x16_tbl1_neon_ |
vqtbl1q_u8 |
vtbl2_u8 with split halves |
f64-precision accumulator in dot/neon.h
For nk_dot_f32_neon and similar functions that accumulate in f64 for numerical stability.
Uses nk_b128_vec_t as the portable accumulator type.
NK_INTERNAL void nk_f32_dot_acc_init_neon_(nk_b128_vec_t *acc_low_vec, nk_b128_vec_t *acc_high_vec) {
#if NK_TARGET_ARM64_
acc_low_vec->f64x2 = vdupq_n_f64(0);
acc_high_vec->f64x2 = vdupq_n_f64(0);
#else
acc_low_vec->f64s[0] = 0; acc_low_vec->f64s[1] = 0;
acc_high_vec->f64s[0] = 0; acc_high_vec->f64s[1] = 0;
#endif
}
NK_INTERNAL void nk_f32_dot_acc_fma_neon_(
nk_b128_vec_t *acc_low_vec, nk_b128_vec_t *acc_high_vec,
float32x4_t a_f32x4, float32x4_t b_f32x4) {
#if NK_TARGET_ARM64_
acc_low_vec->f64x2 = vfmaq_f64(acc_low_vec->f64x2,
vcvt_f64_f32(vget_low_f32(a_f32x4)), vcvt_f64_f32(vget_low_f32(b_f32x4)));
acc_high_vec->f64x2 = vfmaq_f64(acc_high_vec->f64x2,
nk_f64x2_from_f32x4_high_neon_(a_f32x4), nk_f64x2_from_f32x4_high_neon_(b_f32x4));
#else
// NEON f32 multiply, scatter into scalar f64 accumulators (VFP double).
float32x4_t products_f32x4 = vmulq_f32(a_f32x4, b_f32x4);
acc_low_vec->f64s[0] += (nk_f64_t)vgetq_lane_f32(products_f32x4, 0);
acc_low_vec->f64s[1] += (nk_f64_t)vgetq_lane_f32(products_f32x4, 1);
acc_high_vec->f64s[0] += (nk_f64_t)vgetq_lane_f32(products_f32x4, 2);
acc_high_vec->f64s[1] += (nk_f64_t)vgetq_lane_f32(products_f32x4, 3);
#endif
}
NK_INTERNAL nk_f64_t nk_f32_dot_acc_reduce_neon_(nk_b128_vec_t acc_low_vec, nk_b128_vec_t acc_high_vec) {
#if NK_TARGET_ARM64_
return nk_f64x2_reduce_sum_neon_(vaddq_f64(acc_low_vec.f64x2, acc_high_vec.f64x2));
#else
return acc_low_vec.f64s[0] + acc_low_vec.f64s[1] + acc_high_vec.f64s[0] + acc_high_vec.f64s[1];
#endif
}
Once done, we'll need to change many of the #if NK_TARGET_ARM64_ guards to #if NK_TARGET_ARM64_ || NK_TARGET_ARM32_ and change pragmas:
- All widened files:
target("arch=armv8-a+simd") → target("neon").
- GCC:
target("arch=armv8-a+simd") → target("fpu=neon-vfpv4").
Can you contribute to the implementation?
Is your feature request specific to a certain interface?
It applies to everything
Contact Details
No response
Is there an existing issue for this?
Code of Conduct
Describe what you are looking for
NEON extensions have been back-ported to Arm v8.2 which still ship to hundreds of millions devices worldwide. I've started considering adding such a compatibility layer for older Android devices, but for now - it's low priority.
What's truly ARM64-only
Only two things have no ARM32 equivalent:
float64x2_t— the type doesn't exist on AArch32 (no 64-bit float SIMD)vrndnq_f32— no NEON rounding instruction on AArch32Everything else has a portable helper — including
vqtbl1q_u8(viavtbl2) and all horizontal reductions (viavpaddcascades). Functions that accumulate in f64 for precision (likenk_dot_f32_neon) usenk_b128_vec_tas a portable accumulator: SIMD f64 FMA on ARM64, NEON f32 multiply + scalar f64 accumulation via VFP on ARM32.Portable helpers
There are several helpers we can add to keep the kernel code clean, replacing pure intrinsic usage.
Reduction helpers in
reduce/neon.hnk_f32x4_reduce_sum_neon_vaddvq_f32vpadd_f32cascadenk_i32x4_reduce_sum_neon_vaddvq_s32vpadd_s32cascadenk_u32x4_reduce_sum_neon_vaddvq_u32vpadd_u32cascadenk_f64x2_reduce_sum_neon_vaddvq_f64nk_u8x16_reduce_sum_neon_vaddlvq_u8vpaddlqcascade (u8→u16→u32→u64)nk_f32x4_reduce_max_neon_vmaxvq_f32vpmax_f32cascadenk_f32x4_reduce_min_neon_vminvq_f32vpmin_f32cascadenk_f32x4_pairwise_add_neon_vpaddq_f32vpadd_f32+vcombine_f32nk_i32x4_pairwise_add_neon_vpaddq_s32vpadd_s32+vcombine_s32nk_u32x4_pairwise_add_neon_vpaddq_u32vpadd_u32+vcombine_u32Conversion helpers in
cast/neon.hnk_f16x8_high_to_f32x4_neon_vcvt_high_f32_f16vcvt_f32_f16(vget_high_f16(...))nk_f64x2_from_f32x4_high_neon_vcvt_high_f64_f32nk_u16x8_shll_high_16_neon_vshll_high_n_u16(v, 16)vshll_n_u16(vget_high_u16(...), 16)nk_u8x16_tbl1_neon_vqtbl1q_u8vtbl2_u8with split halvesf64-precision accumulator in
dot/neon.hFor
nk_dot_f32_neonand similar functions that accumulate in f64 for numerical stability.Uses
nk_b128_vec_tas the portable accumulator type.Once done, we'll need to change many of the
#if NK_TARGET_ARM64_guards to#if NK_TARGET_ARM64_ || NK_TARGET_ARM32_and change pragmas:target("arch=armv8-a+simd")→target("neon").target("arch=armv8-a+simd")→target("fpu=neon-vfpv4").Can you contribute to the implementation?
Is your feature request specific to a certain interface?
It applies to everything
Contact Details
No response
Is there an existing issue for this?
Code of Conduct