Skip to content

Commit 2a84742

Browse files
authored
Replace ray permuation computation with a look-up table (AVX512) (#660)
Small speedup. VSTC Elo | 1.49 +- 1.19 (95%) SPRT | 4.0+0.04s Threads=1 Hash=16MB LLR | 3.01 (-2.25, 2.89) [0.00, 3.00] Games | N: 83942 W: 21477 L: 21117 D: 41348 Penta | [316, 9246, 22551, 9478, 380] https://recklesschess.space/test/10491/ No functional change. Bench: 3715752
1 parent fe9a816 commit 2a84742

1 file changed

Lines changed: 24 additions & 11 deletions

File tree

src/nnue/rays.rs

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@ use std::arch::x86_64::*;
33
use crate::types::{Piece, Square};
44

55
pub fn ray_permuation(focus: Square) -> (__m512i, u64) {
6-
unsafe {
7-
// We use the 0x88 board representation here for intermediate calculations.
8-
// We convert to and from this representation to avoid a 4KiB LUT.
9-
let offsets: [u8; 64] = [
6+
const PERMS: [[u8; 64]; 64] = {
7+
const OFFSETS: [u8; 64] = [
108
0x1F, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, // N
119
0x21, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, // NE
1210
0x12, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, // E
@@ -16,15 +14,30 @@ pub fn ray_permuation(focus: Square) -> (__m512i, u64) {
1614
0xEE, 0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA, 0xF9, // W
1715
0x0E, 0x0F, 0x1E, 0x2D, 0x3C, 0x4B, 0x5A, 0x69, // NW
1816
];
19-
let offsets = _mm512_loadu_si512(offsets.as_ptr().cast());
20-
21-
let focus = focus as u8;
22-
let focus = focus + (focus & 0x38);
2317

24-
let coords = _mm512_add_epi8(offsets, _mm512_set1_epi8(focus as i8));
18+
let mut perms = [[0u8; 64]; 64];
19+
20+
let mut sq = 0;
21+
while sq < 64 {
22+
let focus = sq as u8;
23+
let focus = focus + (focus & 0x38);
24+
let mut i = 0;
25+
while i < 64 {
26+
let wide_result = OFFSETS[i].wrapping_add(focus);
27+
let valid = wide_result & 0x88 == 0;
28+
let narrow_result = ((wide_result & 0x70) >> 1) + (wide_result & 0x07);
29+
perms[sq][i] = if valid { narrow_result } else { 0x80 };
30+
i += 1;
31+
}
32+
sq += 1;
33+
}
34+
35+
perms
36+
};
2537

26-
let perm = _mm512_gf2p8affine_epi64_epi8(coords, _mm512_set1_epi64(0x0102041020400000), 0);
27-
let mask = _mm512_testn_epi8_mask(coords, _mm512_set1_epi8(0x88u8 as i8));
38+
unsafe {
39+
let perm = _mm512_loadu_si512(PERMS.get_unchecked(focus as usize).as_ptr().cast());
40+
let mask = _mm512_testn_epi8_mask(perm, _mm512_set1_epi8(0x80u8 as i8));
2841

2942
(perm, mask)
3043
}

0 commit comments

Comments
 (0)