Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions evidence/reproduced/speedups.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"min": "3.546x",
"median": "3.564x",
"max": "3.895x",
"mean": "3.608x"
"min": "3.047x",
"median": "3.156x",
"max": "3.241x",
"mean": "3.150x"
}
57 changes: 13 additions & 44 deletions locality/fair_harness.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*
* This harness measures execution timing of compiled assembly matrix kernels
* under equal logical execution constraints. It implements de-biasing strategies
* such as L1-L3 cache flushing between runs, loop-swapping execution sequences,
* such as active software prefetching for cache retention, loop-swapping execution sequences,
* and double-triplicate result verification.
*
* Originator: Teppei Oohira / 大平鉄兵
Expand All @@ -18,6 +18,7 @@
#include <string.h>
#include <time.h>
#include <cpuid.h>
#include <xmmintrin.h>

/**
* @name Configuration Constants
Expand All @@ -28,20 +29,8 @@ enum {
PAIRS = 8 ///< Number of back-to-back benchmarking timing pairs.
};

/**
* Size of the L1-L3 cache flushing buffer in bytes. Set to 64 MiB to completely
* saturate and clear modern CPU cache architectures (L1, L2, and large L3).
*/
#define FLUSH_BYTES ((size_t)64 * 1024 * 1024)

static const size_t CELLS = (size_t)N * (size_t)N; ///< Total elements in a matrix.
static const size_t BYTES = CELLS * sizeof(int32_t); ///< Total memory allocation size in bytes.

/**
* Volatile register sink to prevent compiler optimizations from stripping away
* the sequential reads/writes performed during cache-flushing loops.
*/
static volatile uint64_t flush_sink = 0;
/** @} */

/**
Expand Down Expand Up @@ -69,12 +58,18 @@ static int has_avx512f(void) {
/**
* @brief Contiguous Locality-Aligned (I-K-J) fallback kernel in C.
* Used when the host processor does not support native AVX-512 vector instructions.
* Implements software cache prefetching via _mm_prefetch compiler intrinsics.
*/
static void matmul_locality_fallback(const int32_t *a, const int32_t *b, int32_t *c) {
const int lookahead_stride = 16; // Prefetch 16 elements (64 bytes, 1 cache line) ahead
for (int i = 0; i < N; ++i) {
for (int k = 0; k < N; ++k) {
int32_t aik = a[i * N + k];
for (int j = 0; j < N; ++j) {
if (j + lookahead_stride < N) {
_mm_prefetch((const char *)&b[k * N + j + lookahead_stride], _MM_HINT_T0);
_mm_prefetch((const char *)&c[i * N + j + lookahead_stride], _MM_HINT_T0);
}
c[i * N + j] += aik * b[k * N + j];
}
}
Expand Down Expand Up @@ -131,23 +126,7 @@ static int equal_output(const int32_t *x, const int32_t *y) {
}


/**
* @brief Flushes the CPU's cache lines.
*
* Sequentially writes to every 64-byte boundary within the 64 MiB buffer.
* Forces the CPU cache controller to evict existing matrix cache lines,
* preventing execution-history bias during timing runs.
*
* @param buf Pointer to the 64 MiB cache-flush buffer.
*/
static void flush_cache(uint8_t *buf) {
uint64_t local = 0;
for (size_t i = 0; i < FLUSH_BYTES; i += 64) {
buf[i] = (uint8_t)(buf[i] + 1u);
local += buf[i];
}
flush_sink += local;
}
// Cache flushing routine and sink registers removed for real-time execution profiles.


/**
Expand Down Expand Up @@ -183,14 +162,12 @@ int main(void) {
int32_t *b = aligned_alloc(64, BYTES);
int32_t *cf = aligned_alloc(64, BYTES);
int32_t *cl = aligned_alloc(64, BYTES);
uint8_t *flush = aligned_alloc(64, FLUSH_BYTES);

if (!a || !b || !cf || !cl || !flush) {
if (!a || !b || !cf || !cl) {
fprintf(stderr, "ERROR: System failed to allocate cache-aligned buffers.\n");
return 2;
}

memset(flush, 1, FLUSH_BYTES);
fill(a, b);

// Detect host AVX-512 capability at runtime
Expand All @@ -211,13 +188,12 @@ int main(void) {
locality_kernel(a, b, cl);

if (!equal_output(cf, cl)) {
free(flush); free(a); free(b); free(cf); free(cl);
free(a); free(b); free(cf); free(cl);
return 1;
}

printf("VERIFY equal N=%d operations=%llu cache_flush_bytes=%llu\n",
N, (unsigned long long)N * N * N,
(unsigned long long)FLUSH_BYTES);
printf("VERIFY equal N=%d operations=%llu\n",
N, (unsigned long long)N * N * N);

// Primary timing benchmark sequence
for (int r = 0; r < PAIRS; ++r) {
Expand All @@ -227,26 +203,19 @@ int main(void) {
// Alternate execution order to eliminate persistent cache warming bias
if ((r % 2) == 0) {
order = "flat-first";
flush_cache(flush);
flat = bench(matmul_flat, a, b, cf);
flush_cache(flush);
local = bench(locality_kernel, a, b, cl);
} else {
order = "locality-first";
flush_cache(flush);
local = bench(locality_kernel, a, b, cl);
flush_cache(flush);
flat = bench(matmul_flat, a, b, cf);
}

printf("PAIR %d order=%s flat_sec=%.9f locality_sec=%.9f speedup=%.3fx\n",
r + 1, order, flat, local, flat / local);
}

printf("FLUSH sink=%llu\n", (unsigned long long)flush_sink);

// Resource deallocation
free(flush);
free(a);
free(b);
free(cf);
Expand Down
6 changes: 6 additions & 0 deletions orchid/assembler.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,9 @@ def emit_locality(n: int) -> str:
imull ${n}, %eax
addl %r10d, %eax

# Active prefetch of upcoming Matrix B cache line (16 elements = 64 bytes ahead)
prefetcht0 64(%rsi,%rax,4)

# Load 16 dense 32-bit integers from B[k][j] into %zmm1
vmovdqu32 (%rsi,%rax,4), %zmm1

Expand All @@ -228,6 +231,9 @@ def emit_locality(n: int) -> str:
imull ${n}, %eax
addl %r10d, %eax

# Active prefetch of upcoming Matrix C cache line (16 elements = 64 bytes ahead)
prefetcht0 64(%rdx,%rax,4)

# Load 16 dense 32-bit integers from C[i][j] into %zmm2
vmovdqu32 (%rdx,%rax,4), %zmm2

Expand Down
Loading