Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Build artifacts
benchmark
*.o
*.out

# Temporary files
*.swp
*.swo
*~

# IDE files
.vscode/
.idea/
*.iml
7 changes: 4 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,17 @@ COPY *.h ./
# Copy all C++ source files
COPY *.cpp ./

# Build the application with optimizations
# SSE2 intrinsics are used in the code for x86-64 platforms
# Build the application with optimizations for the target architecture
# Supports both x86-64 (with SSE2) and ARM64 (with NEON) architectures
RUN g++ -O2 -o benchmark \
main.cpp \
matrix_operations.cpp \
hash_operations.cpp \
string_search.cpp \
memory_operations.cpp \
polynomial_eval.cpp \
-std=c++11
-std=c++11 \
-march=native

# Create a startup script
COPY start.sh .
Expand Down
31 changes: 22 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Compute Benchmark Suite

A high-performance compute benchmark application optimized for x86-64 architecture with SSE2 SIMD instructions.
A high-performance compute benchmark application optimized for both x86-64 and ARM64 architectures with SIMD instructions.

## Overview

Expand All @@ -11,7 +11,7 @@ This benchmark suite tests various compute-intensive operations including:
- Memory operations (50MB copy operations)
- Polynomial evaluation (10M iterations)

The code is optimized using x86 SSE2 SIMD intrinsics for maximum performance on Intel and AMD processors.
The code is optimized using architecture-specific SIMD intrinsics for maximum performance on Intel/AMD (x86-64) and ARM processors.

## Building with Docker

Expand All @@ -21,6 +21,12 @@ Build the Docker image:
docker build -t benchmark-suite .
```

For multi-architecture builds:

```bash
docker buildx build --platform linux/amd64,linux/arm64 -t benchmark-suite .
```

## Running the Benchmark

Run the benchmark suite:
Expand All @@ -33,16 +39,17 @@ This will execute all benchmark tests and display timing results for each operat

## Architecture Notes

- **Optimized for**: x86-64 architecture with SSE2 support
- **SIMD Instructions**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations
- **Fallback**: Includes scalar fallback implementation for non-x86 platforms
- **Optimized for**: x86-64 with SSE2 support and ARM64 with NEON support
- **x86-64 SIMD**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations
- **ARM64 SIMD**: Uses NEON intrinsics (`float64x2_t`, `uint8x16_t`) for vectorized operations
- **Fallback**: Includes scalar fallback implementation for platforms without SIMD support

## Output Example

```
========================================
Compute Benchmark Suite
x86-64 with SSE2 Optimizations
ARM64 with NEON Optimizations
========================================

=== Matrix Multiplication Benchmark ===
Expand All @@ -63,10 +70,16 @@ Hash: 0xbfd8e92e2fb01505
The benchmark suite is organized into separate modules:

- `main.cpp` - Main entry point and benchmark orchestration
- `matrix_operations.{h,cpp}` - Matrix multiplication with SSE2 optimizations
- `matrix_operations.{h,cpp}` - Matrix multiplication with SSE2/NEON optimizations
- `hash_operations.{h,cpp}` - Cryptographic hashing with SIMD acceleration
- `string_search.{h,cpp}` - String pattern matching using SSE2
- `string_search.{h,cpp}` - String pattern matching using SSE2/NEON
- `memory_operations.{h,cpp}` - Fast memory copy operations
- `polynomial_eval.{h,cpp}` - Vectorized polynomial evaluation

Each module uses C++11 standard library and x86 SSE2 intrinsics where applicable.
Each module uses C++11 standard library and architecture-specific SIMD intrinsics where applicable.

## Supported Architectures

- **x86-64**: Intel and AMD processors with SSE2 support (standard on all 64-bit x86 CPUs)
- **ARM64**: ARM processors with NEON support (ARMv8-A and later)
- **Generic**: Scalar fallback for other architectures
36 changes: 27 additions & 9 deletions hash_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@
#ifdef __x86_64__
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

unsigned long long compute_hash(const char* data, size_t len) {
Expand All @@ -20,20 +26,32 @@ unsigned long long compute_hash(const char* data, size_t len) {
for (; i + 16 <= len; i += 16) {
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + i));

// Extract bytes and update hash
// Store to array and extract bytes
alignas(16) unsigned char bytes[16];
_mm_storeu_si128(reinterpret_cast<__m128i*>(bytes), chunk);

// Update hash for each byte
for (int j = 0; j < 16; j++) {
hash = ((hash << 5) + hash) + bytes[j];
}
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (; i + 16 <= len; i += 16) {
uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(data + i));

// Store to array and extract bytes
alignas(16) uint8_t bytes[16];
vst1q_u8(bytes, chunk);

// Update hash for each byte
for (int j = 0; j < 16; j++) {
unsigned char byte = _mm_extract_epi16(chunk, j / 2);
if (j % 2 == 0) {
byte = byte & 0xFF;
} else {
byte = (byte >> 8) & 0xFF;
}
hash = ((hash << 5) + hash) + byte;
hash = ((hash << 5) + hash) + bytes[j];
}
}
#endif

// Process remaining bytes (or all bytes on non-x86)
// Process remaining bytes (or all bytes on non-SIMD)
for (; i < len; i++) {
hash = ((hash << 5) + hash) + data[i];
}
Expand Down
38 changes: 38 additions & 0 deletions invocation_reasons.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
id: eccf8992-d5d0-498b-ad8c-c5465015f9ff
timestamp: '2025-10-28T20:29:17.393715+00:00'
tool: check_image
args:
image: ubuntu:22.04
reason: Checking if the ubuntu:22.04 base image in the Dockerfile supports ARM architecture
---
id: 52295671-de6f-4a9e-a0ac-04ef83bd8029
timestamp: '2025-10-28T20:29:36.859509+00:00'
tool: migrate_ease_scan
args:
scanner: cpp
arch: armv8-a
git_repo: null
output_format: json
extra_args: null
reason: Scanning the C++ codebase in /workspace for x86-specific code that needs to
be migrated to ARM, including SSE2 intrinsics and architecture-specific optimizations
---
id: 2aa34fa2-957e-4aaf-9a50-be7a03830cf6
timestamp: '2025-10-28T20:29:53.389796+00:00'
tool: knowledge_base_search
args:
query: How to convert SSE2 intrinsics to ARM NEON intrinsics for matrix multiplication,
hashing, string search, memory copy operations
reason: Looking for documentation on converting x86 SSE2 intrinsics (_mm_loadu_pd,
_mm_mul_pd, _mm_add_pd, _mm_loadu_si128, _mm_storeu_si128, _mm_cmpeq_epi8, etc.)
to ARM NEON equivalents
---
id: 1e41f86a-59c8-453c-9f8b-12967a3ef6fc
timestamp: '2025-10-28T20:30:07.381002+00:00'
tool: knowledge_base_search
args:
query: SSE2 NEON conversion _mm_loadu_pd _mm_storeu_si128 _mm_cmpeq_epi8 _mm_movemask_epi8
arm_neon.h intrinsics
reason: Looking for specific SSE2 to NEON conversion patterns for double precision
operations, integer operations, and memory copy operations
10 changes: 8 additions & 2 deletions main.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* High-Performance Compute Benchmark Suite
* Optimized for x86-64 architecture with SSE/AVX SIMD instructions
* Optimized for x86-64 and ARM64 architectures with SIMD instructions
*/

#include <iostream>
Expand All @@ -12,18 +12,24 @@

#ifdef __x86_64__
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

int main() {
std::cout << "========================================" << std::endl;
std::cout << " Compute Benchmark Suite" << std::endl;
#if USE_X86_SIMD
std::cout << " x86-64 with SSE2 Optimizations" << std::endl;
#elif USE_ARM_NEON
std::cout << " ARM64 with NEON Optimizations" << std::endl;
#else
std::cout << " Generic Build (No SIMD)" << std::endl;
std::cout << " NOTE: This code is optimized for x86-64" << std::endl;
#endif
std::cout << "========================================" << std::endl;

Expand Down
33 changes: 33 additions & 0 deletions matrix_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@
#ifdef __x86_64__
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) {
Expand Down Expand Up @@ -58,6 +64,33 @@ Matrix Matrix::multiply(const Matrix& other) const {
sum += data[i][k] * other.data[k][j];
}

result.data[i][j] = sum;
}
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < other.cols; j++) {
float64x2_t sum_vec = vdupq_n_f64(0.0);
size_t k = 0;

// Process 2 elements at a time with NEON
for (; k + 1 < cols; k += 2) {
float64x2_t a_vec = vld1q_f64(&data[i][k]);
float64x2_t b_vec;
double b_arr[2] = {other.data[k][j], other.data[k+1][j]};
b_vec = vld1q_f64(b_arr);
sum_vec = vmlaq_f64(sum_vec, a_vec, b_vec);
}

// Horizontal add
double sum = vgetq_lane_f64(sum_vec, 0) + vgetq_lane_f64(sum_vec, 1);

// Handle remaining element
if (k < cols) {
sum += data[i][k] * other.data[k][j];
}

result.data[i][j] = sum;
}
}
Expand Down
14 changes: 13 additions & 1 deletion memory_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@
#ifdef __x86_64__
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

void fast_memcpy(void* dest, const void* src, size_t n) {
Expand All @@ -21,9 +27,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) {
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
_mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk);
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (; i + 16 <= n; i += 16) {
uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(s + i));
vst1q_u8(reinterpret_cast<uint8_t*>(d + i), chunk);
}
#endif

// Copy remaining bytes (or all on non-x86)
// Copy remaining bytes (or all on non-SIMD)
for (; i < n; i++) {
d[i] = s[i];
}
Expand Down
35 changes: 35 additions & 0 deletions polynomial_eval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,14 @@
#ifdef __x86_64__
#include <immintrin.h>
#define USE_X86_SIMD 1
#define USE_ARM_NEON 0
#elif defined(__aarch64__) || defined(__ARM_NEON)
#include <arm_neon.h>
#define USE_X86_SIMD 0
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
Expand Down Expand Up @@ -39,6 +45,35 @@ double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
result += coeffs[i] * power_arr[0];
}

return result;
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
float64x2_t result_vec = vdupq_n_f64(0.0);
float64x2_t power_vec;
double power_arr[2] = {1.0, x};
power_vec = vld1q_f64(power_arr);
float64x2_t power_mult = vdupq_n_f64(x * x);

size_t i = 0;

// Process 2 coefficients at a time
for (; i + 1 < coeffs.size(); i += 2) {
float64x2_t coeff_vec;
double coeff_arr[2] = {coeffs[i], coeffs[i + 1]};
coeff_vec = vld1q_f64(coeff_arr);
float64x2_t term = vmulq_f64(coeff_vec, power_vec);
result_vec = vaddq_f64(result_vec, term);
power_vec = vmulq_f64(power_vec, power_mult);
}

// Horizontal add
double result = vgetq_lane_f64(result_vec, 0) + vgetq_lane_f64(result_vec, 1);

// Handle remaining coefficient
if (i < coeffs.size()) {
result += coeffs[i] * vgetq_lane_f64(power_vec, 0);
}

return result;
#else
// Fallback scalar implementation
Expand Down
Loading