Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ COPY *.h ./
COPY *.cpp ./

# Build the application with optimizations
# SSE2 intrinsics are used in the code for x86-64 platforms
# The code now supports both x86-64 (SSE2) and ARM64 (NEON) optimizations
RUN g++ -O2 -o benchmark \
main.cpp \
matrix_operations.cpp \
Expand Down
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Compute Benchmark Suite

A high-performance compute benchmark application optimized for x86-64 architecture with SSE2 SIMD instructions.
A high-performance compute benchmark application optimized for both x86-64 and ARM64 architectures with SIMD instructions.

## Overview

Expand All @@ -11,7 +11,7 @@ This benchmark suite tests various compute-intensive operations including:
- Memory operations (50MB copy operations)
- Polynomial evaluation (10M iterations)

The code is optimized using x86 SSE2 SIMD intrinsics for maximum performance on Intel and AMD processors.
The code includes optimizations using both x86 SSE2 and ARM NEON SIMD intrinsics for maximum performance on Intel, AMD, and ARM processors.

## Building with Docker

Expand All @@ -33,9 +33,11 @@ This will execute all benchmark tests and display timing results for each operat

## Architecture Notes

- **Optimized for**: x86-64 architecture with SSE2 support
- **SIMD Instructions**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations
- **Fallback**: Includes scalar fallback implementation for non-x86 platforms
- **Optimized for**: x86-64 architecture with SSE2 support and ARM64 architecture with NEON support
- **SIMD Instructions**:
- Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations on x86-64
- Uses NEON intrinsics (`float64x2_t`, `uint8x16_t`) for vectorized operations on ARM64
- **Fallback**: Includes scalar fallback implementation for other platforms

## Output Example

Expand Down Expand Up @@ -69,4 +71,4 @@ The benchmark suite is organized into separate modules:
- `memory_operations.{h,cpp}` - Fast memory copy operations
- `polynomial_eval.{h,cpp}` - Vectorized polynomial evaluation

Each module uses C++11 standard library and x86 SSE2 intrinsics where applicable.
Each module uses C++11 standard library and includes both x86 SSE2 intrinsics and ARM NEON intrinsics where applicable.
Binary file added benchmark
Binary file not shown.
31 changes: 22 additions & 9 deletions hash_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@
#include <chrono>
#include <iomanip>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(_M_X64)
#include <immintrin.h>
#define USE_X86_SIMD 1
#elif defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

unsigned long long compute_hash(const char* data, size_t len) {
Expand All @@ -20,20 +24,29 @@ unsigned long long compute_hash(const char* data, size_t len) {
for (; i + 16 <= len; i += 16) {
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + i));

// Extract bytes and update hash
// Extract bytes and update hash - use direct byte access
const char* chunk_bytes = reinterpret_cast<const char*>(&chunk);
for (int j = 0; j < 16; j++) {
unsigned char byte = _mm_extract_epi16(chunk, j / 2);
if (j % 2 == 0) {
byte = byte & 0xFF;
} else {
byte = (byte >> 8) & 0xFF;
}
unsigned char byte = chunk_bytes[j];
hash = ((hash << 5) + hash) + byte;
}
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (; i + 16 <= len; i += 16) {
uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(data + i));

// Store vector to array and process bytes
uint8_t chunk_bytes[16];
vst1q_u8(chunk_bytes, chunk);
for (int j = 0; j < 16; j++) {
unsigned char byte = chunk_bytes[j];
hash = ((hash << 5) + hash) + byte;
}
}
#endif

// Process remaining bytes (or all bytes on non-x86)
// Process remaining bytes (or all bytes on scalar path)
for (; i < len; i++) {
hash = ((hash << 5) + hash) + data[i];
}
Expand Down
20 changes: 20 additions & 0 deletions invocation_reasons.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
id: 3eac6e13-0397-4179-9ce2-3a023cec332e
timestamp: '2025-10-23T15:29:14.681432+00:00'
tool: check_image
args:
image: ubuntu:22.04
reason: Checking if the base Ubuntu 22.04 image supports ARM64 architecture before
migrating the Dockerfile
---
id: 7cb2bd99-a163-4bd9-99eb-37a3fa8cc29e
timestamp: '2025-10-23T15:29:15.052171+00:00'
tool: migrate_ease_scan
args:
scanner: cpp
arch: armv8-a
git_repo: null
output_format: json
extra_args: null
reason: Scanning the C++ codebase to identify x86-specific code that needs to be migrated
to ARM architecture
9 changes: 7 additions & 2 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,25 @@
#include "memory_operations.h"
#include "polynomial_eval.h"

#ifdef __x86_64__
#if defined(__x86_64__) || defined(_M_X64)
#define USE_X86_SIMD 1
#elif defined(__aarch64__) || defined(_M_ARM64)
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

int main() {
std::cout << "========================================" << std::endl;
std::cout << " Compute Benchmark Suite" << std::endl;
#if USE_X86_SIMD
std::cout << " x86-64 with SSE2 Optimizations" << std::endl;
#elif USE_ARM_NEON
std::cout << " ARM64 with NEON Optimizations" << std::endl;
#else
std::cout << " Generic Build (No SIMD)" << std::endl;
std::cout << " NOTE: This code is optimized for x86-64" << std::endl;
std::cout << " NOTE: This code supports x86-64 and ARM64" << std::endl;
#endif
std::cout << "========================================" << std::endl;

Expand Down
32 changes: 31 additions & 1 deletion matrix_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@
#include <chrono>
#include <stdexcept>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(_M_X64)
#include <immintrin.h>
#define USE_X86_SIMD 1
#elif defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) {
Expand Down Expand Up @@ -58,6 +62,32 @@ Matrix Matrix::multiply(const Matrix& other) const {
sum += data[i][k] * other.data[k][j];
}

result.data[i][j] = sum;
}
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < other.cols; j++) {
float64x2_t sum_vec = vdupq_n_f64(0.0);
size_t k = 0;

// Process 2 elements at a time with NEON
for (; k + 1 < cols; k += 2) {
float64x2_t a_vec = vld1q_f64(&data[i][k]);
double b_vals[2] = {other.data[k][j], other.data[k+1][j]};
float64x2_t b_vec = vld1q_f64(b_vals);
sum_vec = vfmaq_f64(sum_vec, a_vec, b_vec);
}

// Horizontal add
double sum = vgetq_lane_f64(sum_vec, 0) + vgetq_lane_f64(sum_vec, 1);

// Handle remaining element
if (k < cols) {
sum += data[i][k] * other.data[k][j];
}

result.data[i][j] = sum;
}
}
Expand Down
14 changes: 12 additions & 2 deletions memory_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
#include <vector>
#include <chrono>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(_M_X64)
#include <immintrin.h>
#define USE_X86_SIMD 1
#elif defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

void fast_memcpy(void* dest, const void* src, size_t n) {
Expand All @@ -21,9 +25,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) {
__m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
_mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk);
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
for (; i + 16 <= n; i += 16) {
uint8x16_t chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(s + i));
vst1q_u8(reinterpret_cast<uint8_t*>(d + i), chunk);
}
#endif

// Copy remaining bytes (or all on non-x86)
// Copy remaining bytes (or all on scalar path)
for (; i < n; i++) {
d[i] = s[i];
}
Expand Down
34 changes: 33 additions & 1 deletion polynomial_eval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@
#include <iostream>
#include <chrono>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(_M_X64)
#include <immintrin.h>
#define USE_X86_SIMD 1
#elif defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
Expand Down Expand Up @@ -39,6 +43,34 @@ double polynomial_eval_sse(double x, const std::vector<double>& coeffs) {
result += coeffs[i] * power_arr[0];
}

return result;
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
float64x2_t result_vec = vdupq_n_f64(0.0);
double powers[2] = {1.0, x};
float64x2_t power_vec = vld1q_f64(powers);
float64x2_t power_mult = vdupq_n_f64(x * x);

size_t i = 0;

// Process 2 coefficients at a time
for (; i + 1 < coeffs.size(); i += 2) {
double coeffs_arr[2] = {coeffs[i], coeffs[i + 1]};
float64x2_t coeff_vec = vld1q_f64(coeffs_arr);
float64x2_t term = vmulq_f64(coeff_vec, power_vec);
result_vec = vaddq_f64(result_vec, term);
power_vec = vmulq_f64(power_vec, power_mult);
}

// Horizontal add
double result = vgetq_lane_f64(result_vec, 0) + vgetq_lane_f64(result_vec, 1);

// Handle remaining coefficient
if (i < coeffs.size()) {
double power = vgetq_lane_f64(power_vec, 0);
result += coeffs[i] * power;
}

return result;
#else
// Fallback scalar implementation
Expand Down
30 changes: 29 additions & 1 deletion string_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@
#include <iostream>
#include <chrono>

#ifdef __x86_64__
#if defined(__x86_64__) || defined(_M_X64)
#include <immintrin.h>
#define USE_X86_SIMD 1
#elif defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>
#define USE_ARM_NEON 1
#else
#define USE_X86_SIMD 0
#define USE_ARM_NEON 0
#endif

int simd_string_search(const std::string& text, const std::string& pattern) {
Expand Down Expand Up @@ -44,6 +48,30 @@ int simd_string_search(const std::string& text, const std::string& pattern) {
}
}
}
#elif USE_ARM_NEON
// ARM64 optimized path using NEON
uint8x16_t first_char_vec = vdupq_n_u8(first_char);

for (; i + 16 <= text_len - pattern_len + 1; i += 16) {
uint8x16_t text_chunk = vld1q_u8(reinterpret_cast<const uint8_t*>(text.data() + i));
uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec);

// Store comparison result to array and check each potential match
uint8_t cmp_result[16];
vst1q_u8(cmp_result, cmp);
for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) {
if (cmp_result[bit] != 0) {
bool match = true;
for (size_t j = 1; j < pattern_len; j++) {
if (text[i + bit + j] != pattern[j]) {
match = false;
break;
}
}
if (match) count++;
}
}
}
#endif

// Handle remaining characters (or all on non-x86)
Expand Down