diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..145c4b2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# Build artifacts +benchmark +*.o +*.out + +# Temporary files +*.swp +*.swo +*~ + +# IDE files +.vscode/ +.idea/ +*.iml diff --git a/Dockerfile b/Dockerfile index 24fec07..c3b981e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,8 +15,8 @@ COPY *.h ./ # Copy all C++ source files COPY *.cpp ./ -# Build the application with optimizations -# SSE2 intrinsics are used in the code for x86-64 platforms +# Build the application with optimizations for the target architecture +# Supports both x86-64 (with SSE2) and ARM64 (with NEON) architectures RUN g++ -O2 -o benchmark \ main.cpp \ matrix_operations.cpp \ @@ -24,7 +24,8 @@ RUN g++ -O2 -o benchmark \ string_search.cpp \ memory_operations.cpp \ polynomial_eval.cpp \ - -std=c++11 + -std=c++11 \ + -march=native # Create a startup script COPY start.sh . diff --git a/README.md b/README.md index 6bbe4e4..eae1113 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Compute Benchmark Suite -A high-performance compute benchmark application optimized for x86-64 architecture with SSE2 SIMD instructions. +A high-performance compute benchmark application optimized for both x86-64 and ARM64 architectures with SIMD instructions. ## Overview @@ -11,7 +11,7 @@ This benchmark suite tests various compute-intensive operations including: - Memory operations (50MB copy operations) - Polynomial evaluation (10M iterations) -The code is optimized using x86 SSE2 SIMD intrinsics for maximum performance on Intel and AMD processors. +The code is optimized using architecture-specific SIMD intrinsics for maximum performance on Intel/AMD (x86-64) and ARM processors. ## Building with Docker @@ -21,6 +21,12 @@ Build the Docker image: docker build -t benchmark-suite . ``` +For multi-architecture builds: + +```bash +docker buildx build --platform linux/amd64,linux/arm64 -t benchmark-suite . +``` + ## Running the Benchmark Run the benchmark suite: @@ -33,16 +39,17 @@ This will execute all benchmark tests and display timing results for each operat ## Architecture Notes -- **Optimized for**: x86-64 architecture with SSE2 support -- **SIMD Instructions**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations -- **Fallback**: Includes scalar fallback implementation for non-x86 platforms +- **Optimized for**: x86-64 with SSE2 support and ARM64 with NEON support +- **x86-64 SIMD**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations +- **ARM64 SIMD**: Uses NEON intrinsics (`float64x2_t`, `uint8x16_t`) for vectorized operations +- **Fallback**: Includes scalar fallback implementation for platforms without SIMD support ## Output Example ``` ======================================== Compute Benchmark Suite - x86-64 with SSE2 Optimizations + ARM64 with NEON Optimizations ======================================== === Matrix Multiplication Benchmark === @@ -63,10 +70,16 @@ Hash: 0xbfd8e92e2fb01505 The benchmark suite is organized into separate modules: - `main.cpp` - Main entry point and benchmark orchestration -- `matrix_operations.{h,cpp}` - Matrix multiplication with SSE2 optimizations +- `matrix_operations.{h,cpp}` - Matrix multiplication with SSE2/NEON optimizations - `hash_operations.{h,cpp}` - Cryptographic hashing with SIMD acceleration -- `string_search.{h,cpp}` - String pattern matching using SSE2 +- `string_search.{h,cpp}` - String pattern matching using SSE2/NEON - `memory_operations.{h,cpp}` - Fast memory copy operations - `polynomial_eval.{h,cpp}` - Vectorized polynomial evaluation -Each module uses C++11 standard library and x86 SSE2 intrinsics where applicable. \ No newline at end of file +Each module uses C++11 standard library and architecture-specific SIMD intrinsics where applicable. + +## Supported Architectures + +- **x86-64**: Intel and AMD processors with SSE2 support (standard on all 64-bit x86 CPUs) +- **ARM64**: ARM processors with NEON support (ARMv8-A and later) +- **Generic**: Scalar fallback for other architectures \ No newline at end of file diff --git a/hash_operations.cpp b/hash_operations.cpp index 0d1d1ca..cf47d6f 100644 --- a/hash_operations.cpp +++ b/hash_operations.cpp @@ -7,8 +7,14 @@ #ifdef __x86_64__ #include #define USE_X86_SIMD 1 +#define USE_ARM_NEON 0 +#elif defined(__aarch64__) || defined(__ARM_NEON) +#include +#define USE_X86_SIMD 0 +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif unsigned long long compute_hash(const char* data, size_t len) { @@ -20,20 +26,32 @@ unsigned long long compute_hash(const char* data, size_t len) { for (; i + 16 <= len; i += 16) { __m128i chunk = _mm_loadu_si128(reinterpret_cast(data + i)); - // Extract bytes and update hash + // Store to array and extract bytes + alignas(16) unsigned char bytes[16]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(bytes), chunk); + + // Update hash for each byte + for (int j = 0; j < 16; j++) { + hash = ((hash << 5) + hash) + bytes[j]; + } + } +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + for (; i + 16 <= len; i += 16) { + uint8x16_t chunk = vld1q_u8(reinterpret_cast(data + i)); + + // Store to array and extract bytes + alignas(16) uint8_t bytes[16]; + vst1q_u8(bytes, chunk); + + // Update hash for each byte for (int j = 0; j < 16; j++) { - unsigned char byte = _mm_extract_epi16(chunk, j / 2); - if (j % 2 == 0) { - byte = byte & 0xFF; - } else { - byte = (byte >> 8) & 0xFF; - } - hash = ((hash << 5) + hash) + byte; + hash = ((hash << 5) + hash) + bytes[j]; } } #endif - // Process remaining bytes (or all bytes on non-x86) + // Process remaining bytes (or all bytes on non-SIMD) for (; i < len; i++) { hash = ((hash << 5) + hash) + data[i]; } diff --git a/invocation_reasons.yaml b/invocation_reasons.yaml new file mode 100644 index 0000000..077d440 --- /dev/null +++ b/invocation_reasons.yaml @@ -0,0 +1,38 @@ +--- +id: eccf8992-d5d0-498b-ad8c-c5465015f9ff +timestamp: '2025-10-28T20:29:17.393715+00:00' +tool: check_image +args: + image: ubuntu:22.04 +reason: Checking if the ubuntu:22.04 base image in the Dockerfile supports ARM architecture +--- +id: 52295671-de6f-4a9e-a0ac-04ef83bd8029 +timestamp: '2025-10-28T20:29:36.859509+00:00' +tool: migrate_ease_scan +args: + scanner: cpp + arch: armv8-a + git_repo: null + output_format: json + extra_args: null +reason: Scanning the C++ codebase in /workspace for x86-specific code that needs to + be migrated to ARM, including SSE2 intrinsics and architecture-specific optimizations +--- +id: 2aa34fa2-957e-4aaf-9a50-be7a03830cf6 +timestamp: '2025-10-28T20:29:53.389796+00:00' +tool: knowledge_base_search +args: + query: How to convert SSE2 intrinsics to ARM NEON intrinsics for matrix multiplication, + hashing, string search, memory copy operations +reason: Looking for documentation on converting x86 SSE2 intrinsics (_mm_loadu_pd, + _mm_mul_pd, _mm_add_pd, _mm_loadu_si128, _mm_storeu_si128, _mm_cmpeq_epi8, etc.) + to ARM NEON equivalents +--- +id: 1e41f86a-59c8-453c-9f8b-12967a3ef6fc +timestamp: '2025-10-28T20:30:07.381002+00:00' +tool: knowledge_base_search +args: + query: SSE2 NEON conversion _mm_loadu_pd _mm_storeu_si128 _mm_cmpeq_epi8 _mm_movemask_epi8 + arm_neon.h intrinsics +reason: Looking for specific SSE2 to NEON conversion patterns for double precision + operations, integer operations, and memory copy operations diff --git a/main.cpp b/main.cpp index 1c6e1a7..0ddb8bb 100644 --- a/main.cpp +++ b/main.cpp @@ -1,6 +1,6 @@ /* * High-Performance Compute Benchmark Suite - * Optimized for x86-64 architecture with SSE/AVX SIMD instructions + * Optimized for x86-64 and ARM64 architectures with SIMD instructions */ #include @@ -12,8 +12,13 @@ #ifdef __x86_64__ #define USE_X86_SIMD 1 +#define USE_ARM_NEON 0 +#elif defined(__aarch64__) || defined(__ARM_NEON) +#define USE_X86_SIMD 0 +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif int main() { @@ -21,9 +26,10 @@ int main() { std::cout << " Compute Benchmark Suite" << std::endl; #if USE_X86_SIMD std::cout << " x86-64 with SSE2 Optimizations" << std::endl; +#elif USE_ARM_NEON + std::cout << " ARM64 with NEON Optimizations" << std::endl; #else std::cout << " Generic Build (No SIMD)" << std::endl; - std::cout << " NOTE: This code is optimized for x86-64" << std::endl; #endif std::cout << "========================================" << std::endl; diff --git a/matrix_operations.cpp b/matrix_operations.cpp index f85a899..c235dfd 100644 --- a/matrix_operations.cpp +++ b/matrix_operations.cpp @@ -7,8 +7,14 @@ #ifdef __x86_64__ #include #define USE_X86_SIMD 1 +#define USE_ARM_NEON 0 +#elif defined(__aarch64__) || defined(__ARM_NEON) +#include +#define USE_X86_SIMD 0 +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) { @@ -58,6 +64,33 @@ Matrix Matrix::multiply(const Matrix& other) const { sum += data[i][k] * other.data[k][j]; } + result.data[i][j] = sum; + } + } +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < other.cols; j++) { + float64x2_t sum_vec = vdupq_n_f64(0.0); + size_t k = 0; + + // Process 2 elements at a time with NEON + for (; k + 1 < cols; k += 2) { + float64x2_t a_vec = vld1q_f64(&data[i][k]); + float64x2_t b_vec; + double b_arr[2] = {other.data[k][j], other.data[k+1][j]}; + b_vec = vld1q_f64(b_arr); + sum_vec = vmlaq_f64(sum_vec, a_vec, b_vec); + } + + // Horizontal add + double sum = vgetq_lane_f64(sum_vec, 0) + vgetq_lane_f64(sum_vec, 1); + + // Handle remaining element + if (k < cols) { + sum += data[i][k] * other.data[k][j]; + } + result.data[i][j] = sum; } } diff --git a/memory_operations.cpp b/memory_operations.cpp index 0e5b970..f5ef8bf 100644 --- a/memory_operations.cpp +++ b/memory_operations.cpp @@ -6,8 +6,14 @@ #ifdef __x86_64__ #include #define USE_X86_SIMD 1 +#define USE_ARM_NEON 0 +#elif defined(__aarch64__) || defined(__ARM_NEON) +#include +#define USE_X86_SIMD 0 +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif void fast_memcpy(void* dest, const void* src, size_t n) { @@ -21,9 +27,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) { __m128i chunk = _mm_loadu_si128(reinterpret_cast(s + i)); _mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk); } +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + for (; i + 16 <= n; i += 16) { + uint8x16_t chunk = vld1q_u8(reinterpret_cast(s + i)); + vst1q_u8(reinterpret_cast(d + i), chunk); + } #endif - // Copy remaining bytes (or all on non-x86) + // Copy remaining bytes (or all on non-SIMD) for (; i < n; i++) { d[i] = s[i]; } diff --git a/polynomial_eval.cpp b/polynomial_eval.cpp index db2247a..f741ec0 100644 --- a/polynomial_eval.cpp +++ b/polynomial_eval.cpp @@ -5,8 +5,14 @@ #ifdef __x86_64__ #include #define USE_X86_SIMD 1 +#define USE_ARM_NEON 0 +#elif defined(__aarch64__) || defined(__ARM_NEON) +#include +#define USE_X86_SIMD 0 +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif double polynomial_eval_sse(double x, const std::vector& coeffs) { @@ -39,6 +45,35 @@ double polynomial_eval_sse(double x, const std::vector& coeffs) { result += coeffs[i] * power_arr[0]; } + return result; +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + float64x2_t result_vec = vdupq_n_f64(0.0); + float64x2_t power_vec; + double power_arr[2] = {1.0, x}; + power_vec = vld1q_f64(power_arr); + float64x2_t power_mult = vdupq_n_f64(x * x); + + size_t i = 0; + + // Process 2 coefficients at a time + for (; i + 1 < coeffs.size(); i += 2) { + float64x2_t coeff_vec; + double coeff_arr[2] = {coeffs[i], coeffs[i + 1]}; + coeff_vec = vld1q_f64(coeff_arr); + float64x2_t term = vmulq_f64(coeff_vec, power_vec); + result_vec = vaddq_f64(result_vec, term); + power_vec = vmulq_f64(power_vec, power_mult); + } + + // Horizontal add + double result = vgetq_lane_f64(result_vec, 0) + vgetq_lane_f64(result_vec, 1); + + // Handle remaining coefficient + if (i < coeffs.size()) { + result += coeffs[i] * vgetq_lane_f64(power_vec, 0); + } + return result; #else // Fallback scalar implementation diff --git a/string_search.cpp b/string_search.cpp index 7c5c340..7e87388 100644 --- a/string_search.cpp +++ b/string_search.cpp @@ -5,8 +5,14 @@ #ifdef __x86_64__ #include #define USE_X86_SIMD 1 +#define USE_ARM_NEON 0 +#elif defined(__aarch64__) || defined(__ARM_NEON) +#include +#define USE_X86_SIMD 0 +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif int simd_string_search(const std::string& text, const std::string& pattern) { @@ -44,9 +50,35 @@ int simd_string_search(const std::string& text, const std::string& pattern) { } } } +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + uint8x16_t first_char_vec = vdupq_n_u8(first_char); + + for (; i + 16 <= text_len - pattern_len + 1; i += 16) { + uint8x16_t text_chunk = vld1q_u8(reinterpret_cast(text.data() + i)); + uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec); + + // Store comparison results to array + alignas(16) uint8_t cmp_result[16]; + vst1q_u8(cmp_result, cmp); + + // Check each potential match + for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) { + if (cmp_result[bit]) { + bool match = true; + for (size_t j = 1; j < pattern_len; j++) { + if (text[i + bit + j] != pattern[j]) { + match = false; + break; + } + } + if (match) count++; + } + } + } #endif - // Handle remaining characters (or all on non-x86) + // Handle remaining characters (or all on non-SIMD) for (; i <= text_len - pattern_len; i++) { bool match = true; for (size_t j = 0; j < pattern_len; j++) {