From 02b6d228ec821709236a56d2e058a30311a27917 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:20:22 +0000 Subject: [PATCH 1/4] Initial plan From 31b614631e07d1d284c7025376579095a1ae28f5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:29:41 +0000 Subject: [PATCH 2/4] Initial analysis and migration plan for ARM architecture Co-authored-by: JoeStech <4088382+JoeStech@users.noreply.github.com> --- invocation_reasons.yaml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 invocation_reasons.yaml diff --git a/invocation_reasons.yaml b/invocation_reasons.yaml new file mode 100644 index 0000000..b1c4d08 --- /dev/null +++ b/invocation_reasons.yaml @@ -0,0 +1,20 @@ +--- +id: 3eac6e13-0397-4179-9ce2-3a023cec332e +timestamp: '2025-10-23T15:29:14.681432+00:00' +tool: check_image +args: + image: ubuntu:22.04 +reason: Checking if the base Ubuntu 22.04 image supports ARM64 architecture before + migrating the Dockerfile +--- +id: 7cb2bd99-a163-4bd9-99eb-37a3fa8cc29e +timestamp: '2025-10-23T15:29:15.052171+00:00' +tool: migrate_ease_scan +args: + scanner: cpp + arch: armv8-a + git_repo: null + output_format: json + extra_args: null +reason: Scanning the C++ codebase to identify x86-specific code that needs to be migrated + to ARM architecture From ab2801c36f2f377e1862144818d7f203c67da3ea Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:39:50 +0000 Subject: [PATCH 3/4] Complete ARM64 migration with NEON optimizations Co-authored-by: JoeStech <4088382+JoeStech@users.noreply.github.com> --- Dockerfile | 2 +- README.md | 14 ++++++++------ benchmark | Bin 0 -> 41328 bytes hash_operations.cpp | 29 ++++++++++++++++++++--------- main.cpp | 9 +++++++-- matrix_operations.cpp | 32 +++++++++++++++++++++++++++++++- memory_operations.cpp | 14 ++++++++++++-- polynomial_eval.cpp | 34 +++++++++++++++++++++++++++++++++- string_search.cpp | 28 +++++++++++++++++++++++++++- 9 files changed, 139 insertions(+), 23 deletions(-) create mode 100755 benchmark diff --git a/Dockerfile b/Dockerfile index 24fec07..9cf0b28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ COPY *.h ./ COPY *.cpp ./ # Build the application with optimizations -# SSE2 intrinsics are used in the code for x86-64 platforms +# The code now supports both x86-64 (SSE2) and ARM64 (NEON) optimizations RUN g++ -O2 -o benchmark \ main.cpp \ matrix_operations.cpp \ diff --git a/README.md b/README.md index 6bbe4e4..432b586 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Compute Benchmark Suite -A high-performance compute benchmark application optimized for x86-64 architecture with SSE2 SIMD instructions. +A high-performance compute benchmark application optimized for both x86-64 and ARM64 architectures with SIMD instructions. ## Overview @@ -11,7 +11,7 @@ This benchmark suite tests various compute-intensive operations including: - Memory operations (50MB copy operations) - Polynomial evaluation (10M iterations) -The code is optimized using x86 SSE2 SIMD intrinsics for maximum performance on Intel and AMD processors. +The code includes optimizations using both x86 SSE2 and ARM NEON SIMD intrinsics for maximum performance on Intel, AMD, and ARM processors. ## Building with Docker @@ -33,9 +33,11 @@ This will execute all benchmark tests and display timing results for each operat ## Architecture Notes -- **Optimized for**: x86-64 architecture with SSE2 support -- **SIMD Instructions**: Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations -- **Fallback**: Includes scalar fallback implementation for non-x86 platforms +- **Optimized for**: x86-64 architecture with SSE2 support and ARM64 architecture with NEON support +- **SIMD Instructions**: + - Uses SSE2 intrinsics (`__m128d`, `__m128i`) for vectorized operations on x86-64 + - Uses NEON intrinsics (`float64x2_t`, `uint8x16_t`) for vectorized operations on ARM64 +- **Fallback**: Includes scalar fallback implementation for other platforms ## Output Example @@ -69,4 +71,4 @@ The benchmark suite is organized into separate modules: - `memory_operations.{h,cpp}` - Fast memory copy operations - `polynomial_eval.{h,cpp}` - Vectorized polynomial evaluation -Each module uses C++11 standard library and x86 SSE2 intrinsics where applicable. \ No newline at end of file +Each module uses C++11 standard library and includes both x86 SSE2 intrinsics and ARM NEON intrinsics where applicable. \ No newline at end of file diff --git a/benchmark b/benchmark new file mode 100755 index 0000000000000000000000000000000000000000..5a74ae030063a89221c6f2bc2772903631bf3f16 GIT binary patch literal 41328 zcmeHwd3==B)&DbDAZ&Sp1)AbEV9=yW%w)mDH6a;!1|}FuP~18UlL?tRnTfMNuvVi3 z*6BEw*Y<5`Yt^^4yKU)aky=dHu`Y2fN)_Cn0hEXps9NUtJ$IRTW|H}I`MrPrez|-y z=iYPAJ?GqW?>%>&;Wn%N@;sACks)8XRH4++*&Nd-GOE5UA^?p_y)p%#vlO#34(S4h zBigMYoMA%b8X-{H9 zfAxZIHmpJ(1qrjub&GP@u!R>>BqSLW+TRX!{wMOYoM5Nq_4BmkM|smkKGFaA^jl7B>bmWmFU|PUD@7Om<F@i#C(Wt6S9Q%z?iX3XMkZ$bo+#2mXv4`dpqvZVW~nO>SWhKBwf6I}3&$jnCX1{J)q3Ka@l6 z(>dhs%7OoBj`~J&==sMScux*}uFRqT9XaT!BaUXDXL8`*%Tccna@1=u2cHXa=yPrk z{ws3S>+~G>jvV~Y&!NxFIpn^Zqh2(g&G^d>^K<9}_M_6Dmjizb_{`2`j`Xj=XqHl{ z+$H9bQbE%K{A_rc#&V7rFYuoMpQ)H_8~Zt;gcPCRaIEa{`y9cLGZ1n(6o;*&)zRq= zxL10DA$Opowb|?UxjURIyl$R1dd{`&q3SA!Bh(%6uXcFdzLlYFhdU7P2ig|76vwrT z+d~&P9In1Ti^WpC!Ws0qATZ$Zt+ct?Ll%px+Zk|#0!~jTXmeTHo%paeI9eUf-d?w_ z(+VCvjESF+9$(NM2-&<=>ymbV2Q((`^IVYZ^Vc}8slmuMCRH;gp6TKYLH$0L zyEo+V`-GQLr=k$6)YaFYpPy)@ArFM(O zpp+jy!tLyl?K?`xK!bw=?>>Us*5T-=a#-8vIocMr*Eqafv3g?_oq>RJjl<`b{Zdr7 zrZ*ho4Pg(!2 zZR=25>RZ0m7#FOG0&doIxRw816)aV#bf+iS>kqm`gVehG;gH~sAt?JElhqY)XLN<7 zzSSO(EO7_JJ#Z4_N?bQOT-`T1x||-b(&O&w@vou`^b#8PK#lyhxd#Ohk-Q^luVklNs$q_bP~d*|U*E(svq<@~NQY;7CeX_BrBwL|%Dp0w_{~4@ zEeM{XydiK-mY%G9Aky6|J&`S(NvAa|JwZ{ikfwC>whELVubd&$&;RO1#0r)3MEWL1 zU!Ytd(jJ!1S1uRnP{qC2+TPbKhKwVnI5{0<7v$; z*V=^HYru=GAR|0rz)xow{-GQ2a}4-l174`Ya!(lWJxbO~Ueq4S2qf;YCgu z@T5~J2s;o?whiSK8SvX znE}7sfIr)SAB&DzV9Wwz78tX@m<7fxFlK=<3yfJ{%mQN;_X+xiQE}f>0d}3=l zO&j>cZ_;VnxF^1!PSXKV;@)(cHt>l%(rMbbCj#j-ZP*hl(rMbLCzhnsv_VhIPp4^P zo~TZzX+xfvolc)l=~L2a+L$Nu(&;lPee@rx`kN{JMk<}0bLS0;(xsl+qOksvoa4Jb zGu@Sy{z6vzs;uI-kFu&l9hfWEB(u?^pCRA_h+TQnU%gLD}85H`qr%Us;smxmDZyB zbsghFiyqPY3C3E=N;|c43+B*iX1K^6eOKR&k*0r$?zaAM8d-WNqFTJJ;$kYcON(yP z-$7r~V$;tDTVxDfO&R<3UX)}0E4qV{O;ikG)&RX;i``OIsxKx~Y;9TnIf{}NdM;77 zgNoqKMyWQsg%5i)sc(UP_ULi_5eARwzXT-y8=!WYe-ZJe`sim6b8vcz z^at`a#^;DWkHO=5CFR2+-C2bn0bYNd3dL_JGwa`ituz?wYXm`FpGE!^BL9t$@kM5< zY@SDS@)vp>nO&9YnQNgiIdnE&mum63pElr4j}|}uylY9+`%so<1^p7nbeI0$aPhGG z0~b;5e*J8LzLleopfI6TfnG^y#zg=0Q>ytcy#pxy88~kJ33xAkDO`-+j{`dWMygZ1 z?ghc(NkLEVe}LA6z~2k>lr-n>3)IJ`2+dcPf$MJA#^xi-d$#DaU(lj&Xpy&u+d8b4 z0m~jOe#ulcXL9Y4pys%-*-{&mWX z*S$iZ;-nTG)&}3dOdC9rubFmg&z=lT1B34g21UsO+%L-V)L;8Ac@eUd@VQI1$R!Iw z31ht-nykexS%rlD(~r?mozQh>;V(dEx&dWV@~M1cH8O=hOWXPEJzk47>RRmK@x#P! z-Fu<-{x8>q?XvXbwT=^f$xzp!J#I+ke?kp&ME@?qm{r+AlcpA*P7jE{t4eDa z3hm*@=#k`*x@Gv#X8j%zr)?+K1JU7as3h-Fq@O)@MQQ80Jt3DpcD31Z%(ACTJue45g9Y@jG~%29P0+H#;sYuFn4ESpaRrUbC@1kmC4 zwaB1Ji|j15H|z;N3c*JPwyjU=MDm-iB5c#rrjDgr^dDp(bEg)Y%sh@3wU?EmJ4Bz= zb}hi@MP&yQ7i-bgrCRiAvpRLWx~>|1Or6?NR-~tVF-{@5qexU^Nw1w9P;N84oaTDdof2CA5kUw86jUYwE9_(4tRh z4TI_(QIgQGJ6Ng33mU)RMe9#^ZLx$0l zs_VW*Y$Dsu@Tk|_&~$y%4NYH2d;x`c|I>CAlz{*}Vp!YNQpRhJ2u383>yCwHTP<%{ zE$?a3H?@i(I}P3=TJ&Wtx>f%+mPf7Y-U{E7DKT6@(xj+uS(%eDP^#vkH(z%Gy?F-o zpq7E55(f?OcSyf<;|YJ%bam5Jto5kr>b2PXQmx@~v%2mzP;1c-wdjjlY*nciTVl3F zKh-BPAKG5ZT5AdUhghrGYI)8wpv5nKidS!sx^^LQ?6E~=TWoTf7VRiQi*%Ni+u}E2 z1l!BNY1kJ1NQ=C45`8Pr_N;D;yp0?*YO867ZSd^^(QjcTTjcL1ySkvWtVD~Uq!xu0 z7soGtn?x;&*LE(B)lRiXFD|pk3qB&8enM((IuGI`=r^_t05L{s%nWAOJH7x2eDhtG@GGsp;>loo&k4syf==ficWrBi)=M%w}0G=ElzkJjhD!mOBIq`g(U`IUoQMb zwLr`vxm*kD0sHYtf;mEwq24 z4s%qU+5tCo`^SD@)UT{3E#qw_tT?uF8}Wi#bSI{uNzr4PWthd^8Grjb{a2m$y;NGg z{T*24h)lnK3S#PA!(7asqni5p-RKy|q+!9jXfwL#r?$v0ll`JEo7Fqs!YH>zb|BSLs^0NSq@X|^q_bx1`WrQ9 z_ByXoyM4O`T@3ve_T3pdU?M5nv+rvYc2eD+hQK|EPr$QlJsPf1fgXu3L0A0?;7fiB zYtY2cK9cbriEW7Ue!}LPt2bj2gQlsYI;VVB{Pg{8)UEcT=l=xx8NGc25S9Ub3RArB z5hCOhcs{iXCUAA_@&ZMnNnBk^kGaJxZjXD*O6>7aS)=}6M^R}ktr|W|CO7X!sV($j z$DDo~KG%pjuD=Zo<=lWp6D?qu0_w}bz>dMyYmXj<$HxL!i|){Grsao~c*pBLMSZ9O zoA=c|@|QbaJGEJ>(6#8GcKjeFF|2t$K1Hj*JcdrXnfB27rBH=wK{F-k$46GYZW6`> zn=E(eGt)Fj^d44+9a@Z4-yVCYkgAW_MqO(`rJMS1DI)_sh5p~9ZV8oPA#+6E@By!Q zBMG3h0%*FNYpOp+3}cvUNdrF-fDI8`&IquZ*r9J=G}e(K|A)i~k>wd8=LnHc0dA(_ z18_W1MRnS*+rZ9_&UY8nd_Y}$C5qZ(cbBavI_T0%W7)E2p_q=CpQrT^)<5I4__Xnj z$95HkW}p*_wa-xK)YN)PVCDK}$hiI)3_nWpE2uhAOu!?%5WRxtUFaNgLt-JgY?#|m zp-RQPWqn%g`m$cF;nmP-*f)vw##xk!^#;)4gPdU9z7R8OktnATx{=D2rSwgef0Kk5 z%QtGW{BFvETukBDV-g*`{139rm!!(y#mZ;xy3}=_khyQ)PD9Nw;;F;@ffY!vaTj3t ziZ!mf_Qz}=!I!v+@9>1aPL#lccqWMXI+(AD)wP>w#UU2O3EDr>Cbk_VY_Wnel+ZD# zGfO;*jS-00GCJ`vPw2DKB_0`3Vw$nUcqFI~C8m(%5&O+T{n!zV*3GN&hME(&>#yQ_Fb1A)bWpe7(|eGIXO@YJb#@{RZETEH1LgDz(^UTCBOm7Jag{ z;;>e6!rpK|z2gUH9($}AYvHY0#mDxBHv+Y`$frf>Jx3`c`UV`4zLIH{Z#Ny#qRmBE zeH8Hq-PW~U?xVEG$0+j}EAzfS4+V%R^}bjuWcPp-@wdf8)3HnYfY#cPzoW!zo9HSdft97nOqoenF9xb(}rd& z9Wv{Y+JY~P7o4N6n+`#=%Clk~oH{-+4oRVhx~?HJ@AE8A;pJXMf5412juv!E;u%Uw zE=s}=%Juq1&?LRWX8U98i`uoAAJ#gP2H)9OMUNc6XQ5nb{MurtmC^8<3V%aQktjqi zYkx{@%i>S%jy=od#QwvnddPO;^JQv|j2y;B>K58+9aGmnNcN0XV;ydbzJLXw?Qz)# zhdSl{($=tFz2nCqB+vIN@W*u2rK!F_&GhFk)jOa?ZzxfxT9wvVZJ7=3rO00Kp%(e1 z=+-VR`V97L_UJLah^mHJ%=EGr*#Y^j!?wtKrcec=r*Wq3vX(#uc7av}<-=dK>^bxu zKHrnyq8^E2y1~n&>=*tiwZJ$uqv_E7wC-ECZ_OnAkwbF5pie@SkEmVxnTTr*U>G`; zTp@F=f21KE-HUZKL|}<@=rz3_)f9tJzn)cjzy1woT7DkS*W0$}i>zTIhnE>v8ZBiS z7S*(o-N=@n)anb+9iOm8Pil{*cBt4@VP>Z@4E0xo_6A*z{2b(VG-On_SDcVJ+M`Q( zDAX!ImQe4AV3A_4IBAR93(+LlE>Yn-x-jW#TV(5L@yRR?yNy9LvXJtyYC~R3Qg!@V98UCC=edWB^ieUmz znACqySlWWMM$c}I`ig9(?XA%+NJjgm4*RepBilu`FEjK!>91n*i?nm|XfR{( zB5|@25A9&*o=vKMrLM)kjE(*gYq9&*#v!>&Zl5|!$;w5 zHq)!LZZ5Z>$IRsY#2!6KtAO{==WXx~tqpI74{H(ha;#qIv^TpYFK45XMnOh%CLVs5 zTxKJ-lxPFen>LAv*HfG#MHETg0+zgAZAy*EhCxkTup8}W(*_UZvo=k99qSoJ$MhA) zIu$!%Jv~6jfjGb@(M*F{!=aFuEQI;Nf36m5$N0);y>wTz7+(W0P&e$~hVk`kc&{zG zxQO%=_5KKZY^t}N>Yd2P@Dlx!ZFI%xQrF!NbrQ6iq2h_ZQYF^?8cCYNSoeqDR1yyW zlWDK;Og;Dx?Z;r9%$!h<{wQ)Xk>Nu)eX5v)4auu-OV_#U&;;Q_56#wp1T;0*r6Y@O z)b{{MW70y$O&J3){B@djJ_jWPtHc*k)AZ~Ie@_{<=m{-$ffj$fbUeAPt*kw12d?zP zPoal?fPsdCxH%|%n>Jw(8&d7~UhUb0R`b>6*j&unzp?1sbn+0L4`Y8E#v!Nwl6F)F zuw^6P_Eo-sw0Vv35KR99WQsc<@WV2{$|Yr|474j&@= z(TDR@e~UV7k3OeX6qKb+PV+b&$qc?;sF`-Fk1R|A*cyhy@7Us3VsG(MUi> zdl++D!w12$7sjWln&~)emyGh7x`i>&8vYV~n@vb?zSGzdRk#o%wS1et=q=bYEuZSs zboFMsamAMK=xnGj&n~h5ZzONKpO(b>Sty5#fhP%-z0*Gkx6Zd|jflx@+OJ_|T3BB5 zW-|G>qCBYRk0XZZ>j4}o?j(7ACfd$U02U25{id@>n%>o-`cDrdPLb)iPz2rQAVP3aJk!s`%0za}K^QwB=9n$#Db$A|w(`q; zxQ&-(+<|@zg1KLOpm(5gwkYt>?|hK(hFJtOd~Zz`r(mJ)Aa;gZt)(cI+VMYN-0|Ow1R3M&Oa$A%myeLK1pNm@T!xh; zj_6KS0?iwrehh;*)!R^n9QYe?uzK(4pahejXG_>i;6%P_X|bL3B$2fq{XczSV)v$ z2^c7cw_@k2v@gP0E)j7V2ZNuzt|fXvTw86kH5?2afmRh=>n#I^rZ7LvnTK|%kDNBH z{N!o6XP_o%*uV(B-Z|CrRBZ^<~UI-8hMj zyhc5w@H`Zw@oJ0RC$3WWscSE#lh`lOaGjD*{yB)@llDmv36+#ChtSUxcL1 zbD=+9wApYhGVs@AGIGcietuJzA9%>Zbzx^&8<;QYqwB23F7R(7)*4DHz_%;<%_t=n zet1c7ADkYDChIPsVVWpRgUT3Bc(Iom<%G9{`@QR_MmW=iu1Wn@;K9&fWCtVKLO@16 zqroi~yOB;GXitVxe46_PI+2{_p4bPDuvLJV@k{8sNn9)aeNqzpLXAW=lE}Xj6?H*~ ze4ksHMDAdOBSl(M2cWt^tXktELp=8w`w3 zzQXJBBVsHZ_)e@1h}mqWBJGfoc##OGjZlq;K_D!5k`$x+!7R)D1Twhg{)95pa_fb( zdRFQd7>M0&c|_7oi&vcA?!t}NQid}=3Q^bH3#M?neQAHYja=tf87XsXXTnEiy0t~z zzj8--DNF3vm$4H0oH9N=v7aizUJ$5jpZ}CBvY!{&$r4BObJ8Ln197H^*%(dGgO{L# zF;eOyswz#T@#$Y3QQ{7xQemFZx1~!g8&Tr;KMXPsQUazjJi@sd`-f{kM`J96=QkT( z9zk%89%1}5O10>|k;wD)Yephx>MbLYh5GqG>IWfBP7HftC>h_5IDc=6vrl!^rEHZ< z`?Z$hVJ*5FS8(buT^VZ)PXrTK^#rGp-&7x&JnpW^36K`3$?xYxw3E3?^`Q{yL{_szmp-!DPmG)1# z@FXR|2lYCljn~}^`;bxo$$i+1{5Gr#>TJQq7ZuSX;2=l*HH-~gv`q%hsEal)tk=ZH zyoQ0ATC{!lC2q-P;wM~;YX+C`IBCh>U4;zd7O`w5?t4IwD(-ob!NuKf6nCBw_X422 z__|++YXdX7FV#^UG%6m|?5FG0r*OAB9T9GG1McJOf;_i?yCpk?+?K*-kR z@#&XfP-VKG4e%cz;_fGgggEy_xn3q9eT}3rHzpk245Le#fJC}ny(kCg^%O>Uro7XQ z?ydw0d7gAcSJHa^_lR;1qn`Bkmt0u7>fsjsbuwBe@_hZtk;s|)A4Vbz_3s0z-+&M4 zfNn;U$#^H?Y~8{T^1T1Z3#oOi^fQ!?J~AEGFz^*B4xe%9KWzxt{R3JBPB`##;bcBA zH~=_NwCYOB06G#J6Z%fcG8MhJ6qR8UuNF`-1a|Yj9Sv>}9RtXaH|t%5gd} zeIKq#w`mQ-!3khIZ4jxgK+}6UF)P4Oa==JQKSbQ#irU5S?@#-mZ zxHrS&(dy8RnqhTVOqa;au9F(v0=tb0`6i_uc2INy`g^ zc=Y`^o#KJbllxSTMJgy|O9ZTizFOzWTJ*fw{N2Scs=xO2s42n#)w zD$%0tRa(5UUY+_Gb?S9Ew+osH&pt}^FGCBMa_&;F#5+jxm$cC%^UkpWM~Ce=$t;1f z))JoC3pW^_A85ji0b}lfEw{7xs)(R4d+Jftbo+_^Z*EY+&+4a9HRE;n!;oaVlw-`b zd8WH_{j;dn7Ggk#+KdEt%i{XFHTFBPPsWAN>&z+LN(=8v+$pVM`{Zvu%WZoZgi9aj zn94NJ{|Fr0Cu6))wpe5>VA;$EejDgfec*je9NQ;XadCW~TrR{70ZJeE2_`Pm-VSCk zCM-j#`;o#tGTL(ndH^f8zz@O*atnM!-!sH(a}$ZdculWE)OD|b68@_fu2nA65>e36 z8;;ijZS;a*`f50eOt111^A5+k7u!SL0aMUMCLpy=YhrFmxGx)XUt*nhF%q;VzKuy> zd*W#$L^Q!yrS`5kCSQxXwm<7fxFlK=<3yfJ{%mQN;7_-2b z1;#A!|Dy%???m=raml|Jq>apG{)_15)t*qdxxL+5ZN36O{o3QX*-1YD8SMRc=?vA255JftBunIn?d+nR}dljv#&$v^@1wj5Z&B zo!8Te9QJiT5Iej=BCofd)gGPiQ#){I>9e7Dcl-Qx`0 zXa}C9-XeE+ikun>~Y;_UV)!F_Cb1|`HSJ6-k=%3aZ1{|QIVV1 zm^=L|E6piA+&%ul8W_+W;MN<-o9Wx;_pb5zdpu6B*^26h8F3b^+1%{!!Ee{P4fyu3 zC*)Sl<|eP#yh7G4Xm)|n>khd)XDQsq7hcGI3)t%wKeDR`1CgmOG%Fo2DuB5ssL+qO zdP8RXqAL(gH_aLB#t$|dq>}YooFS)K6gMxLuMpLR=Bhr$_#@93UdVsQS?O^1fhQMc zrXPKFy6_v*L8Z+Z0`y&Io~>Nra)ko{)WJA}fyxN|IJ=++3JfB88|6xfQyUf}q^DVgkdWu5e>;=iX8?XGWd7s1!QJGaF$4yaoG4Zk+unV;PJW5ryY7@-b}&&#AT*a^DmHsE2vD**fOldY{$-vY8E^uPK`iEGvIKVE1Ftg%BbT z-w^(m6Q0V8Ks0ybuO7A(s?n<^g#koq0{grIGxWVsp>L;!+@5ii@mx z2}OE6iN8&cCzH<*t(9msqP?=%ocC~XX>)N&Q*n_OS%1Ud&{N6e52=k76qm&F<`V%wL)FF&G1+rfsaGyS%OfDtF<;5j;Q@!pa9U=u^;x&WRg_D3qB@qP~ z@uGfJhW>&}VHT*3D41?;v$GZ)|J@y>igK!KM?3P!HYF%yN7;h$=rEA?HLCMH`SXj- z@q+oq<#!i06<6IkuBo^_GQOp_E$AL~vhK-4my46`cNc)) zorTaeGLF7bY3(mZyB(lGh&kmv-DU{yvb}Y-qdI>?!O{` zW!|XvpD*p8foBJFS^_x*eWX+W6-K4nfTyeA4Lg#_Phf(A>!tj~x%$zS#Rp9JH|CEj zH)a0;@T`Pfx{)fVCAkMoM!D$A%0rOX23zbvT)`Nm-dO|=)He9Za-{$#{5I0*Q!ryT z$*I%g;>P@PQfXw{SXtZ9CE82yZ8X*jo&~?+Mx#2)#lY91-ygtz`6bjgkarJ_CD9+z z`|hN%6d6aIu^fIX)e~8r$&SB+j05oHuaVASsq<2@qjl8rxY3k9>7UrK2)rtxH|xjO zklq_J>`3)F48672;0F+=KscK4x8r@N^e0q58V0?fyNL9g&h?Au!KT7TVk|`p7Z>;D zZ$qcb^pf;oO8ExLJ@aTXc|PLwd-Bx3h1+B@Ly*}EnRg+df`-4{D03LNqtu=+z_84& zkZBy}7KKb3xnWowwHLc4t z{Wh;~Q=YOb52dIzm0MUEK|HJg~L`9!~`27>{@!Z71;}eym z6y?Q~WYRQsdEt-pmFMybzmu=5&!;$$1!sYVpoPPt&lXlIb73jqCf{o+e63LVrK#|_ zLPldQXZZ317M~GBF=gYe9F0>P7D`3NG?5-JAiie713OgWKYvJw z1L8tR>TMQ=n;^)g{4+#a(tVQT^#eIW;gv3{?*U;*Tne$Y)N@#*?-2Qa&&n6<|JQ*0 z|Io7;R+%}bQNT6uE3fLy# zashh<>=$sofDa0|QNRHKhXgzz;IM%B&wE)Y5fE3REK~`|Q~&;FQGaF`{P&p`n#-52 zz<*2|Hd|^cYbvYe&I_|-^{v&ll~uJB0-aedJ7of*06#RBji-KSLeAZyBbfyOeH(K$ z`U>Fbguan3aS+%Y5@0L-2*13Hu!=I517!su;qMW6oY1jwyTI=ic;Ob5?+N_p=WseY z7pL$T{zlVh2zcVZLePuhryR~fe^Sr~1ijSrbQowf{x!f8{|5v;o#|4bAGjTjzDv+Q zE$9VM*5#nTN6-)8?2rPx=K~zdLBCJXD}tUC!pExuPrvzT48sDS?cc}cQM;d}wB63B zLB_%<4F6e0`dxum8h|G|$oWFz>5IK4Wk%}%_muct`gbz=X^Ph?DbsK?dj>$SD4$W< zI@5F%TgUi(PRaH|bZKQ$W~ThMEcFoZr^ti>KF07R%1}8+(mfl6CxM@h`nHLeC-NR@ zCC14qz&DDUcDi?=a4Yac-#d?|>8_E&V~n21jh}SwMSO-Ab1a?vQxQog`KLHD#(d<7P z_)+U6=|%e@$wIHdw~6s0+cBDh&mVH&pU8o4N57?UG;}tXCxG%n4*EJY&}jNE$bt9f zz<-7Cd6Q|-WP~{IWVe9}IGWxLP&f=c^`GoA5xs9@k{t4&_{)^l$Ix zpq~W0O~t&>*w5)pg#Vc(?73WwySsR{(h5BJB}LE+v~n%OpQbFkU1B(NFg}?$N_{N{ z{wKhb-Lm`1px`g{lpLSR!RPM`5C4oG(xI?}1Em-nfzj+)mjjPy8Kcs7=D@Gbfqyp# z{$vjPw2@tk)u0Oa(ez)y@K_Bk_%w=z61|mRq1_b-St|Yb&*YseJdTiarNWykz77DuHg7;xhMk_XK^{v@)w+S=1o>GFF!3AspM>%}|MHB`)z$`-i| z0jIAMuQzXYTUW`v>Z;UxO#Gks^mG+^1x0o@9!ri?({e$&2vOkCrJxV8=c$e$d!8yP zWfUOt(*kd(I^~U9-qhI3TUiUa)wn!>S3S*0Mx#v|O{esnyo3 zI2PEinBQb~Tygp3?N-#SX};Zx66kvV8a!6Dx*8YQZS$L#EpyaV)@FA%Mi;8C>A{m& zx6kKxgjVy~yL~G?KDVvM-cwU;wfD@cuEl3f!v*wd!AEu7ybEgUYvx(%kgyP@wz{FV zVcrGR4TK@O>UlNu8mjB()m9M+VQZ_Zt1qajx}c6&TI$({rMkAJrn;`iYPDGKQh24? z=b%1qT?N%f>y|*HN$Z)#!#-xWE)QN8`)q9^##&E%wF4$;uca1WvZ%wt?Gl>j!dqUS zzs7M@wWT_^2F=ad+I3@%&%fHb%4o1kO9LbtJFlT9lJ!O(r;5YfiD$}0kk+c*A)NqR z0bGF9+G1h#1kD8whpVs8VzE>kUonr?rBN3Bx&gN0FPyDyi(EY!D&toN{+ZlrE_W0! zqBv|7>SFs)C91z$9ryvI}q>(nk}T+ zD2hpo)!?N!sey_+9V;;Gd=8g02R#<>vLcZ2>MB{&>F`o=HT1?)Lb54&>aZb){s;3cQ`UEL!HcG z>2d}`4#;x#u4!A2^E%nkb7s~z0V8U3fSn(DuJQG2M)6W`yi%G(B?A?bsaX=yb30xZD@8uN(M0nRckO)Pqf@w=F|1<7_mq zHNnNjHQ;@BlYzQ>*5CVQeT*P8w_ioKhT3|4d3O!HyQi^I8lqEy#}tShpkc| zzoWHT&O+o0<~bZM~s25PhRXSK{UN~J|OxPRAaTB^Xc z(-Z9V2eDo<+PRYZOt})XU?m32^6I8Lu?yofqZp&9b@{`>H>4(6OGA2^U`rZD5Y7}P z&F;rAFijMEfMM3~vBe(*XZC9HA1mDr?NLdTcyw%18POsx_#lGAP=M#)3vl$6 zyo`hcc>%ZANd-i#*BeqQnZK+=v~r~%Ns0vBE~PSruZSs?Y`Uuq_}RLx(%miA=iQy) zCQ}gOa*4$R_I=d2j4NNda~?8Id`JzjWGXv*Jdnk?WB4i9uJzyxh4clC|I0x4jdZjj zjyV;1uP33rZc<-M(h@}3qXEM(jNCSd6fRF;?bo)Y45HVZh$WI@tNNY}b_rz+F(UR6S- zG-ybF(o^b>C<}3XQWaC>#XYaQ?==gBrF>al*8f|g{Cvg@AM)Z#K-mweJl%z7m;WE& zNQC6Cn314hW}kE-ml3l2&wa=>mN%O@c?!yk2`*XcDL>9C-#D8SN+{_?@$Bj;8?(x{ ziSiN}zZ*w9BtHos%_=YN9VNV@l1in6EGIGaMN4D-<@vXSQzW5)+4iIJLn@y#({e#5 zU|Uv!Z23=t+*n@TZ%Ihs-86xUJOvpM1}eEl;K;%m;tfP+s0QN+{{LR_XpP@hZxbic-Ejhm=r`v+VLReHsdr ziDY?sk0_z+e>DCnNV&2de-4~czI-PlVH;%{f-EoL9ON6zEB%64z<;LyxhPD$r2dVv zoP=5O8$oLfRYHE15sA3Ghd4J2FWDJks_z?UwSJ!1fb(_=XX0-b{$#yny>R`I(jC!M jIPg^-osl&FrCdok6%ko?i6eN794i9-2cO1}l~n!@1EZ-^ literal 0 HcmV?d00001 diff --git a/hash_operations.cpp b/hash_operations.cpp index 0d1d1ca..5217417 100644 --- a/hash_operations.cpp +++ b/hash_operations.cpp @@ -4,11 +4,15 @@ #include #include -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #include #define USE_X86_SIMD 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#include +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif unsigned long long compute_hash(const char* data, size_t len) { @@ -20,20 +24,27 @@ unsigned long long compute_hash(const char* data, size_t len) { for (; i + 16 <= len; i += 16) { __m128i chunk = _mm_loadu_si128(reinterpret_cast(data + i)); - // Extract bytes and update hash + // Extract bytes and update hash - use direct byte access + const char* chunk_bytes = reinterpret_cast(&chunk); for (int j = 0; j < 16; j++) { - unsigned char byte = _mm_extract_epi16(chunk, j / 2); - if (j % 2 == 0) { - byte = byte & 0xFF; - } else { - byte = (byte >> 8) & 0xFF; - } + unsigned char byte = chunk_bytes[j]; + hash = ((hash << 5) + hash) + byte; + } + } +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + for (; i + 16 <= len; i += 16) { + uint8x16_t chunk = vld1q_u8(reinterpret_cast(data + i)); + + // Process each byte from the NEON vector + for (int j = 0; j < 16; j++) { + unsigned char byte = vgetq_lane_u8(chunk, j); hash = ((hash << 5) + hash) + byte; } } #endif - // Process remaining bytes (or all bytes on non-x86) + // Process remaining bytes (or all bytes on scalar path) for (; i < len; i++) { hash = ((hash << 5) + hash) + data[i]; } diff --git a/main.cpp b/main.cpp index 1c6e1a7..7105620 100644 --- a/main.cpp +++ b/main.cpp @@ -10,10 +10,13 @@ #include "memory_operations.h" #include "polynomial_eval.h" -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #define USE_X86_SIMD 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif int main() { @@ -21,9 +24,11 @@ int main() { std::cout << " Compute Benchmark Suite" << std::endl; #if USE_X86_SIMD std::cout << " x86-64 with SSE2 Optimizations" << std::endl; +#elif USE_ARM_NEON + std::cout << " ARM64 with NEON Optimizations" << std::endl; #else std::cout << " Generic Build (No SIMD)" << std::endl; - std::cout << " NOTE: This code is optimized for x86-64" << std::endl; + std::cout << " NOTE: This code supports x86-64 and ARM64" << std::endl; #endif std::cout << "========================================" << std::endl; diff --git a/matrix_operations.cpp b/matrix_operations.cpp index f85a899..1e2ae8f 100644 --- a/matrix_operations.cpp +++ b/matrix_operations.cpp @@ -4,11 +4,15 @@ #include #include -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #include #define USE_X86_SIMD 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#include +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) { @@ -58,6 +62,32 @@ Matrix Matrix::multiply(const Matrix& other) const { sum += data[i][k] * other.data[k][j]; } + result.data[i][j] = sum; + } + } +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + for (size_t i = 0; i < rows; i++) { + for (size_t j = 0; j < other.cols; j++) { + float64x2_t sum_vec = vdupq_n_f64(0.0); + size_t k = 0; + + // Process 2 elements at a time with NEON + for (; k + 1 < cols; k += 2) { + float64x2_t a_vec = vld1q_f64(&data[i][k]); + double b_vals[2] = {other.data[k][j], other.data[k+1][j]}; + float64x2_t b_vec = vld1q_f64(b_vals); + sum_vec = vfmaq_f64(sum_vec, a_vec, b_vec); + } + + // Horizontal add + double sum = vgetq_lane_f64(sum_vec, 0) + vgetq_lane_f64(sum_vec, 1); + + // Handle remaining element + if (k < cols) { + sum += data[i][k] * other.data[k][j]; + } + result.data[i][j] = sum; } } diff --git a/memory_operations.cpp b/memory_operations.cpp index 0e5b970..644214b 100644 --- a/memory_operations.cpp +++ b/memory_operations.cpp @@ -3,11 +3,15 @@ #include #include -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #include #define USE_X86_SIMD 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#include +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif void fast_memcpy(void* dest, const void* src, size_t n) { @@ -21,9 +25,15 @@ void fast_memcpy(void* dest, const void* src, size_t n) { __m128i chunk = _mm_loadu_si128(reinterpret_cast(s + i)); _mm_storeu_si128(reinterpret_cast<__m128i*>(d + i), chunk); } +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + for (; i + 16 <= n; i += 16) { + uint8x16_t chunk = vld1q_u8(reinterpret_cast(s + i)); + vst1q_u8(reinterpret_cast(d + i), chunk); + } #endif - // Copy remaining bytes (or all on non-x86) + // Copy remaining bytes (or all on scalar path) for (; i < n; i++) { d[i] = s[i]; } diff --git a/polynomial_eval.cpp b/polynomial_eval.cpp index db2247a..87c779e 100644 --- a/polynomial_eval.cpp +++ b/polynomial_eval.cpp @@ -2,11 +2,15 @@ #include #include -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #include #define USE_X86_SIMD 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#include +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif double polynomial_eval_sse(double x, const std::vector& coeffs) { @@ -39,6 +43,34 @@ double polynomial_eval_sse(double x, const std::vector& coeffs) { result += coeffs[i] * power_arr[0]; } + return result; +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + float64x2_t result_vec = vdupq_n_f64(0.0); + double powers[2] = {1.0, x}; + float64x2_t power_vec = vld1q_f64(powers); + float64x2_t power_mult = vdupq_n_f64(x * x); + + size_t i = 0; + + // Process 2 coefficients at a time + for (; i + 1 < coeffs.size(); i += 2) { + double coeffs_arr[2] = {coeffs[i], coeffs[i + 1]}; + float64x2_t coeff_vec = vld1q_f64(coeffs_arr); + float64x2_t term = vmulq_f64(coeff_vec, power_vec); + result_vec = vaddq_f64(result_vec, term); + power_vec = vmulq_f64(power_vec, power_mult); + } + + // Horizontal add + double result = vgetq_lane_f64(result_vec, 0) + vgetq_lane_f64(result_vec, 1); + + // Handle remaining coefficient + if (i < coeffs.size()) { + double power = vgetq_lane_f64(power_vec, 0); + result += coeffs[i] * power; + } + return result; #else // Fallback scalar implementation diff --git a/string_search.cpp b/string_search.cpp index 7c5c340..2884c44 100644 --- a/string_search.cpp +++ b/string_search.cpp @@ -2,11 +2,15 @@ #include #include -#ifdef __x86_64__ +#if defined(__x86_64__) || defined(_M_X64) #include #define USE_X86_SIMD 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#include +#define USE_ARM_NEON 1 #else #define USE_X86_SIMD 0 +#define USE_ARM_NEON 0 #endif int simd_string_search(const std::string& text, const std::string& pattern) { @@ -44,6 +48,28 @@ int simd_string_search(const std::string& text, const std::string& pattern) { } } } +#elif USE_ARM_NEON + // ARM64 optimized path using NEON + uint8x16_t first_char_vec = vdupq_n_u8(first_char); + + for (; i + 16 <= text_len - pattern_len + 1; i += 16) { + uint8x16_t text_chunk = vld1q_u8(reinterpret_cast(text.data() + i)); + uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec); + + // Check each potential match by examining comparison result + for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) { + if (vgetq_lane_u8(cmp, bit) != 0) { + bool match = true; + for (size_t j = 1; j < pattern_len; j++) { + if (text[i + bit + j] != pattern[j]) { + match = false; + break; + } + } + if (match) count++; + } + } + } #endif // Handle remaining characters (or all on non-x86) From 537b1164960867520511a3d3edac6cb7c999d6e0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Oct 2025 15:52:22 +0000 Subject: [PATCH 4/4] Fix ARM NEON compilation errors with vgetq_lane_u8 constant requirements Co-authored-by: JoeStech <4088382+JoeStech@users.noreply.github.com> --- hash_operations.cpp | 6 ++++-- string_search.cpp | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/hash_operations.cpp b/hash_operations.cpp index 5217417..0bdf700 100644 --- a/hash_operations.cpp +++ b/hash_operations.cpp @@ -36,9 +36,11 @@ unsigned long long compute_hash(const char* data, size_t len) { for (; i + 16 <= len; i += 16) { uint8x16_t chunk = vld1q_u8(reinterpret_cast(data + i)); - // Process each byte from the NEON vector + // Store vector to array and process bytes + uint8_t chunk_bytes[16]; + vst1q_u8(chunk_bytes, chunk); for (int j = 0; j < 16; j++) { - unsigned char byte = vgetq_lane_u8(chunk, j); + unsigned char byte = chunk_bytes[j]; hash = ((hash << 5) + hash) + byte; } } diff --git a/string_search.cpp b/string_search.cpp index 2884c44..49fc985 100644 --- a/string_search.cpp +++ b/string_search.cpp @@ -56,9 +56,11 @@ int simd_string_search(const std::string& text, const std::string& pattern) { uint8x16_t text_chunk = vld1q_u8(reinterpret_cast(text.data() + i)); uint8x16_t cmp = vceqq_u8(text_chunk, first_char_vec); - // Check each potential match by examining comparison result + // Store comparison result to array and check each potential match + uint8_t cmp_result[16]; + vst1q_u8(cmp_result, cmp); for (int bit = 0; bit < 16 && i + bit <= text_len - pattern_len; bit++) { - if (vgetq_lane_u8(cmp, bit) != 0) { + if (cmp_result[bit] != 0) { bool match = true; for (size_t j = 1; j < pattern_len; j++) { if (text[i + bit + j] != pattern[j]) {