From 47f0c3d1cb0d77a4724f232b9f1d0ec764b98460 Mon Sep 17 00:00:00 2001 From: jiayaozhang Date: Fri, 11 Feb 2022 21:23:38 +0800 Subject: [PATCH] finish hw07 yeah! --- .../index/alignalloc.h.61C002AA54843252.idx | Bin 0 -> 4410 bytes .../index/main.cpp.D4B0CBCBF15E99F4.idx | Bin 0 -> 3858 bytes .../index/ndarray.h.D2E81DB3AE0CDD83.idx | Bin 0 -> 3810 bytes .../index/ticktock.h.DA79E8F882CB1FB0.idx | Bin 0 -> 242 bytes .../index/wangsrng.h.FA1F5931EA52ADFF.idx | Bin 0 -> 2656 bytes ANSWER.md | 136 ++++++++++++++++-- CMakeLists.txt | 4 +- main.cpp | 122 +++++++++++++--- 8 files changed, 227 insertions(+), 35 deletions(-) create mode 100644 .cache/clangd/index/alignalloc.h.61C002AA54843252.idx create mode 100644 .cache/clangd/index/main.cpp.D4B0CBCBF15E99F4.idx create mode 100644 .cache/clangd/index/ndarray.h.D2E81DB3AE0CDD83.idx create mode 100644 .cache/clangd/index/ticktock.h.DA79E8F882CB1FB0.idx create mode 100644 .cache/clangd/index/wangsrng.h.FA1F5931EA52ADFF.idx diff --git a/.cache/clangd/index/alignalloc.h.61C002AA54843252.idx b/.cache/clangd/index/alignalloc.h.61C002AA54843252.idx new file mode 100644 index 0000000000000000000000000000000000000000..65a0292ead23edb573ccc1d51d73c2f3a0f478c3 GIT binary patch literal 4410 zcmZ8j30M?I7Va7vgy~`!aHhx1z%Ve)4I`JR@!%8@C8$ANK~yB_D5&8`gii@%MU6)i z5KX+}#tp7UqpKLNRq=>d@Q6_`W{p|%X`+xH@kH4qtudaIi>b>{B z(Zez`<9#_U{gq)=v!+%S@f^o#;lHA?e8xG#ac=G$SG9gaUeTPdE!x{#Vz;hqKlj3n zqa$|zJA1<~JuhE;tM#L4$LqL5=eKn~3HWFFhe?f_hn<>xWZu8#9dv7YIBw3qy_HF! zzuAA<ofJ;}Oxy{IYV|)MIjdvPs z8PD>jMYg`XVzB$Y-!j5m^9x=JzPkNFe6DfNu7w7&H^g(x2li1@oXfn8H- zTdt{3SD4-=C%!hcABnErUHH|8w*qGc-7lVeJ^Rb3$&c^f=so}1gB532ul(!zkl4;+ zA+eh~Dq=H^eat6+{{Gj;Ue><&S%3QnEnmzS{^F14x)ugsNI$dX*5@^455LHX(Ud2T z3|w!xSe{e40z~}LK8U_k)a?83CUp%3+hf3f$$?9Ph8E{^_R}a{pNW|KHx8-(H zUk`yx^y4Idvwt`PWrvuZAGWv$WYr0r&6|_r`o;}ppzv{%*SekfxiDU<(!p2M5x$NH z&WEz5-++>B(;J#riYgyiQ^{BA7$|(|@x&Y5M%7;g&IC4?%%&&?3J<4W``h`iyul0b zem&vq33Wb{H90TGOjvbxbca?I1#9N>^C<&`&;3l3T)Okmg92v-zEms?VW99M3x56M zQepl@4^#8AVTU4qb7CCU;YPDJmH|-c!$DH~lc)>p+;GV(0Y2q{s1Lghd{F|L= z>mxe_E*ALNrrD_s6n_34?bCI+ecl59!ubRFfs}#5zsa`*Z@2v?TaGkOso0}fWT5b? zmebDrt92tN7XzkkBHSk8>3o2VJmCy4Ks3I$&=R@KT4SJGG^}bSTr;e4Jf2(i{ucut zazT4n2wXha5+OzeGEmN1L$WFc-&%AW?gIEQF)Vr$EfR7fVLl`Lh zmOhI=m_D|&9b&#GCz-R&xeOG3f5ayXTj$yz2%JCoYNj|dgn_~zIOhE+w`WbMz$L)` zQDRge1BE}Ni?6=*$*_yyYv64rTPg#EKdOB+YIbySG35e-IZ5NC31$Eu@Nnb$7{O1+ zy6m*C+s)gux7Cut!yqzHb~ygz^!HX{lmkJ1mG0ta8#WtzDg&_pzz=H+S2{q@Py3}C19V?Yz$$b@V7rUjNOv; zMjd24oWF+fYoM|@9?F`#?^*Yay1n!oM2aui*tcvT1BLJ0{OHx%ob@{(QozPYF*1aK z!vFVX`&#ckzXUn5I+0J*Nirz>W9t*oAv<0zqMQhkAgF~P1`rAEDlVxPL_(LPZ|uF( z)~!%iz)HJlw=z(6D4#m5;&o4#+7Pul$EsSMy7sISoc8qtXTuDpi@PqVV=*p|IAI1k?};MaORw30CS6jQeaNLtNu#;@3=ql}ZHVr%y8(f@9Q6 z@QW;#n27j6FfD5&stKwiOgp3zy`B(or0c0)Y6}RNkGGF6W_;qHlj>{L`8u9TLa-37 zT)!m53Q4X}6pix3B15GTTH=QZjZ#ZgOk(jkR9LV>mRGbIodVMiRmJtFra;J2iqj~f zFfA)6PUF|Yv@D!>fhd_UE$bytqf)x7nL^?wR9Zmz1w=rbsI;6|mXk2oxPkaI5D`lg zg^jn;*QpQ6))xZ<>>KkURpx@ zmJq%2wo{hjZCR7?ZB%2JmZca?s^#C5I@{;G1SUn(Xg%s2l}c%;9xo5~a8v(Z63LAj zl8{7DD{+L$0UmHDH|9NN1rSb^b}}f@Dm9a4G>}RuG)0eJLno zJc~*L_yOK{p;Q{A8Dw~FWF8de8fAHBiLP(Y)?^!+(%jO@-r3(orR>=3{;pA;G0%#A z!A}RXp=6URLj@;V$7FMEiIB`q@ujI;$`_qs0@z35c~n1{ejy2x7dp8>Ri;m>C=+7h|2a_CyB~$VAJI*-z?m`?B0`;)`#}r_2rhob7UZM{+*Vr zW*go-;eb4a8y&8UP1^25QjIb0FKPF;AL$G(uZnFC5&6lF9ksc-EB>CZL(aO)C1bBw zFRJ?e%{+nLde!h6$KcxHZ&S|Q3T>2|zHYegF(T_v>pRhEs_w|YFgVH$d)>j~*53Oa zeXXYwUp_Gi>P}|H9Pr+KvDDc9zRe#Ejp=hGm)_hc`|-BRmAu<0{;>4NoZ~$gn}b@m zt@y|I(FX|&Ip2MJV&ne0M_F|-EzOPB_T8C#!1kR2r@r01eZs)IHrIMR`=&yF(uuj>Gy_H@WJrJw`Fom$*|W>(b7;{=M;y(RTX< zyMiNrdixmf>VzV&GP0*l9-q7@`cE-E5BGfh__>#f2#U~GXtM~K6;illsHpya9wcN0 zfjppxY;qw8hCce`*mat-_Ck6&B&2j)0TnKji#hIAd3Nt-?FEk^;Y`QHP%N8V%<*{( zifhiE^sI)2gpS9-IH$?Q91n8pl^Dl#%OT-J$4fyeIk}kQDF+JP@0LoNc-A61F5-yb z>noD0{^1Nhgefj8JHpfqC}7=#-~8lw-;~JJQ%bB$lz=6 z!&{=RDuL^@u4@3G0OtS~ycIxEZc*;+eWU!N=CNp%XcfZu#h=6uEL=HyC#D;w_8|y}@<5X+dLP+&%0wQoH- z2cDea1LOc4$Rk7c_gmgO*lo##C-5ZQZ1TA26Z6X~c?kg_mp1 z_wJp{CQey$mJZU%H8)cczj=?VC4)I2hrAl8)0)=3B(mgF5A>w&?VaDnamtftvg^O? zgU`PCx3QIGS8b;j;lkn?((E!KV9QbjiUhbUm`*F|sR*Y2Ex6Yp_g&sJm)=tDEf0RB zF7A$Wncrze_!(-TB123;6&A#wsv)GM~Pw4TOB0Ff8)qDEi5(YEGF zO*};2z?(`{#cuyXt}{WT1WKx|YH&`d=4=>5p&*nxhu)~zc&~I8L@J=7l1fP1=FREbtvt{_wbQhCleB8y#pq?>x(@3{1oaB&06lx5e?^dB`+i*ZX?_W7@OBLgysaj(%v zGRxr3qUlbh(Yz}xRnE_(8&4zd4+<+eX4f;!1-PHs{-n&eNT}HkytLCgDnFh54p!2Y+f{)>k-5vg-kIU zrw) z)X$lOC?DigPM01oNlM}GhNut}QaH6u1*w65{j}Ri7b(LbLxM$8Wjs`N+;Gr`5v@e- zhDD3_r+t8VSm9oVNGsCHv7~DBXe>hR@)o0Pv7>aFvc*o)Y09?ul1x*!y|-kVvc*d2 zG-ZoJrPGuxR!Lc9hruwjOVKIFGTd|z^I{k30R!?+VKDC&`@<4rsB7BsMbU4 zh4Z&`TSI^$=;u51;Hlh@H0zQ6XYkl=^S!Zbg(=aWe%@*MS|Psck(C4U8+Qm0_m_qH zYX=9q5x81Ay!p&RKxD8ph{Q~BVJ+n-`%-^AZ{NT?BLTr9(g_M1qj-|t@)xgo-K}=+8bBgKTCWM_p SU~L6#I9yvu37_MWJmNnezx+-B literal 0 HcmV?d00001 diff --git a/.cache/clangd/index/ndarray.h.D2E81DB3AE0CDD83.idx b/.cache/clangd/index/ndarray.h.D2E81DB3AE0CDD83.idx new file mode 100644 index 0000000000000000000000000000000000000000..95d2300cfc80f98efb51e8fc0ce7e29396cecb4e GIT binary patch literal 3810 zcmYjT3s4kS7M<4&(68t9bkFpBm>FSUoIwOkAzL6Ki+q%5sR%J522oKVpok*sibP5c z8YLLS-v(sSsG%Yu8e`NXn^Y?CQ;WO0x*rvdg+^msbq&e7L}I0OUw3=Ep^Dq{?z!FX zzWX`%y)-!|=epTo$X+yg&C=49B}Rk6AkzPul`AUZk-^Z$7z}IL-_0+cayr3wVfw#L z8#mp)IpAwZUH<5+&>zo!v9)2!+E))&ytHHG@wWzVOgnYtVMil7nDR~gvzD5|)a%ne z6a&)9T;6khFGR_dv~_xoOQi2TI$Y+?l?*ByMMMh zPehOUY1G{tJ>mNcehArHb!za-H(SXcPJcG*@K5GlEuWod@BXvW;+#5S`F_6TK-!bQ zVUm3&@AvsxZ^S;0da!wsZFfZ}TYPX~)VJ|JJ_+4$LujwMI6Y=tS&nDi!_+_ilGD3m zPSDY1scZV)xjN?F{cC6Mw4JGKNPc@)+hggdL1TZDRRYn<6yaRaG)4;oD~$oQ#L!KBF&K5gHzsdk&S3 zxphn3quyEX#j?=w_%o+!*LP;#H8TQzU-XK8MQC`^Ij-^R=3g4r_iHV+E-VWT&$u+A zw`EJhbQ2Rq-{(1=SA>RVXYUfLlM_oq8H>SyaR}xWVZcU)R-OT2G$HA?*}bjTtJD*Q z8^cXl7TOo4wN`&p+%ofu$k^!)QGAqL5gNYWxG$@x%+xJ1PKu}FbWsr+zAB+Bh0VP2 zp89>S(QCr8(C{6;9h<_FTsuX^LEqnnoV#ci)Q4uXr^fnte7L7s&18YQz>8&};cfRG zX1rhCyHRA^^cy)i$Ds%f?=)N*e}BBGP-H?WzS6nUqX-Q@T5{xt@dexaMaE3=B6rbn zMQHeOzVeLp#n@^SW2Cs5GYg8)@Q<%u%$XqwxoS0bA-)RcSSO0z5 z+^)uYG(XygWuf7JkB{{CP5hxq#r?eBhGn7Q|LCyydb++|rRH)fo+@HlX!zBNfrLEw zps2=inPr(B%R<9@Hy)fHc6r}bkqM*zm5F5$iqP; z3cC61*H_ZsSL>}AS(<4c)Q4u%KPvF&wqGY~Q~l^bdIoALHR6_xoXFv@ML)9|9fZ7u@>0aW(Igb#*>{50esDW>&wahc1 z2AbhGgH`o7f;S`gW)u$nM{pBzH=%II1EI+u){MMR7le|JZx;#xFA-XRTr1+CHVG{Ru>(m^ zBLsIMYbUb9L?XBwS-X)PI05_@IF)&o(L{%n!mBozdY%#NusXt49f&y5wCET| za0;IyU206SPVO3+M{ow8Ayqz>B5qZ*_6GIh2XjTTHYGx0}Ce{Be=v@5~IIYhRX!Kw92jJVMYJ_ zex$2^_fm)-J+V9q;lwZJT1VV(NeiaUC}L57Zi5D)3?R=%u!iK(kI0h{)W5r!w{@K_ z@f<@iAIZl+8xUOKtr!9BtT}!A?povJ3$l@&ck4!Gr+FAey8iPeZ(LZ?_bM{t#g4@; zco!;~?yJsM*cswS{glTlsG*0cS{|vO-fy%0{85Y*(Zib3O_MW1|1dgNG-ym|+>Aa@HIu6O7j7Bi92%li+-Fz67if z?N7s@UIf4`;31j?b@~Hv_O&@|-{GKna$KWLDVloA;N|2+Z#|{zN#;o&ND7<_bIKI~ ztO<_hqAmKt;<ez39ZSV2EZCYvefN0}kaaDej!7Yc_>j!X`oZOw|wV-*T9@^qFyurfH+Y=zF&YEDkzv!t`(rBlT-{0DX2IxZb(6; zcv1pG=**K506Kf%e)ms7b;VxV&#C5A2{t7-%bX=aa-i_3>*_@Xc7?N1AJy9l00S2s z)2_VUR!)m%v1hTy9xF;KOXT@evtnV*i}DZLc5z@~^zsd+V2(Et)JuU43 literal 0 HcmV?d00001 diff --git a/.cache/clangd/index/ticktock.h.DA79E8F882CB1FB0.idx b/.cache/clangd/index/ticktock.h.DA79E8F882CB1FB0.idx new file mode 100644 index 0000000000000000000000000000000000000000..8fbe77e6cf2d76338206589ecde6ca67b9a5bd84 GIT binary patch literal 242 zcmWIYbaQ*fz`)>~;#rZKT9U{DWD5XsaY<2T4v;Pa;)*$@XRP@SIqlmz%lix!E` zFKK^wnD=Z&=E2jEm+P-)H>Q63C-UKC)&I8B4A+D#FR|M*sh=sX%uNDX$551-R;&l4 z%?vKRvs(P*A|n?gGY2ydND2hD8=FgPQjb{!6lURIfeRO<<|Kloi;I$r#X$@v5CH*< QZ#G;%W9H}6$H>G004xqqYybcN literal 0 HcmV?d00001 diff --git a/.cache/clangd/index/wangsrng.h.FA1F5931EA52ADFF.idx b/.cache/clangd/index/wangsrng.h.FA1F5931EA52ADFF.idx new file mode 100644 index 0000000000000000000000000000000000000000..ea75a4a442e7a1941141c7f58bf7761afc3c91b1 GIT binary patch literal 2656 zcmY*b3se+U7#=PQ;&Pc8_c60P_TetLC@71N5R!|5z={H*kgShH0VN+Pkf4|y)bN#> z@A8pHO-`DopebUQhN0%GkVMT#fN-!B4Ngkf;dE#4pQSyAf9Jd3{P*$S@Bfw*TWst^ zIYmWh+6w1p+3A|>7uhg@Ur;)^5bi{I#c7h zjn0aT<8~B!9X~jJS7v2(ebc;|SMGhPXdbon#?q5J_iS5v%Jb|UpM`t33@oU*+VLG9 z^KeF4(e~Jz7ehaLxb@!n*>_IO>Nub+y}PEn_I9qeXv;mtsqXW7Iy=m;|MoJ^NN&r< za6b1~d&5ulw^uw_-6tNWl|&e~Pic-%a`Q7}t{XKcfjyyn>rvyGdA4?^?7w#}uXLU1 zxwzSV_vM&B!Y92FP_dT1@8%+me4JFS`$~ybs8k|~IQI#$Cw~u~xKPBh!>Mw$q zCD`hIueIgvTNUA>tX);|^z-Da^^c~Gnb}l2u_O0dP&vPI{m~)oGlpfHeU&-Y$DMg~ z?7*RPT(!$GjMw+|saM zIrGwc$-fonZ+P!mbGYI`>mTcmJ@Eh2np5@O%@a?om(RSsR+&E~_W0F`McQiD%5VO0 zpVZarLgg)*JAE0cKZ-ikmv7pZz#=(Ep=d7JAO^}!xGeIKbH7zAYQeBMVRo3e4BOlBtEOQxVzI{zmG-kBtJ|YrW1q27mYd5 zC(b;lo<%I-L%0xw7$kmq(~%EOFZ|lSXMToeI3xs#-w|BhV2GaIu0TqXuhyx}VvzWQ z)aJ|Qc)!^!awUAaI^7@!i9Z?~uh&g*t@C=tm;1wtZ5DcTe>T!V9N z(eO8`Tdd9)nQ5~c)Pa#d>(4t-QgBi@q=15Da#6XCd1Ev&CXhcy3A%)VAO?)=jNJ=L zBR~B7TI7!I7L1X*%pF5TF!JC%^q_x?7@pC?Z4hK;f*eS}EA*fQj1*=?P%jjwusV=h zt=7VwVWjp~TO3G7>o}-4Mkd6#@wQpX^C!epno~OooM?hvB!ohAnxt`L~f| z7Xfqt2dBm;kO|}gK^aM|;0KTkdLqMu4Ztv5L0`Gp_%KWqsqr^KX~+VaKocy0Q79j3 zgwDn&l8N+!+LE(+M)|_&2nvm|I*?6g>m|}OCd~`ZpMEp{MO*D8Cnt>3%xS?r=1ozP z$DF@XacdCkXvv)dG&|Jp*1p$66r(^YKKvhHp$O{l2yq*4wRxwkq}geh)OU$n3pWhhxdm? zLCa)u!Svvzm7NzJPZ;-d1=)BA9ioP*NZFZC4iL1J3r4Ir>kCt1r1Vz$LPZD~tQ-bN zE?Ot7f{~ulYoH&&EXY;_6NQ(+EL`NGZNf0|jzA9!^eLIiO^;zFf;{zF2Qn}QB_Q}l zNOW14PKJCLUmlbQqptEEt9`8nvL^X;29h{buZocx{LdPC%Yf9Nmr>OHyv*=yVjx2< zHja+tp;w3jY2ybtkWFRNI#7ZtK|-DDx75UbwZ~3`Y9C;?fKYR@=41%ZdAT$4Ov1zK Z|38FBROMBUC+xj8AIY34XJMr0^B-4>Tjl@& literal 0 HcmV?d00001 diff --git a/ANSWER.md b/ANSWER.md index 83349d8..454cb86 100644 --- a/ANSWER.md +++ b/ANSWER.md @@ -1,23 +1,101 @@ # 改进前 ``` -这里贴改进前的运行结果。 -matrix_randomize: 100s +t=0: n=1120 +matrix_randomize: 0.00323458s +matrix_randomize: 0.00244834s +matrix_transpose: 0.00449375s +matrix_multiply: 0.507901s +matrix_multiply: 0.482881s +matrix_RtAR: 0.995444s +matrix_trace: 2.1269e-05s +1.75932e+08 +test_func: 1.00941s +t=1: n=928 +matrix_randomize: 0.000608694s +matrix_randomize: 0.00140255s +matrix_transpose: 0.00288064s +matrix_multiply: 0.264023s +matrix_multiply: 0.246176s +matrix_RtAR: 0.513248s +matrix_trace: 0.000323195s +1.00156e+08 +test_func: 0.519699s +t=2: n=1024 +matrix_randomize: 0.00063783s +matrix_randomize: 0.000629075s +matrix_transpose: 0.00313033s +matrix_multiply: 0.358542s +matrix_multiply: 0.359807s +matrix_RtAR: 0.721592s +matrix_trace: 5.6674e-05s +1.34324e+08 +test_func: 0.728255s +t=3: n=1056 +matrix_randomize: 0.000902491s +matrix_randomize: 0.000780195s +matrix_transpose: 0.00387635s +matrix_multiply: 0.3977s +matrix_multiply: 0.406229s +matrix_RtAR: 0.808082s +matrix_trace: 8.5799e-05s +1.47405e+08 +test_func: 0.816765s +overall: 3.07871s ``` # 改进后 ``` -这里贴改进后的运行结果。 -matrix_randomize: 0.01s +t=0: n=1120 +matrix_randomize: 0.00347719s +matrix_randomize: 0.00243836s +matrix_transpose: 0.00556866s +matrix_multiply: 0.106349s +matrix_multiply: 0.0895811s +matrix_RtAR: 0.208163s +matrix_trace: 0.000257071s +1.76466e+08 +test_func: 0.223656s +t=1: n=928 +matrix_randomize: 0.00487382s +matrix_randomize: 0.00245537s +matrix_transpose: 0.00080515s +matrix_multiply: 0.0597233s +matrix_multiply: 0.0543553s +matrix_RtAR: 0.115015s +matrix_trace: 0.000258041s +1.00585e+08 +test_func: 0.127455s +t=2: n=1024 +matrix_randomize: 0.000620257s +matrix_randomize: 0.00175319s +matrix_transpose: 0.00213852s +matrix_multiply: 0.0661278s +matrix_multiply: 0.065029s +matrix_RtAR: 0.133401s +matrix_trace: 0.000278096s +1.34691e+08 +test_func: 0.143559s +t=3: n=1056 +matrix_randomize: 0.00101717s +matrix_randomize: 0.00216396s +matrix_transpose: 0.00102147s +matrix_multiply: 0.0686898s +matrix_multiply: 0.0643553s +matrix_RtAR: 0.134125s +matrix_trace: 0.000949931s +1.47779e+08 +test_func: 0.146622s +overall: 0.643736s ``` # 加速比 -matrix_randomize: 10000x -matrix_transpose: 10000x -matrix_multiply: 10000x -matrix_RtAR: 10000x +matrix_randomize: 10x +matrix_transpose: 3x +matrix_multiply: 7x +matrix_RtAR: 8x > 如果记录了多种优化方法,可以做表格比较 @@ -27,20 +105,54 @@ matrix_RtAR: 10000x > matrix_randomize -请回答。 - + answer: + 这是因为 YX 序的数组,X 方向在内存空间中的排布是连续的 + YX 序的循环,其 X 是内层循环体,因此在先后执行的时间上是连续的。 + 对于 YX 序(列主序,C/C++)的数组,请用 YX 序遍历(x变量做内层循环体) + + answer: + _mm_stream_ps 可以一次性写入 16 字节到挂起队列,更加高效了 + 他的第二参数是一个 __m128 类型,可以配合其他手写的 SIMD 指令使用 + 不过,_mm_stream_ps 写入的地址必须对齐到 16 字节,否则会产生段错误等异常 + 需要注意,stream 系列指令写入的地址,必须是连续的,中间不能有跨步,否则无法合并写入,会产生有中间数据读的带宽 + > matrix_transpose -请回答。 + answer: + 循环是 XY 序的,虽然 out 也是 XY 序的没问题,但是 in 相当于一个 YX 序的二维数组, + 从而在内存看来访存是跳跃的,违背了空间局域性。因为每次跳跃了 ny,所以只要缓存容量小于 ny 就无法命中。 + 解决方法当然还是循环分块。 + 这样只需要块的大小 blockSize^2 小于缓存容量,即可保证全部命中。 + + answer: + tbb::simple_partitioner 自带莫顿序遍历功能 + 保证对齐到16字节 > matrix_multiply -请回答。 + answer: + out(x, y) 始终在一个地址不动(一般)。 + lhs(x, t) 每次跳跃 n 间隔的访问(坏)。 + rhs(t, y) 连续的顺序访问(好)。 + 因为存在不连续的 lhs 和一直不动的 out,导致矢量化失败,一次只能处理一个标量,CPU也无法启动指令级并行(ILP) + + answer: + #pragma omp unroll + + answer: + out(i, j) 连续 32 次顺序访问(好)。 + lhs(i, t) 连续 32 次顺序访问(好)。 + rhs(t, j) 32 次在一个地址不动(一般)。 + 这样就消除不连续的访问了,从而内部的 i 循环可以顺利矢量化,且多个循环体之间没有依赖关系,CPU得以启动指令级并行,缓存预取也能正常工作,快好多! + > matrix_RtAR +static 预先分配好空间 + 请回答。 # 我的创新点 如果有,请说明。 + diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d76276..3cd661c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,8 +11,8 @@ add_executable(main main.cpp) find_package(OpenMP REQUIRED) target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) -#find_package(TBB REQUIRED) -#target_link_libraries(main PUBLIC TBB::tbb) +find_package(TBB REQUIRED) +target_link_libraries(main PUBLIC TBB::tbb) if (MSVC) target_compile_options(main PUBLIC /fp:fast /arch:AVX) diff --git a/main.cpp b/main.cpp index d5af053..1d464b9 100644 --- a/main.cpp +++ b/main.cpp @@ -7,16 +7,28 @@ // 作业中有很多个问句,请通过注释回答问题,并改进其代码,以使其更快 // 并行可以用 OpenMP 也可以用 TBB +#include #include -//#include // _mm 系列指令都来自这个头文件 -//#include // 如果上面那个不行,试试这个 +#include // _mm 系列指令都来自这个头文件 +#include +#include +#include +#include +// #include // 如果上面那个不行,试试这个 +#include +#include #include "ndarray.h" #include "wangsrng.h" #include "ticktock.h" // Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x] -using Matrix = ndarray<2, float>; +// using Matrix = ndarray<2, float>; +using Matrix = ndarray<2, float, 0, 0, AlignedAllocator >; // 注意:默认对齐到 64 字节,如需 4096 字节,请用 ndarray<2, float, AlignedAllocator<4096, float>> +// 4096 那么大,即64个缓存行。 +// 这样一次随机访问之后会伴随着64次顺序访问,能被CPU检测到,从而启动缓存行预取,避免了等待数据抵达前空转浪费时间 + + static void matrix_randomize(Matrix &out) { TICK(matrix_randomize); @@ -24,11 +36,30 @@ static void matrix_randomize(Matrix &out) { size_t ny = out.shape(1); // 这个循环为什么不够高效?如何优化? 10 分 -#pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - float val = wangsrng(x, y).next_float(); - out(x, y) = val; + // answer: 这是因为 YX 序的数组,X 方向在内存空间中的排布是连续的 + // YX 序的循环,其 X 是内层循环体,因此在先后执行的时间上是连续的。 + // 对于 YX 序(列主序,C/C++)的数组,请用 YX 序遍历(x变量做内层循环体) + + // #pragma omp parallel for collapse(2) + // for (int x = 0; x < nx; x++) { + // for (int y = 0; y < ny; y++) { + // float val = wangsrng(x, y).next_float(); + // out(x, y) = val; + // } + // } + + +// _mm_stream_ps 可以一次性写入 16 字节到挂起队列,更加高效了 +// 他的第二参数是一个 __m128 类型,可以配合其他手写的 SIMD 指令使用 +// 不过,_mm_stream_ps 写入的地址必须对齐到 16 字节,否则会产生段错误等异常 +// 需要注意,stream 系列指令写入的地址,必须是连续的,中间不能有跨步,否则无法合并写入,会产生有中间数据读的带宽 + #pragma omp parallel for collapse(2) + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x+=4) { + + __m128 tmp = {wangsrng(x, y).next_float(), wangsrng(x+1, y).next_float(), wangsrng(x+2, y).next_float(), wangsrng(x+3, y).next_float()}; + _mm_stream_ps(&out(x,y), tmp); + } } TOCK(matrix_randomize); @@ -41,12 +72,32 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { out.reshape(ny, nx); // 这个循环为什么不够高效?如何优化? 15 分 -#pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - out(y, x) = in(x, y); - } - } + // answer: 循环是 XY 序的,虽然 out 也是 XY 序的没问题,但是 in 相当于一个 YX 序的二维数组, + // 从而在内存看来访存是跳跃的,违背了空间局域性。因为每次跳跃了 ny,所以只要缓存容量小于 ny 就无法命中。 + + // #pragma omp parallel for collapse(2) + // for (int x = 0; x < nx; x++) { + // for (int y = 0; y < ny; y++) { + // out(y, x) = in(x, y); + // } + // } + + // 解决方法当然还是循环分块。 + // 这样只需要块的大小 blockSize^2 小于缓存容量,即可保证全部命中。 + constexpr int block_size = 64; + tbb::parallel_for(tbb::blocked_range2d(0,ny, block_size, 0, nx, block_size), + [&](const tbb::blocked_range2d &r){ + for(size_t y=r.cols().begin(); y{1024, 1024}), RtA(std::array{1024, 1024}); + matrix_transpose(Rt, R); matrix_multiply(RtA, Rt, A); matrix_multiply(RtAR, RtA, R); @@ -106,6 +184,7 @@ static void test_func(size_t n) { Matrix RtAR; matrix_RtAR(RtAR, R, A); + std::cout << matrix_trace(RtAR) << std::endl; TOCK(test_func); @@ -116,6 +195,7 @@ int main() { TICK(overall); for (int t = 0; t < 4; t++) { size_t n = 32 * (rng.next_uint64() % 16 + 24); + // size_t n = 1<<13; std::cout << "t=" << t << ": n=" << n << std::endl; test_func(n); }