From aa8b9f5c56821c7f5ba8c82d3b51634b8fbd83fd Mon Sep 17 00:00:00 2001 From: HaoHaoXueXiHxy <20211005@bjtu.edu.cn> Date: Wed, 3 Dec 2025 12:11:55 +0800 Subject: [PATCH 1/2] =?UTF-8?q?2025=E5=B9=B4=E6=98=87=E8=85=BEAI=E5=88=9B?= =?UTF-8?q?=E6=96=B0=E5=A4=A7=E8=B5=9B-=E6=98=87=E6=80=9D=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=BC=80=E5=8F=91=E6=8C=91=E6=88=98=E8=B5=9B=EF=BC=88?= =?UTF-8?q?S1=E8=B5=9B=E5=AD=A3)--MoE=E8=B5=9B=E9=A2=98--=E6=88=91?= =?UTF-8?q?=E6=83=B3=E5=A5=BD=E5=A5=BD=E7=9D=A1=E4=B8=80=E8=A7=89=E9=98=9F?= =?UTF-8?q?=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../README.md" | 262 ++++++++++++++++++ .../patches.zip" | Bin 0 -> 30103 bytes 2 files changed, 262 insertions(+) create mode 100644 "2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/README.md" create mode 100644 "2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/patches.zip" diff --git "a/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/README.md" "b/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/README.md" new file mode 100644 index 00000000..0b0ad724 --- /dev/null +++ "b/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/README.md" @@ -0,0 +1,262 @@ +#### 1. MoE模块前向优化 +在moe模块可以拆分 prefill(长序列预填充)和 decode(单 token 解码)阶段逻辑,能够解决解决 “两阶段结果不一致” 和 “推理效率低” 问题。 + +DeepseekMoE模块 +显式拆分prefill/decode阶段 +```python +def forward(self, hidden_states): + if orig_shape[1] == 1: # decode阶段(单token) + y = self.moe_infer_decode(...).view(*orig_shape) + else: # prefill阶段(长序列) + y = self.moe_infer_prefill(...).view(*orig_shape) +``` +通过`sequence_length == 1`显式区分两阶段,分别调用专属推理函数,避免共享逻辑导致的结果偏差。 + +decode阶段(单token推理):极简逐专家计算 +```python +@no_grad() +def moe_infer_decode(self, x, flat_expert_indices, flat_expert_weights): + expert_cache = ops.zeros_like(x) + for i in range(self.num_experts_per_tok): + expert_id = flat_expert_indices[i].item() # 取第i个选中的专家ID + weight = flat_expert_weights[i].item() # 取对应权重 + expert = self.experts[expert_id] # 选中专家网络 + expert_out = expert(x) # 专家处理单token输入 + expert_cache += expert_out * weight # 加权累加输出 + return expert_cache +``` +单token场景下,直接遍历门控网络选中的`num_experts_per_tok`个专家,逐个计算并加权累加; +无冗余操作:单token无需批量索引/掩码,直接遍历激活专家,计算效率最大化; + `@no_grad()`:关闭梯度计算,避免推理阶段显存占用; +权重直接乘:跳过复杂的掩码/索引,保证计算语义简单可追溯。 + +prefill阶段(长序列推理):批量高效计算 +```python +@no_grad() +def moe_infer_prefill(self, x, flat_expert_indices, flat_expert_weights): + expert_cache = ops.zeros_like(x) + idxs = flat_expert_indices.argsort() # 按专家ID排序,便于批量处理 + tokens_per_expert = flat_expert_indices.bincount().cumsum(0) # 每个专家的token数量 + token_idxs = idxs // self.num_experts_per_tok # 还原token在原序列中的索引 + + for i, end_idx in enumerate(tokens_per_expert): + start_idx = 0 if i == 0 else tokens_per_expert[i-1] + if start_idx == end_idx: continue # 无token的专家跳过 + + expert = self.experts[i] + length = (end_idx - start_idx).item() + # 1. 切片获取当前专家处理的token索引(MindSpore原生操作) + exp_token_idx = ops.narrow(token_idxs, 0, start_idx, length) + # 2. 批量提取token的输入特征 + expert_tokens = F.embedding(exp_token_idx, x) + # 3. 专家计算 + 权重加权 + expert_out = expert(expert_tokens) + expert_out = expert_out.mul(F.embedding(ops.narrow(idxs, 0, start_idx, length), flat_expert_weights)) + # 4. 批量累加结果到缓存(替代循环赋值) + expert_cache = mindspore.mint.scatter_add( + expert_cache, 0, + exp_token_idx.view(-1, 1).tile((1, x.shape[-1])), # 扩展索引到特征维度 + expert_out + ) + return expert_cache +``` +长序列场景下,先按专家ID排序token,再批量处理每个专家的所有token,避免逐token循环; + +排序+批量处理:将同专家的token归拢,一次调用专家网络处理所有token,计算效率提升数倍; +`ops.narrow`替代Python切片:适配MindSpore张量操作,避免维度不匹配; +`scatter_add`批量累加:替代逐token赋值,减少显存交互开销; +跳过空专家:无token的专家直接跳过,减少无效循环。 + +--- + +Qwen2MoeSparseMoeBlock模块 +(deepseek模块的实现参考了培训给出的一些示例,基于这个经验,在qwen2_moe模块也进行了拆分获取了收益) + +与DeepseekMoE的“完全拆分函数”不同,Qwen2-MoE在同一个`forward`函数内通过分支区分阶段(代码写的没那么漂亮,主打一个效率): +```python +def forward(self, hidden_states): + if sequence_length == 1: # decode阶段 + # 仅循环活跃专家的精简逻辑 + ... + else: # prefill阶段 + # 遍历所有专家的标准逻辑 + ... +``` + +decode阶段(单token):活跃专家筛选+复用prefill核心逻辑 +```python +if sequence_length == 1: + # 1. 生成专家掩码(与prefill格式完全一致) + expert_mask = nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + # 2. 筛选活跃专家(仅处理被选中的专家,减少循环次数) + expert_usage = ops.sum(expert_mask, dim=(1, 2)) # 统计每个专家被选中的次数 + active_experts = ops.nonzero(expert_usage > 0, as_tuple=False).squeeze(-1) + # 3. 遍历活跃专家(复用prefill的index_add逻辑) + for expert_idx_tensor in active_experts: + expert_idx = int(expert_idx_tensor.asnumpy().item()) + expert_layer = self.experts[expert_idx] + idx, top_x = ops.nonzero(expert_mask[expert_idx], as_tuple=True) + if 0 not in idx.shape: + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + # 关键:与prefill使用相同的index_add累加,保证结果一致 + final_hidden_states = final_hidden_states.index_add( + 0, top_x.int(), current_hidden_states.to(hidden_states.dtype) + ) +``` +- **核心优化**: + - 掩码格式对齐:decode阶段生成与prefill完全一致的`expert_mask`,消除掩码维度/排列差异导致的结果偏差; + - 活跃专家筛选:仅循环被选中的专家(而非所有专家),单token推理循环次数从`num_experts`降至`top_k`(通常2),效率提升; + - 累加逻辑一致:复用`index_add`而非直接赋值,保证两阶段输出的数学语义完全一致。 + +prefill阶段(长序列):标准掩码+全专家遍历 +```python +else: + # 1. 生成专家掩码:(num_experts, top_k, batch*seq_len) + expert_mask = nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + # 2. 遍历所有专家(保证长序列处理兼容性) + for expert_idx in range(self.num_experts): + expert_layer = self.experts[expert_idx] + idx, top_x = ops.nonzero(expert_mask[expert_idx], as_tuple=True) + if 0 not in idx.shape: + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + # 核心:index_add累加,保证批量token结果正确 + final_hidden_states = final_hidden_states.index_add( + 0, top_x.int(), current_hidden_states.to(hidden_states.dtype) + ) +``` +- **核心保留**: + - 全专家遍历:长序列场景下,确保所有可能被选中的专家都被处理,避免遗漏; + - 掩码索引:通过`nonzero`提取每个专家处理的token索引,批量计算后用`index_add`累加,保证结果与原始逻辑一致。 + +共享专家(Shared Expert):两阶段统一逻辑 +```python +shared_expert_output = self.shared_expert(hidden_states) +shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output +final_hidden_states = final_hidden_states + shared_expert_output +``` +- 共享专家是所有输入都会经过的“基础专家”,用于兜底计算; +- 门控权重:通过`sigmoid`激活的门控网络(`shared_expert_gate`)控制共享专家的输出强度,两阶段逻辑完全一致,避免额外偏差。 + +这样把prefill和decode分开来写带来了在decode阶段有达到秒级的收益。这样做完总分提升到了160+。 + + +#### 2. 培训中提到的使用ops算子替换索引操作 + +```python +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + # x1 = x[..., : x.shape[-1] // 2] + # x2 = x[..., x.shape[-1] // 2 :] + x1, x2 = ops.split(x, x.shape[-1] // 2, dim = -1) + return ops.cat((-x2, x1), dim=-1) + + +... + + # cos = cos[position_ids].unsqueeze(unsqueeze_dim) + # sin = sin[position_ids].unsqueeze(unsqueeze_dim) + cos = F.embedding(position_ids, cos).unsqueeze(unsqueeze_dim) + sin = F.embedding(position_ids, sin).unsqueeze(unsqueeze_dim) + +... + + hidden_states_expand = ops.unsqueeze(hidden_states, 2) + hidden_states = hidden_states_expand.broadcast_to((batch, num_key_value_heads, n_rep, slen, head_dim)) + +... + # self.cos_cached[:seq_len].to(dtype=x.dtype), + # self.sin_cached[:seq_len].to(dtype=x.dtype), + ops.narrow(self.cos_cached, 0, 0, seq_len).to(dtype=x.dtype), + ops.narrow(self.sin_cached, 0, 0, seq_len).to(dtype=x.dtype), + +... + + attention_mask_expanded = ops.unsqueeze(ops.unsqueeze(attention_mask, dim=1), dim=2) + padding_mask = ops.narrow(causal_mask, -1, 0, mask_length) + attention_mask_expanded + +``` +使用ops替换掉tensor索引的一些操作能够带来一些细微的收益(几百ms),有的替换甚至没有收益(也可能是那块代码没执行)。 + +#### 3. 使用StaticCache,并开启JIT优化 + +使用StaticCache需要先在cache_utils.py文件中修改一下StaticCache类的update函数中的某一部分(被注释掉的是原来的): +```python + else: + # use index_add for mindspore since tensor slice is too slow and no implementation of index_copy + # k_out = ops.index_add(k_out, 2, cache_position.int(), key_states) + # v_out = ops.index_add(v_out, 2, cache_position.int(), value_states) + k_out.index_add_(2, cache_position.int(), key_states) + v_out.index_add_(2, cache_position.int(), value_states) +``` + +由于默认是DynamicCache,因此需要在utils.py中generate接口处修改一些地方,支持StaticCache的创建使用 +```python + else: + model_type = getattr(self.config, 'model_type', '') + supports_cache_position = model_type in ['qwen2_moe'] + # print('StaticCache') + if ( + hasattr(self, '_supports_static_cache') + and self._supports_static_cache + and not requires_cross_attention_cache + and supports_cache_position + ): + if hasattr(self.config, "_pre_quantization_dtype"): + cache_dtype = self.config._pre_quantization_dtype + else: + cache_dtype = self.dtype + # print('StaticCache') + model_kwargs[cache_name] = self._get_cache( + cache_implementation="static", + max_batch_size=batch_size, + max_cache_len=max_cache_length, + model_kwargs=model_kwargs, + ) + else: + num_hidden_layers = self.config.get_text_config().num_hidden_layers + model_kwargs[cache_name] = ( + DynamicCache(num_hidden_layers) + if not requires_cross_attention_cache + else EncoderDecoderCache(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers)) + ) +``` +这里StaticCache只限制了在qwen2_moe中使用,因为我尝试在deepseek中使用最后好像出现了mismatch,可能精度上有一些损失。 + +最后JIT优化使用在了RMSNorm部分,还在Qwen2MoeModel的_update_causal_mask部分加了JIT,都带来了收益 +```python + @mindspore.jit + def forward(self, hidden_states): + # if use_pyboost(): + # return F.rms_norm(hidden_states, self.weight, self.variance_epsilon) + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(mindspore.float32) + variance = ops.mean(hidden_states.pow(2), -1, keepdim=True) + hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + ... + + @mindspore.jit + def _update_causal_mask( + self, + attention_mask: mindspore.Tensor, + input_tensor: mindspore.Tensor, + cache_position: mindspore.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): +``` + +最后JIT优化牺牲了一些prefill的时延,但是对decode的时延带来了很大的提升,所以最后总分也提升到了280+。 + +--- +#### 最终评测结果 + +| 评测指标 | 平均得分 | +| :-------: | :----------: | +| 峰值显存 | 100 | +| Prefill时延 | 62.3694 | +| Decode时延 | 696.3557 | +| **总分** | **286.2417** | diff --git "a/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/patches.zip" "b/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/patches.zip" new file mode 100644 index 0000000000000000000000000000000000000000..cd92eb435e67c529bdaa1ce68df8587df05a4326 GIT binary patch literal 30103 zcmagFQ?M{huq1eF+qP}nwr$(CZQHibcWm3X?fvJ*#_rC(Om{RMI`XlytFlr-8W;o$ z;D3#bS_|#}x%}T71ON_zgQ1JDg{d>WstP0ku!1DKrO|)O)dLy;5ab*f006|~zqbni zA)x$sLi`UxB6Fz6F**RioE-oF!T$$=fq{XEmeJPI+4g@y(z;uC()|zk|AYL$(EoqP zH#)Y?TVhGSb^46vz8p+MhDk}LlBlIk*~6*5&CZ?9Rd&@XR9HmeNNNNF00nwZ-m0^9 zHWQI1U#oK09CC;6Nj2`t5}ps6akaz#Vg#k~ztC&E%m8Kp2}LGck@f$aGEq2|cK(|` zF2)1Kak}KGL2@sQy7;v*G3iw+haR%{1h+q(o@c|J6XPdWgGV_TDDfGSDk_AuJLjS zYo}S9Oj3G{=m7#35R|E8QAfzc#3mICY7{UzHnZP>YCVulFPHXu-Up^iTs%y&QLaO^ zS9+#&{T%H)3<+~*;1W9xRcRX>*PLA^t2X6wxNi=k_0tPZOUyFr)-=iIFXBRB;S=); z<&n+HMRB*7ZLS)*u*ilp9P&#oL$2LYvTqKli(%3iJbzmaW>bTxrh1HgcQ2w=sgthR zV#q_%MyL3amNfuDRP=58h)T>dmr0X27HMkEB`z*87-`bDC|ffyrh@fDAm>Vcg!#A_ z6*iW@TY#b5>yx;TnQ ziDyr9$=?!|H0mJShhGwb1Eo=w-E^Lg8;e(BC!jfrhJWVdU>UeoEn;T#u9J2{EIl#O zGg@)WS!J4N>9Ons)UZ=ih-^e%38WuqJ~G5-`5GlYA~j1ShwKMO&h22+8z<|+`U zUX7p#1!#MyY6Y^w>m^uFT7KHUZfd1})z!D~x9xKOY|ai$|AOPe!j4R;&6K{|>dTCUO6k+BebX+Gca@botNxpUPib+rRPYKF(G#apz%T zR4+1?vtCXGA_Q`HCh^WtJGh%GA}|YG%gLNL%!L%j4L5UcMFcT4=M!Xhfm2y1=VfVj z9Ir=YC=}uxl7jp#!0PaF(6c|lBI+5s?zYIMl7 z8*;07$9l3i%#LO5kH=*fxnmI8CJh)fOh|DhJ;|?B1PjAOV7x-Ou@Or^WR8v$hH+t*{z-$E6aI-Wwk^Wk*` z3CrGRgF-3RD~35u7snw?)dsxoG=*L<8D6M51Q};p&ZX&gpRT-=!r%|8CecIXPM_1Q z1~vr0<=|su5rc@mS+v3+!@&0;{}-)8Fd^y8kYWN5T&WST3vhCMW-NoiGVTK8Q+qiv zq9b()`9U>V$NJ3&@plOx7r;t>iX7qx$Fl2W#1=Fi0z!-zp=bnksn>u>{BW2t_n z;c||3fwSE@PzLHjoZnu^O6})mohm1QJT}~2(>NKtg)8AZVO!_#)akF%2J=G8i#~>gd zc{ESD=F=F)j~-Pu-3~=$)1?}lnoX3w9dp1wkgQ`V@M8*usAIZacSa7)=z$hOcw-|` z48;Mtneog~48Zm5jnT$*HL}N1mI2wKL6QZO8)DD-b4hUBBD$+rGgnfhDb3$ zUV>m8PT-&~{3ekFv=?oVJxU-BwNaleG!rwK5fGV)=Ij9iko~!GR6w$TZZHTPRb7_E z+*Sf2iHo}#QW8H*Q+n@)sU9$m_I;x#Ke>Cg<~MBiY#t{V?t!$@8<&tMDqe6ZRCL;( zSuq3MF@H<|C5AI?DiSP@1L)4yE5N4?`Wk+ZRp8T7C#3 z1OJ=KWirBr=$I8OwdfDc->FRSo)43f15Hw7Hno%@WZNlhPm9d5CJe7J^JvM7WQ{&- zlH{$BkXeuJj8K^Ol~Pg?;VoPEgsC;EOi=7N9#>+MjWykEJ@q$1aoSL+9Y1=2uia%~ zW`)Q$b+k>yYIe3ra8Z>nuO~>Y>A{i|*KGl2Afl9F^F0x~BCBHBPcH97rQy$Q(yKK) zPK+roQnno|h{mk3iPD^5mdTnJI3tY>&93xuQ%INX())zCc&%L#6}R>6o=Ja$BL~mxFk~!$=Ro`xqQNwEGlY|ApZzL2WP9m zaPb`^l)3QF1MiW3j?1l?(A!pKJ3=8!P5gz5WQD8gij(@*^{>Z!cf+`CJcf8h z$hE8jf1%|;s8(bwF6Jc<^X;&~-EKP?==R{CxKoh{$7?)hEanzjQR5iTs^jx7(MSlNn!9f-i%)tun6b>d-Ul^_1qQD0R{OX7PY%+wwtnSLq^+xPW_<*RlJ&$kYQ%f*NpW2CQQ<}{h(co zj49J)tbG89FVlfVozA0TP97$8EdHoF!f_a(pU9*7s{1B(K^f)xt3-_NIfNaU<4joL z24bG5;)KZIA4Z3GCYZQ5&k^v5JO;cXBmDJ2n)mQ*IG#m-;Slm(gQdbgP<;Y1+~iAW zl$$6m`qL1ZTN2}p5n{a*hl>aR%VsuxR(T=LS%pxuh%^ifZ`}1$35Yxdt!PUdu@-#p znTT5HtnA~mX^?o8l*!&S<-sSB4j!8J=n3ukNuJ+Yr^;z#St4A@U;rsI<?)x$R-;`Y{4L(MH*}B_CU$yy2 z&)Sa71^baV@aOH84f$Dj(DpuW`}oGJK_A�~WNuAvoYMhHI_gP6F2ZM>8D3zxjKQ zL}Pze@8|iVcSvn0!a2c4=kf0Y(0{4p+tz-970;SPKHuo;VdGP+LPLQA37(%>XoFYf zmYNb_;M`L@o-u@fAOT*ZzQReGHXCbGHu8bS0yRT(`+eW{3m&%;i7>GOGul{mq`ZG7 z!tOdy_z0B5MiF>T(=mnJ$zv`{qF#f7I(Zm`Ndvc4QO=*OONvpEXq1C8 zuzbpy;x54n(Y%?1sEfMgjnoYt8SzY5mamnVX?AHxzoF2c!CYj{D^fS|RufZvUpHR?V|!X#+XmE4$N zPXpVU=?kEGkihci+t8!*IX@rXZ@{okXid zMz|_Ke-(Ij+K!N0kgMc)IQwUu$VlXEAsHhMa^HRCB92^qv4lk1@dvzOTS9kJ5?X0z zXA!q)y=FrtlVVZHxFr3?uP4;*I1FMnAc8+CC2g7afSw3e#F83*64CODB8^%mfS2Q8%8E|D7 zdpK|Cy9hFdtP*-{=#;k!%HIv;dbMeaz6L<;8wT#sJX?TkXW;L;es-AQMPja(J~4QISqm9$*0RNLH4`OGDO^sbk#?$=HR^Pkl*~y##Qagv5=W${C zD-$Lv)VhAxJ^kLLKA&g^0~wC9eTJjr+SZ-wy|%utjo(-Czu&Xp9a#S>`Cs_G_w-f9 zn(69xr%#X>?uBb$26##LpcAPcUmpcJ&PTljp1nxxy8s?~-jHr|B=y^>7x3_K5>!b% zWIt(_Pl&#gtxb76FX~MXr3`J)-!AdtT(@Og`JP%CFPoNJ^@|sHv$644NRzROS9qgp zEm`pONerJCXuTurKf~}+)ppL^lI@a&uRyL**#KKOV@ zr4sWz)D@O-ymjOc?pKjdf(>5Z#QaQx-*|uf{JoH6upTXId%Hbuc`Ue*GZu-jVDI0N z*8eh3H#WRmy^Zys<39htASuZ9H6I&w#;y$vX-oM)*r_KUdCEi(e3<xz@Vv1Ag=QuEwTrCqT8vr~98BRGq0=vYL~Ye#cr@*JKoZnZ;;DXgm0W zkr*0EZ!#K2?3QYzd77|aGD$+ylge+Sr($kp(35a@6IEq0rDvBGsUl!wsyGbe8Divl z7jz&4V@~3~vcT_4q=^t%a?S@2oFQawi&13CDCEf_Q;exbU6-)>dk~K(x%Ic5Hr_4S zo(@zZ#)6I11cg*VQ!3rtpfbY)1mgG}i zK)Iy1UxIipe#n3fY^kH6A!`^nK41@LE==RMB^DV?2YXxLkCptpMzuY*rNJ!Ko%TehQ9h5C=Gt_j9%E&C##eKT3(1JD0ovYPis#zG^!RZ8 zziIZZxBh)ele01D-^ri6ZW3@E+aBx1d{XBs+_heR@QdYs8Mln}`X`5k859=)^KExy z=Hmr#?PQ)o?0ett^j+(IhMSbtL9Q^%w+An4th$HCsJq;qr8BL&@7N!cC0D(d{X3?c zKepZ;Zogj1{`l9=v{JZz^c8?)Wvnr2R z`n{(V%Rq}Q_*K)+hEsnBGw-Xa{#U#`upaaQ(rD$a=EeBEsVz?bwf=iN6|N_z+@(I> zr9Cfn7D9BM%jKVT$mNf1_5bKiG&_cAM%?}1y`xso@fP9 znk4hHh#%?hEEj87zXhz{=C#<`zvWjvSMUD5F^lmu3DB&OMF4w`;X8 zu_!2S9uZJh`G34SpYP0i;?9;rc})d93PV|kld%3TKe^p|fgSJ7zY1Kn^)0))w)y%O z0sa05^Uwo;Pc(SbAPr{bGz_byZ+ax<+z=Q)qJ_O0Undu0zS~T_D>7EQTCkgQ82XFQ zW%|+SeTLzp+eBpEcFvdm%`Q*T)wJs{uFfiZv|GRn+z!+r+LIwbUJPQDT{lBXR zjI=JM&Mu6Mw5t#?SyGT@O6@4}M!nL~`d&$jviPop@{b6>8<(SM&~H|Vu|HjvYe#tA9(IA?SiC)v+` z^p{wz-~P4-h=KWWGjhLw@9g>Q_v(xFLv4L}2Yn92OqVzqr-&wugmF6ZotUx7V4F}fKYKTZ$X+22EwDa)*EgL{4B;>C;G z@ejp$NJVxima|RXsGAd*aJ2@18KBYq%0r0quH4lkj z#xh7nuC=#LZh9gCfM7EJxr>`SuU{iU?{0H0qPZV7{A?>`Gm!8(uOseEvxT1}W^h>- zW;;}L+RvW_r%1rIeHYHbc#5_EH|v?~R3kru_uKFllE)F4Tz0|}tmqW)z$5l&EB7Yh z^K3QlF$O1zLJpQc26(KLZ7UphYkpmupi1t&$uZ?qLJT9zQeab?e7rLEpAgGU)j(is zi)pVpi998xIza<%8MP%2X-bLTxst*lL+F_0{t!ApsV2%``gt5jq4>Xu_iBy;YAiFE zREZXZ`)vq_#p-&G;_HU>jTvI@?!qf4eb|KdT7rDBnqD$fKKv!L0(}tmFlEr$RX*76 zt=hq5klLi`5Hn9CBta4|1U>$`ecLjR;L%nX)T+J)8ZHbvnjwbr)nQDHXj+wC*lpm? zT{fK5T^uT#UxO{tdNv11J~0g0dnX)FrXP0O$jsxc}(|m$5IUudWZxfabc9GG+A%~tOG?C z3i)iqW~ zh0R6$|5(CZHq&ny)CGyquoz!Y470`z@|Z6%k7)|zCLzWtL2X2VV?<_diHbojWbdT? zh~Wh?(EKENqsB@;I!-kLLirLvZ2;e)YB_btHQ`tex6GJqpud%I&W{DnysZm@?;ats z-bc`eIuT`~4?1y=?UM1D7Ijo)I&G*79y})unMN_7A%boK^tf-eXdWp>NG1!Z_-e#- zoZvr&V@O~3(IKd6Y)#r#PikbT!0mFfaZmRqa$o^I7ENZTRv`U~L4=u*3lJ|vrTZ^H{)D(p#8CvMWj+pKBBU`V(_oR@?B zHVM{n3bl2v79@OnMwdRU-`nj4Zi;VuoKeRZ5) z|2E*x_;1&4t--dXGP`S--=^j8qRf6Gu6XKD;SJpEGlS5Rl#h+k-#i_`@8xcY)5q!I zZ;9LM=j7w(Z;{i_&hF;;FMDkmRP6%HbwDu&BiLqsdiwqWK|>funzL!)V7R zlSs9^8oqmKz;8P6Y@c9U>!reFhLd3`2LU*drcBKOTYp++1rO<(Y39tnz7Z&lJNB z*Nke6M|tWdN9_NC5Inj zks%a!6^0Sn_RJd}=C)rWSO$fNAR;O_e(p;aQR5mr3WK`8bf_Cz20N}hTmJwNvRNYP_C|y z2E0sAuNk6)^tjiEXoURX-04K0o;hg@16=sHq#01uV7ladlHySJS?n0w8K>qB%)a3DrJsP_Mb#S^m+aXAXr}ktVX=nyJVK?2U|6y8BMcX zGew+Y#m?}Y#Ub~KSFG(-ltXHp6v(ujvCmS#`SHgSSgp4EoVcab1aPX-#Df*{)Vd@+dg94Hrg6?05)6v40t3v;ItkPqg1~^3Owwtj)Nwl9;F(7P-;S3- zJ|mDdq$U0%OLl($2+A1nS25rO2Xi>i%+Y3B_}6imVcirEyyk}cQ);8xj$OI1?O8dv zu&s@aG@>T_AhFH9@N2XF^0o?e;IJHS9|$Z=G`bXKE|@&6RwE$o*a4!XVGP63&MZD8 zc;?FX^yNX~Xz&Cxrl27kHsY1Zbgxpy2ypTZi%BVoWJ!PSBtr2O4~SX|1i@#3lqee) z@*v&Z(`-0obcjY%q8U7fb6bY_mh6T=CXt#-QA2kg?t%l?HXG%sBnqP#xC{T(BO{xE zeCoER=sMx$Z$Fqo+;-+vc{Y$aGVpC)*|PmSv8^t0(wrR)_tdBg*FJ!zZ99jQ2|Zci z7?+FT=YKsl&SeZ4K5QC>6E3Sr>EbpGa#m?_>BY5RQL?AJF;OhvQ$)m(EA)s6sRMX0 zVJQqZjRF-&!1tt8I$joE>IUbg~T}nQHQ|z!6z5ZD?QBnZ6L)>%JQ)(OaGVM118 zEm;35s*%E}hdDsSvqcf29yS*X_%V#WJK4|S$>m{0ejEU$U|VGFUFe?Fmk0b?>~UzafMpa(C(gP2DW*yW6|<_IQ85&I+=w*-Jc93caCyll5-6o2 zz#GD3+2Yi7@M96h4(Pzht3`~X!?HqOzmisr4nJF{^7q#U~xHNbLB;~LOz3l9eJM#2(Vn~2yS7s*~UThYqr zp`50I1j*anWFex|d6r3J5YS2k4YG3!CBpUt;s*xB5a(dN5HRBw^rgx^S4yn=(Si^$ zjjh9tL|qec+72Q|QJY9`$i?0~~T$zkKI9zAGvs6#LsqN8=)obPnVdb6OUinbwg@$Diu zD=uw?(uUccqW;SkUJUtg;bD#r46mQUQ(HxrM%3@{E{Zm+_^d7*Vm~4Nn(MQZ&Oil( zz$|K%cZ2`TNNRwD-wR>t`iJ35BsE%B+&!j8N24j2&=chYIC<`j$Ov?W=xQqmp3^c($>OJLJR`_! zH`(5jS4EkTxDYCKEm=Wa7R~=46?VvjtL`_zR3O95TPP^fHpB%RWt4A~D33)*h|2N% zc-=K^i|IJ9L7+y?F*&1*@V9}IT~z}!#IOYSYjBEH#D7^M87dmmxpv0xJMEZ#zW z+_-Yz5aapoL7#efC@I%@nAl^1`<3mmhcmMS2TO?)vsB+JY2D$CZfEOG;G^o` zHP9(qGoYRVKwb?yHnbs+a@uq?Us%=oE>Fb53Bq`ACD7)g%y?{nr1C9~qE7Z)^9qa9 zq3Sjod(>XvvhX!ffuLm~TBG9=S`C{LrY)P8{zRPFLd486%8VGSSF5qh;u*GMU zT^!$JUbbQ3jYZ$CL(bqg;B*p~<%^8%8C)*idsHB{#pb%S&Dd1Sb$F-g8fA^`TXlU1ef$x__jf zDIV%?O>0PEx;C~K6$Aq#Nl*|3l&udIhYZ6$uM%Ky4^cL=>N@w%43IB0FnW7yv6ZzgwVa4}0z)2!(0voU2eEwn z3t;`;UNJP)BabZ&Yp{^OY94}}=qUu$2f5Sww39@8j87y^_Il4j@F)F#=s=a%Dng^LQ()Q-@z!FQ*!dRojD?sIa}k@LvxC{R3gq*RC^}sxk+yNj%R%p z22=|XHckwui*p4cjzzLxnA0` zlBJI{VrXt__S;J@IWY{hecdhRpCOUdQ5}NxmKhm9o zbXROmN-v<$rN$dAcPed`cH%Q!*$Y_!E^u@t6-*aX3{ciQiB#8_%W6q)bSr z6qr**JCL81--{p3`v&n(i2ss=mEA8ys%~&3$;zdJmr0>>zO7PsbQfhl15TbIW)e2BnXG_i|F=ca94UwXBmFMFEK)mF zD=|aY{JdN^H{MRb)$^93D+e7K)6|_ZJ|gx*aRKxBWeW{Tx&Ub z%@z?0rtpmme45v0S(rzoy3=%Ar_|+lP93K~GRxWsI<8GuWzXH_QNC92$as?2UrNLnEn6{&0Dv_H+gpg)Wvx1t+H4Pgs+v z`0WZklJ`#ma@CFI$X^mb2ilc6(-C8W%B>m99Du|RJX`vv!Jrp~?aNVliGTYB*<+@3wphZXx_R{bMQ2B*RB_UofJA}o6 zbyx&<(6kh=Zwrr6{S9v&pFcq#RUQA5u|Gz(7gPzbw-SC&mcnhD!Z_554#+JsA|0e8 zOpXxDIGJ@%d+CCn1dLVbo`s^fFmFam^)gZ^7dhNRVP}y}`RSeZQBhm%chaO-6lQBi z9Qya)d38nhC1Qn5XSQ`~XXIvnS%9uyWS`7!Xnh9kzFkijigS2+oZ#4l6HJ49=mJpF zhD_=fUQMc0SpiTNr3sZUHRo2Bpv6*tnpek**$~YF%5g_t3(F%Fz=ovdf-%o9KX+i+kXq{YD4n(T z43Wr*67ITEFY4Jt(L+PSqsBC9JySzBi?!z<&U%r#YSKj2B?H854-KMr{e?epac|^VqrFG2!RF0-I2T&6V zuSsOS^`bFN8rPZz7+1A+eYv_;%Z`~_-eN+__KSZsRJ{8qsJG7r%!$(WO zCBoiny0QOHoWk6kNGhCY9YmHtWa1W>IS+XC-WAY*DQ7Wj^z{nz znfdx+tS_(a;XnBFdw$4gQjq%@0qw*Z=~mI=|7d`Zax)NUe(glprCrnN4w`lZv?Dd@ zN1CkBj8`M|r~|w~sb9!v3t|WVY%uw?Ol0e2?`HEgshnro-TbrV|X~gMx;ruw5i|sE5}aKH&u&3J_k~T z%AoSA<+EtX*KV&cP}+Y62pt6E=V+q})-(`(~ZOC(|*-g6T)#*k^=Y95fy8gi5 zEVHmM+~efgoJYglF7jR6tiSO=Rwahp&u7y-Ob~yduA<}aGp9p;N8(%H6O_&RvU^dn z7xT21ez_GrmSfsI`u0!%BG_k;8v_nvjM~YvQD#+NyREoGYe1ML+Hsn$r*JQ+niE&q zO`f|-wGr;$dDOwe_7#|Q?sKEHuW@;3+Il2!CivD3QL-Wft#HzD@HN#gjg-RER zZk`j^`wJThoaka@Ke3?fg>*DHub#_5+-@O1in;i*pZpa~a^sjOC>suXEEecr_l!Gc zo`h{h9@Lk>DlbT1W27YGawdbNCAM64Nf}k;21NBX9&m~stBNToL`l`v_h*qL%}%~d zpO8l&@$(!wakJa=)$Buip=Y#blsbieX#`~`;6Z

L4K0QGeqAsXUd6*9%0ZUaX~r zjZPKsY-hf7yO!raInzt9tCi$r4?RpC2&vy7h$(| z#(SP{I6r0CyY_9i3yHpWFPM9RA$O<&a9rhafz8*P&)FHCZ*jo=?B}pUe)bLIvT#SG zoJG+uSzysKXqG&$uO(o?YwJ!!5N+XNXty?Bh?^ZIv4$M@K(5Xd8pMDGJpgn86MsPxQv*VC05T~NTjm8{X}0bxTrmD>p(_PA1_2ZT8=`4`fYkwrqur6e%19vjzv{kI1*;A2fq5EutzPL zEb?jYtC|8<_13gR)@WVCOqm>e%^px;qm*V=5jP?5in?^{L- zGo}(FMESa5WOFA{DQR6-^ECrR_daFL%C+XOi?fz)vzAfW-L%cEz$(vNbH8kf<8nB( zK&^fa1jE{H>&V9VqpFptrBEsozn+c7|#J-k-()M06F*!@7%CNW9V46_6$y?pvZ53- z)<~rjlLrN&@EJddGq;=&K!8|nqkvEAAIL#6nQB@l6%n=jC2Q`=l&WvdEaJfWBU{3d4xdX9o~v;_9!N?w^Y{3F@6Jr`opZ&xP7 zJr;;8)el0cs?}D0LciY(r|QLJ4J`#3(8n0`&JrM9W{VQ`MvNZP7^?r;=IUwhr43xvr4;S1$o)73CS2t#FB5IUEf__1;1=W%rkgc{>;H7sse$z z-1H1DxznzIlx}I$k6ZHUmYb$XWEo}HWs5&-w#;M=7FpCS8j5e%7dzw09O%ttZD`$* zESN75{)UqZb>7d?X;Pd5c0-2@Pjj>EQ(`)btTVDB>q( zpOJ-lxy&ADmy~Q1;ATQ3v9ga3KeA~poZcmn`v}F0T6rH<{ggXf#Vg zJPaiw?ZE7kfC`!|WN2;10MCN2)-@G9E6(&*lJvL9RF8fk)KM#cyM#jOhlr ztwLI5uxmtIdW#`B_3R`9)f1q9Zkeyu>J4WKF&{SrOnG&X z4(8r|5Q)w|dli$dOFg*9WK{Ks=}VRVttJzbN0m*Pf((s0DYuxg8cv+Pe;KKz+Ej9dy4l@*OXNyS%Cpr|Cgj6WWc z|C&2$o-EZV*4{#vP|P!%+_C^rTHTQHGq4a`!>B^=r zy};b;dn^)wv)Y*t5J-D%k6MUl$$80|5$Op-U7z!MJU11~+u?iv=A8K^uxoDr=TG`b zpNPt#GfLA8l6iYI>l)E#;fBa2F}n%zVZsw#4TidKL2Wj6Wtw}Gh=PDX`HSSKVY?Cn zTp=OW7ifEZ2n|QT;LU?>P(yKAjHYDlHc~$9%*!HBd8;7;PkLoBP0|V9^DG%0TW(vR z4Gl2*kUDv)YIv}UdJ5;)(KQE7QTsFm6jE>M!n1r#Wy#Np5tC~`2*}&pbbss{Y}uGq zzYu}7Y>GV)Kn6s@eqcWs3baznyrtzFt~Fw+iSQ5`C9!ozockuWQe=lje54){;B@=d zjL61sI)bdHL}?VI41&5d3W&l#-i?F76nXpCV?n8f6X9f~CkfWN4$K~rOz^x@P>$J0 z#UgLWQ;c}Twl_wsS@)lLNz5^F`o@0KNm^sS4dgYV1 z5E|W%0KMc2y`7qnR2v;dnPUo`PxBNWea8-czR+Y^-mOmDK|xJ|E(w1op4!rM#HM0p zfP^&EsxV1QR)kk&5jj=zCPeu&1EKV}1zeIU96;&Pk)U*j_w0`2x2Jk61c-i$Yi>3q zK`-%$sw=IrwbIwAZL{7|1SX9cPLLg+HjR|DaBT58$p2IN)`a{y_ySRadta?vV{6q2 zy4%gVX4md!VfA-qh7gpb9@VK|SulvrhD`7_Jz=1*uN(}NgLEhPh=)arGKqZ{^UeN|-pC;Hwk7I+ONi#1^gtuC1+Mi4u@7xJFIW8IugzdI2fr z;#E0Kpo93MzbZl|O3QM05}a_ZNC!PN(i;FSNpdOWN%UrSEv@F?VL?3>1_GpnG%gbC z;_4A>R2$<^gM=x(M43Fs#IlG6OHA4F02N`_BC;(RR8~gvp3(odu5_J_|K4VEuB4z8@37j6M*Tf%h%_!ACavd*AI z=nEBt&nawkKI5qGZ;WQr%8$cNphS77KUN;TQ~71I{i(1_)DD)J+eP&2A;Q$&_X97! z3=l`{4hE2WJ$}?vJ>a3weNY!m*J`2XHmR%3Xz-h}}Gpc)47Ut8J#Yi$3|md;Ge z$jHF)KeqJ$b7$ENueJS_SliwU_30{p9m-Uc33r@rP4I0SL|2Eb{he)KcYq;93rkaH znuKaeQpY;{tQ`(_Eq>=rv*jkoM#)TD2&5(-q68Cl%CLqjG4m3aX7>5S(tpx@t7FXQ0Kz*`8|c4xf~i(G4aw8aq$xI zc}!hlyaT4GT#<`c&Tlx_(uK@5O8}<$9VOTITR7Ek@aJx*=W{mnQ?sgQwx@TO7Mn7M zzU)bha-%yca(r<-3Uj2`0{AB<0E0(R0CyrLK91ne#exghtD@^?JcoH)@dUj8*8}%m zVd2q`GDWgvd>RTfJ9acI49u_sRVtU>;!}qO&YZCh8$+IPzp5>n^`2zv8Dj4*$RB*6 zpL)Cetg$VF_M&5RzH_wuH!>DpsNEqegFraG2xw~rY)W=0=b;Z0Y%YdKmDzq&-`NvqLEho&M+ae=y2hfzrOqRX4F@jRlw zfF%zT8!sD2l9X)xfexSYc%cjzFB6YU#1^#O|JB$#1!)q6TcTy#HoI)wwrzLWUAAqz z%Xa-`+xpAsvb(J5GZQ!NIdkX4oy?cqFW<`88L@Z9xAt0#g+xW^1@Os$R;$KiK=+$%C0f+_roa6iFcr`z zrZ@7YL)@Xw1?|7mzj6Q3QipGeFmlS>5lnRns>ZfH-IQ}@uc;=>odbz*iXnN1Avu;@ zVh-0?Y>G46@a_Aj1UZnxSwSvu;bDbLF}67>ajJhr^gLHp1(PA7d0vDecuRrgC1y{! zj-QdF@XWg`IU(hb414vT2kj|5s>lq``8Nk^KWPRM1uk4|ZA1<5;iTf@k%^IQgK>=k z%huwerApa}aa#zzH5?hJTYgO|%b8MwGjUhdD$Tm=jUFDOQD#i+QVaml{@mx(&i>qa z5EhCaSrIe3%M=L_?r@#v;;ch^@Dio7JFj1zMM?2cykO3E+SgVQ~)sPuT8@F1s zTh4_&+vsM3l8}$1P?kD4KEYzqfnvxFjGH&)2a1qv39w6c_(r<6GYBgBW+eJ+4-ljG z4!yD+c~9G1r&}2zJcF$r@#^02`frAw%^M z(tt(sNo_87ueUUemn5}O%)F@KgiA-2&F7CaV@m0$jGvarmtoQ84=!|SWLn@Nu_E7P zPCK~d>Luu_NUQ1W0VQ5)Phko^({@xOhR2B`oWSMy&}t1IDUcY5M%)6iN{p&0Zjn`Z z{iVa54nJg%t{FEz) zRj~5*m|RXy-f?gSqaXYrlh#ZVl;^f%bU^8_>F4~dx#n; zikC>xIk?h{%hg|9^9NM8kiXWltE(YWE2b1aB};tv;vayy6$v7%nx7LJ{6x z=pW$oee@+`vB0*=WUf~NV zRKmpd6*PoYpU`v`? z5-9kny04&Wi5ei4Sv_iSpTv@Q!|%)riL*Shv9^#)#4E*>=`jy2==8j?qf)m+$6g9^ zCC#Bik@fu+&-B5OBU!NHq3P5j&<~C%Mg79zvk@O0<2sNvGF0G}7G9U;tZZ(@b{;Am zWtoqxJx4#`n8{txkZCRIC=bJpz&OvaunydkkdmrWcicn3N6Rw!ojOEaHoHJ9;3b&% zP1IqFHjZxM0R`eWib2pnGS^^n_Vz|J58swTX5Qmw^!lzD*4l`PcKi)@Ke}5xO&J91 z9GD2`1>W6n;B0`G=VXlxn{0t1TL~pog?hextM?D1k7JSD5|MGS2ms^sRQSH@T7sRg z&acCe(YgBCJjYt>oYkFzS}mnoyXsJ#lPww;dSvNbMQ0LcY#$l%9%>R;_kAjgcpVgZ z+rPr7dp)&>9in8J!xnMyM=Boih&&{9gxXr=%mXqr$zSZHC@(Uc6^AqH!(4dy0~bVX zBnc--&Z?g=BOm&p@%p*2n&OoB8^yfT25wF3aql^;pY^p`$an<;RN3=^pjw zke}-HjQrGgwR!b*BXkHZ1H3!L|F|?jy6QA=-Yh#h7EO2idl6aNYpl<7MB13LG7N+d zYchh+##L3rrlwEp^X6Cd;h@LBGbH1wrOH>y!>(+TfI*e#8S|S0BtU3KK)y@(iIpgh z2=%y8e<4!iIMu_0Vw6mINs4kS^26~-{0*SXPKtSZ*2SE%7A;`g4U9_ADp9k^E)8)` z&BSwrjO?Z`Oo-u?b6&Fsb-Q@@0Ur)>{L}8Dtko58cLW=$;PC#w`NU!Oz!RzcdC^C} zSQ{=93l4c>H5hZ>*dLT~3T`k2ijI8aI{#s)IdO70aFO z6Fbr!#d;3=19j+AtbT*&G9@pR;&iIfSa>W+u=-CR&S3p6SrnN*$`G8N+Mb`%V6=~u z7E?wH#665>1GmZn1dO`d;Id?5_CyoXur!(I>cP_HEV4W*VNb6m!&T^$*#K@mw`tPx z9o7l!TJx&E0|<_j^h#E8T~p)`A~XIGLyAVqfZa**#Pt)j@cjtLdwXrG@3e|5`YEEM ztAZAX8~qsh+qEgr4`9lUl#J0~r8A4`yj~_Xh%NDHQaZHUu3mvaTRQQo;}?v7f%_wH z^J#X#Gr+-K-~4h~q>H5c!`x_9?9#8J!T0v<8{i-+>53t;_GX!U@Fgi4oP*dKBJEJ9 z@p~1|4FWXh7o5mKPDPe|Ti%$H!^9e6a&ub6FUCA6x1hv9pgU~QR-OD-p9?zJt+?#M zpTlbj7m>s6OLM%^G7x=1-cABOh618af|{dBpo~c_tbWGDHM1Nk27q3l)Pybg^lw}pW#Nr#Qc0?#u=qoNS;K3%(5j6F8tUj zfSnEf&~~;ca;K2no@esYDk5Hn3?>(u!6X%giK<8OyuccA;B2$t@vGfX61*CBH zI{am1I|Yp147}xVPrt%L`0B?}dr|BH5>NzxJ1%TYY_vw6h&>|bsF1UxtF{d^Np}{; zAvyPre>lc%+dVp7H#smQATPfDtYcpI*@LtLI3YAZ#zW;{98e~A(iIPf zyHyXRYRYoIE^DDslk)8zyqD^UC(39@a@!i%3fV+-m=+mPQe@iuR%_LCTR&-DjjFhQP*Dj-0 zYpsT^dKoGRug>9HdXa0bJ)kaz%4z)47+%ybbgIMWKRd(K=q zPr&cn(%;q7aPo)S3;_Q&Mc-Dmf#FJ^tHl+1qH$+ctIVfN)3Ul#IL? zNFj_Yn6ThR;p+C7xDHFWF%ASGaEq^Wr2MRM?{|rXiKKAqqKb_qJeQDom?Q|4i7h)= zm!wJ=D#)hB9qznjTW!pd!o=~;R+`KCn}%|bIkn4O(&J*@I(+b!JEAo6Z=@>WO>oGm zw+$#7$Hay=ri6`>-dX%KnS)YD>77w5UnUphuFil$71 z7@Oj$E*I9*={f5AlfL{YU39m1HRjJuurTO1epx9;#+!|%^=95M->7l8n8wi z8Iw=;A7UAJr3?hc4GD235U8ajRldjPm0g_7JAho!!*s!mq_9V!?D$Jv{=X}&{U$3P z2FXDyb{6xlKqd5kQN*sfz9!it4#QDM39)52nKQgQDWdDd(;6R0&NmT}F`nGO*wNBic`vOpsnqw&~nw)Tx z{3~^utNC=PIA}P^dr)3PAuKsYPlU%=!6331PM?E=GR=?S@RxLZx7Gni4k%RZ{gxul z@SlS3{}km{9J*<^`pAkql0^sWjArY5TM%*!YaWcm_5!9u!pZ|X%Wgk>J^)v5)J)B{ znOeML=y_B|QU1t5)Uc_a@&j74kkw~*xp3NizJpB28LL?x|8`~_;A^ync(BiSbw%df zfoT7j-s~nf6$J^S8=TS6=haK77|K5`M>Y#NgT&eZQVX?4@57k84O=S#l{`JrGTw}nckK27y6^)>6_(`3BxO4g)zP$- zK@Zev;=I-ijt*q2>#at>OX-+tNn2oXTrUtxkWhU0(?H~+IG!eUZJ%xd)97yBW{c}9 z8hN;EIz^zT(ebKAZ>rQ~)J|5h2mvD4{P20Q%KaPK&N<1#TUt3LyMc@wGk6U?lpi~| zHNIGR!eHg$Z}{VV#^zJ}MGdaUQ36b1+xzb)A0q^4aqjIdJA^1s4z+}HeBad}jr5K7 z-uErV>-yz3tqUfP+Br@n;$IE|Jvx1Dy1wpfDF-(PATgF>Gzoc$qH#5Ko8^RMcqJsj z&%4M#seN0s(Yn@z}SsQxK0@7D?$$DXE7re)&2DtEA>lP+D{5 zW zuT%K$=H;*L$#H(W7x~|yR%ai|8z<0`O{#{M79XvKw|*QC#2A4XH0+l8Te?^Jn{_s} zVJJ_yr*`*>Sa33&2MUiVJDXp_KAYK;Q8o_s*s3iuT?%%T+C@;y^=!P zWqFb3jG&Fz<&e@TY~`Oe8T2cqHuX%Ga;aPd8GTJqp6*@}2=H~wvsc^K8R>wO+3Uq( zmKW4y%<0_WJ|yVptE@SKd%RK=8Yna3p=UIfytnl{t-hn#B+x4w4}3Zw#ZuX0g16F~ zIv=T%Egf=zD>qV>T=7M;DbH5|M`j5X#m{2oxEfyUYiK^()v!a)$HAcb1zb4$U3I^GiGFVU?AC67MXPe92@i}+ zWqgc57Vf)Mt9xxXm~%pT(*su1OIoWGtVP2fx_`zNczHv%;;bLj4tYe z)$9j5vNxJ^X+@y8H<C}Kms=H^! z*RiI1}7(s<hYa!ySpRiT}aKt^-^ z!U-=|7MUU&vXz=?d#qUnUaJDaqGpV;f+K$}7TpYK=FA!jU2a+1xRZooQBNBSU$2{q zi?w#b~m_oaC_Rx;pkYuhWOSqO&aylCuowO)dL7x!Twi zs}o)%IsYe#RI|XP#K*JSKHP(JP~)r#w21-TYza5csx4AUaa{&#*&cfX#lER8o)2XL zEO-!1NJACWt*|I>ZB@}Uw;5P9-R=Dg)XIYb{P~Ps(mq>7(AVRD*5LeDCnp%0**bCL(3m(5P6I;%m>5AmxdV=(b1yG}=;bs?5Uh~i*;R`no zEbmRI^J2aKZ+yr<5_QJ{9|VLq6a<9mzw#mM3?}C047N7^wD!#0|No50rT&#Ga5U?? zuFq8J&6v5=Shd((M$LPX_wM-ml3lBNY1Wz@Mw*dQoZ_EHuertBz!_!Z_S;o<>D_Uy zBBRYBMOqH$$$GTW7j96p(6Ld+$%a8-45V;y@}-KR94mz>hVb~-a(&9l`fsE z9%$qAi6`>VU;f)=eQo)C?K1u@KV4hxHtCesx(JOb;_3|B6A}c>-rRz6Sw7LS$mjSj zgh6POG_a~x!YXN1>F3g>%&fXvGpx6bNaUD4{$56%NIzP;XJy8X9?7K}Ri>Vdj#5!m zk4XcY@I9(smpv7LcDz4xx^@;YZPnKPGK>jsKGvpJcyhva`-_cVBG8&j8LTC=1ki_h&Q+Ir_>m<+#gNS4bi*{UQg zYXu3lsF00WMp^hSkJ$L?HY$^mfP%Tv_Oa>U;M-}>P-W%nH~umungRUPy)HMKkIyGh zo9xla%dtK0(y1r5njaG%t7DbeFZw=^rWIaCqhA>9+MY zAF^$Y?7Y@SRAsxc8*ageC_k=GrrH#Zsmpv+=vm_QJE+g@LS{)_q?bsz3ND9q3yAM>JC|*4{MYmGe9pr)NQLLbk3_&N+6^iCI)V|HQI9E zEpzJrp<<4wM@wkS_StUCS&i=fXB${?@+%}@70#zvDwjk}!^~mGO{<5J(;a+WG9#!C z%P~qTwgPrC%pAH}#(s>N(XZ34bD8bD?$A5@?0s2s0@r@noXZ@+d$n%pXv>Pmci#*t zOW0-|A&nMmG+M^;C5I4h3R-iJ-$23CF^b@fiwCSAo(7`lX9aln;hj4!;aIa%maqbu z!-n~p^bu}6{g{k6oD)F zSj`!@MMLCp4mMWuw#mjl9(hqLK4AtPlbHEktyZQKGv4^w9XpxVT@k+MeAB17vTX8x zc}|88R1$yFs9j)^OJ~tW^E<8WLP7G%b&EB=_^@Uh_iPxyq3q2P=NvA_QhhyM{v({h zWgp>ue!^1mlj-GBbYJ$0jP6G92Wc5#FcSfmAUuLV0tA=P zJ`2JD&DlG`CC~*V)+_u52L~rB3v2wyS=BW+9iwP$Y|PuOAcrmkL|Ws|)Jryb8+Z(c zRvN2)kazaoQHt}ZkyC$@Fy%R{PQ`-ozjY5%g(3?@O_xhg6TH8Mv>(9%=6LPxbQC5p z-NLi0*My^IUU|iQ;AJZbjjw#2rm($9M*Ga#7n!M_0S8$Em6X4{a8bQS>RY^6cq@ zY)Z~y+TduW6mMEKS~CW;dozfnuUOM?ag+c*z&+B z?_FH6U62LuM0vXO8x)~Z#P3{mRg99VvGM(i;(8w(N(j@xX@vHQJ*=`ZiKk832O|?v zt9s0G{zkLuGT3!=79#ma z4+Cz4+-Dtp2euR|B zZsCBbuHm+bz3E|>Zo?@Q&43J{Y3V+X0x^QlAp>_aHJxZC$I_4hsx87zJ-Y@>(7evH znMfywKSXQ8ImRa_y3p1#8aOEF`3h}Iix$>gm%)#ZI|$pf{d&<04L6#55V@|pj2Blc zM0Q4TQ1j%_AgWbdDXVTOsG6Wv6Rf+l`!ZwWaL$-2PCGeV@`-3<4=w>r%ciISKt^|J zuW0KRc+vH{0lAsan7b{}0VIp8(X{iIiwHxS*>TFJ&Kz82-nvt|Lu%C8l{{>UMz5dMt|y?nNtWNA zQnr)sKATZXZGzcTzdAr>=hK1}E%Xcqr^Q^}G4_^pLSt$@FCVp7LiiGE*pztBVR!k7 zorx@~NV&Vqp#4g>@i;#z%NpoPjq)vb{*7)cBUEPt%M>^kjF?TJG3kc~=#WbxRz%)~ z;&u(}_`tADT|JI8+N!~-70T~I7q_eV>Sz>;adx>BoDK;yhB`bkr$jOEGaasz7{Uz(F_#oJ5-yKYpvl5+*aanLB^->0#z-6s7UIn$b!Bn03pcJc}{#%4o~%(-s;-(7*eC zo(_0=Ckzs~M#T%~m(en$HzW`X8(T=M87_KDMkdl!-zA%I%b0kffNlm>Vro<%vw@$m zs$oeok0)$sv`)sKrv$Q%LJS7WC|FUa)%KoJmkO8je~fG=1^@ZT(dL!)@LITjFB?INbq)Yq_lWE0+G){Pv;y zie5O7ZbNDs5WJT{zm8mXE(>F8I5ly{X|V(Xb_V8c+GLOH+n0j5zvXfUW*ZxHQcK6K z`={N8hj_hiEZ6E>$%g-qPc+JKW^Vnsv?i!;D7Qr%jyo1F=}CPzl!gw&Fq3|+BeUE!iRLCAfKDh z{oZ?Yz-Ej8$KdxyWXL?n0XN?gs)z4DnUw)LMz=Fwft=V#HfXs{P;X<_o}~41A$*pi zRr4S2d}7W6hAz_~wq)1!L!sMIn}~el8Hk4nm3Y;RNdi4{i=T>h8e#1JR7cYfh2CeR z**wLFwzV~;|GcRUnGKrb-UkutMI)cs8;hoI)kMy^es*D9U*Q_xYR2Tz1bUj(65ExaVGv!Y2WaDq|Xo6+qMZH>C^sr!ya`L^Jtc zBhLs%iKD!{x@;0dd+)Cs0+dA03}T4?Ft-qQ>;w7WYY};_pBcfLh!~gVjAn{dkV|JM zG{qi@-$9@}R{+u>fP@kZ4wmhDxJ&E)-7ROqpNBm8=maM7a4jFz+U<6X-ZCB`A zVh#7Nt6zNO7)Vh%Pk7r$dXA$%IPj;-dZ+o({6qBt1|+@c9_9jX90fH*wACqZ)KamJ z;p6CLO!Zw6ZnqHdjOYqoXpJhY@!Gq$-k0SfbM8(lkvCIMo-V2LtQyi+wTep0OLlc; zoaf~y_{ON4g~v^=)zysRC1tf@V*u5?L6EMaN`yhPk)hRXQySnw$|~@{Y-RJM3#q1Y z1jA3`V$j)vfk;>@#gDk49E&(6|IgnQ1lNl@bvdiA?~0;G?}Mpp1tMfI@LiYuEJlU7 zgZzhSdZxXR!?CRRGarndkDBAR?Q%K%an*COgLH7^Z+i@)sX|0}F&w@_8YJCe zUi>J_ADjtXMgxA1Q2&k;ForKtx*3rf8*ohh#Zt-+B*V!?2>SSUhSLp32Q)S%hMtnY z8iWz#dGmd2fcp_N9-w4h_;!AQwLRa^vQ4d%=&OX{QM!#OankSt0+*4Q%8+?B0yEqd z0UeOI?d%Hw{@XwW-}}cwW8TAUi%6{f(FeEn`9tf5i#&Zda^WN^f$lT80gMv)i=vf+ zoZZ&2)Tr)vV2C(B#8jrH$-d7_|De8ngAEMQQ=<(s# zmjsN{0uIMIDc@}30KdjH$y01~se;~?`N{z|rO;ckOq{9ecEA5}D27)G zF(fH|H5N_?iLr^jg%OQ&L&!;suIBb&CzpBG;Blm`zr81HI|f2#clkNR^V@W-s$)_$ z&K3-!VnhZfPAKO{k2+5tO9erNSg!jyT8f|NXp638tps_wc2Cb(F-Y~xfI}?$2Yuxd zRMDd4EeL_-iQ>-XNt1+?i`Q7ldFBMY$Wm_34TBFs z7)@o|O}sQ%7L3lcS?yJx>BPv=t^9WyVEn_i%BDI*eB%wC`!Rc-1<=?+XPDTu*j*NoovUDN4t~3y!Ukzo%HIj5$F3hk#}jY|7yw^q^!l~ay-(_VtJ*MpJMD!Bkf)5C5!VWrJEGmE z#Bl0AbYhKx1s#!8FT*rl9$W@Hl+;%~<|q z%5-W1J7xnU0>ddFwq2~_x%BW#cz<;0&%{_~!JuUT4vYUX19CZ7MDtZr5GHe8#Kuy( zX8*i~AdDnV<_DY3JpsRa)*ZjQlS@H$pNrYNe}9uY?qr)pIVAM>_=-NiU9MkvS7zhy z;V0}H?g?U-d0*B_9z!L1fdJOB;(Z-2OB31v`LSy`A>VWLDE{VLvUgY<>!eFRU%Thp zdY*GJH|G=dc&^~usey}vn=PI`&wczr+;tD)k@8Xvkj85?)vp$5YzL2uMa_Obfi339^Hl8wb?;y}La0FQj zbv@v@gaM>};4{H9z%z$Vvs@GIaIfn3`A%Y7xztHLTSA{?FtpL7Apr6X!}L#e=@1LY zhTn~%(b|-{GjmZaUy9b{1x{drG|FpZF!1uC&j&8p12@xA~Bs;nU z)4xNx8Zvir7P5z}`QM(9fFf05h0%>)fehE~G~jVo<{Sr*T)`9dJbeN@H^@Bu-l5oPyW5`y zy9j>WZ(cugl{VjaP1KCfx^@x{BlTFh3eL@z$cKUY{<-qJHrfF6jhswy4THixm?t?z zopIh*N}kw6SI^qr44+zjIVc9kerHWf-LP0k_#Ol7=e4vcVl^!X|{ zBtkDgXgFyL!IcWp*G+8xyYEx*YNl-vmlDjNLeSc|2LcGlVpeplBJ;f~$t^d~5g6zL_|lUd~@k&3Sj{QV-m#>Aj-QF^SUXUU+q zeIJ58Yq|ZAfx!b)Lo(+BBQ8x{$K7+OoQ%hZ)HarNeg51y<_(iJj)N`&#p&2_%e_I@_7)2GHppz^8A`xHKebTf40BH7>dVE%+xxv|q*ggSru!G?pDc14;<@`=Gn2iV%6Dd779MoEwiK>kXh-W-%*Qwl|dp+Ls zqq_N-JfeU2GKH%S$_a5cm8d?3i?@T=4)pgK{Sp3kSNr)rX^O;VU7uBmW~N=AB^-!h zFN5G~O_lB%3qrzsQpq=@9EA|t>LIZktt-Qm4@<$KNQi#+xKX~GPVGIez52t+*}k_R z)n>z2y*~m6wgWD=Hsby4623XZ^^>+=ub1+mkuAjiVcd6?uTTutj#mtN9f&^mSwj!= z_yQnDLnT*lSkUGN6?I#VCo{5K!1%}rq_97*BLkcybg9NJ>LJ&@#yxS(v1k{2zoBb- z48BUH4d;jUHW~IBTxf0zN_OaeaV+J&8$sUqMRq6E2W39*j z-MC4@w0~yp@I+_#4Gq5gUem(%+%sFv@zptaUq;|*H~U55-u+V5cLaPMWt|@5cFG?9ElXiroztHRPR~!*TopCtKO?~t56u0d#tn@|I>x+BsiM}r+^<>PVB^5c>?;jH$LZ9icw3}6EX`Dlo3cjlLfvr z{dotqQN%f47_*PTLyFv+}p^zK`H!W}i35r+;h1NDh) z`5395gyB(CoUr;Uz)#NVvD>Y*5Qe}rexf@HZRHD!Xj9`xUtes6MzAevR%{HOOturjWK4>6 zOns)pKPsf{=b!?F-6NIMecs4@J)QktqgRlh$R{qnBxS1^_Y_g^3fC54oQQjGOe_yW`(} zGQfHoPB1)2Plh`AEHNF-Nq9Li7kT#7W2B!SQTWB!0;Ee3GCS_Xcr?7|vF=0lQURNm z?N$Gx+nncctuIU~+$&j`p$45Y;>dm%2Hr94FI^tvhjv{~>AXD*gV2oc*rhCxtfwWh z0O%$t*!cYzi%q@FD4yM_D94ZWO8%XWRF=ac+ca!WR&_ZZcl|lL3uj-5iC4Ane`vGew6zy!XWGH5QYyRz9k}+|gDZ-Pu6ulR+QdZQ4kn zI#_u@PtQPesCK3sKUKPu6mX5Z*3=tO!)B~IMC%A`{MDZp7I_qcY~nnKnY9x>K@lw? zkA_soAD4M`=g)o61aqmw1tl=0Z_Da@Eat=N)Zz{VZf&}_V}I3uP}9zxL5WzG?Z!e zNf;LKPg7epyj2j{hd$!fm2c7AK`*YeFDM$@ws6KBSGRlh>WLS(t1Jf!Mg;!f<#hhj zHT};E1>{uZKOX+y)J{l{|C!qPzvXKFuWJ8YX6OH^2Kui~VUXy5s`lR}c>dp^{8w-9 z|BMp&Z}9)T^FNgTuhaMc9l(Fpc>QO94Z;5c@PANz{of({SJw1DBb;gd7li*Khx*@P VKtlb;M}z!#1OLr>x$A$7{s*Do^HTr- literal 0 HcmV?d00001 From a2a957ef5b29aa5c0a1e22a097f095fa2edc5b3b Mon Sep 17 00:00:00 2001 From: HaoHaoXueXiHxy <20211005@bjtu.edu.cn> Date: Thu, 4 Dec 2025 13:56:13 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E4=B8=80?= =?UTF-8?q?=E7=82=B9readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../README.md" | 239 ++++++++---------- 1 file changed, 103 insertions(+), 136 deletions(-) diff --git "a/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/README.md" "b/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/README.md" index 0b0ad724..bd83306d 100644 --- "a/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/README.md" +++ "b/2025-Ascend-Innovation-Contest/S1/MoE/\346\210\221\346\203\263\345\245\275\345\245\275\347\235\241\344\270\200\350\247\211/README.md" @@ -1,78 +1,61 @@ -#### 1. MoE模块前向优化 -在moe模块可以拆分 prefill(长序列预填充)和 decode(单 token 解码)阶段逻辑,能够解决解决 “两阶段结果不一致” 和 “推理效率低” 问题。 +#### 1. MoE模块拆分 prefill 和 decode 阶段逻辑 DeepseekMoE模块 -显式拆分prefill/decode阶段 ```python def forward(self, hidden_states): - if orig_shape[1] == 1: # decode阶段(单token) + if orig_shape[1] == 1: y = self.moe_infer_decode(...).view(*orig_shape) - else: # prefill阶段(长序列) + ... + else: + # 复用原代码 y = self.moe_infer_prefill(...).view(*orig_shape) ``` -通过`sequence_length == 1`显式区分两阶段,分别调用专属推理函数,避免共享逻辑导致的结果偏差。 -decode阶段(单token推理):极简逐专家计算 +decode阶段(单token推理) ```python @no_grad() def moe_infer_decode(self, x, flat_expert_indices, flat_expert_weights): - expert_cache = ops.zeros_like(x) - for i in range(self.num_experts_per_tok): - expert_id = flat_expert_indices[i].item() # 取第i个选中的专家ID - weight = flat_expert_weights[i].item() # 取对应权重 - expert = self.experts[expert_id] # 选中专家网络 - expert_out = expert(x) # 专家处理单token输入 - expert_cache += expert_out * weight # 加权累加输出 - return expert_cache + # Decode时单token直接遍历激活专家 + expert_cache = ops.zeros_like(x) + for i in range(self.num_experts_per_tok): + expert_id = flat_expert_indices[i].item() + weight = flat_expert_weights[i].item() + expert = self.experts[expert_id] + expert_out = expert(x) + expert_cache += expert_out * weight + return expert_cache ``` -单token场景下,直接遍历门控网络选中的`num_experts_per_tok`个专家,逐个计算并加权累加; -无冗余操作:单token无需批量索引/掩码,直接遍历激活专家,计算效率最大化; - `@no_grad()`:关闭梯度计算,避免推理阶段显存占用; -权重直接乘:跳过复杂的掩码/索引,保证计算语义简单可追溯。 +直接遍历门控网络选中的`num_experts_per_tok`个专家,逐个计算并加权累加,计算效率最大化。 -prefill阶段(长序列推理):批量高效计算 +prefill阶段(长序列推理) ```python @no_grad() def moe_infer_prefill(self, x, flat_expert_indices, flat_expert_weights): - expert_cache = ops.zeros_like(x) - idxs = flat_expert_indices.argsort() # 按专家ID排序,便于批量处理 - tokens_per_expert = flat_expert_indices.bincount().cumsum(0) # 每个专家的token数量 - token_idxs = idxs // self.num_experts_per_tok # 还原token在原序列中的索引 - - for i, end_idx in enumerate(tokens_per_expert): - start_idx = 0 if i == 0 else tokens_per_expert[i-1] - if start_idx == end_idx: continue # 无token的专家跳过 - - expert = self.experts[i] - length = (end_idx - start_idx).item() - # 1. 切片获取当前专家处理的token索引(MindSpore原生操作) - exp_token_idx = ops.narrow(token_idxs, 0, start_idx, length) - # 2. 批量提取token的输入特征 - expert_tokens = F.embedding(exp_token_idx, x) - # 3. 专家计算 + 权重加权 - expert_out = expert(expert_tokens) - expert_out = expert_out.mul(F.embedding(ops.narrow(idxs, 0, start_idx, length), flat_expert_weights)) - # 4. 批量累加结果到缓存(替代循环赋值) - expert_cache = mindspore.mint.scatter_add( - expert_cache, 0, - exp_token_idx.view(-1, 1).tile((1, x.shape[-1])), # 扩展索引到特征维度 - expert_out - ) - return expert_cache + expert_cache = ops.zeros_like(x) + idxs = flat_expert_indices.argsort() + tokens_per_expert = flat_expert_indices.bincount().cumsum(0) + token_idxs = idxs // self.num_experts_per_tok + for i, end_idx in enumerate(tokens_per_expert): + start_idx = 0 if i == 0 else tokens_per_expert[i-1] + if start_idx == end_idx: + continue + expert = self.experts[i] + length = (end_idx - start_idx).item() + exp_token_idx = ops.narrow(token_idxs, 0, start_idx, length) + expert_tokens = F.embedding(exp_token_idx, x) + expert_out = expert(expert_tokens) + expert_out = expert_out.mul(F.embedding(ops.narrow(idxs, 0, start_idx, length), flat_expert_weights)) + expert_cache = mindspore.mint.scatter_add(expert_cache, 0, exp_token_idx.view(-1, 1).tile((1, x.shape[-1])), expert_out) + return expert_cache ``` -长序列场景下,先按专家ID排序token,再批量处理每个专家的所有token,避免逐token循环; - -排序+批量处理:将同专家的token归拢,一次调用专家网络处理所有token,计算效率提升数倍; -`ops.narrow`替代Python切片:适配MindSpore张量操作,避免维度不匹配; -`scatter_add`批量累加:替代逐token赋值,减少显存交互开销; -跳过空专家:无token的专家直接跳过,减少无效循环。 +这里直接复用原代码中逻辑。 --- Qwen2MoeSparseMoeBlock模块 (deepseek模块的实现参考了培训给出的一些示例,基于这个经验,在qwen2_moe模块也进行了拆分获取了收益) -与DeepseekMoE的“完全拆分函数”不同,Qwen2-MoE在同一个`forward`函数内通过分支区分阶段(代码写的没那么漂亮,主打一个效率): +Qwen2-MoE在同一个`forward`函数内通过分支区分阶段(代码写的没那么漂亮,主打一个效率): ```python def forward(self, hidden_states): if sequence_length == 1: # decode阶段 @@ -86,12 +69,11 @@ def forward(self, hidden_states): decode阶段(单token):活跃专家筛选+复用prefill核心逻辑 ```python if sequence_length == 1: - # 1. 生成专家掩码(与prefill格式完全一致) expert_mask = nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) - # 2. 筛选活跃专家(仅处理被选中的专家,减少循环次数) + # 筛选活跃专家(仅处理被选中的专家,减少循环次数) expert_usage = ops.sum(expert_mask, dim=(1, 2)) # 统计每个专家被选中的次数 active_experts = ops.nonzero(expert_usage > 0, as_tuple=False).squeeze(-1) - # 3. 遍历活跃专家(复用prefill的index_add逻辑) + # 遍历活跃专家(复用prefill的index_add逻辑) for expert_idx_tensor in active_experts: expert_idx = int(expert_idx_tensor.asnumpy().item()) expert_layer = self.experts[expert_idx] @@ -99,50 +81,35 @@ if sequence_length == 1: if 0 not in idx.shape: current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] - # 关键:与prefill使用相同的index_add累加,保证结果一致 + final_hidden_states = final_hidden_states.index_add( 0, top_x.int(), current_hidden_states.to(hidden_states.dtype) ) ``` -- **核心优化**: - - 掩码格式对齐:decode阶段生成与prefill完全一致的`expert_mask`,消除掩码维度/排列差异导致的结果偏差; - - 活跃专家筛选:仅循环被选中的专家(而非所有专家),单token推理循环次数从`num_experts`降至`top_k`(通常2),效率提升; - - 累加逻辑一致:复用`index_add`而非直接赋值,保证两阶段输出的数学语义完全一致。 + + decode阶段生成与prefill完全一致的`expert_mask`,消除掩码维度/排列差异导致的结果偏差; + 仅循环被选中的专家,单token推理循环次数从`num_experts`降至`top_k`。 prefill阶段(长序列):标准掩码+全专家遍历 ```python else: - # 1. 生成专家掩码:(num_experts, top_k, batch*seq_len) expert_mask = nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) - # 2. 遍历所有专家(保证长序列处理兼容性) for expert_idx in range(self.num_experts): expert_layer = self.experts[expert_idx] idx, top_x = ops.nonzero(expert_mask[expert_idx], as_tuple=True) if 0 not in idx.shape: current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] - # 核心:index_add累加,保证批量token结果正确 final_hidden_states = final_hidden_states.index_add( 0, top_x.int(), current_hidden_states.to(hidden_states.dtype) ) ``` -- **核心保留**: - - 全专家遍历:长序列场景下,确保所有可能被选中的专家都被处理,避免遗漏; - - 掩码索引:通过`nonzero`提取每个专家处理的token索引,批量计算后用`index_add`累加,保证结果与原始逻辑一致。 - -共享专家(Shared Expert):两阶段统一逻辑 -```python -shared_expert_output = self.shared_expert(hidden_states) -shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output -final_hidden_states = final_hidden_states + shared_expert_output -``` -- 共享专家是所有输入都会经过的“基础专家”,用于兜底计算; -- 门控权重:通过`sigmoid`激活的门控网络(`shared_expert_gate`)控制共享专家的输出强度,两阶段逻辑完全一致,避免额外偏差。 +这部分复用原本的MoE部分代码,遍历全部专家。 这样把prefill和decode分开来写带来了在decode阶段有达到秒级的收益。这样做完总分提升到了160+。 -#### 2. 培训中提到的使用ops算子替换索引操作 +#### 2. 使用ops算子替换索引操作 ```python def rotate_half(x): @@ -166,15 +133,15 @@ def rotate_half(x): hidden_states = hidden_states_expand.broadcast_to((batch, num_key_value_heads, n_rep, slen, head_dim)) ... - # self.cos_cached[:seq_len].to(dtype=x.dtype), - # self.sin_cached[:seq_len].to(dtype=x.dtype), - ops.narrow(self.cos_cached, 0, 0, seq_len).to(dtype=x.dtype), - ops.narrow(self.sin_cached, 0, 0, seq_len).to(dtype=x.dtype), + # self.cos_cached[:seq_len].to(dtype=x.dtype), + # self.sin_cached[:seq_len].to(dtype=x.dtype), + ops.narrow(self.cos_cached, 0, 0, seq_len).to(dtype=x.dtype), + ops.narrow(self.sin_cached, 0, 0, seq_len).to(dtype=x.dtype), ... - attention_mask_expanded = ops.unsqueeze(ops.unsqueeze(attention_mask, dim=1), dim=2) - padding_mask = ops.narrow(causal_mask, -1, 0, mask_length) + attention_mask_expanded + attention_mask_expanded = ops.unsqueeze(ops.unsqueeze(attention_mask, dim=1), dim=2) + padding_mask = ops.narrow(causal_mask, -1, 0, mask_length) + attention_mask_expanded ``` 使用ops替换掉tensor索引的一些操作能够带来一些细微的收益(几百ms),有的替换甚至没有收益(也可能是那块代码没执行)。 @@ -183,70 +150,70 @@ def rotate_half(x): 使用StaticCache需要先在cache_utils.py文件中修改一下StaticCache类的update函数中的某一部分(被注释掉的是原来的): ```python - else: - # use index_add for mindspore since tensor slice is too slow and no implementation of index_copy - # k_out = ops.index_add(k_out, 2, cache_position.int(), key_states) - # v_out = ops.index_add(v_out, 2, cache_position.int(), value_states) - k_out.index_add_(2, cache_position.int(), key_states) - v_out.index_add_(2, cache_position.int(), value_states) + else: + # use index_add for mindspore since tensor slice is too slow and no implementation of index_copy + # k_out = ops.index_add(k_out, 2, cache_position.int(), key_states) + # v_out = ops.index_add(v_out, 2, cache_position.int(), value_states) + k_out.index_add_(2, cache_position.int(), key_states) + v_out.index_add_(2, cache_position.int(), value_states) ``` 由于默认是DynamicCache,因此需要在utils.py中generate接口处修改一些地方,支持StaticCache的创建使用 ```python +else: + model_type = getattr(self.config, 'model_type', '') + supports_cache_position = model_type in ['qwen2_moe'] + # print('StaticCache') + if ( + hasattr(self, '_supports_static_cache') + and self._supports_static_cache + and not requires_cross_attention_cache + and supports_cache_position + ): + if hasattr(self.config, "_pre_quantization_dtype"): + cache_dtype = self.config._pre_quantization_dtype else: - model_type = getattr(self.config, 'model_type', '') - supports_cache_position = model_type in ['qwen2_moe'] - # print('StaticCache') - if ( - hasattr(self, '_supports_static_cache') - and self._supports_static_cache - and not requires_cross_attention_cache - and supports_cache_position - ): - if hasattr(self.config, "_pre_quantization_dtype"): - cache_dtype = self.config._pre_quantization_dtype - else: - cache_dtype = self.dtype - # print('StaticCache') - model_kwargs[cache_name] = self._get_cache( - cache_implementation="static", - max_batch_size=batch_size, - max_cache_len=max_cache_length, - model_kwargs=model_kwargs, - ) - else: - num_hidden_layers = self.config.get_text_config().num_hidden_layers - model_kwargs[cache_name] = ( - DynamicCache(num_hidden_layers) - if not requires_cross_attention_cache - else EncoderDecoderCache(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers)) - ) + cache_dtype = self.dtype + # print('StaticCache') + model_kwargs[cache_name] = self._get_cache( + cache_implementation="static", + max_batch_size=batch_size, + max_cache_len=max_cache_length, + model_kwargs=model_kwargs, + ) + else: + num_hidden_layers = self.config.get_text_config().num_hidden_layers + model_kwargs[cache_name] = ( + DynamicCache(num_hidden_layers) + if not requires_cross_attention_cache + else EncoderDecoderCache(DynamicCache(num_hidden_layers), DynamicCache(num_hidden_layers)) + ) ``` 这里StaticCache只限制了在qwen2_moe中使用,因为我尝试在deepseek中使用最后好像出现了mismatch,可能精度上有一些损失。 -最后JIT优化使用在了RMSNorm部分,还在Qwen2MoeModel的_update_causal_mask部分加了JIT,都带来了收益 +最后JIT优化使用在了RMSNorm部分,还在Qwen2MoeModel的_update_causal_mask部分加了JIT,都带来了收益。但是不知道为什么使用rms_norm的融合算子会导致mismatch,可能是精度有误差,所以放弃了融合算子,同样flash attention也会有这个问题。 ```python - @mindspore.jit - def forward(self, hidden_states): - # if use_pyboost(): - # return F.rms_norm(hidden_states, self.weight, self.variance_epsilon) - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(mindspore.float32) - variance = ops.mean(hidden_states.pow(2), -1, keepdim=True) - hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - ... +@mindspore.jit +def forward(self, hidden_states): + # if use_pyboost(): + # return F.rms_norm(hidden_states, self.weight, self.variance_epsilon) + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(mindspore.float32) + variance = ops.mean(hidden_states.pow(2), -1, keepdim=True) + hidden_states = hidden_states * ops.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) - @mindspore.jit - def _update_causal_mask( - self, - attention_mask: mindspore.Tensor, - input_tensor: mindspore.Tensor, - cache_position: mindspore.Tensor, - past_key_values: Cache, - output_attentions: bool, - ): +... + +@mindspore.jit +def _update_causal_mask( + self, + attention_mask: mindspore.Tensor, + input_tensor: mindspore.Tensor, + cache_position: mindspore.Tensor, + past_key_values: Cache, + output_attentions: bool, +): ``` 最后JIT优化牺牲了一些prefill的时延,但是对decode的时延带来了很大的提升,所以最后总分也提升到了280+。