diff --git a/.github/workflows/bangc_ci.yaml b/.github/workflows/bangc_ci.yaml index 89ce5f7fd..cc2b2ed8d 100644 --- a/.github/workflows/bangc_ci.yaml +++ b/.github/workflows/bangc_ci.yaml @@ -47,35 +47,12 @@ jobs: runner: [mlu370-m8] mlu_ops_version : [v0.7.1] cntoolkit_version : [cntoolkit3.5.0] - runs-on: ${{matrix.runner}} + runs-on: [self-hosted] steps: - uses: actions/checkout@v3 with: submodules: 'true' - - name: bangc_lint_check + - name: run_bangc_ops_ci run: > - docker run --rm -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:v0.2-x86_64-ubuntu16.04-BANGPy - ./tools/pre-commit origin/master - - - name: build_bangc_ops - run: > - docker run --rm -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} - ./build.sh --sub_module=bangc - - - name: bangc_ops_release_temp_cases - run: > - docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} - ./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/default_platform - - - name: test_bangc_ops_release_temp_370_cases - if: matrix.runner == 'mlu370-m8' - run: > - docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} - ./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/370 - - - name: clean - run: | - rm -rf bangc-ops/build + bash ci.sh diff --git a/bangc-ops/kernels/generate_proposals_v2/generate_proposals_v2_union1.mlu b/bangc-ops/kernels/generate_proposals_v2/generate_proposals_v2_union1.mlu index ef7c29080..2a4c9dcf3 100644 --- a/bangc-ops/kernels/generate_proposals_v2/generate_proposals_v2_union1.mlu +++ b/bangc-ops/kernels/generate_proposals_v2/generate_proposals_v2_union1.mlu @@ -748,14 +748,14 @@ __mlu_func__ void removeSmallBox(T *proposal_scores, T *proposal_boxes, return; } // collect and store box and scores - __bang_collect(proposal_boxes, proposal_boxes, mask_tmp2, align_count); - __bang_collect(proposal_boxes + 1 * input_stride, + __bang_filter(proposal_boxes, proposal_boxes, mask_tmp2, align_count); + __bang_filter(proposal_boxes + 1 * input_stride, proposal_boxes + 1 * input_stride, mask_tmp2, align_count); - __bang_collect(proposal_boxes + 2 * input_stride, + __bang_filter(proposal_boxes + 2 * input_stride, proposal_boxes + 2 * input_stride, mask_tmp2, align_count); - __bang_collect(proposal_boxes + 3 * input_stride, + __bang_filter(proposal_boxes + 3 * input_stride, proposal_boxes + 3 * input_stride, mask_tmp2, align_count); - __bang_collect(proposal_scores, proposal_scores, mask_tmp2, align_count); + __bang_filter(proposal_scores, proposal_scores, mask_tmp2, align_count); } template @@ -867,38 +867,38 @@ __mlu_func__ void createAndRemoveBox( __bang_ge_scalar(ge_mask, scores, k_score, actual_num_align); count = __bang_count(ge_mask, actual_num_align); if (count != 0 && count != actual_num && actual_num != 1) { - __bang_collect(scores, scores, ge_mask, actual_num_align); + __bang_filter(scores, scores, ge_mask, actual_num_align); - __bang_collect(bbox_deltals, bbox_deltals, ge_mask, actual_num_align); - __bang_collect(bbox_deltals + 1 * actual_num_align, + __bang_filter(bbox_deltals, bbox_deltals, ge_mask, actual_num_align); + __bang_filter(bbox_deltals + 1 * actual_num_align, bbox_deltals + 1 * actual_num_align, ge_mask, actual_num_align); - __bang_collect(bbox_deltals + 2 * actual_num_align, + __bang_filter(bbox_deltals + 2 * actual_num_align, bbox_deltals + 2 * actual_num_align, ge_mask, actual_num_align); - __bang_collect(bbox_deltals + 3 * actual_num_align, + __bang_filter(bbox_deltals + 3 * actual_num_align, bbox_deltals + 3 * actual_num_align, ge_mask, actual_num_align); - __bang_collect(anchors, anchors, ge_mask, actual_num_align); - __bang_collect(anchors + 1 * actual_num_align, + __bang_filter(anchors, anchors, ge_mask, actual_num_align); + __bang_filter(anchors + 1 * actual_num_align, anchors + 1 * actual_num_align, ge_mask, actual_num_align); - __bang_collect(anchors + 2 * actual_num_align, + __bang_filter(anchors + 2 * actual_num_align, anchors + 2 * actual_num_align, ge_mask, actual_num_align); - __bang_collect(anchors + 3 * actual_num_align, + __bang_filter(anchors + 3 * actual_num_align, anchors + 3 * actual_num_align, ge_mask, actual_num_align); - __bang_collect(variances, variances, ge_mask, actual_num_align); - __bang_collect(variances + 1 * actual_num_align, + __bang_filter(variances, variances, ge_mask, actual_num_align); + __bang_filter(variances + 1 * actual_num_align, variances + 1 * actual_num_align, ge_mask, actual_num_align); - __bang_collect(variances + 2 * actual_num_align, + __bang_filter(variances + 2 * actual_num_align, variances + 2 * actual_num_align, ge_mask, actual_num_align); - __bang_collect(variances + 3 * actual_num_align, + __bang_filter(variances + 3 * actual_num_align, variances + 3 * actual_num_align, ge_mask, actual_num_align); } diff --git a/bangc-ops/kernels/get_indice_pairs/get_indice_pairs_block.mlu b/bangc-ops/kernels/get_indice_pairs/get_indice_pairs_block.mlu index 7fb1ad749..88e39540a 100644 --- a/bangc-ops/kernels/get_indice_pairs/get_indice_pairs_block.mlu +++ b/bangc-ops/kernels/get_indice_pairs/get_indice_pairs_block.mlu @@ -284,7 +284,7 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel3( __bang_int322float_rn((float *)nram_aux, (int32_t *)nram_mask, load_l_num, 0); valid_l_num_now = __bang_count((float *)nram_aux, load_l_num); - __bang_collect((float *)nram_output, (float *)nram_input, + __bang_filter((float *)nram_output, (float *)nram_input, (float *)nram_aux, load_l_num); int32_t *store_valid_ptr = (int32_t *)indice_pair + store_offset * len_l + core_offset_l_valid; diff --git a/bangc-ops/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu b/bangc-ops/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu index 41c1a94dd..1e2d13eab 100644 --- a/bangc-ops/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu +++ b/bangc-ops/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu @@ -328,13 +328,13 @@ __mlu_func__ void backwardStageTwoLoop( int32_t all_valid_count = __bang_sum(cond_all_valid, nq_nl_np); int32_t* dst_offset = (int32_t*)offset_zero_nram_stg2; for (int i = 0; i < 4; i++) { - __bang_collect((T*)dst_offset + i * nq_nl_np, + __bang_filter((T*)dst_offset + i * nq_nl_np, (T*)offset_nram + i * nq_nl_np, cond_all_valid, nq_nl_np); } int32_t* src_offset = (int32_t*)inter_grad; int32_t* stride_4_2 = dst_offset + 3 * nq_nl_np; int32_t* stride_1_2 = dst_offset; - __bang_collect((T*)src_offset, (T*)seq_nram, cond_all_valid, nq_nl_np); + __bang_filter((T*)src_offset, (T*)seq_nram, cond_all_valid, nq_nl_np); __bang_mul_scalar(src_offset, src_offset, channels * sizeof(T), nq_nl_np); __bang_sub(stride_4_2, stride_4_2, dst_offset + nq_nl_np, nq_nl_np); __bang_sub(stride_1_2, stride_1_2, dst_offset + nq_nl_np, nq_nl_np); @@ -364,9 +364,9 @@ __mlu_func__ void backwardStageTwoLoop( int32_t* tmp_src_offset = (int32_t*)inter_grad; int32_t valid_count = __bang_sum(tmp_cond, nq_nl_np); if (valid_count > 0) { - __bang_collect((T*)tmp_dst_offset, (T*)tmp_dst_offset, tmp_cond, + __bang_filter((T*)tmp_dst_offset, (T*)tmp_dst_offset, tmp_cond, nq_nl_np); - __bang_collect((T*)tmp_src_offset, (T*)seq_nram, tmp_cond, nq_nl_np); + __bang_filter((T*)tmp_src_offset, (T*)seq_nram, tmp_cond, nq_nl_np); __bang_mul_scalar(tmp_src_offset, tmp_src_offset, channels * sizeof(T), valid_count); for (int p = 0; p < valid_count; p++) { diff --git a/bangc-ops/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h b/bangc-ops/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h index e4d1f5157..c906cbd36 100644 --- a/bangc-ops/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h +++ b/bangc-ops/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h @@ -164,8 +164,8 @@ __mlu_func__ void computePolationWeightOffsetCond( T* buf_y_ceil = buf_nram + 5 * total_points; //================================================================================================ int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD); - __bang_collect_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad); - __bang_collect_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad); + __bang_filter_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad); + __bang_filter_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad); // x = loc_x * spatial_w - 0.5; y = loc_y * spatial_h - 0.5; __bang_fusion(FUSION_FMS, buf_x_nram, buf_x_nram, spatial_w_bd_nram, (T)0.5, total_points, block_points); diff --git a/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu b/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu index 8397bb276..c5f2ddcc7 100644 --- a/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu +++ b/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu @@ -166,8 +166,8 @@ __mlu_func__ void getConditionCoordWeight( w_contain_inf = buf_nram[2 * total_points] > 0; //================================================================================================ int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD); - __bang_collect_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad); - __bang_collect_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad); + __bang_filter_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad); + __bang_filter_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad); // x = loc_x * spatial_w - 0.5; y = loc_y * spatial_h - 0.5; __bang_fusion(FUSION_FMS, buf_x_nram, buf_x_nram, spatial_w_bd_nram, (T)0.5, total_points, block_points); @@ -291,7 +291,7 @@ __mlu_func__ void getConditionCoordWeight( weight_attn_nram, 4 * total_points, total_points); } __bang_mul_scalar(buf_nram, weight_attn_nram, (T)1, total_points); - __bang_collect((float*)weight_attn_nram, (float*)buf_nram, + __bang_filter((float*)weight_attn_nram, (float*)buf_nram, cond_point_valid_nram, total_points); __bang_float2int32((int32_t*)cond_point_polation_nram, cond_point_polation_nram, total_points * 4, 0); @@ -301,15 +301,15 @@ __mlu_func__ void getConditionCoordWeight( __bang_band((char*)weight_polation_nram_tmp, (char*)weight_polation_nram, (char*)cond_point_polation_nram, total_points * 4 * sizeof(float)); - __bang_collect((float*)weight_polation_nram, (float*)weight_polation_nram_tmp, + __bang_filter((float*)weight_polation_nram, (float*)weight_polation_nram_tmp, cond_point_valid_nram, total_points); - __bang_collect((float*)weight_polation_nram + total_points, + __bang_filter((float*)weight_polation_nram + total_points, (float*)weight_polation_nram_tmp + total_points, cond_point_valid_nram, total_points); - __bang_collect((float*)weight_polation_nram + 2 * total_points, + __bang_filter((float*)weight_polation_nram + 2 * total_points, (float*)weight_polation_nram_tmp + 2 * total_points, cond_point_valid_nram, total_points); - __bang_collect((float*)weight_polation_nram + 3 * total_points, + __bang_filter((float*)weight_polation_nram + 3 * total_points, (float*)weight_polation_nram_tmp + 3 * total_points, cond_point_valid_nram, total_points); //================================================================================================ @@ -319,16 +319,16 @@ __mlu_func__ void getConditionCoordWeight( __bang_mul_scalar((int32_t*)cond_point_polation_nram_tmp, (int32_t*)cond_point_polation_nram, (int32_t)1, total_points * 4); - __bang_collect((float*)cond_point_polation_nram, + __bang_filter((float*)cond_point_polation_nram, (float*)cond_point_polation_nram_tmp, cond_point_valid_nram, total_points); - __bang_collect((float*)cond_point_polation_nram + total_points, + __bang_filter((float*)cond_point_polation_nram + total_points, (float*)cond_point_polation_nram_tmp + total_points, cond_point_valid_nram, total_points); - __bang_collect((float*)cond_point_polation_nram + 2 * total_points, + __bang_filter((float*)cond_point_polation_nram + 2 * total_points, (float*)cond_point_polation_nram_tmp + 2 * total_points, cond_point_valid_nram, total_points); - __bang_collect((float*)cond_point_polation_nram + 3 * total_points, + __bang_filter((float*)cond_point_polation_nram + 3 * total_points, (float*)cond_point_polation_nram_tmp + 3 * total_points, cond_point_valid_nram, total_points); } @@ -349,11 +349,11 @@ __mlu_func__ void getConditionCoordWeight( __bang_sub((int32_t*)data_offset_nram_tr_tmp, (int32_t*)data_offset_nram_tr_tmp, (int32_t*)data_offset_nram_tl_tmp, total_points); - __bang_collect((float*)data_offset_nram_tl, (float*)data_offset_nram_tl_tmp, + __bang_filter((float*)data_offset_nram_tl, (float*)data_offset_nram_tl_tmp, cond_point_valid_nram, total_points); - __bang_collect((float*)data_offset_nram_bl, (float*)data_offset_nram_bl_tmp, + __bang_filter((float*)data_offset_nram_bl, (float*)data_offset_nram_bl_tmp, cond_point_valid_nram, total_points); - __bang_collect((float*)data_offset_nram_tr, (float*)data_offset_nram_tr_tmp, + __bang_filter((float*)data_offset_nram_tr, (float*)data_offset_nram_tr_tmp, cond_point_valid_nram, total_points); } diff --git a/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu b/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu index 4c1e09857..954d44da6 100644 --- a/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu +++ b/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu @@ -168,15 +168,15 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( __sync(); // generate x and y coordinate vector // generate spatial_x and spatial_y spatial vector - __bang_collect((float *)coord_y, (float *)grid_ram, (float *)mask_ram, + __bang_filter((float *)coord_y, (float *)grid_ram, (float *)mask_ram, deal_num * 2); // y - __bang_collect((float *)spatial_x_temp, (float *)data_spatial_shapes_nram, + __bang_filter((float *)spatial_x_temp, (float *)data_spatial_shapes_nram, (float *)mask_ram, num_levels * 2); // spatial_x __bang_not((float *)mask_ram, (float *)mask_ram, deal_num * 2); - __bang_collect((float *)coord_x, (float *)grid_ram, (float *)mask_ram, + __bang_filter((float *)coord_x, (float *)grid_ram, (float *)mask_ram, deal_num * 2); // x - __bang_collect((float *)spatial_y_temp, (float *)data_spatial_shapes_nram, + __bang_filter((float *)spatial_y_temp, (float *)data_spatial_shapes_nram, (float *)mask_ram, num_levels * 2); // spatial_y for (int32_t i = 0; i < num_levels; i++) { diff --git a/bangc-ops/kernels/roiaware_pool3d/roiaware_pool3d_union1.mlu b/bangc-ops/kernels/roiaware_pool3d/roiaware_pool3d_union1.mlu index d42f779ea..68c432a39 100644 --- a/bangc-ops/kernels/roiaware_pool3d/roiaware_pool3d_union1.mlu +++ b/bangc-ops/kernels/roiaware_pool3d/roiaware_pool3d_union1.mlu @@ -281,7 +281,7 @@ __mlu_entry__ void MLUMultiKernelPtsIdxOfVoxels( __memset_nram((float *)fp_nram_pts_in_flag + load_pts_num, compute_pts_num - load_pts_num, (float)0.0); } - __bang_collect((float *)temp_buffer4, (float *)nram_pts_idx_seq, + __bang_filter((float *)temp_buffer4, (float *)nram_pts_idx_seq, (float *)fp_nram_pts_in_flag, compute_pts_num); int pts_num_in_cur_roi = (int)__bang_count((float *)fp_nram_pts_in_flag, compute_pts_num); diff --git a/bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu b/bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu index b80f40cc8..ab496b066 100644 --- a/bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu +++ b/bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu @@ -157,7 +157,7 @@ __mlu_func__ void computeStoreRoipointPool3d( NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T), segnum); // copy y to pooled_features_gdram - __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, + __bang_filter((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + cnt[box_idx]) * @@ -167,7 +167,7 @@ __mlu_func__ void computeStoreRoipointPool3d( (3 + feature_in_len) * sizeof(T), sizeof(T), segnum); // copy z to pooled_features_gdram - __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, + __bang_filter((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + cnt[box_idx]) * @@ -180,7 +180,7 @@ __mlu_func__ void computeStoreRoipointPool3d( for (int c_idx = 0; c_idx < feature_in_len; c_idx++) { __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T), GDRAM2NRAM); - __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, + __bang_filter((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + cnt[box_idx]) * @@ -264,7 +264,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( (3 + feature_in_len) * sizeof(T), sizeof(T), segnum); // copy y to pooled_features_gdram - __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, + __bang_filter((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + cnt[box_idx]) * @@ -274,7 +274,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( (3 + feature_in_len) * sizeof(T), sizeof(T), segnum); // copy z to pooled_features_gdram - __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, + __bang_filter((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + cnt[box_idx]) * @@ -287,7 +287,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( for (int c_idx = 0; c_idx < feature_in_len; c_idx++) { __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T), GDRAM2NRAM); - __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, + __bang_filter((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + cnt[box_idx]) * diff --git a/bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu b/bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu index 614ca29c4..74cc7788c 100644 --- a/bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu +++ b/bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu @@ -154,7 +154,7 @@ __mlu_func__ void computeStoreRoipointPool3d( NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T), segnum); // copy y to pooled_features_gdram - __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, + __bang_filter((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * @@ -164,7 +164,7 @@ __mlu_func__ void computeStoreRoipointPool3d( (3 + feature_in_len) * sizeof(T), sizeof(T), segnum); // copy z to pooled_features_gdram - __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, + __bang_filter((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * @@ -177,7 +177,7 @@ __mlu_func__ void computeStoreRoipointPool3d( for (int c_idx = 0; c_idx < feature_in_len; c_idx++) { __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T), GDRAM2NRAM); - __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, + __bang_filter((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * @@ -264,7 +264,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T), segnum); // copy y to pooled_features_gdram - __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, + __bang_filter((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * @@ -274,7 +274,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( (3 + feature_in_len) * sizeof(T), sizeof(T), segnum); // copy z to pooled_features_gdram - __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, + __bang_filter((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * @@ -287,7 +287,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( for (int c_idx = 0; c_idx < feature_in_len; c_idx++) { __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T), GDRAM2NRAM); - __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, + __bang_filter((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal); __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp index f5071ac98..e5b75bd51 100644 --- a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp +++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp @@ -427,7 +427,6 @@ void NmsExecutor::nms_detection_cpu( void NmsExecutor::cpuCompute() { assert(parser_->getInputNum() == 2); // assert(parser_->getOutputNum() == 1); - int max_output_boxes = parser_->getProtoNode()->nms_param().max_output_boxes(); float iou_thresh = parser_->getProtoNode()->nms_param().iou_threshold(); diff --git a/ci.sh b/ci.sh new file mode 100644 index 000000000..42c09116c --- /dev/null +++ b/ci.sh @@ -0,0 +1,92 @@ +# /bin/bash +# get PR id +set -e +PR_string=$(echo $GITHUB_REF | grep -Eo "/[0-9]*/") +pr_id=(${PR_string//// }) + +# generate time stamp +current=`date "+%Y-%m-%d %H:%M:%S"` +timeStamp=`date -d "$current" +%s` +currentTimeStamp=$((timeStamp*1000+10#`date "+%N"`/1000000)) + +# temporally set to mlu370 +card_type="MLU370-S4" + +# default repo name +repo_name="mlu-ops-dev" + +# repo ci root path +repo_root="/home/cambricon/${repo_name}_ci/" +if [ ! -d $repo_root ];then + mkdir $repo_root +fi + +# repo ci requests path +requests_path="$repo_root/requests" +if [ ! -d $requests_path ];then + mkdir $requests_path +fi + +# gen name of this ci +request_name="${repo_name}_${pr_id}_${currentTimeStamp}_${card_type}" + +# gen file and dir for this request +request_root="$repo_root/$request_name/" +sub_logs_path="$request_root/sub_logs/" + + +# echo "${repo_root}" +# echo "${requests_path}" +# echo "${request_root}" + +if [ ! -d $request_root ];then + mkdir $request_root +fi + +if [ ! -d $sub_logs_path ];then + mkdir $sub_logs_path +fi + +echo "working" > "$request_root/status" +chmod o+w "$request_root/status" + +if [ ! -f "$request_root/log" ];then + touch "$request_root/log" +fi + +chmod o+w "$request_root/log" + +if [ ! -f "$request_root/log_list" ];then + touch "$request_root/log_list" +fi + +chmod o+w "$request_root/log_list" + +# gen request file. +echo "${repo_name},${pr_id},${currentTimeStamp},${card_type}" > "$requests_path/${request_name}" + +# change dir group for server and client, or when server/client try to delete request, ftp may raise error. +chgrp -R cambricon $request_root +chgrp -R cambricon $requests_path + +# start script +python3 file_guard.py "$request_root/status" "$request_root/log" & +python3 combine_log.py "$request_root/log" "$request_root/log_list" "$request_root/sub_logs" "$request_root/status" & + +wait + +# status=$(cat ${request_root}/status) + +status=$( head -n +1 ${request_root}/status ) + +set +e + +if [ "$status" != "success" ];then + return_info=$( sed -n 2p ${request_root}/status ) + echo "${return_info}" + exit -1 +else + return_info=$( sed -n 2p ${request_root}/status ) + echo "${return_info}" + exit 0 +fi diff --git a/combine_log.py b/combine_log.py new file mode 100644 index 000000000..78a94319b --- /dev/null +++ b/combine_log.py @@ -0,0 +1,44 @@ +import time +import sys +import os +# Get info. +# output_path: the target file that you want to combine sub log with. +# list_path: the list of sub log name. When it is updated, the correspondding file will be add to output tail. +# list_dir_path: the dir path where sub logs stored. +# status_path: the path of status file. When status file is written to "success" or "fail", exit script. + +output_path = sys.argv[1] +list_path = sys.argv[2] +list_dir_path = sys.argv[3] +status_path = sys.argv[4] + +if __name__ == '__main__': + # list_pos stores the last position that pointer of list file pointed to. + list_pos = 0 + while True: + list_file = open(list_path, 'r') + list_file.seek(list_pos) + # read all lines starting from list_pos. + items = list_file.readlines() + # update list_pos + list_pos = list_file.tell() + # if read any line + if items is not None: + items.sort() + for item in items: + sub_path = item.strip() + if sub_path is not "": + file_name = list_dir_path + '/' + sub_path + # while True: + if os.path.exists(file_name): + os.system('cat ' + file_name + ' >> ' + output_path) + # break + # check status_file, when read "success" or "fail" exit cycle, or else, sleep some seconds and start from beginning. + status_file = open(status_path) + status = status_file.readline().strip() + status_file.close() + if "fail" in status or "success" in status: + break + else: + time.sleep(2) + diff --git a/docs/bangc-docs/design_docs/generate_proposals_v2/generate_proposals_v2.md b/docs/bangc-docs/design_docs/generate_proposals_v2/generate_proposals_v2.md index 6bddb754c..5686c0a2f 100644 --- a/docs/bangc-docs/design_docs/generate_proposals_v2/generate_proposals_v2.md +++ b/docs/bangc-docs/design_docs/generate_proposals_v2/generate_proposals_v2.md @@ -423,11 +423,11 @@ int rem_num = per_core_num % seg_pad_1; 2. 单次循环load完数据后,使用bang_ge 获取 nram 上 scores 大于等于 k_score 的mask; -3. 使用 bang_collect,根据 第2步的mask, 把 mask 等于1位置的`scores`、`anchors`、`bbox_deltas`、`variances`值collect到一起, `scores` 需要collect一次, `anchors`、`bbox_deltas`、`variances`需要对四个值分别进行collect, 每次循环 collect 数量为seg_pad_1; +3. 使用 bang_filter,根据 第2步的mask, 把 mask 等于1位置的`scores`、`anchors`、`bbox_deltas`、`variances`值collect到一起, `scores` 需要collect一次, `anchors`、`bbox_deltas`、`variances`需要对四个值分别进行collect, 每次循环 collect 数量为seg_pad_1; 4. 用 collect 后的数据,根据 createbox 计算过程创建 proposals; -5. 根据 removeSmallBox 的计算方法,生成新的 mask2, 用 bang_collect 操作移除proposal中宽和高小于 min_size 的 proposal,把有效的 proposals 集中到一起,此时,单次循环内的计算过程结束; +5. 根据 removeSmallBox 的计算方法,生成新的 mask2, 用 bang_filter 操作移除proposal中宽和高小于 min_size 的 proposal,把有效的 proposals 集中到一起,此时,单次循环内的计算过程结束; 6. 把单次循环时创建好 proposal 数据,保存到 workspace 空间内, 若单 core 内数据未处理完,回到第 2 步;
@@ -483,7 +483,7 @@ proposals[3] = Max(Min(oymax, im_shape[0] - offset), 0.); 3. 用bang_and 计算 mask_w 和 mask_h 的与的结果 mask_res; -4. 根据mask_res,用bang_collect,把proposals中对应位置的值取出集中到一起; +4. 根据mask_res,用bang_filter,把proposals中对应位置的值取出集中到一起; 5. 把 collect 后的 proposal 数据存放到 workspace 上, 先在 workspace 上开辟 coreNum 大小的空间,每个 core 在对应 taskId 位置存放自己当前的 collect 数量,sync_all 同步后,每个 core 上计算自己存放在 workspace 上的数据偏移,按照这个偏移往 workspace 上存放 collect 后的数值(由于3.1.3 nms筛选中会对乱序数据进行排序操作,本节中存放在 workspace 中的数据相对顺序与 input tensors 可能会不同,但不影响最终算子结果)。 @@ -764,11 +764,11 @@ __mlu_func__ void removeSmallBox(T * boxes, T *scores, const T *im_size, *count = __bang_count(mask_result, deal_size); // collect and store box and scores - __bang_collect(box + 0 * deal_size, box + 0 * deal_size, mask_result, deal_size); - __bang_collect(box + 1 * deal_size, box + 1 * deal_size, mask_result, deal_size); - __bang_collect(box + 2 * deal_size, box + 2 * deal_size, mask_result, deal_size); - __bang_collect(box + 3 * deal_size, box + 3 * deal_size, mask_result, deal_size); - __bang_collect(scores, scores, mask_result, deal_size); + __bang_filter(box + 0 * deal_size, box + 0 * deal_size, mask_result, deal_size); + __bang_filter(box + 1 * deal_size, box + 1 * deal_size, mask_result, deal_size); + __bang_filter(box + 2 * deal_size, box + 2 * deal_size, mask_result, deal_size); + __bang_filter(box + 3 * deal_size, box + 3 * deal_size, mask_result, deal_size); + __bang_filter(scores, scores, mask_result, deal_size); } ``` diff --git a/docs/bangc-docs/design_docs/roiaware_pool3d_forward/roiaware_pool3d_forward.md b/docs/bangc-docs/design_docs/roiaware_pool3d_forward/roiaware_pool3d_forward.md index c689e808e..e1467f051 100644 --- a/docs/bangc-docs/design_docs/roiaware_pool3d_forward/roiaware_pool3d_forward.md +++ b/docs/bangc-docs/design_docs/roiaware_pool3d_forward/roiaware_pool3d_forward.md @@ -360,7 +360,7 @@ __device__ roiaware_maxpool3d(int max_pts_each_voxel, int out_x, int out_y, int nram pooled_feature_cur_voxels[channels]; pooled_features_cur_voxels = pooled_features + voxels_index * channels; pts_idx_of_voxels_cur_voxels = pts_idx_of_voxels + voxels_index * max_pts_each_voxel; - __bang_collect(pooled_feature_cur_voxels, pts_idx_of_voxels_cur_voxels); + __bang_filter(pooled_feature_cur_voxels, pts_idx_of_voxels_cur_voxels); nram nram_pts_feature_cur_voxels[max_pts_each_voxel]; nram nram_pooled_feature_cur_voxels[max_pts_each_voxel]; @@ -386,7 +386,7 @@ __device__ roiaware_avgpool3d(int max_pts_each_voxel, int out_x, int out_y, int nram pooled_feature_cur_voxels[channels]; pooled_features_cur_voxels = pooled_features + voxels_index * channels; pts_idx_of_voxels_cur_voxels = pts_idx_of_voxels + voxels_index * max_pts_each_voxel; - __bang_collect(pooled_feature_cur_voxels, pts_idx_of_voxels_cur_voxels); + __bang_filter(pooled_feature_cur_voxels, pts_idx_of_voxels_cur_voxels); nram nram_pts_feature_cur_voxels[max_pts_each_voxel]; nram nram_pooled_feature_cur_voxels[max_pts_each_voxel]; diff --git a/file_guard.py b/file_guard.py new file mode 100644 index 000000000..bbbc6a664 --- /dev/null +++ b/file_guard.py @@ -0,0 +1,30 @@ +import time +import sys +import os +guard_status_file = sys.argv[1] +guard_log_file = sys.argv[2] + +if __name__ == '__main__': + # where stores the last position that pointer pointed to. + where= 0 + while True: + file = open(guard_log_file, "r") + file.seek(where) + # if read any lines, call system echo to print each line. + for line in file.readlines(): + new_line = line.strip().replace("\'", "_").replace("\"", "_") + os.system('echo ' + "'%s'" % new_line) + # update where + where = file.tell() + file.close() + # check status, end process when read "success" or "fail" + status_file = open(guard_status_file, "r") + line = status_file.readline().strip() + status_file.close() + if "success" in line: + break + elif "fail" in line: + exit(-1) + # sleep for a while + time.sleep(2) +