Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 3 additions & 26 deletions .github/workflows/bangc_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,35 +47,12 @@ jobs:
runner: [mlu370-m8]
mlu_ops_version : [v0.7.1]
cntoolkit_version : [cntoolkit3.5.0]
runs-on: ${{matrix.runner}}
runs-on: [self-hosted]
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'

- name: bangc_lint_check
- name: run_bangc_ops_ci
run: >
docker run --rm -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:v0.2-x86_64-ubuntu16.04-BANGPy
./tools/pre-commit origin/master

- name: build_bangc_ops
run: >
docker run --rm -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}}
./build.sh --sub_module=bangc

- name: bangc_ops_release_temp_cases
run: >
docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0
-v /testdata:/testdata -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}}
./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/default_platform

- name: test_bangc_ops_release_temp_370_cases
if: matrix.runner == 'mlu370-m8'
run: >
docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0
-v /testdata:/testdata -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}}
./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/370

- name: clean
run: |
rm -rf bangc-ops/build
bash ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -748,14 +748,14 @@ __mlu_func__ void removeSmallBox(T *proposal_scores, T *proposal_boxes,
return;
}
// collect and store box and scores
__bang_collect(proposal_boxes, proposal_boxes, mask_tmp2, align_count);
__bang_collect(proposal_boxes + 1 * input_stride,
__bang_filter(proposal_boxes, proposal_boxes, mask_tmp2, align_count);
__bang_filter(proposal_boxes + 1 * input_stride,
proposal_boxes + 1 * input_stride, mask_tmp2, align_count);
__bang_collect(proposal_boxes + 2 * input_stride,
__bang_filter(proposal_boxes + 2 * input_stride,
proposal_boxes + 2 * input_stride, mask_tmp2, align_count);
__bang_collect(proposal_boxes + 3 * input_stride,
__bang_filter(proposal_boxes + 3 * input_stride,
proposal_boxes + 3 * input_stride, mask_tmp2, align_count);
__bang_collect(proposal_scores, proposal_scores, mask_tmp2, align_count);
__bang_filter(proposal_scores, proposal_scores, mask_tmp2, align_count);
}

template <typename T>
Expand Down Expand Up @@ -867,38 +867,38 @@ __mlu_func__ void createAndRemoveBox(
__bang_ge_scalar(ge_mask, scores, k_score, actual_num_align);
count = __bang_count(ge_mask, actual_num_align);
if (count != 0 && count != actual_num && actual_num != 1) {
__bang_collect(scores, scores, ge_mask, actual_num_align);
__bang_filter(scores, scores, ge_mask, actual_num_align);

__bang_collect(bbox_deltals, bbox_deltals, ge_mask, actual_num_align);
__bang_collect(bbox_deltals + 1 * actual_num_align,
__bang_filter(bbox_deltals, bbox_deltals, ge_mask, actual_num_align);
__bang_filter(bbox_deltals + 1 * actual_num_align,
bbox_deltals + 1 * actual_num_align, ge_mask,
actual_num_align);
__bang_collect(bbox_deltals + 2 * actual_num_align,
__bang_filter(bbox_deltals + 2 * actual_num_align,
bbox_deltals + 2 * actual_num_align, ge_mask,
actual_num_align);
__bang_collect(bbox_deltals + 3 * actual_num_align,
__bang_filter(bbox_deltals + 3 * actual_num_align,
bbox_deltals + 3 * actual_num_align, ge_mask,
actual_num_align);

__bang_collect(anchors, anchors, ge_mask, actual_num_align);
__bang_collect(anchors + 1 * actual_num_align,
__bang_filter(anchors, anchors, ge_mask, actual_num_align);
__bang_filter(anchors + 1 * actual_num_align,
anchors + 1 * actual_num_align, ge_mask,
actual_num_align);
__bang_collect(anchors + 2 * actual_num_align,
__bang_filter(anchors + 2 * actual_num_align,
anchors + 2 * actual_num_align, ge_mask,
actual_num_align);
__bang_collect(anchors + 3 * actual_num_align,
__bang_filter(anchors + 3 * actual_num_align,
anchors + 3 * actual_num_align, ge_mask,
actual_num_align);

__bang_collect(variances, variances, ge_mask, actual_num_align);
__bang_collect(variances + 1 * actual_num_align,
__bang_filter(variances, variances, ge_mask, actual_num_align);
__bang_filter(variances + 1 * actual_num_align,
variances + 1 * actual_num_align, ge_mask,
actual_num_align);
__bang_collect(variances + 2 * actual_num_align,
__bang_filter(variances + 2 * actual_num_align,
variances + 2 * actual_num_align, ge_mask,
actual_num_align);
__bang_collect(variances + 3 * actual_num_align,
__bang_filter(variances + 3 * actual_num_align,
variances + 3 * actual_num_align, ge_mask,
actual_num_align);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel3(
__bang_int322float_rn((float *)nram_aux, (int32_t *)nram_mask, load_l_num,
0);
valid_l_num_now = __bang_count((float *)nram_aux, load_l_num);
__bang_collect((float *)nram_output, (float *)nram_input,
__bang_filter((float *)nram_output, (float *)nram_input,
(float *)nram_aux, load_l_num);
int32_t *store_valid_ptr =
(int32_t *)indice_pair + store_offset * len_l + core_offset_l_valid;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,13 +328,13 @@ __mlu_func__ void backwardStageTwoLoop(
int32_t all_valid_count = __bang_sum(cond_all_valid, nq_nl_np);
int32_t* dst_offset = (int32_t*)offset_zero_nram_stg2;
for (int i = 0; i < 4; i++) {
__bang_collect((T*)dst_offset + i * nq_nl_np,
__bang_filter((T*)dst_offset + i * nq_nl_np,
(T*)offset_nram + i * nq_nl_np, cond_all_valid, nq_nl_np);
}
int32_t* src_offset = (int32_t*)inter_grad;
int32_t* stride_4_2 = dst_offset + 3 * nq_nl_np;
int32_t* stride_1_2 = dst_offset;
__bang_collect((T*)src_offset, (T*)seq_nram, cond_all_valid, nq_nl_np);
__bang_filter((T*)src_offset, (T*)seq_nram, cond_all_valid, nq_nl_np);
__bang_mul_scalar(src_offset, src_offset, channels * sizeof(T), nq_nl_np);
__bang_sub(stride_4_2, stride_4_2, dst_offset + nq_nl_np, nq_nl_np);
__bang_sub(stride_1_2, stride_1_2, dst_offset + nq_nl_np, nq_nl_np);
Expand Down Expand Up @@ -364,9 +364,9 @@ __mlu_func__ void backwardStageTwoLoop(
int32_t* tmp_src_offset = (int32_t*)inter_grad;
int32_t valid_count = __bang_sum(tmp_cond, nq_nl_np);
if (valid_count > 0) {
__bang_collect((T*)tmp_dst_offset, (T*)tmp_dst_offset, tmp_cond,
__bang_filter((T*)tmp_dst_offset, (T*)tmp_dst_offset, tmp_cond,
nq_nl_np);
__bang_collect((T*)tmp_src_offset, (T*)seq_nram, tmp_cond, nq_nl_np);
__bang_filter((T*)tmp_src_offset, (T*)seq_nram, tmp_cond, nq_nl_np);
__bang_mul_scalar(tmp_src_offset, tmp_src_offset, channels * sizeof(T),
valid_count);
for (int p = 0; p < valid_count; p++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ __mlu_func__ void computePolationWeightOffsetCond(
T* buf_y_ceil = buf_nram + 5 * total_points;
//================================================================================================
int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD);
__bang_collect_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad);
__bang_collect_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad);
__bang_filter_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad);
__bang_filter_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad);
// x = loc_x * spatial_w - 0.5; y = loc_y * spatial_h - 0.5;
__bang_fusion(FUSION_FMS, buf_x_nram, buf_x_nram, spatial_w_bd_nram, (T)0.5,
total_points, block_points);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ __mlu_func__ void getConditionCoordWeight(
w_contain_inf = buf_nram[2 * total_points] > 0;
//================================================================================================
int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD);
__bang_collect_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad);
__bang_collect_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad);
__bang_filter_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad);
__bang_filter_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad);
// x = loc_x * spatial_w - 0.5; y = loc_y * spatial_h - 0.5;
__bang_fusion(FUSION_FMS, buf_x_nram, buf_x_nram, spatial_w_bd_nram, (T)0.5,
total_points, block_points);
Expand Down Expand Up @@ -291,7 +291,7 @@ __mlu_func__ void getConditionCoordWeight(
weight_attn_nram, 4 * total_points, total_points);
}
__bang_mul_scalar(buf_nram, weight_attn_nram, (T)1, total_points);
__bang_collect((float*)weight_attn_nram, (float*)buf_nram,
__bang_filter((float*)weight_attn_nram, (float*)buf_nram,
cond_point_valid_nram, total_points);
__bang_float2int32((int32_t*)cond_point_polation_nram,
cond_point_polation_nram, total_points * 4, 0);
Expand All @@ -301,15 +301,15 @@ __mlu_func__ void getConditionCoordWeight(
__bang_band((char*)weight_polation_nram_tmp, (char*)weight_polation_nram,
(char*)cond_point_polation_nram,
total_points * 4 * sizeof(float));
__bang_collect((float*)weight_polation_nram, (float*)weight_polation_nram_tmp,
__bang_filter((float*)weight_polation_nram, (float*)weight_polation_nram_tmp,
cond_point_valid_nram, total_points);
__bang_collect((float*)weight_polation_nram + total_points,
__bang_filter((float*)weight_polation_nram + total_points,
(float*)weight_polation_nram_tmp + total_points,
cond_point_valid_nram, total_points);
__bang_collect((float*)weight_polation_nram + 2 * total_points,
__bang_filter((float*)weight_polation_nram + 2 * total_points,
(float*)weight_polation_nram_tmp + 2 * total_points,
cond_point_valid_nram, total_points);
__bang_collect((float*)weight_polation_nram + 3 * total_points,
__bang_filter((float*)weight_polation_nram + 3 * total_points,
(float*)weight_polation_nram_tmp + 3 * total_points,
cond_point_valid_nram, total_points);
//================================================================================================
Expand All @@ -319,16 +319,16 @@ __mlu_func__ void getConditionCoordWeight(
__bang_mul_scalar((int32_t*)cond_point_polation_nram_tmp,
(int32_t*)cond_point_polation_nram, (int32_t)1,
total_points * 4);
__bang_collect((float*)cond_point_polation_nram,
__bang_filter((float*)cond_point_polation_nram,
(float*)cond_point_polation_nram_tmp, cond_point_valid_nram,
total_points);
__bang_collect((float*)cond_point_polation_nram + total_points,
__bang_filter((float*)cond_point_polation_nram + total_points,
(float*)cond_point_polation_nram_tmp + total_points,
cond_point_valid_nram, total_points);
__bang_collect((float*)cond_point_polation_nram + 2 * total_points,
__bang_filter((float*)cond_point_polation_nram + 2 * total_points,
(float*)cond_point_polation_nram_tmp + 2 * total_points,
cond_point_valid_nram, total_points);
__bang_collect((float*)cond_point_polation_nram + 3 * total_points,
__bang_filter((float*)cond_point_polation_nram + 3 * total_points,
(float*)cond_point_polation_nram_tmp + 3 * total_points,
cond_point_valid_nram, total_points);
}
Expand All @@ -349,11 +349,11 @@ __mlu_func__ void getConditionCoordWeight(
__bang_sub((int32_t*)data_offset_nram_tr_tmp,
(int32_t*)data_offset_nram_tr_tmp,
(int32_t*)data_offset_nram_tl_tmp, total_points);
__bang_collect((float*)data_offset_nram_tl, (float*)data_offset_nram_tl_tmp,
__bang_filter((float*)data_offset_nram_tl, (float*)data_offset_nram_tl_tmp,
cond_point_valid_nram, total_points);
__bang_collect((float*)data_offset_nram_bl, (float*)data_offset_nram_bl_tmp,
__bang_filter((float*)data_offset_nram_bl, (float*)data_offset_nram_bl_tmp,
cond_point_valid_nram, total_points);
__bang_collect((float*)data_offset_nram_tr, (float*)data_offset_nram_tr_tmp,
__bang_filter((float*)data_offset_nram_tr, (float*)data_offset_nram_tr_tmp,
cond_point_valid_nram, total_points);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,15 +168,15 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel(
__sync();
// generate x and y coordinate vector
// generate spatial_x and spatial_y spatial vector
__bang_collect((float *)coord_y, (float *)grid_ram, (float *)mask_ram,
__bang_filter((float *)coord_y, (float *)grid_ram, (float *)mask_ram,
deal_num * 2); // y
__bang_collect((float *)spatial_x_temp, (float *)data_spatial_shapes_nram,
__bang_filter((float *)spatial_x_temp, (float *)data_spatial_shapes_nram,
(float *)mask_ram,
num_levels * 2); // spatial_x
__bang_not((float *)mask_ram, (float *)mask_ram, deal_num * 2);
__bang_collect((float *)coord_x, (float *)grid_ram, (float *)mask_ram,
__bang_filter((float *)coord_x, (float *)grid_ram, (float *)mask_ram,
deal_num * 2); // x
__bang_collect((float *)spatial_y_temp, (float *)data_spatial_shapes_nram,
__bang_filter((float *)spatial_y_temp, (float *)data_spatial_shapes_nram,
(float *)mask_ram,
num_levels * 2); // spatial_y
for (int32_t i = 0; i < num_levels; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ __mlu_entry__ void MLUMultiKernelPtsIdxOfVoxels(
__memset_nram((float *)fp_nram_pts_in_flag + load_pts_num,
compute_pts_num - load_pts_num, (float)0.0);
}
__bang_collect((float *)temp_buffer4, (float *)nram_pts_idx_seq,
__bang_filter((float *)temp_buffer4, (float *)nram_pts_idx_seq,
(float *)fp_nram_pts_in_flag, compute_pts_num);
int pts_num_in_cur_roi =
(int)__bang_count((float *)fp_nram_pts_in_flag, compute_pts_num);
Expand Down
12 changes: 6 additions & 6 deletions bangc-ops/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ __mlu_func__ void computeStoreRoipointPool3d(
NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);

// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign,
__bang_filter((T *)auxiliary_d, (T *)points_y, (T *)pts_assign,
span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) *
Expand All @@ -167,7 +167,7 @@ __mlu_func__ void computeStoreRoipointPool3d(
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);

// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign,
__bang_filter((T *)auxiliary_e, (T *)points_z, (T *)pts_assign,
span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) *
Expand All @@ -180,7 +180,7 @@ __mlu_func__ void computeStoreRoipointPool3d(
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T),
span_num_deal * sizeof(T), GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign,
__bang_filter((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign,
span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) *
Expand Down Expand Up @@ -264,7 +264,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d(
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);

// copy y to pooled_features_gdram
__bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign,
__bang_filter((T *)auxiliary_d, (T *)points_y, (T *)pts_assign,
span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) *
Expand All @@ -274,7 +274,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d(
(3 + feature_in_len) * sizeof(T), sizeof(T), segnum);

// copy z to pooled_features_gdram
__bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign,
__bang_filter((T *)auxiliary_e, (T *)points_z, (T *)pts_assign,
span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) *
Expand All @@ -287,7 +287,7 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d(
for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
__memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T),
span_num_deal * sizeof(T), GDRAM2NRAM);
__bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign,
__bang_filter((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign,
span_num_deal);
__memcpy(pooled_features_gdram +
(box_idx * sampled_pts_num + cnt[box_idx]) *
Expand Down
Loading