From e7fc1cb41cc3d193454c0dd71c8e802e305bf0e3 Mon Sep 17 00:00:00 2001 From: Anmin Liu Date: Wed, 27 May 2026 23:21:25 +0800 Subject: [PATCH 1/2] feat(sparsity): add VecAttention sparse prefill for VLM Integrate VecAttention into AngelSlim as a sparse attention method for Vision-Language Models (Qwen2.5-VL). - Add vecattention subpackage under compressor/sparsity/ - Add vllm-flash-attention as git submodule for sparse_attn_func kernel - Add Triton kernels for MinP threshold selection and query pooling - Add run_vecattention.py tool for image/video inference --- .gitmodules | 3 + README.md | 6 + README_cn.md | 6 + angelslim/compressor/sparsity/__init__.py | 3 +- .../sparsity/vecattention/__init__.py | 17 + .../sparsity/vecattention/modules/__init__.py | 5 + .../sparsity/vecattention/modules/forward.py | 366 ++++++++ .../sparsity/vecattention/ops/__init__.py | 1 + ..._vecattention_kernels_best_eff_configs.pkl | Bin 0 -> 138 bytes ..._vecattention_kernels_best_eff_configs.pkl | Bin 0 -> 138 bytes .../vecattention/ops/vecattention_kernel.py | 783 ++++++++++++++++++ .../vecattention/ops/vllm-flash-attention | 1 + .../compressor/sparsity/vecattention/patch.py | 63 ++ .../sparsity/vecattention/vecattention.py | 53 ++ .../vecattention_configuration.py | 83 ++ tools/run_vecattention.py | 324 ++++++++ 16 files changed, 1713 insertions(+), 1 deletion(-) create mode 100644 .gitmodules create mode 100644 angelslim/compressor/sparsity/vecattention/__init__.py create mode 100644 angelslim/compressor/sparsity/vecattention/modules/__init__.py create mode 100644 angelslim/compressor/sparsity/vecattention/modules/forward.py create mode 100644 angelslim/compressor/sparsity/vecattention/ops/__init__.py create mode 100644 angelslim/compressor/sparsity/vecattention/ops/cache/dit_vecattention_kernels_best_eff_configs.pkl create mode 100644 angelslim/compressor/sparsity/vecattention/ops/cache/vlm_vecattention_kernels_best_eff_configs.pkl create mode 100644 angelslim/compressor/sparsity/vecattention/ops/vecattention_kernel.py create mode 160000 angelslim/compressor/sparsity/vecattention/ops/vllm-flash-attention create mode 100644 angelslim/compressor/sparsity/vecattention/patch.py create mode 100644 angelslim/compressor/sparsity/vecattention/vecattention.py create mode 100644 angelslim/compressor/sparsity/vecattention/vecattention_configuration.py create mode 100644 tools/run_vecattention.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..adb2692e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "angelslim/compressor/sparsity/vecattention/ops/vllm-flash-attention"] + path = angelslim/compressor/sparsity/vecattention/ops/vllm-flash-attention + url = git@github.com:anminliu/vllm-flash-attention.git diff --git a/README.md b/README.md index 7d6d7a43..de3c3f1e 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,12 @@ A more accessible, comprehensive, and efficient toolkit for large model compress