-
Notifications
You must be signed in to change notification settings - Fork 149
2026-04-27-the-evolution-of-flashattention #131
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a2ae7b1
8dfc4d1
24275eb
417ac33
4b08166
565013e
31fcd96
2fea695
b061efd
b0ae716
c9387d7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,360 @@ | ||
| @article{dao2022flashattention, | ||
| title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness}, | ||
| author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher}, | ||
| journal={arXiv preprint arXiv:2205.14135}, | ||
| year={2022} | ||
| } | ||
|
|
||
|
|
||
| @misc{shazeer2019fasttransformerdecodingwritehead, | ||
| title={Fast Transformer Decoding: One Write-Head is All You Need}, | ||
| author={Noam Shazeer}, | ||
| year={2019}, | ||
| eprint={1911.02150}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.NE}, | ||
| url={https://arxiv.org/abs/1911.02150}, | ||
| } | ||
|
|
||
|
|
||
| @misc{bahdanau2016neuralmachinetranslationjointly, | ||
| title={Neural Machine Translation by Jointly Learning to Align and Translate}, | ||
| author={Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio}, | ||
| year={2016}, | ||
| eprint={1409.0473}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.CL}, | ||
| url={https://arxiv.org/abs/1409.0473}, | ||
| } | ||
|
|
||
| @article{sun2025efficient, | ||
| title={Efficient attention mechanisms for large language models: A survey}, | ||
| author={Sun, Yutao and Li, Zhenyu and Zhang, Yike and Pan, Tengyu and Dong, Bowen and Guo, Yuyi and Wang, Jianyong}, | ||
| journal={arXiv preprint arXiv:2507.19595}, | ||
| year={2025} | ||
| } | ||
|
|
||
|
|
||
| @misc{keles2022computationalcomplexityselfattention, | ||
| title={On The Computational Complexity of Self-Attention}, | ||
| author={Feyza Duman Keles and Pruthuvi Mahesakya Wijewardena and Chinmay Hegde}, | ||
| year={2022}, | ||
| eprint={2209.04881}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.LG}, | ||
| url={https://arxiv.org/abs/2209.04881}, | ||
| } | ||
|
|
||
| @article{gholami2024ai, | ||
| title={Ai and memory wall}, | ||
| author={Gholami, Amir and Yao, Zhewei and Kim, Sehoon and Hooper, Coleman and Mahoney, Michael W and Keutzer, Kurt}, | ||
| journal={IEEE Micro}, | ||
| volume={44}, | ||
| number={3}, | ||
| pages={33--39}, | ||
| year={2024}, | ||
| publisher={IEEE} | ||
| } | ||
|
|
||
| @article{vaswani2023attentionneed, | ||
| title={Attention Is All You Need}, | ||
| author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin}, | ||
| journal={arXiv preprint arXiv:1706.03762}, | ||
| year={2017}, | ||
| url={https://arxiv.org/abs/1706.03762} | ||
| } | ||
|
|
||
|
|
||
|
|
||
| @techreport{nvidia2022h100, | ||
| title={NVIDIA H100 Tensor Core GPU Architecture}, | ||
| author={{NVIDIA Corporation}}, | ||
| year={2022}, | ||
| institution={NVIDIA}, | ||
| url={https://resources.nvidia.com/en-us-tensor-core/nvidia-tensor-core-gpu-architecture-whitepaper}, | ||
| note={Whitepaper} | ||
| } | ||
|
|
||
| @techreport{nvidia2020a100, | ||
| title={NVIDIA A100 Tensor Core GPU Architecture}, | ||
| author={NVIDIA}, | ||
| year={2020}, | ||
| institution={NVIDIA}, | ||
| url={https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/nvidia-ampere-architecture-whitepaper.pdf}, | ||
| note={Whitepaper} | ||
| } | ||
|
|
||
| @misc{hazyresearch2024brrr, | ||
| title={GPUs Go Brrr}, | ||
| author={{Hazy Research}}, | ||
| year={2024}, | ||
| howpublished={Stanford Hazy Research Blog}, | ||
| url={https://hazyresearch.stanford.edu/blog/2024-05-12-tk}, | ||
| note={Accessed: 2025-11-29} | ||
| } | ||
|
|
||
|
|
||
| @misc{wang2020linformerselfattentionlinearcomplexity, | ||
| title={Linformer: Self-Attention with Linear Complexity}, | ||
| author={Sinong Wang and Belinda Z. Li and Madian Khabsa and Han Fang and Hao Ma}, | ||
| year={2020}, | ||
| eprint={2006.04768}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.LG}, | ||
| url={https://arxiv.org/abs/2006.04768}, | ||
| } | ||
|
|
||
|
|
||
| @misc{choromanski2022rethinkingattentionperformers, | ||
| title={Rethinking Attention with Performers}, | ||
| author={Krzysztof Choromanski and Valerii Likhosherstov and David Dohan and Xingyou Song and Andreea Gane and Tamas Sarlos and Peter Hawkins and Jared Davis and Afroz Mohiuddin and Lukasz Kaiser and David Belanger and Lucy Colwell and Adrian Weller}, | ||
| year={2022}, | ||
| eprint={2009.14794}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.LG}, | ||
| url={https://arxiv.org/abs/2009.14794}, | ||
| } | ||
|
|
||
| @article{flashattention3, | ||
| title={FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision}, | ||
| author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher}, | ||
| journal={arXiv preprint arXiv:2407.08608}, | ||
| year={2024} | ||
| } | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
| @article{kwon2023efficientmemorymanagementlarge, | ||
| title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, | ||
| author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph E. and Zhang, Hao and Stoica, Ion}, | ||
| journal={arXiv preprint arXiv:2309.06180}, | ||
| year={2023} | ||
| } | ||
|
|
||
| @article{juravsky2024hydragenhighthroughputllminference, | ||
| title={HydraGen: High-Throughput LLM Inference with Shared Prefixes}, | ||
| author={Juravsky, Jordan and others}, | ||
| journal={arXiv preprint arXiv:2402.05099}, | ||
| year={2024} | ||
| } | ||
|
|
||
| @misc{kitaev2020reformerefficienttransformer, | ||
| title={Reformer: The Efficient Transformer}, | ||
| author={Nikita Kitaev and Łukasz Kaiser and Anselm Levskaya}, | ||
| year={2020}, | ||
| eprint={2001.04451}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.LG}, | ||
| url={https://arxiv.org/abs/2001.04451}, | ||
| } | ||
|
|
||
| @article{milakov2018online, | ||
| title={Online normalizer calculation for softmax}, | ||
| author={Milakov, Maxim and Gimelshein, Natalia}, | ||
| journal={arXiv preprint arXiv:1805.02867}, | ||
| year={2018}, | ||
| url={https://arxiv.org/abs/1805.02867} | ||
| } | ||
|
|
||
|
|
||
| @misc{child2019generatinglongsequencessparse, | ||
| title={Generating Long Sequences with Sparse Transformers}, | ||
| author={Rewon Child and Scott Gray and Alec Radford and Ilya Sutskever}, | ||
| year={2019}, | ||
| eprint={1904.10509}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.LG}, | ||
| url={https://arxiv.org/abs/1904.10509}, | ||
| } | ||
|
|
||
| @article{dao2023flashattention2, | ||
| title={FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning}, | ||
| author={Dao, Tri}, | ||
| journal={arXiv preprint arXiv:2307.08691}, | ||
| year={2023} | ||
| } | ||
|
|
||
| @article{shah2024flashattention3, | ||
| title={FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision}, | ||
| author={Shah, Jay and Bikshandi, Ganesh and Zhang, Ying and Thakkar, Vijay and Ramani, Pradeep and Dao, Tri}, | ||
| journal={arXiv preprint arXiv:2407.08608}, | ||
| year={2024} | ||
| } | ||
|
|
||
|
|
||
| @misc{kwon2023efficientmemorymanagementlarge, | ||
| title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, | ||
| author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica}, | ||
| year={2023}, | ||
| eprint={2309.06180}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.LG}, | ||
| url={https://arxiv.org/abs/2309.06180}, | ||
| } | ||
|
|
||
| @misc{juravsky2024hydragenhighthroughputllminference, | ||
| title={Hydragen: High-Throughput LLM Inference with Shared Prefixes}, | ||
| author={Jordan Juravsky and Bradley Brown and Ryan Ehrlich and Daniel Y. Fu and Christopher R{\'e} and Azalia Mirhoseini}, | ||
| year={2024}, | ||
| eprint={2402.05099}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.LG}, | ||
| url={https://arxiv.org/abs/2402.05099}, | ||
| } | ||
|
Comment on lines
+198
to
+206
|
||
|
|
||
|
|
||
| @inproceedings{dukhan2020two, | ||
| title={Two-pass softmax algorithm}, | ||
| author={Dukhan, Marat and Ablavatski, Artsiom}, | ||
| booktitle={2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, | ||
| pages={386--395}, | ||
| year={2020}, | ||
| organization={IEEE} | ||
| } | ||
|
|
||
| @inproceedings{demaine2018red, | ||
| title={Red-blue pebble game: Complexity of computing the trade-off between cache size and memory transfers}, | ||
| author={Demaine, Erik D and Liu, Quanquan C}, | ||
| booktitle={Proceedings of the 30th on Symposium on Parallelism in Algorithms and Architectures}, | ||
| pages={195--204}, | ||
| year={2018} | ||
| } | ||
|
|
||
| @article{aggarwal1988io, | ||
| title={The input/output complexity of sorting and related problems}, | ||
| author={Aggarwal, Alok and Vitter, Jeffrey Scott}, | ||
| journal={Communications of the ACM}, | ||
| volume={31}, | ||
| number={9}, | ||
| pages={1116--1127}, | ||
| year={1988}, | ||
| publisher={ACM} | ||
| } | ||
|
|
||
| @article{williams2009roofline, | ||
| title={Roofline: An insightful visual performance model for multicore architectures}, | ||
| author={Williams, Samuel and Waterman, Andrew and Patterson, David}, | ||
| journal={Communications of the ACM}, | ||
| volume={52}, | ||
| number={4}, | ||
| pages={65--76}, | ||
| year={2009}, | ||
| publisher={ACM} | ||
| } | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
| @online{cutlass-tma, | ||
| title = {CUTLASS Tutorial: Mastering the NVIDIA® Tensor Memory Accelerator (TMA)}, | ||
| author = {Colfax Research}, | ||
| year = {2024}, | ||
| url = {https://research.colfax-intl.com/tutorial-hopper-tma/}, | ||
| note = {Accessed 2025-12-07} | ||
| } | ||
|
|
||
| @online{modal-tma, | ||
| title = {Tensor Memory Accelerator (TMA) - GPU Glossary}, | ||
| author = {Modal Labs}, | ||
| year = {2024}, | ||
| url = {https://modal.com/gpu-glossary/device-hardware/tensor-memory-accelerator}, | ||
| note = {Accessed 2025-12-07} | ||
| } | ||
|
|
||
| @online{nvidia-hopper-depth, | ||
| title = {NVIDIA Hopper Architecture In-Depth}, | ||
| author = {NVIDIA}, | ||
| year = {2022}, | ||
| url = {https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/}, | ||
| note = {Accessed 2025-12-07} | ||
| } | ||
|
|
||
| @online{nvidia-blackwell-brief, | ||
| title = {NVIDIA Blackwell Architecture Technical Brief}, | ||
| author = {NVIDIA}, | ||
| year = {2024}, | ||
| url = {https://resources.nvidia.com/en-us-blackwell-architecture?ncid=no-ncid}, | ||
| note = {Accessed 2025-12-07} | ||
| } | ||
|
|
||
|
|
||
| @inproceedings{NEURIPS2024_7ede97c3, | ||
| author = {Shah, Jay and Bikshandi, Ganesh and Zhang, Ying and Thakkar, Vijay and Ramani, Pradeep and Dao, Tri}, | ||
| booktitle = {Advances in Neural Information Processing Systems}, | ||
| doi = {10.52202/079017-2193}, | ||
| editor = {A. Globerson and L. Mackey and D. Belgrave and A. Fan and U. Paquet and J. Tomczak and C. Zhang}, | ||
| pages = {68658--68685}, | ||
| publisher = {Curran Associates, Inc.}, | ||
| title = {FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-precision}, | ||
| url = {https://proceedings.neurips.cc/paper_files/paper/2024/file/7ede97c3e082c6df10a8d6103a2eebd2-Paper-Conference.pdf}, | ||
| volume = {37}, | ||
| year = {2024} | ||
| } | ||
|
|
||
| @misc{spector2024thunderkittenssimplefastadorable, | ||
| title={ThunderKittens: Simple, Fast, and Adorable AI Kernels}, | ||
| author={Benjamin F. Spector and Simran Arora and Aaryan Singhal and Daniel Y. Fu and Christopher Ré}, | ||
| year={2024}, | ||
| eprint={2410.20399}, | ||
| archivePrefix={arXiv}, | ||
| primaryClass={cs.LG}, | ||
| url={https://arxiv.org/abs/2410.20399}, | ||
| } | ||
|
|
||
| @online{amd-iris, | ||
| title = {Iris: AMD ROCm Attention Kernels}, | ||
| author = {{AMD}}, | ||
| year = {2025}, | ||
| url = {https://github.com/ROCm/iris}, | ||
| note = {GitHub repository, accessed 2025-12-07} | ||
| } | ||
| @article{zadouri2026flashattention4, | ||
| title={FlashAttention-4: Algorithm and Kernel Pipelining Co-Design for Asymmetric Hardware Scaling}, | ||
| author={Zadouri, Ted and Hoehnerbach, Markus and Shah, Jay and Liu, Timmy and Thakkar, Vijay and Dao, Tri}, | ||
| journal={arXiv preprint arXiv:2603.05451}, | ||
| year={2026} | ||
| } | ||
|
|
||
|
|
||
| @article{liu2023ringattention, | ||
| title={Ring Attention with Blockwise Transformers for Near-Infinite Context}, | ||
| author={Liu, Hao and Zaharia, Matei and Abbeel, Pieter}, | ||
| journal={arXiv preprint arXiv:2310.01889}, | ||
| year={2023} | ||
| } | ||
|
|
||
| @article{brandon2023striped, | ||
| title={Striped Attention: Faster Ring Attention for Causal Transformers}, | ||
| author={Brandon, William and Nrusimha, Aniruddha and Qian, Kevin and Ankner, Zachary and Jin, Tian and Song, Zhiye and Ragan-Kelley, Jonathan}, | ||
| journal={arXiv preprint arXiv:2311.09431}, | ||
| year={2023} | ||
| } | ||
|
|
||
| @article{he2024flexattention, | ||
| title={Flex Attention: A Programming Model for Generating Optimized Attention Kernels}, | ||
| author={Dong, Juechu and Feng, Boyuan and Guessous, Driss and Liang, Yanbo and He, Horace}, | ||
| journal={arXiv preprint arXiv:2412.05496}, | ||
| year={2024} | ||
| } | ||
|
|
||
| @inproceedings{kwon2023pagedattention, | ||
| title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, | ||
| author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph E. and Zhang, Hao and Stoica, Ion}, | ||
| booktitle={Proceedings of the 29th Symposium on Operating Systems Principles (SOSP '23)}, | ||
| year={2023} | ||
| } | ||
| @inproceedings{tillet2019triton, | ||
| title={Triton: an intermediate language and compiler for tiled neural network computations}, | ||
| author={Tillet, Philippe and Kung, H. T. and Cox, David}, | ||
| booktitle={Proceedings of the 3rd ACM SIGPLAN International Workshop on Machine Learning and Programming Languages}, | ||
| pages={10--19}, | ||
| year={2019} | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Duplicate BibTeX key
kwon2023efficientmemorymanagementlargeis defined earlier in this file (as an@articleat lines 168–173) and then redefined here as an@misc. Duplicate keys typically cause BibTeX/Jekyll-Scholar parsing errors or nondeterministic citation resolution; keep a single entry per key (or rename one of them).