From d39f930d5262ecac88c8d7be9154d9a2ca627861 Mon Sep 17 00:00:00 2001 From: shw Date: Wed, 5 Feb 2025 22:27:13 +0800 Subject: [PATCH 1/4] add partial compile --- benchmarks/transformer.py | 2 ++ torchacc/accelerate.py | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/benchmarks/transformer.py b/benchmarks/transformer.py index 670a2db..9097646 100644 --- a/benchmarks/transformer.py +++ b/benchmarks/transformer.py @@ -53,6 +53,7 @@ def _parse_args(): parser.add_argument('--acc', action='store_true', default=False) parser.add_argument('--backend', type=str, default='lazy') parser.add_argument('--hybrid_trace', action='store_true', default=False) + parser.add_argument('--partial_compile', action='store_true', default=False) parser.add_argument('--fp16', action='store_true', default=False) parser.add_argument('--bf16', action='store_true', default=False) parser.add_argument('--gc', action='store_true', default=False) @@ -85,6 +86,7 @@ def _get_config(args): config = ta.Config() config.backend.mode = args.backend config.backend.hybrid_trace = args.hybrid_trace + config.backend.partial_compile = args.partial_compile config.compute.fp16 = args.fp16 config.compute.bf16 = args.bf16 diff --git a/torchacc/accelerate.py b/torchacc/accelerate.py index e9f28bc..c6c350f 100644 --- a/torchacc/accelerate.py +++ b/torchacc/accelerate.py @@ -186,8 +186,35 @@ def accelerate( model = torch.compile(model, backend="hybridtrace") if config.backend.partial_compile: + try: + import torch_xla._dynamo.config as config + import torch_xla._dynamo.dynamo_bridge as dynamo_bridge + except ImportError as e: + raise ImportError( + "Please follow the instruction in https://torchacc.readthedocs.io/en/stable/install.html to install torch_xla" + ) from e + # TODO: maybe we should move the config to dynamo_bridge? + config.use_call_computation = False + config.skip_input_data_check = False + config.outside_on_cuda = True + config.mark_step_after_layer_if_early_sync = False + config.no_xla_graph_sync = True + torch.utils.deterministic.fill_uninitialized_memory = False # disbale initianization for torch.empty() + torch._dynamo.disallow_in_graph( torch.nn.functional.scaled_dot_product_attention) model = torch.compile(model, backend="openxla") + # TODO: current we can't set xla stream as cuda stream because xla + # can't receive an externel cuda stream(int) and transfer it to + # se::stream. + # set cuda stream as xla stream; + cuda_device = dist.get_rank() % torch.cuda.device_count( + ) if dist.is_initialized() else 0 + import torch_xla + stream = torch_xla._XLAC._get_stream_for_cuda_device(cuda_device) + stream = 1 if stream == 0 else stream + assert stream is None or type(stream) is int + external_stream = torch.cuda.ExternalStream(stream) + torch.cuda.set_stream(external_stream) return (model, dataloader) if dataloader else model From 8c642797534a3e51ea986197723710c61eb8ac9d Mon Sep 17 00:00:00 2001 From: shw Date: Wed, 12 Feb 2025 15:17:23 +0800 Subject: [PATCH 2/4] add partial_compile option to benchmark --- benchmarks/benchmark.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/benchmark.sh b/benchmarks/benchmark.sh index d945598..1acde56 100644 --- a/benchmarks/benchmark.sh +++ b/benchmarks/benchmark.sh @@ -29,6 +29,7 @@ FSDP_SIZE=4 declare -A BACKAND_PARAMS=( ["torchacc"]="--backend lazy" ["hybridtrace"]="--backend lazy --hybrid_trace" + ["partialcompile"]="--backend eager --partial_compiles" ["cuda"]="--backend eager" ) @@ -40,6 +41,9 @@ function run_benchmark() { if [ "$backend" == "hybridtrace" ]; then export USE_TORCH_XLA=1 export TORCHACC_PATCH_FA=0 + elif [ "$backend" == "partialcompile"]; then + export USE_TORCH_XLA=1 + export TORCHACC_PATCH_FA=0 elif [ "$backend" == "torchacc" ]; then export USE_TORCH_XLA=1 export TORCHACC_PATCH_FA=1 @@ -75,6 +79,7 @@ function run_benchmark() { for MODEL in "${MODELS[@]}"; do run_benchmark "$MODEL" "hybridtrace" ${FSDP_SIZE} + run_benchmark "$MODEL" "partialcompile" ${FSDP_SIZE} run_benchmark "$MODEL" "torchacc" ${FSDP_SIZE} run_benchmark "$MODEL" "cuda" ${FSDP_SIZE} done From e5576d7e3273db1ac4117347c223dddfdd6c63d0 Mon Sep 17 00:00:00 2001 From: shw Date: Thu, 13 Feb 2025 10:29:31 +0800 Subject: [PATCH 3/4] add docs for hybridtracing --- docs/source/hybridtracing/hybrid_trace.md | 24 +++++++++++++++++++ docs/source/hybridtracing/partial_compile.md | 25 ++++++++++++++++++++ docs/source/index.rst | 4 +++- 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 docs/source/hybridtracing/hybrid_trace.md create mode 100644 docs/source/hybridtracing/partial_compile.md diff --git a/docs/source/hybridtracing/hybrid_trace.md b/docs/source/hybridtracing/hybrid_trace.md new file mode 100644 index 0000000..36a89c7 --- /dev/null +++ b/docs/source/hybridtracing/hybrid_trace.md @@ -0,0 +1,24 @@ +# Hybrid Trace +## Introduction +The Hybrid Trace approach addresses performance degradation issues that arise when XLA encounters tensor evaluation. In this solution, we combine the graph capture capabilities of Dynamo and Lazy Tensor Core (LTC). The model runs entirely on the XLA device, following LTC's execution logic, while locally employing Dynamo to reduce tracing overhead. This strategy retains the potential for full-graph optimization with XLA. + +Note: Hybrid Trace runs on torchacc lazy backend(xla device). +## How to use +It can be specified within torchacc.accelerate by setting the config. +```Python +import torchacc as ta + +config = ta.config() +config.backend.mode = 'lazy' +config.backend.hybrid_trace = True + +... + +ta.accelerate(model, config=config) +``` + +## Sceneries +Below are the sceneries we suggest to use hybrid trace: +- language model with tensor evaluations like qwen and llama. + +## Performance diff --git a/docs/source/hybridtracing/partial_compile.md b/docs/source/hybridtracing/partial_compile.md new file mode 100644 index 0000000..166f697 --- /dev/null +++ b/docs/source/hybridtracing/partial_compile.md @@ -0,0 +1,25 @@ +# Partial Compile +## Introduction +Partial_compile approach in TorchAcc can be employed to achieve performance acceleration over native CUDA execution In scenarios involving complex user code (e.g., extensive tensor evaluations, custom operations, etc.) which is hard for xla to capture whole graph. Specifically, we utilize Dynamo + XLA backend for partial compilation, with enhancements and optimizations in both functionality and performance. + +Note: Partial compile runs on TorchAcc eager backend(cuda device). +## How to use +It can be specified within torchacc.accelerate by setting the config. +```Python +import torchacc as ta + +config = ta.config() +config.backend.mode = 'eager' +config.backend.partial_compile = True + +... + +ta.accelerate(model, config=config) +``` + +## Sceneries +Below are the sceneries we suggest to use partial compile: +- model with custom ops which xla do not support. +- model with extensive tensor evaluations. + +## Performance diff --git a/docs/source/index.rst b/docs/source/index.rst index 6a65b10..c427b51 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -26,7 +26,9 @@ Welcome to TorchAcc's documentation! dist/dp dist/fsdp bucketing - + hybridtracing/hybrid_trace + hybridtracing/partial_compile + .. toctree:: :maxdepth: 2 :caption: Tutorials From 030397ed0faa6a63fa4d88b232c33135dca092a5 Mon Sep 17 00:00:00 2001 From: shw Date: Thu, 13 Feb 2025 11:31:32 +0800 Subject: [PATCH 4/4] modify docs --- docs/source/hybridtracing/hybrid_trace.md | 6 ++---- docs/source/hybridtracing/partial_compile.md | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/docs/source/hybridtracing/hybrid_trace.md b/docs/source/hybridtracing/hybrid_trace.md index 36a89c7..09cee09 100644 --- a/docs/source/hybridtracing/hybrid_trace.md +++ b/docs/source/hybridtracing/hybrid_trace.md @@ -1,6 +1,6 @@ # Hybrid Trace ## Introduction -The Hybrid Trace approach addresses performance degradation issues that arise when XLA encounters tensor evaluation. In this solution, we combine the graph capture capabilities of Dynamo and Lazy Tensor Core (LTC). The model runs entirely on the XLA device, following LTC's execution logic, while locally employing Dynamo to reduce tracing overhead. This strategy retains the potential for full-graph optimization with XLA. +Hybrid Trace mainly addresses performance degradation issues that arise when XLA encounters tensor evaluation. In this solution, we combine the graph capture capabilities of Dynamo and Lazy Tensor Core (LTC). The model runs on the XLA device, following LTC's execution logic, while locally employing Dynamo to reduce tracing overhead. This strategy retains the potential for full-graph optimization with XLA. Note: Hybrid Trace runs on torchacc lazy backend(xla device). ## How to use @@ -19,6 +19,4 @@ ta.accelerate(model, config=config) ## Sceneries Below are the sceneries we suggest to use hybrid trace: -- language model with tensor evaluations like qwen and llama. - -## Performance +- language model with tensor evaluations like qwen and llama. \ No newline at end of file diff --git a/docs/source/hybridtracing/partial_compile.md b/docs/source/hybridtracing/partial_compile.md index 166f697..0e9aae2 100644 --- a/docs/source/hybridtracing/partial_compile.md +++ b/docs/source/hybridtracing/partial_compile.md @@ -1,6 +1,6 @@ # Partial Compile ## Introduction -Partial_compile approach in TorchAcc can be employed to achieve performance acceleration over native CUDA execution In scenarios involving complex user code (e.g., extensive tensor evaluations, custom operations, etc.) which is hard for xla to capture whole graph. Specifically, we utilize Dynamo + XLA backend for partial compilation, with enhancements and optimizations in both functionality and performance. +Partial compile in TorchAcc can be employed to achieve performance acceleration over native torch cuda execution In scenarios involving complex user code (e.g., extensive tensor evaluations, custom operations, etc.) which is hard for xla to capture whole graph. Specifically, we utilize Dynamo + XLA backend for partial compilation, with enhancements and optimizations in both functionality and performance. Note: Partial compile runs on TorchAcc eager backend(cuda device). ## How to use @@ -20,6 +20,4 @@ ta.accelerate(model, config=config) ## Sceneries Below are the sceneries we suggest to use partial compile: - model with custom ops which xla do not support. -- model with extensive tensor evaluations. - -## Performance +- model with extensive tensor evaluations. \ No newline at end of file