From e4df4da6c98646da33eb7828f03901c9c437a3a5 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Wed, 10 Jun 2026 09:53:39 -0700 Subject: [PATCH 1/4] fix(tests): Mock HF in integration tests Signed-off-by: Matthew Grossman --- .../test_parallelism_hf_model_config.py | 3 +-- .../parallelism/test_recent_models.py | 3 +-- .../tests/integration/test_models_with_auth.py | 18 +++++++++++++++++- .../tests/parallelism/nemo_validation_data.py | 3 +-- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py b/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py index 304974dd35..967e9f49d4 100644 --- a/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py +++ b/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py @@ -9,9 +9,8 @@ from nmp.core.models.parallelism.api import infer_model_cfg_from_hf -# TODO: Mock HuggingFace API calls instead of accessing real gated models REQUIRES_HF_TOKEN = pytest.mark.skip( - reason="Gated HuggingFace models require mocking (not yet implemented)", + reason="Gated HuggingFace models require authentication (no fixture available)", ) GATED_MODEL_IDS = frozenset({"meta-llama/Llama-3.1-8B"}) diff --git a/services/core/models/tests/integration/parallelism/test_recent_models.py b/services/core/models/tests/integration/parallelism/test_recent_models.py index 5b32c86d19..c405d11a3e 100644 --- a/services/core/models/tests/integration/parallelism/test_recent_models.py +++ b/services/core/models/tests/integration/parallelism/test_recent_models.py @@ -14,9 +14,8 @@ from nmp.core.models.parallelism.api import estimate_parallelization, find_minimum_gpus -# TODO: Mock HuggingFace API calls instead of accessing real gated models REQUIRES_HF_TOKEN = pytest.mark.skip( - reason="Gated HuggingFace models require mocking (not yet implemented)", + reason="Gated HuggingFace models require authentication (no fixture available)", ) GATED_MODEL_IDS = frozenset({"meta-llama/Llama-3.3-70B-Instruct"}) diff --git a/services/core/models/tests/integration/test_models_with_auth.py b/services/core/models/tests/integration/test_models_with_auth.py index 902e991342..a5f378637b 100644 --- a/services/core/models/tests/integration/test_models_with_auth.py +++ b/services/core/models/tests/integration/test_models_with_auth.py @@ -19,7 +19,7 @@ from contextlib import contextmanager from typing import Generator -from unittest.mock import patch +from unittest.mock import Mock, patch import pytest from nemo_platform import NeMoPlatform, PermissionDeniedError @@ -1292,6 +1292,22 @@ class TestTrustRemoteCodePermission: trust_remote_code=True requires models.trust-remote-code.set. """ + @pytest.fixture(autouse=True) + def _mock_hf_storage(self): + """Prevent real HuggingFace API calls during fileset creation. + + These tests verify authorization logic, not HF connectivity. + Mocking the HfApi avoids rate-limit failures in CI. + """ + with patch("nmp.core.files.app.backends.huggingface.HfApi") as mock_cls: + mock_api = Mock() + mock_repo_info = Mock() + mock_repo_info.sha = "abc123mocked" + mock_repo_info.siblings = [] # skip file metadata check + mock_api.repo_info.return_value = mock_repo_info + mock_cls.return_value = mock_api + yield + def test_create_model_trust_remote_code_true_has_permission_succeeds(self, sdk: NeMoPlatform): """Create with trust_remote_code=True succeeds when principal has models.trust-remote-code.set (repo not on allow list).""" workspace = short_unique_name("trc-has") diff --git a/services/core/models/tests/parallelism/nemo_validation_data.py b/services/core/models/tests/parallelism/nemo_validation_data.py index 606f78ccec..12947f4d0b 100644 --- a/services/core/models/tests/parallelism/nemo_validation_data.py +++ b/services/core/models/tests/parallelism/nemo_validation_data.py @@ -10,9 +10,8 @@ import pytest -# TODO: Mock HuggingFace API calls instead of accessing real gated models REQUIRES_HF_TOKEN = pytest.mark.skip( - reason="Gated HuggingFace models require mocking (not yet implemented)", + reason="Gated HuggingFace models require authentication (no fixture available)", ) GATED_MODEL_IDS = frozenset( { From 50602f58628b367a45d293f88b5d2dbb6385d269 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Wed, 10 Jun 2026 09:53:49 -0700 Subject: [PATCH 2/4] add fixtures Signed-off-by: Matthew Grossman --- .../fixtures/EleutherAI/gpt-j-6b/config.json | 40 + .../EleutherAI/gpt-neox-20b/config.json | 25 + .../Qwen/Qwen2.5-72B-Instruct/config.json | 27 + .../fixtures/Qwen/Qwen2.5-72B/config.json | 27 + .../fixtures/Qwen/Qwen2.5-7B/config.json | 28 + .../fixtures/Qwen/Qwen3-4B-SafeRL/config.json | 30 + .../fixtures/Qwen/Qwen3-8B/config.json | 30 + .../deepseek-ai/DeepSeek-V3-Base/config.json | 67 + .../configuration_deepseek.py | 199 ++ .../DeepSeek-V3-Base/modeling_deepseek.py | 1848 +++++++++++++++++ .../deepseek-llm-67b-base/config.json | 25 + .../deepseek-llm-7b-base/config.json | 25 + .../parallelism/fixtures/gpt2/config.json | 31 + .../parallelism/fixtures/manifest.json | 69 + .../fixtures/microsoft/phi-2/config.json | 30 + .../fixtures/microsoft/phi-4/config.json | 32 + .../mistralai/Devstral-Small-2505/config.json | 26 + .../mistralai/Mistral-7B-v0.1/config.json | 24 + .../mistralai/Mixtral-8x7B-v0.1/config.json | 29 + .../config.json | 28 + .../NVIDIA-Nemotron-Nano-9B-v2/config.json | 56 + .../configuration_nemotron_h.py | 245 +++ .../modeling_nemotron_h.py | 1643 +++++++++++++++ .../model_config.yaml | 261 +++ .../fixtures/openai/gpt-oss-120b/config.json | 88 + .../fixtures/openai/gpt-oss-20b/config.json | 76 + 26 files changed, 5009 insertions(+) create mode 100644 services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/manifest.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml create mode 100644 services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json create mode 100644 services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json diff --git a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json new file mode 100644 index 0000000000..614ae4f4e0 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json @@ -0,0 +1,40 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPTJForCausalLM" + ], + "attn_pdrop": 0.0, + "bos_token_id": 50256, + "embd_pdrop": 0.0, + "eos_token_id": 50256, + "gradient_checkpointing": false, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gptj", + "n_embd": 4096, + "n_head": 16, + "n_inner": null, + "n_layer": 28, + "n_positions": 2048, + "resid_pdrop": 0.0, + "rotary": true, + "rotary_dim": 64, + "scale_attn_weights": true, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "task_specific_params": { + "text-generation": { + "do_sample": true, + "max_length": 50, + "temperature": 1.0 + } + }, + "tie_word_embeddings": false, + "tokenizer_class": "GPT2Tokenizer", + "transformers_version": "4.18.0.dev0", + "use_cache": true, + "vocab_size": 50400 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json new file mode 100644 index 0000000000..54d3633ef9 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json @@ -0,0 +1,25 @@ +{ + "architectures": [ + "GPTNeoXForCausalLM" + ], + "attention_probs_dropout_prob": 0, + "bos_token_id": 0, + "eos_token_id": 0, + "hidden_act": "gelu_fast", + "hidden_dropout_prob": 0, + "hidden_size": 6144, + "initializer_range": 0.02, + "intermediate_size": 24576, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 2048, + "model_type": "gpt_neox", + "num_attention_heads": 64, + "num_hidden_layers": 44, + "rotary_emb_base": 10000, + "rotary_pct": 0.25, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.19.0.dev0", + "use_cache": true, + "vocab_size": 50432 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json new file mode 100644 index 0000000000..ec6ea340e5 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 29568, + "max_position_embeddings": 32768, + "max_window_layers": 70, + "model_type": "qwen2", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.43.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json new file mode 100644 index 0000000000..67663e297b --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 29568, + "max_position_embeddings": 131072, + "max_window_layers": 80, + "model_type": "qwen2", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.43.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json new file mode 100644 index 0000000000..1a90713f0e --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.40.1", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 152064 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json new file mode 100644 index 0000000000..e49eccdc32 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json new file mode 100644 index 0000000000..d46195ac87 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 12288, + "max_position_embeddings": 40960, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json new file mode 100644 index 0000000000..aec35a75d9 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json @@ -0,0 +1,67 @@ +{ + "architectures": [ + "DeepseekV3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_deepseek.DeepseekV3Config", + "AutoModel": "modeling_deepseek.DeepseekV3Model", + "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM" + }, + "bos_token_id": 0, + "eos_token_id": 1, + "ep_size": 1, + "first_k_dense_replace": 3, + "hidden_act": "silu", + "hidden_size": 7168, + "initializer_range": 0.02, + "intermediate_size": 18432, + "kv_lora_rank": 512, + "max_position_embeddings": 163840, + "model_type": "deepseek_v3", + "moe_intermediate_size": 2048, + "moe_layer_freq": 1, + "n_group": 8, + "n_routed_experts": 256, + "n_shared_experts": 1, + "norm_topk_prob": true, + "num_attention_heads": 128, + "num_experts_per_tok": 8, + "num_hidden_layers": 61, + "num_key_value_heads": 128, + "num_nextn_predict_layers": 1, + "q_lora_rank": 1536, + "qk_nope_head_dim": 128, + "qk_rope_head_dim": 64, + "quantization_config": { + "activation_scheme": "dynamic", + "fmt": "e4m3", + "quant_method": "fp8", + "weight_block_size": [ + 128, + 128 + ] + }, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "beta_fast": 32, + "beta_slow": 1, + "factor": 40, + "mscale": 1.0, + "mscale_all_dim": 1.0, + "original_max_position_embeddings": 4096, + "type": "yarn" + }, + "rope_theta": 10000, + "routed_scaling_factor": 2.5, + "scoring_func": "sigmoid", + "tie_word_embeddings": false, + "topk_group": 4, + "topk_method": "noaux_tc", + "torch_dtype": "bfloat16", + "transformers_version": "4.33.1", + "use_cache": true, + "v_head_dim": 128, + "vocab_size": 129280 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py new file mode 100644 index 0000000000..f549f2b17d --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py @@ -0,0 +1,199 @@ +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} +class DeepseekV3Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the DeepSeek-V3. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 129280): + Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DeepseekV3Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + moe_intermediate_size (`int`, *optional*, defaults to 1407): + Dimension of the MoE representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_nextn_predict_layers (`int`, *optional*, defaults to 1): + Number of nextn predict layers in the DeepSeekV3 Model. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + n_shared_experts (`int`, *optional*, defaults to None): + Number of shared experts, None means dense model. + n_routed_experts (`int`, *optional*, defaults to None): + Number of routed experts, None means dense model. + routed_scaling_factor (`float`, *optional*, defaults to 1.0): + Scaling factor or routed experts. + topk_method (`str`, *optional*, defaults to `gready`): + Topk method used in routed gate. + n_group (`int`, *optional*, defaults to None): + Number of groups for routed experts. + topk_group (`int`, *optional*, defaults to None): + Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups). + num_experts_per_tok (`int`, *optional*, defaults to None): + Number of selected experts, None means dense model. + moe_layer_freq (`int`, *optional*, defaults to 1): + The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers. + first_k_dense_replace (`int`, *optional*, defaults to 0): + Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head). + \--k dense layers--/ + norm_topk_prob (`bool`, *optional*, defaults to False): + Whether to normalize the weights of the routed experts. + scoring_func (`str`, *optional*, defaults to 'softmax'): + Method of computing expert weights. + aux_loss_alpha (`float`, *optional*, defaults to 0.001): + Auxiliary loss weight coefficient. + seq_aux = (`bool`, *optional*, defaults to True): + Whether to compute the auxiliary loss for each individual sample. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import DeepseekV3Model, DeepseekV3Config + + >>> # Initializing a Deepseek-V3 style configuration + >>> configuration = DeepseekV3Config() + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_v3" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=129280, + hidden_size=7168, + intermediate_size=18432, + moe_intermediate_size = 2048, + num_hidden_layers=61, + num_nextn_predict_layers=1, + num_attention_heads=128, + num_key_value_heads=128, + n_shared_experts = 1, + n_routed_experts = 256, + ep_size = 1, + routed_scaling_factor = 2.5, + kv_lora_rank = 512, + q_lora_rank = 1536, + qk_rope_head_dim = 64, + v_head_dim = 128, + qk_nope_head_dim = 128, + topk_method = 'noaux_tc', + n_group = 8, + topk_group = 4, + num_experts_per_tok = 8, + moe_layer_freq = 1, + first_k_dense_replace = 3, + norm_topk_prob = True, + scoring_func = 'sigmoid', + hidden_act="silu", + max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=0, + eos_token_id=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.moe_intermediate_size = moe_intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_nextn_predict_layers = num_nextn_predict_layers + self.num_attention_heads = num_attention_heads + self.n_shared_experts = n_shared_experts + self.n_routed_experts = n_routed_experts + self.ep_size = ep_size + self.routed_scaling_factor = routed_scaling_factor + self.kv_lora_rank = kv_lora_rank + self.q_lora_rank = q_lora_rank + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.qk_nope_head_dim = qk_nope_head_dim + self.topk_method = topk_method + self.n_group = n_group + self.topk_group = topk_group + self.num_experts_per_tok = num_experts_per_tok + self.moe_layer_freq = moe_layer_freq + self.first_k_dense_replace = first_k_dense_replace + self.norm_topk_prob = norm_topk_prob + self.scoring_func = scoring_func + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py new file mode 100644 index 0000000000..28d9ea27aa --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py @@ -0,0 +1,1848 @@ +# coding=utf-8 +# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch DeepSeek model.""" +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_causal_attention_mask, +) +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import ( + ALL_LAYERNORM_LAYERS, + is_torch_greater_or_equal_than_1_13, +) +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from transformers.utils.import_utils import is_torch_fx_available +from .configuration_deepseek import DeepseekV3Config +import torch.distributed as dist +import numpy as np + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + +# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. +# It means that the function will not be traced through and simply appear as a node in the graph. +if is_torch_fx_available(): + if not is_torch_greater_or_equal_than_1_13: + import torch.fx + + _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "DeepseekV3Config" + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad( + torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0) + ) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +class DeepseekV3RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + DeepseekV3RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +ALL_LAYERNORM_LAYERS.append(DeepseekV3RMSNorm) + + +class DeepseekV3RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, + device=self.inv_freq.device, + dtype=torch.get_default_dtype(), + ) + self.max_seq_len_cached = None + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.outer(t, self.inv_freq.to(t.device)) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV3 +class DeepseekV3LinearScalingRotaryEmbedding(DeepseekV3RotaryEmbedding): + """DeepseekV3RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV3 +class DeepseekV3DynamicNTKScalingRotaryEmbedding(DeepseekV3RotaryEmbedding): + """DeepseekV3RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Inverse dim formula to find dim based on number of rotations +def yarn_find_correction_dim( + num_rotations, dim, base=10000, max_position_embeddings=2048 +): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + +# Find dim range bounds based on rotations +def yarn_find_correction_range( + low_rot, high_rot, dim, base=10000, max_position_embeddings=2048 +): + low = math.floor( + yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) + ) + high = math.ceil( + yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) + ) + return max(low, 0), min(high, dim - 1) # Clamp values just in case + + +def yarn_get_mscale(scale=1, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +def yarn_linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding): + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + original_max_position_embeddings=4096, + beta_fast=32, + beta_slow=1, + mscale=1, + mscale_all_dim=0, + ): + self.scaling_factor = scaling_factor + self.original_max_position_embeddings = original_max_position_embeddings + self.beta_fast = beta_fast + self.beta_slow = beta_slow + self.mscale = mscale + self.mscale_all_dim = mscale_all_dim + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + dim = self.dim + + freq_extra = 1.0 / ( + self.base + ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + ) + freq_inter = 1.0 / ( + self.scaling_factor + * self.base + ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + ) + + low, high = yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + dim, + self.base, + self.original_max_position_embeddings, + ) + inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to( + device=device, dtype=torch.float32 + ) + inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(seq_len, device=device, dtype=torch.float32) + + freqs = torch.outer(t, inv_freq) + + _mscale = float( + yarn_get_mscale(self.scaling_factor, self.mscale) + / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim) + ) + + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer( + "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False + ) + self.register_buffer( + "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + + b, h, s, d = q.shape + q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + b, h, s, d = k.shape + k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class DeepseekV3MLP(nn.Module): + def __init__(self, config, hidden_size=None, intermediate_size=None): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size if hidden_size is None else hidden_size + self.intermediate_size = ( + config.intermediate_size if intermediate_size is None else intermediate_size + ) + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class MoEGate(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.n_routed_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.scoring_func = config.scoring_func + self.topk_method = config.topk_method + self.n_group = config.n_group + self.topk_group = config.topk_group + + # topk selection algorithm + self.norm_topk_prob = config.norm_topk_prob + self.gating_dim = config.hidden_size + self.weight = nn.Parameter( + torch.empty((self.n_routed_experts, self.gating_dim)) + ) + if self.topk_method == "noaux_tc": + self.e_score_correction_bias = nn.Parameter( + torch.empty((self.n_routed_experts)) + ) + self.reset_parameters() + + def reset_parameters(self) -> None: + import torch.nn.init as init + + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + + def forward(self, hidden_states): + bsz, seq_len, h = hidden_states.shape + ### compute gating score + hidden_states = hidden_states.view(-1, h) + logits = F.linear( + hidden_states.type(torch.float32), self.weight.type(torch.float32), None + ) + if self.scoring_func == "sigmoid": + scores = logits.sigmoid() + else: + raise NotImplementedError( + f"insupportable scoring function for MoE gating: {self.scoring_func}" + ) + + ### select top-k experts + if self.topk_method == "noaux_tc": + assert not self.training + scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0) + group_scores = ( + scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1) + ) # [n, n_group] + group_idx = torch.topk( + group_scores, k=self.topk_group, dim=-1, sorted=False + )[ + 1 + ] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand( + bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group + ) + .reshape(bsz * seq_len, -1) + ) # [n, e] + tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e] + _, topk_idx = torch.topk( + tmp_scores, k=self.top_k, dim=-1, sorted=False + ) + topk_weight = scores.gather(1, topk_idx) + else: + raise NotImplementedError( + f"insupportable TopK function for MoE gating: {self.topk_method}" + ) + + ### norm gate to sum 1 + if self.top_k > 1 and self.norm_topk_prob: + denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20 + topk_weight = topk_weight / denominator + topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor + + return topk_idx, topk_weight + +class DeepseekV3MoE(nn.Module): + """ + A mixed expert module containing shared experts. + """ + + def __init__(self, config): + super().__init__() + self.config = config + self.num_experts_per_tok = config.num_experts_per_tok + + if hasattr(config, "ep_size") and config.ep_size > 1: + assert config.ep_size == dist.get_world_size() + self.ep_size = config.ep_size + self.experts_per_rank = config.n_routed_experts // config.ep_size + self.ep_rank = dist.get_rank() + self.experts = nn.ModuleList( + [ + ( + DeepseekV3MLP( + config, intermediate_size=config.moe_intermediate_size + ) + if i >= self.ep_rank * self.experts_per_rank + and i < (self.ep_rank + 1) * self.experts_per_rank + else None + ) + for i in range(config.n_routed_experts) + ] + ) + else: + self.ep_size = 1 + self.experts_per_rank = config.n_routed_experts + self.ep_rank = 0 + self.experts = nn.ModuleList( + [ + DeepseekV3MLP( + config, intermediate_size=config.moe_intermediate_size + ) + for i in range(config.n_routed_experts) + ] + ) + self.gate = MoEGate(config) + if config.n_shared_experts is not None: + intermediate_size = config.moe_intermediate_size * config.n_shared_experts + self.shared_experts = DeepseekV3MLP( + config=config, intermediate_size=intermediate_size + ) + + def forward(self, hidden_states): + identity = hidden_states + orig_shape = hidden_states.shape + topk_idx, topk_weight = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + flat_topk_idx = topk_idx.view(-1) + if not self.training: + y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape) + if self.config.n_shared_experts is not None: + y = y + self.shared_experts(identity) + return y + + @torch.no_grad() + def moe_infer(self, x, topk_ids, topk_weight): + cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts))) + cnts.scatter_(1, topk_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + idxs = topk_ids.view(-1).argsort() + sorted_tokens = x[idxs // topk_ids.shape[1]] + sorted_tokens_shape = sorted_tokens.shape + if self.ep_size > 1: + tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1) + tokens_per_expert_group = tokens_per_expert.new_empty( + tokens_per_expert.shape[0] + ) + dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert) + output_splits = ( + tokens_per_expert_group.view(self.ep_size, -1) + .sum(1) + .cpu() + .numpy() + .tolist() + ) + gathered_tokens = sorted_tokens.new_empty( + tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1] + ) + input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist() + dist.all_to_all( + list(gathered_tokens.split(output_splits)), + list(sorted_tokens.split(input_split_sizes)), + ) + tokens_per_expert_post_gather = tokens_per_expert_group.view( + self.ep_size, self.experts_per_rank + ).sum(dim=0) + gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32) + s = 0 + for i, k in enumerate(tokens_per_expert_group.cpu().numpy()): + gatherd_idxs[s : s + k] = i % self.experts_per_rank + s += k + gatherd_idxs = gatherd_idxs.argsort() + sorted_tokens = gathered_tokens[gatherd_idxs] + tokens_per_expert = tokens_per_expert_post_gather + tokens_per_expert = tokens_per_expert.cpu().numpy() + + outputs = [] + start_idx = 0 + for i, num_tokens in enumerate(tokens_per_expert): + end_idx = start_idx + num_tokens + if num_tokens == 0: + continue + expert = self.experts[i + self.ep_rank * self.experts_per_rank] + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + expert_out = expert(tokens_for_this_expert) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) + if self.ep_size > 1: + new_x = torch.empty_like(outs) + new_x[gatherd_idxs] = outs + gathered_tokens = new_x.new_empty(*sorted_tokens_shape) + dist.all_to_all( + list(gathered_tokens.split(input_split_sizes)), + list(new_x.split(output_splits)), + ) + outs = gathered_tokens + + new_x = torch.empty_like(outs) + new_x[idxs] = outs + final_out = ( + new_x.view(*topk_ids.shape, -1) + .type(topk_weight.dtype) + .mul_(topk_weight.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + return final_out + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand( + batch, num_key_value_heads, n_rep, slen, head_dim + ) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV3 +class DeepseekV3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank + self.qk_rope_head_dim = config.qk_rope_head_dim + self.kv_lora_rank = config.kv_lora_rank + self.v_head_dim = config.v_head_dim + self.qk_nope_head_dim = config.qk_nope_head_dim + self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim + + self.is_causal = True + + if self.q_lora_rank is None: + self.q_proj = nn.Linear( + self.hidden_size, self.num_heads * self.q_head_dim, bias=False + ) + else: + self.q_a_proj = nn.Linear( + self.hidden_size, config.q_lora_rank, bias=config.attention_bias + ) + self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank) + self.q_b_proj = nn.Linear( + config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False + ) + + self.kv_a_proj_with_mqa = nn.Linear( + self.hidden_size, + config.kv_lora_rank + config.qk_rope_head_dim, + bias=config.attention_bias, + ) + self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank) + self.kv_b_proj = nn.Linear( + config.kv_lora_rank, + self.num_heads + * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim), + bias=False, + ) + + self.o_proj = nn.Linear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=config.attention_bias, + ) + self._init_rope() + + self.softmax_scale = self.q_head_dim ** (-0.5) + if self.config.rope_scaling is not None: + mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) + scaling_factor = self.config.rope_scaling["factor"] + if mscale_all_dim: + mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) + self.softmax_scale = self.softmax_scale * mscale * mscale + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = DeepseekV3RotaryEmbedding( + self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding( + self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = DeepseekV3DynamicNTKScalingRotaryEmbedding( + self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "yarn": + kwargs = { + key: self.config.rope_scaling[key] + for key in [ + "original_max_position_embeddings", + "beta_fast", + "beta_slow", + "mscale", + "mscale_all_dim", + ] + if key in self.config.rope_scaling + } + self.rotary_emb = DeepseekV3YarnRotaryEmbedding( + self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + **kwargs, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return ( + tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim) + .transpose(1, 2) + .contiguous() + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + if self.q_lora_rank is None: + q = self.q_proj(hidden_states) + else: + q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) + q_nope, q_pe = torch.split( + q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1 + ) + + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + compressed_kv, k_pe = torch.split( + compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1 + ) + k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) + kv = ( + self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) + .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + .transpose(1, 2) + ) + + k_nope, value_states = torch.split( + kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1 + ) + kv_seq_len = value_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) + + query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + query_states[:, :, :, : self.qk_nope_head_dim] = q_nope + query_states[:, :, :, self.qk_nope_head_dim :] = q_pe + + key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + key_states[:, :, :, : self.qk_nope_head_dim] = k_nope + key_states[:, :, :, self.qk_nope_head_dim :] = k_pe + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + attn_weights = ( + torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale + ) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + assert attention_mask is not None + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(query_states.dtype) + attn_weights = nn.functional.dropout( + attn_weights, p=self.attention_dropout, training=self.training + ) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV3 +class DeepseekV3FlashAttention2(DeepseekV3Attention): + """ + DeepseekV3 flash attention module. This module inherits from `DeepseekV3Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # DeepseekV3FlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + if self.q_lora_rank is None: + q = self.q_proj(hidden_states) + else: + q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) + q_nope, q_pe = torch.split( + q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1 + ) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + compressed_kv, k_pe = torch.split( + compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1 + ) + k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) + kv = ( + self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) + .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + .transpose(1, 2) + ) + + k_nope, value_states = torch.split( + kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1 + ) + kv_seq_len = value_states.shape[-2] + + kv_seq_len = value_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) + + query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + query_states[:, :, :, : self.qk_nope_head_dim] = q_nope + query_states[:, :, :, self.qk_nope_head_dim :] = q_pe + + key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) + key_states[:, :, :, : self.qk_nope_head_dim] = k_nope + key_states[:, :, :, self.qk_nope_head_dim :] = k_pe + + if self.q_head_dim != self.v_head_dim: + value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim]) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (DeepseekV3RMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + # Handle the case where the model is quantized + if hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + elif torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + else: + target_dtype = ( + self.q_proj.weight.dtype + if self.q_lora_rank is None + else self.q_a_proj.weight.dtype + ) + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + softmax_scale=self.softmax_scale, + ) + if self.q_head_dim != self.v_head_dim: + attn_output = attn_output[:, :, :, : self.v_head_dim] + + attn_output = attn_output.reshape( + bsz, q_len, self.num_heads * self.v_head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV3FlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + ( + query_states, + key_states, + value_states, + indices_q, + cu_seq_lens, + max_seq_lens, + ) = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input( + attn_output_unpad, indices_q, batch_size, query_length + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + return attn_output + + def _upad_input( + self, query_layer, key_layer, value_layer, attention_mask, query_length + ): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), + indices_k, + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), + indices_k, + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), + indices_k, + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( + query_layer, attention_mask + ) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +ATTENTION_CLASSES = { + "eager": DeepseekV3Attention, + "flash_attention_2": DeepseekV3FlashAttention2, +} + + +class DeepseekV3DecoderLayer(nn.Module): + def __init__(self, config: DeepseekV3Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = ATTENTION_CLASSES[config._attn_implementation]( + config=config, layer_idx=layer_idx + ) + + self.mlp = ( + DeepseekV3MoE(config) + if ( + config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0 + ) + else DeepseekV3MLP(config) + ) + self.input_layernorm = DeepseekV3RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = DeepseekV3RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +DeepseekV3_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`DeepseekV3Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.", + DeepseekV3_START_DOCSTRING, +) +class DeepseekV3PreTrainedModel(PreTrainedModel): + config_class = DeepseekV3Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["DeepseekV3DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +DeepseekV3_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.", + DeepseekV3_START_DOCSTRING, +) +class DeepseekV3Model(DeepseekV3PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`] + + Args: + config: DeepseekV3Config + """ + + def __init__(self, config: DeepseekV3Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + DeepseekV3DecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = ( + attention_mask + if (attention_mask is not None and 0 in attention_mask) + else None + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = ( + next_decoder_cache.to_legacy_cache() + if use_legacy_cache + else next_decoder_cache + ) + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = DeepseekV3Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM + + >>> model = DeepseekV3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as + # input) + if ( + attention_mask is not None + and attention_mask.shape[1] > input_ids.shape[1] + ): + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ), + ) + return reordered_past + + +@add_start_docstrings( + """ + The DeepseekV3 Model transformer with a sequence classification head on top (linear layer). + + [`DeepseekV3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + DeepseekV3_START_DOCSTRING, +) +class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = DeepseekV3Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError( + "Cannot handle batch sizes > 1 if no padding token is defined." + ) + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = ( + torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + ).to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[ + torch.arange(batch_size, device=logits.device), sequence_lengths + ] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and ( + labels.dtype == torch.long or labels.dtype == torch.int + ): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct( + pooled_logits.view(-1, self.num_labels), labels.view(-1) + ) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json new file mode 100644 index 0000000000..031a7856cc --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json @@ -0,0 +1,25 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 22016, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 95, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.33.1", + "use_cache": true, + "vocab_size": 102400 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json new file mode 100644 index 0000000000..208956063b --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json @@ -0,0 +1,25 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 30, + "num_key_value_heads": 32, + "pretraining_tp": 1, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.33.1", + "use_cache": true, + "vocab_size": 102400 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json b/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json new file mode 100644 index 0000000000..10c66461e4 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json @@ -0,0 +1,31 @@ +{ + "activation_function": "gelu_new", + "architectures": [ + "GPT2LMHeadModel" + ], + "attn_pdrop": 0.1, + "bos_token_id": 50256, + "embd_pdrop": 0.1, + "eos_token_id": 50256, + "initializer_range": 0.02, + "layer_norm_epsilon": 1e-05, + "model_type": "gpt2", + "n_ctx": 1024, + "n_embd": 768, + "n_head": 12, + "n_layer": 12, + "n_positions": 1024, + "resid_pdrop": 0.1, + "summary_activation": null, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": true, + "summary_type": "cls_index", + "summary_use_proj": true, + "task_specific_params": { + "text-generation": { + "do_sample": true, + "max_length": 50 + } + }, + "vocab_size": 50257 +} \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/manifest.json b/services/core/models/tests/integration/parallelism/fixtures/manifest.json new file mode 100644 index 0000000000..baddd3d114 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/manifest.json @@ -0,0 +1,69 @@ +{ + "EleutherAI/gpt-j-6b": [ + "config.json" + ], + "EleutherAI/gpt-neox-20b": [ + "config.json" + ], + "Qwen/Qwen2.5-72B": [ + "config.json" + ], + "Qwen/Qwen2.5-72B-Instruct": [ + "config.json" + ], + "Qwen/Qwen2.5-7B": [ + "config.json" + ], + "Qwen/Qwen3-4B-SafeRL": [ + "config.json" + ], + "Qwen/Qwen3-8B": [ + "config.json" + ], + "deepseek-ai/DeepSeek-V3-Base": [ + "config.json", + "configuration_deepseek.py", + "modeling_deepseek.py" + ], + "deepseek-ai/deepseek-llm-67b-base": [ + "config.json" + ], + "deepseek-ai/deepseek-llm-7b-base": [ + "config.json" + ], + "gpt2": [ + "config.json" + ], + "microsoft/phi-2": [ + "config.json" + ], + "microsoft/phi-4": [ + "config.json" + ], + "mistralai/Devstral-Small-2505": [ + "config.json" + ], + "mistralai/Mistral-7B-v0.1": [ + "config.json" + ], + "mistralai/Mixtral-8x7B-v0.1": [ + "config.json" + ], + "nvidia/Mistral-NeMo-Minitron-8B-Instruct": [ + "config.json" + ], + "nvidia/NVIDIA-Nemotron-Nano-9B-v2": [ + "config.json", + "configuration_nemotron_h.py", + "modeling_nemotron_h.py" + ], + "nvidia/nemotron-4-340b-instruct": [ + "model_config.yaml" + ], + "openai/gpt-oss-120b": [ + "config.json" + ], + "openai/gpt-oss-20b": [ + "config.json" + ] +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json new file mode 100644 index 0000000000..011968cc02 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json @@ -0,0 +1,30 @@ +{ + "_name_or_path": "microsoft/phi-2", + "architectures": [ + "PhiForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 50256, + "embd_pdrop": 0.0, + "eos_token_id": 50256, + "hidden_act": "gelu_new", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 10240, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 2048, + "model_type": "phi", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "partial_rotary_factor": 0.4, + "qk_layernorm": false, + "resid_pdrop": 0.1, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.37.0", + "use_cache": true, + "vocab_size": 51200 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json new file mode 100644 index 0000000000..ab17e0b583 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json @@ -0,0 +1,32 @@ +{ + "_name_or_path": "microsoft/phi-4", + "architectures": [ + "Phi3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 100257, + "embd_pdrop": 0.0, + "eos_token_id": 100265, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 17920, + "max_position_embeddings": 16384, + "model_type": "phi3", + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 10, + "original_max_position_embeddings": 16384, + "pad_token_id": 100349, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 250000, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "use_cache": true, + "vocab_size": 100352 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json new file mode 100644 index 0000000000..dae01ddab0 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json @@ -0,0 +1,26 @@ +{ + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "initializer_range": 0.02, + "intermediate_size": 32768, + "max_position_embeddings": 131072, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": true, + "vocab_size": 131072 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json new file mode 100644 index 0000000000..f4989f072a --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json @@ -0,0 +1,24 @@ +{ + "architectures": [ + "MistralForCausalLM" + ], + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 10000.0, + "sliding_window": 4096, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.34.0.dev0", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json new file mode 100644 index 0000000000..de132a80b2 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json @@ -0,0 +1,29 @@ +{ + "architectures": [ + "MixtralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "mixtral", + "num_attention_heads": 32, + "num_experts_per_tok": 2, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "num_local_experts": 8, + "output_router_logits": false, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "router_aux_loss_coef": 0.02, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.0.dev0", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json b/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json new file mode 100644 index 0000000000..55d6cbbe0b --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "nvidia/Mistral-NeMo-Minitron-8B-Instruct", + "activation": "silu", + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11520, + "max_position_embeddings": 8192, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.44.0", + "use_cache": true, + "vocab_size": 131072 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json new file mode 100644 index 0000000000..4f56c18a20 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json @@ -0,0 +1,56 @@ +{ + "architectures": [ + "NemotronHForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_nemotron_h.NemotronHConfig", + "AutoModelForCausalLM": "modeling_nemotron_h.NemotronHForCausalLM" + }, + "bos_token_id": 1, + "chunk_size": 128, + "conv_kernel": 4, + "eos_token_id": 12, + "head_dim": 128, + "hidden_dropout": 0.0, + "hidden_size": 4480, + "hybrid_override_pattern": "M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-", + "initializer_range": 0.02, + "intermediate_size": 15680, + "layer_norm_epsilon": 1e-05, + "mamba_head_dim": 80, + "mamba_hidden_act": "silu", + "mamba_num_groups": 8, + "mamba_num_heads": 128, + "mamba_proj_bias": false, + "mamba_state_dim": 128, + "max_position_embeddings": 131072, + "mlp_bias": false, + "mlp_hidden_act": "relu2", + "model_type": "nemotron_h", + "n_groups": 8, + "num_attention_heads": 40, + "num_hidden_layers": 56, + "num_key_value_heads": 8, + "num_logits_to_keep": 1, + "num_query_groups": 8, + "pad_token_id": 0, + "rescale_prenorm_residual": true, + "residual_in_fp32": false, + "rms_norm_eps": 1e-05, + "sliding_window": null, + "ssm_state_size": 128, + "tie_word_embeddings": false, + "time_step_floor": 0.0001, + "time_step_max": 0.1, + "time_step_min": 0.001, + "time_step_rank": 256, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_bias": false, + "use_cache": true, + "use_conv_bias": true, + "use_mamba_kernels": true, + "vocab_size": 131072 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py new file mode 100644 index 0000000000..2b5c451b4a --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py @@ -0,0 +1,245 @@ +# coding=utf-8 +# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""NemotronH model configuration""" + +import re + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +class NemotronHConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`NemotronHModel`]. It is used to instantiate a + NemotronH model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the NemotronH-v0.1 model. + + [todo](todo) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 131072): + Vocabulary size of the NemotronH model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`NemotronHModel`] + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the + model has a output word embedding layer. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 21504): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 52): + Number of hidden layers in the Transformer encoder. + hybrid_override_pattern (`str`, *optional*, defaults to `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`): + The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, *: Attention, -: MLP + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + attention_head_dim (`int`, *optional*, defaults to 128): + Dimension of each attention head. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. + mlp_hidden_act (`str`, *optional*, defaults to "relu2"): + The non-linear activation function in the MLP layers. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use bias in attention layers. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use bias in MLP layers. + use_bias (`bool`, *optional*, defaults to `False`): + Whether to use bias in the model. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + residual_in_fp32 (`bool`, *optional*, defaults to `False`): + Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + num_logits_to_keep (`int` or `None`, *optional*, defaults to 1): + Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an + integer value, only last `num_logits_to_keep` logits will be calculated. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + sliding_window (`int`, *optional*, defaults to None): + Sliding window attention window size. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the hidden states. + use_mamba_kernels (`bool`, *optional*, defaults to `True`): + Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and + `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. + ssm_state_size (`int`, *optional*, defaults to 128): + The dimension of the mamba state space latents. + mamba_num_heads (`int`, *optional*, defaults to 128): + Number of heads in Mamba layers. + mamba_n_groups (`int`, *optional*, defaults to 8): + Number of groups in Mamba layers. + mamba_head_dim (`int`, *optional*, defaults to 64): + Dimension of each Mamba head. + mamba_d_conv (`int`, *optional*, defaults to 4): + The size of the mamba convolution kernel. + mamba_expand (`int`, *optional*, defaults to 2): + Expanding factor used to determine the mamba intermediate size. + mamba_hidden_act (`str`, *optional*, defaults to "silu"): + The non-linear activation function in the Mamba layers. + mamba_dt_min (`float`, *optional*, defaults to 0.001): + Minimum value for the time step in Mamba. + mamba_dt_max (`float`, *optional*, defaults to 0.1): + Maximum value for the time step in Mamba. + mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))): + Limits for the time step in Mamba. + mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4): + Floor value for time step initialization in Mamba. + mamba_conv_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the convolution layer of the mamba mixer block. + mamba_proj_bias (`bool`, *optional*, defaults to `False`): + Whether to use bias in the input and output projections of the mamba mixer block. + mamba_chunk_size (`int`, *optional*, defaults to 256): + Size of chunks for Mamba processing. + rescale_prenorm_residual (`bool`, *optional*, defaults to `True`): + Whether to rescale the pre-normalization residual connections. + """ + + model_type = "nemotron_h" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=131072, + tie_word_embeddings=False, + hidden_size=4096, + intermediate_size=21504, + num_hidden_layers=52, + hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", + num_attention_heads=32, + #attention_head_dim=128, + head_dim=128, + num_key_value_heads=8, # nemo: num_query_groups + mlp_hidden_act="relu2", + attention_bias=False, + mlp_bias=False, + use_bias=False, + initializer_range=0.02, # nemo: init_method_std + layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon + residual_in_fp32=False, # Megatron Core default value + use_cache=True, + num_logits_to_keep=1, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + sliding_window=None, + max_position_embeddings=4096, + attention_dropout=0.0, + hidden_dropout=0.0, # * ADDED + use_mamba_kernels=True, + ssm_state_size=128, # mamba_state_size + mamba_num_heads=128, + mamba_n_groups=8, # nemo: mamba_ssm_ngroups = num_heads + mamba_head_dim=64, + mamba_d_conv=4, + mamba_expand=2, + mamba_hidden_act="silu", + mamba_dt_min=0.001, + mamba_dt_max=0.1, + mamba_dt_limit=(0.0, float("inf")), + mamba_dt_init_floor=1e-4, + mamba_conv_bias=True, + mamba_proj_bias=False, + mamba_chunk_size=256, + rescale_prenorm_residual=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.tie_word_embeddings = tie_word_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.hybrid_override_pattern = hybrid_override_pattern + self.num_attention_heads = num_attention_heads + #self.attention_head_dim = attention_head_dim + self.head_dim = head_dim + self.sliding_window = sliding_window + self.max_position_embeddings = max_position_embeddings + self.attention_dropout = attention_dropout + self.hidden_dropout = hidden_dropout + + # Validate hybrid_override_pattern + # M: Mamba2, *: Attention, -: MLP + assert len(self.hybrid_override_pattern) == self.num_hidden_layers, "hybrid_override_pattern must have the same length as num_hidden_layers" + assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), "hybrid_override_pattern must only contain characters 'M', '*', or '-'" + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.mlp_hidden_act = mlp_hidden_act + self.attention_bias = attention_bias + self.mlp_bias = mlp_bias + self.use_bias = use_bias + self.initializer_range = initializer_range + self.layer_norm_epsilon = layer_norm_epsilon + self.residual_in_fp32 = residual_in_fp32 + + self.use_cache = use_cache + self.num_logits_to_keep = num_logits_to_keep + + self.use_mamba_kernels = use_mamba_kernels + self.n_groups = mamba_n_groups + self.mamba_head_dim = mamba_head_dim + self.ssm_state_size = ssm_state_size + self.mamba_num_heads = mamba_num_heads + self.conv_kernel = mamba_d_conv + self.expand = mamba_expand + self.mamba_hidden_act = mamba_hidden_act + self.time_step_min = mamba_dt_min + self.time_step_max = mamba_dt_max + self.time_step_limit = mamba_dt_limit + self.time_step_floor = mamba_dt_init_floor + self.use_conv_bias = mamba_conv_bias + self.mamba_proj_bias = mamba_proj_bias + self.chunk_size = mamba_chunk_size + self.rescale_prenorm_residual = rescale_prenorm_residual + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + @property + def layers_block_type(self): + return [ + "mamba" if self.hybrid_override_pattern[i] == "M" else + "attention" if self.hybrid_override_pattern[i] == "*" else "mlp" + for i in range(self.num_hidden_layers)] \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py new file mode 100644 index 0000000000..7ac44be3f8 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py @@ -0,0 +1,1643 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch NemotronH model.""" + +import math +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import DynamicCache # we need __iter__ and __len__ of pkv +from transformers.generation import GenerationMixin +from transformers.modeling_attn_mask_utils import ( + AttentionMaskConverter, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from transformers.utils.import_utils import ( + is_causal_conv1d_available, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + is_mamba_2_ssm_available, +) +from .configuration_nemotron_h import NemotronHConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.mamba.modeling_mamba2.modeling_mamba2.py with MAMBA2->NEMOTRONH,Mamba2->NemotronH +# For Mamba2 components Mamba2->NemotronHMamba2 +if is_mamba_2_ssm_available(): + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined +else: + mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined, selective_state_update = None, None, None + +try: + #from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated + from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn +except ImportError: + raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported") + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + +if is_flash_attn_2_available(): + from transformers.modeling_flash_attention_utils import _flash_attention_forward + +is_fast_path_available = all( + ( + selective_state_update, + mamba_chunk_scan_combined, + mamba_split_conv1d_scan_combined, + causal_conv1d_fn, + causal_conv1d_update, + ) +) + + +_CHECKPOINT_FOR_DOC = "nvidia/Nemotron-H-56B-Base-8K" +_CONFIG_FOR_DOC = "NemotronHConfig" + + +# Helper methods for segment sum computation + + +def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int): + """ + Padding x tensor with `pad_size` on the seq_len dim (dim=1) + + Assumes that we only have tensors of either size 4 or 3 + """ + pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0) + + return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0) + + +def reshape_into_chunks(input_tensor, pad_size, chunk_size): + """ + Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and + simultaneously splitting it into chunk sequences. + + Assumes that we only have tensors of either size 4 or 3 + """ + # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...] + input_tensor = pad_tensor_by_size(input_tensor, pad_size) + + if len(input_tensor.shape) == 3: + # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads] + return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2]) + else: + # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size] + return input_tensor.reshape( + input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3] + ) + + +def segment_sum(input_tensor): + """ + More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions. + """ + chunk_size = input_tensor.size(-1) + # 1. expand input tensor to have an additional dimension and repeat along that dimension + # [..., chunk_size] -> [..., chunk_size, chunk_size] + input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size) + # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1) + input_tensor = input_tensor.masked_fill(~mask, 0) + # 3. compute actual cumsum + tensor_segsum = torch.cumsum(input_tensor, dim=-2) + + # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time) + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0) + tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf) + return tensor_segsum + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + +# Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/models/jamba/modeling_jamba.py +class HybridMambaAttentionDynamicCache(DynamicCache): + """ + A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache + (which has a constant shape regardless of seq_len). + + This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` + and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor + For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, + while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). + For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), + while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, + and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. + """ + + def __init__(self, config, batch_size, dtype=torch.float16, device=None): + super().__init__() + self.dtype = dtype + self.hybrid_override_pattern = config.hybrid_override_pattern + self.has_previous_state = False # only used by mamba + #intermediate_size = config.expand * config.hidden_size + intermediate_size = config.mamba_num_heads * config.mamba_head_dim + ssm_state_size = config.ssm_state_size + conv_kernel_size = config.conv_kernel + self.conv_states = [] + self.ssm_states = [] + self.transformer_layers = [] + for i in range(config.num_hidden_layers): + if self.hybrid_override_pattern[i] == "M": + # Mamba layer + self.conv_states += [ + torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype) + ] + self.ssm_states += [ + torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype) + ] + else: + # Attention or MLP layer + self.conv_states += [torch.tensor([[]] * batch_size, device=device)] + self.ssm_states += [torch.tensor([[]] * batch_size, device=device)] + self.transformer_layers.append(i) + + self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Update the cache + if self.key_cache[layer_idx].shape[-1] == 0: + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + for layer_idx in range(len(self.key_cache)): + device = self.key_cache[layer_idx].device + self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) + device = self.value_cache[layer_idx].device + self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) + + device = self.conv_states[layer_idx].device + self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device)) + device = self.ssm_states[layer_idx].device + self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device)) + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # take any layer that contains cache and not empty tensor + layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx + if len(self.key_cache) <= layer_idx: + return 0 + return self.key_cache[layer_idx].shape[-2] + + def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: + raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.") + + @classmethod + def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache": + raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.") + + # Copied from modeling_mamba2.py + def update_conv_state( + self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False + ) -> torch.Tensor: + if cache_init: + self.conv_states[layer_idx] = new_conv_state.to(self.conv_states.device) + else: + self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1) + self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states.device) + return self.conv_states[layer_idx] + + def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor): + self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device) + return self.ssm_states[layer_idx] + + def reset(self): + self.conv_states.zero_() + self.ssm_states.zero_() + +class MambaRMSNormGated(torch.nn.Module): + def __init__(self, hidden_size, group_size, eps=1e-5): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + self.group_size = group_size + + # jan28b version + def forward(self, hidden_states, gate=None): + return rmsnorm_fn(x=hidden_states, + weight=self.weight, + bias=None, # No bias + z=gate, + eps=self.variance_epsilon, + group_size=self.group_size, + norm_before_gate=False + ) + +class NemotronHMamba2Mixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. + A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) + ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, + and is why Mamba is called **selective** state spaces) + """ + + def __init__(self, config: NemotronHConfig, layer_idx: int): + super().__init__() + self.num_heads = config.mamba_num_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.ssm_state_size + self.conv_kernel_size = config.conv_kernel + self.intermediate_size = config.mamba_num_heads * config.mamba_head_dim + self.layer_idx = layer_idx + self.use_conv_bias = config.use_conv_bias + self.activation = config.mamba_hidden_act + self.act = ACT2FN[config.mamba_hidden_act] + + self.layer_norm_epsilon = config.layer_norm_epsilon + + self.n_groups = config.n_groups + self.head_dim = config.mamba_head_dim + self.chunk_size = config.chunk_size + + self.time_step_limit = config.time_step_limit + self.time_step_min = config.time_step_min + self.time_step_max = config.time_step_max + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.use_conv_bias, + kernel_size=config.conv_kernel, + groups=self.conv_dim, + padding=config.conv_kernel - 1, + ) + + # projection of the input hidden states + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear( + self.hidden_size, + projection_size, + bias=config.use_bias, + ) + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.num_heads + 1) + self.A_log = nn.Parameter(torch.log(A)) + self.A_log._no_weight_decay = True + self.norm = MambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon, group_size=self.intermediate_size // self.n_groups) + self.D = nn.Parameter(torch.ones(self.num_heads)) + self.D._no_weight_decay = True + + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias) + self.use_bias = config.use_bias + + if not is_fast_path_available: + logger.warning_once( + "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + # 1. Gated MLP's linear projection + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + projected_states = self.in_proj(hidden_states) + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + groups_time_state_size = self.n_groups * self.ssm_state_size + d_mlp = ( + projected_states.shape[-1] + - 2 * self.intermediate_size + - 2 * self.n_groups * self.ssm_state_size + - self.num_heads + ) // 2 + + # Single step calculations via cache + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + hidden_states_B_C = causal_conv1d_update( + hidden_states_B_C, + cache_params.conv_states[self.layer_idx], + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.activation, + ) + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # (nheads,) + A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dt = dt[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D = self.D[:, None, ...].expand(-1, self.head_dim) + B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) + C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) + hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) + hidden_states = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states_reshaped, + dt, + A, + B, + C, + D, + z=None, + dt_bias=dt_bias, + dt_softplus=True, + ) + hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) + hidden_states = self.norm(hidden_states, gate) + + # 4. Final linear projection + out = self.out_proj(hidden_states)[:, None, ...] + + # Fused calculations or step by step if no initialized cache is found + else: + A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) + dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + + # 2-4. Fused kernel for conv1d, SSM, and the final projection + if self.training and cache_params is None: + out = mamba_split_conv1d_scan_combined( + projected_states, + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.dt_bias, + A, + D=self.D, + chunk_size=self.chunk_size, + seq_idx=None, # was seq_idx + activation=self.activation, + rmsnorm_weight=self.norm.weight, + rmsnorm_eps=self.norm.variance_epsilon, + outproj_weight=self.out_proj.weight, + outproj_bias=self.out_proj.bias, + headdim=self.head_dim, + ngroups=self.n_groups, + norm_before_gate=False, + return_final_states=False, + **dt_limit_kwargs, + ) + + else: + _, _, gate, hidden_states_B_C, dt = projected_states.split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, + (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), + ) + cache_params.update_conv_state( + layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True + ) + + if self.activation not in ["silu", "swish"]: + hidden_states_B_C = self.act( + self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2) + ) + else: + hidden_states_B_C = causal_conv1d_fn( + x=hidden_states_B_C.transpose(1, 2), + weight=self.conv1d.weight.squeeze(1), + bias=self.conv1d.bias, + activation=self.activation, + ).transpose(1, 2) + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + scan_output, ssm_state = mamba_chunk_scan_combined( + hidden_states.view(batch_size, seq_len, -1, self.head_dim), + dt, + A, + B.view(batch_size, seq_len, self.n_groups, -1), + C.view(batch_size, seq_len, self.n_groups, -1), + chunk_size=self.chunk_size, + D=self.D, + z=None, + seq_idx=None, + return_final_states=True, + dt_bias=self.dt_bias, + dt_softplus=True, + **dt_limit_kwargs, + ) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) + + scan_output = scan_output.view(batch_size, seq_len, -1) + + # Multiply "gate" branch and apply extra normalization layer + scan_output = self.norm(scan_output, gate) + + # 4. Final linear projection + out = self.out_proj(scan_output) + return out + + # fmt: off + def torch_forward(self, input_states, cache_params: Optional[HybridMambaAttentionDynamicCache]=None, cache_position:Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + projected_states = self.in_proj(input_states) + d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size-self.num_heads) // 2 + _, _, gate, hidden_states_B_C, dt = projected_states.split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False) + + # We need to guarantee that anything regarding the cache is on the same device + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + + hidden_states_B_C = torch.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 + ) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # [num_heads] + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + # We need to guarantee that anything regarding the cache is on the same device + cache_device = cache_params.ssm_states.device + + # Note: there is no need to pad parameter matrices here, as there is just one new token + # for batched generation + dt = dt[:, 0, :][:, None, ...] + dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) + # [num_heads] -> [num_heads, head_dim] + dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) + + dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + # [bsz, num_heads, head_dim, state_size] + dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) + + # Discretize B + # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> + # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + # [bsz, num_heads, head_dim, state_size] + dB = dt[..., None] * B[..., None, :] + + # Discretize x into dB + # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]).to(device=cache_device) + + # State calculation + cache_params.update_ssm_state( + layer_idx=self.layer_idx, + new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + # Subsequent output + # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() + C = C.reshape(batch_size, -1, C.shape[-1]) + # [bsz, num_heads, head_dim] + + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] + # Reshape ssm_states to merge the first two dimensions + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] + y = torch.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + # D skip connection + # [num_heads] -> [num_heads, head_dim] + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) + y = (y + hidden_states * D).to(y.dtype) + + # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # begin ssd naive implementation without einsums + dt = nn.functional.softplus(dt + self.dt_bias) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.repeat(1, 1, self.num_heads // self.n_groups, 1) + C = C.repeat(1, 1, self.num_heads // self.n_groups, 1) + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + # Discretize x and A + hidden_states = hidden_states * dt[..., None] + A = A.to(hidden_states.dtype) * dt + + # Rearrange into blocks/chunks + hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] + + # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] + A = A.permute(0, 3, 1, 2) + A_cumsum = torch.cumsum(A, dim=-1) + + # 1. Compute the output for each intra-chunk (diagonal blocks) + # This is the analog of a causal mask + L = torch.exp(segment_sum(A)) + + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) + G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) + + # Compute M, equivalent to applying attention mask to weights + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(dim=-1) + + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum)) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) + else: + previous_states = torch.zeros_like(states[:, :1]) + states = torch.cat([previous_states, states], dim=1) + decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + # 4. Compute state -> output conversion per chunk + # (left term of low-rank factorization of off-diagonal blocks; C terms) + state_decay_out = torch.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) + + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + y = Y_diag + Y_off + # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + + y = y + D_residual + # Cutting off padded chunks + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) + + scan_output = self.norm(y, gate) + + # end ssd naive + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] + return contextualized_states + # fmt: on + + def forward( + self, + hidden_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + if is_fast_path_available and "cuda" in self.in_proj.weight.device.type: + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) + dtype = hidden_states.dtype + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) + + +class NemotronHRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + NemotronHRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + # Weights are in float32 + return (self.weight.to(torch.float32) * hidden_states).to(input_dtype) + +class NemotronHBlock(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.residual_in_fp32 = config.residual_in_fp32 + self.norm = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + + # M: Mamba2, *: Attention, -: MLP + self.block_type = config.layers_block_type[layer_idx] + if self.block_type == "mamba": + self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx) + elif self.block_type == "attention": + self.mixer = NEMOTRONH_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) + elif self.block_type == "mlp": + self.mixer = NemotronHMLP(config, layer_idx=layer_idx) + else: + raise ValueError(f"Invalid layer pattern {config.hybrid_override_pattern[layer_idx]}") + + def forward( + self, + hidden_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + with torch.cuda.stream(torch.cuda.default_stream(hidden_states.device)): + # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs + residual = hidden_states + hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) + if self.residual_in_fp32: + residual = residual.to(torch.float32) + + if self.block_type == "mamba": + hidden_states = self.mixer( + hidden_states, cache_params=cache_params, cache_position=cache_position + ) + elif self.block_type == "attention": + hidden_states = self.mixer( + hidden_states, cache_position=cache_position + ) + hidden_states = hidden_states[0] + elif self.block_type == "mlp": + hidden_states = self.mixer( + hidden_states + ) + else: + raise ValueError(f"Invalid block_type: {self.block_type}") + + hidden_states = residual + hidden_states + return hidden_states + + +# Copied from transformers.models.nemotron.modeling_nemotron Nemotron->NemotronH +class NemotronHMLP(nn.Module): + def __init__(self, config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + self.hidden_size = config.hidden_size + #intermediate_size = config.expand * config.hidden_size + self.intermediate_size = config.intermediate_size + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.mlp_hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.up_proj(x))) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class NemotronHAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: NemotronHConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + if config.head_dim is not None: + self.head_dim = config.head_dim + else: + self.head_dim = config.hidden_size // config.num_attention_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=config.attention_bias) + + def forward( + self, + hidden_states: torch.Tensor, + # position_embeddings: Tuple[torch.Tensor, torch.Tensor], #TODO + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if past_key_value is not None: + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + #attn_output = attn_output.view(bsz, q_len, self.hidden_size) + attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +# Adapted from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Jamba +#class JambaFlashAttention2(JambaAttention): +class NemotronHFlashAttention2(NemotronHAttention): + """ + Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ): + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if past_key_value is not None: + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + sliding_window=getattr(self.config, "sliding_window", None), + is_causal=self.is_causal, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + #attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Adapted from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Jamba +#class JambaSdpaAttention(JambaAttention): +class NemotronHSdpaAttention(NemotronHAttention): + """ + Jamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `JambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from NemotronHAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "NemotronHModel is using NemotronHSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if past_key_value is not None: + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal = True if self.is_causal and causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +NEMOTRONH_ATTENTION_CLASSES = { + "eager": NemotronHAttention, + "flash_attention_2": NemotronHFlashAttention2, + "sdpa": NemotronHSdpaAttention, +} + +# Copied from transformers.models.mamba.modeling_mamba2.Mamba2PreTrainedModel +class NemotronHPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = NemotronHConfig + base_model_prefix = "backbone" + _no_split_modules = ["NemotronHBlock"] + supports_gradient_checkpointing = True + _is_stateful = True + _supports_flash_attn_2 = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, NemotronHMamba2Mixer): + if getattr(module.dt_bias, "_is_hf_initialized", False): + return + module.A_log._no_weight_decay = True + module.D._no_weight_decay = True + + dt = torch.exp( + torch.rand(self.config.mamba_num_heads) + * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min)) + + math.log(self.config.time_step_min) + ).clamp(min=self.config.time_step_floor) + + # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 + inv_dt = dt + torch.log(-torch.expm1(-dt)) + with torch.no_grad(): + module.dt_bias.copy_(inv_dt) + module.dt_bias._no_reinit = True + + if isinstance(module, nn.Linear): + if module.bias is not None: + if not getattr(module.bias, "_no_reinit", False): + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=self.config.initializer_range) + + # TODO: Check + if self.config.rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if getattr(p, "_is_hf_initialized", False): + continue + if name in ["out_proj.weight"]: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(self.config.num_hidden_layers) + + +@dataclass +# Copied from transformers.models.mamba.modeling_mamba2.Mamba2Output with MAMBA2->NemotronH,Mamba2->NemotronH +class NemotronHOutput(ModelOutput): + """ + Class for the NemotronH model outputs. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + cache_params (`HybridMambaAttentionDynamicCache`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + + Includes both the State space model state matrices after the selective scan, and the Convolutional states + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + cache_params: Optional[HybridMambaAttentionDynamicCache] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +# Copied from transformers.models.mamba2.modeling_mamba2.MambaCausalLMOutput with Mamba2->NemotronH +class NemotronHCausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + cache_params (`HybridMambaAttentionDynamicCache`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + + Includes both the State space model state matrices after the selective scan, and the Convolutional states + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + cache_params: Optional[HybridMambaAttentionDynamicCache] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +NEMOTRONH_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`NemotronHConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +NEMOTRONH_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): + Indices of input sequence tokens in the vocabulary. + + If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + position_ids (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. + cache_params (`HybridMambaAttentionDynamicCache`, *optional*): + If passed along, the model uses the previous state in all the blocks (which will give the output for the + `input_ids` provided as if the model add `state_input_ids + input_ids` as context). + use_cache (`bool`, *optional*): + If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the current input in the cache. This is used to ensure that the cache is correctly updated. + If `cache_params` is passed, `cache_position` should also be passed. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) +""" + + +@add_start_docstrings( + "The bare NemotronH Model transformer outputting raw hidden-states without any specific head on top.", + NEMOTRONH_START_DOCSTRING, +) +class NemotronHModel(NemotronHPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList([NemotronHBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]) + + self.gradient_checkpointing = False + self.norm_f = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + # Initialize weights and apply final processing + self._register_load_state_dict_pre_hook(self.load_hook) + self.post_init() + + def load_hook(self, state_dict, prefix, *args): + for k in state_dict: + if "embedding." in k: + state_dict[k.replace("embedding.", "embeddings.")] = state_dict.pop(k) + break + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings = new_embeddings + + @add_start_docstrings_to_model_forward(NEMOTRONH_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=NemotronHOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[Tuple, NemotronHOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + # use_cache = use_cache if use_cache is not None else self.config.use_cache + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + # From zamba_modeling.py + if use_cache and cache_params is None: + logger.warning_once( + "NemotronH requires an initialized `NemotronHHybridDynamicCache` to return a cache. None was " + "provided, so no cache will be returned." + ) + + hidden_states = inputs_embeds + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) + mamba_mask = self._update_mamba_mask(attention_mask, cache_position) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + # Until HERE + + for layer_idx, mixer_block in enumerate(self.layers): + # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention) + if mixer_block.block_type == "mamba": + layer_mask = mamba_mask + elif mixer_block.block_type == "attention": + layer_mask = causal_mask + elif mixer_block.block_type == "mlp": + layer_mask = None + else: + raise ValueError(f"Invalid block_type: {self.block_type}") + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + hidden_states = self._gradient_checkpointing_func( + mixer_block.__call__, hidden_states, cache_params, cache_position, layer_mask + ) + else: + hidden_states = mixer_block( + hidden_states, + cache_params=cache_params, + cache_position=cache_position, + attention_mask=layer_mask, + ) + + # TODO: Store attentions + # if output_attentions: + # if layer_outputs[1] is not None: + # # append attentions only of attention layers. Mamba layers return `None` as the attention weights + # all_self_attns += (layer_outputs[1],) + + # TODO (Check): should it happen before the forward pass? + # if output_hidden_states: + # all_hidden_states = all_hidden_states + (hidden_states,) + + hidden_states = self.norm_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None) + + return NemotronHOutput( + last_hidden_state=hidden_states, + cache_params=cache_params if use_cache else None, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask + def _update_causal_mask(self, attention_mask, input_tensor, cache_position): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + target_length = cache_position[-1] + 1 + + causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) + causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + def _update_mamba_mask(self, attention_mask, cache_position): + """ + No need for zeroing states when + 1. Cached forward + 2. Attending to all inputs + """ + mamba_mask = attention_mask + if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): + mamba_mask = None + return mamba_mask + + +@add_start_docstrings( + """ + The NEMOTRONH Model transformer with a language modeling head on top (linear layer with weights not tied to the input + embeddings). + """, + NEMOTRONH_START_DOCSTRING, +) +class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.backbone = NemotronHModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.backbone.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + return self.backbone.set_input_embeddings(new_embeddings) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def get_decoder(self): + return self.model + + def set_decoder(self, decoder): + self.model = decoder + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/jamba/modeling_jamba.py + # Overwitten -- uses `cache_params` as opposed to `past_key_values` + empty_past_kv = past_key_values is None + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + if not empty_past_kv: + if ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + else: + past_key_values = HybridMambaAttentionDynamicCache( + self.config, input_ids.shape[0], self.dtype, device=self.device + ) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + # TODO(pjin): workaround fix for properly extending inputs_embeds; + # longer term, may be better handled elsewhere in .generate(). + if input_ids is not None and inputs_embeds.shape[1] < input_ids.shape[1]: + new_token_embeds = self.get_input_embeddings()(input_ids[:,inputs_embeds.shape[1]:]) + inputs_embeds = torch.cat([inputs_embeds, new_token_embeds], dim=1) + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "logits_to_keep": self.config.num_logits_to_keep, + "cache_position": cache_position, + } + ) + return model_inputs + + @add_start_docstrings_to_model_forward(NEMOTRONH_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=NemotronHCausalLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + **kwargs, # for now we need this for generation + ) -> Union[Tuple, NemotronHCausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + nemotron_h_outputs = self.backbone( + input_ids, + cache_params=cache_params, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + use_cache=use_cache, + cache_position=cache_position, + attention_mask=attention_mask, + ) + hidden_states = nemotron_h_outputs[0] + + # TODO: Check zamba_modeling.py: https://github.com/huggingface/transformers/blob/d7188ba600e36d3fd191b12e19f1b3bb81a8404f/src/transformers/models/zamba/modeling_zamba.py#L1284C1-L1286C2 + #logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float() + logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float() + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (logits,) + nemotron_h_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return NemotronHCausalLMOutput( + loss=loss, + logits=logits, + cache_params=nemotron_h_outputs.cache_params, + hidden_states=nemotron_h_outputs.hidden_states, + attentions=nemotron_h_outputs.attentions, + ) diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml b/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml new file mode 100644 index 0000000000..dec06eac9c --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml @@ -0,0 +1,261 @@ +mcore_gpt: true +micro_batch_size: 1 +global_batch_size: 256 +tensor_model_parallel_size: 8 +pipeline_model_parallel_size: 4 +virtual_pipeline_model_parallel_size: null +encoder_seq_length: 4096 +max_position_embeddings: 4096 +num_layers: 96 +hidden_size: 18432 +ffn_hidden_size: 73728 +num_attention_heads: 96 +init_method_std: 0.0063 +use_scaled_init_method: true +hidden_dropout: 0.0 +attention_dropout: 0.0 +ffn_dropout: 0.0 +kv_channels: null +apply_query_key_layer_scaling: true +normalization: layernorm1p +layernorm_epsilon: 1.0e-05 +do_layer_norm_weight_decay: false +make_vocab_size_divisible_by: 128 +pre_process: true +post_process: true +persist_layer_norm: true +bias: false +activation: squared-relu +headscale: false +transformer_block_type: pre_ln +openai_gelu: false +normalize_attention_scores: true +position_embedding_type: rope +rotary_percentage: 0.5 +attention_type: multihead +share_embeddings_and_output_weights: false +num_query_groups: 8 +tokenizer: + library: sentencepiece + type: null + model: nemo:8223bf8eaa194eb8920af568bb52e2d0_megatron_2.model + vocab_file: null + merge_file: null + tokenizer_model: nemo:eb5528fdec5c4083affa2c97958eeef7_megatron_2.model + sentencepiece_legacy: false +native_amp_init_scale: 4294967296 +native_amp_growth_interval: 1000 +hysteresis: 2 +fp32_residual_connection: false +fp16_lm_cross_entropy: false +megatron_amp_O2: true +grad_allreduce_chunk_size_mb: 125 +grad_div_ar_fusion: true +gradient_accumulation_fusion: false +bias_activation_fusion: false +bias_dropout_add_fusion: false +masked_softmax_fusion: true +seed: 1234 +resume_from_checkpoint: null +use_cpu_initialization: false +onnx_safe: false +apex_transformer_log_level: 30 +gradient_as_bucket_view: false +sync_batch_comm: false +activations_checkpoint_granularity: null +activations_checkpoint_method: null +activations_checkpoint_num_layers: 1 +num_micro_batches_with_partial_activation_checkpoints: null +activations_checkpoint_layers_per_pipeline: null +sequence_parallel: false +transformer_engine: false +fp8: false +fp8_e4m3: false +fp8_hybrid: false +fp8_margin: 0 +fp8_interval: 1 +fp8_amax_history_len: 1 +fp8_amax_compute_algo: most_recent +reduce_amax: true +use_emha: false +optim: + name: distributed_fused_adam + lr: 3.001e-07 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 10 + constant_steps: 400 + min_lr: 3.0e-07 + bucket_cap_mb: 200 + overlap_grad_sync: false + contiguous_grad_buffer: true +precision: bf16-mixed +data: + chat: true + chat_prompt_tokens: + system_turn_start: + turn_start: + label_start: + end_of_turn: ' + + ' + end_of_name: ' + + ' + sample: true + num_workers: 2 + dataloader_type: single + train_ds: + file_path: /dataset/train.jsonl + global_batch_size: 128 + micro_batch_size: 1 + shuffle: true + memmap_workers: null + max_seq_length: 4096 + min_seq_length: 1 + drop_last: true + concat_sampling_probabilities: null + label_key: output + add_eos: false + add_sep: false + add_bos: false + truncation_field: input + index_mapping_dir: /indexmap_dir + prompt_template: 'System + + {system message} + + User + + {turn 1 user message} + + Assistant + + {turn 1 assistant label} + + {turn 1 assistant message} + + User + + {turn 2 user message} + + Assistant + + {turn 2 assistant label} + + {turn 2 assistant message} + + ' + hf_dataset: true + truncation_method: right + validation_ds: + file_path: /dataset/val.jsonl + names: null + global_batch_size: 128 + micro_batch_size: 1 + shuffle: false + memmap_workers: null + max_seq_length: 4096 + min_seq_length: 1 + drop_last: false + label_key: output + add_eos: false + add_sep: false + add_bos: false + write_predictions_to_file: false + output_file_path_prefix: null + truncation_field: input + index_mapping_dir: /indexmap_dir + prompt_template: 'System + + {system message} + + User + + {turn 1 user message} + + Assistant + + {turn 1 assistant label} + + {turn 1 assistant message} + + User + + {turn 2 user message} + + Assistant + + {turn 2 assistant label} + + {turn 2 assistant message} + + ' + tokens_to_generate: 32 + hf_dataset: true + truncation_method: right + metric: + name: loss + average: null + num_classes: null + test_ds: + prompt_template: 'System + + {system message} + + User + + {turn 1 user message} + + Assistant + + {turn 1 assistant label} + + {turn 1 assistant message} + + User + + {turn 2 user message} + + Assistant + + {turn 2 assistant label} + + {turn 2 assistant message} + + ' + data_impl: jsonl + splits_string: null + seq_length: 4096 + skip_warmup: true + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: /indexmap_dir + data_prefix: + train: + - /datasets/train.jsonl + validation: + - /datasets/val.jsonl + test: + - /datasets/val.jsonl +answer_only_loss: true +restore_from_path: /models/340B_base +save_nemo_on_validation_end: true +use_flash_attention: null +pipeline_model_parallel_split_rank: 0 +dpo: + log_prob_forward_micro_batch_size: 2 + ref_policy_kl_penalty: 0.3 + average_log_probs: false + sft_loss_coeff: 1.0e-05 + optimize_ref_policy_kl_penalty: false + preference_loss: reward_rev_dpo + gt_reward_scale: 1.0 +apply_rope_fusion: false +target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel +nemo_version: 1.22.0 diff --git a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json new file mode 100644 index 0000000000..42300b8993 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json @@ -0,0 +1,88 @@ +{ + "architectures": [ + "GptOssForCausalLM" + ], + "attention_bias": true, + "attention_dropout": 0.0, + "eos_token_id": 200002, + "experts_per_token": 4, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2880, + "initial_context_length": 4096, + "initializer_range": 0.02, + "intermediate_size": 2880, + "layer_types": [ + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "model_type": "gpt_oss", + "num_attention_heads": 64, + "num_experts_per_tok": 4, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_local_experts": 128, + "output_router_logits": false, + "pad_token_id": 199999, + "quantization_config": { + "modules_to_not_convert": [ + "model.layers.*.self_attn", + "model.layers.*.mlp.router", + "model.embed_tokens", + "lm_head" + ], + "quant_method": "mxfp4" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "beta_fast": 32.0, + "beta_slow": 1.0, + "factor": 32.0, + "original_max_position_embeddings": 4096, + "rope_type": "yarn", + "truncate": false + }, + "rope_theta": 150000, + "router_aux_loss_coef": 0.9, + "sliding_window": 128, + "swiglu_limit": 7.0, + "tie_word_embeddings": false, + "transformers_version": "4.55.0.dev0", + "use_cache": true, + "vocab_size": 201088 +} diff --git a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json new file mode 100644 index 0000000000..8fb5a4a033 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json @@ -0,0 +1,76 @@ +{ + "architectures": [ + "GptOssForCausalLM" + ], + "attention_bias": true, + "attention_dropout": 0.0, + "eos_token_id": 200002, + "experts_per_token": 4, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2880, + "initial_context_length": 4096, + "initializer_range": 0.02, + "intermediate_size": 2880, + "layer_types": [ + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "model_type": "gpt_oss", + "num_attention_heads": 64, + "num_experts_per_tok": 4, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_local_experts": 32, + "output_router_logits": false, + "pad_token_id": 199999, + "quantization_config": { + "modules_to_not_convert": [ + "model.layers.*.self_attn", + "model.layers.*.mlp.router", + "model.embed_tokens", + "lm_head" + ], + "quant_method": "mxfp4" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "beta_fast": 32.0, + "beta_slow": 1.0, + "factor": 32.0, + "original_max_position_embeddings": 4096, + "rope_type": "yarn", + "truncate": false + }, + "rope_theta": 150000, + "router_aux_loss_coef": 0.9, + "sliding_window": 128, + "swiglu_limit": 7.0, + "tie_word_embeddings": false, + "transformers_version": "4.55.0.dev0", + "use_cache": true, + "vocab_size": 201088 +} From 62a425239493438a2d0ed10bff652c3997561392 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Wed, 10 Jun 2026 09:54:49 -0700 Subject: [PATCH 3/4] fix(tests): add parallelism conftest and fixture download script The conftest.py patches hf_hub_download and AutoConfig.from_pretrained to serve from local fixtures, preventing HuggingFace API calls. The download_fixtures.py script regenerates fixtures when new models are added to tests. Signed-off-by: Matthew Grossman --- .../tests/integration/parallelism/conftest.py | 78 +++++++++++++ .../parallelism/download_fixtures.py | 107 ++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 services/core/models/tests/integration/parallelism/conftest.py create mode 100644 services/core/models/tests/integration/parallelism/download_fixtures.py diff --git a/services/core/models/tests/integration/parallelism/conftest.py b/services/core/models/tests/integration/parallelism/conftest.py new file mode 100644 index 0000000000..2acbbcdede --- /dev/null +++ b/services/core/models/tests/integration/parallelism/conftest.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Fixtures that redirect HuggingFace Hub calls to local config fixtures. + +This prevents parallelism integration tests from making real network calls to +huggingface.co, avoiding rate-limit failures in CI. + +To regenerate fixtures after adding new models to tests, run: + uv run python services/core/models/tests/integration/parallelism/download_fixtures.py +""" + +from pathlib import Path +from unittest.mock import patch + +import pytest + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +def _fixture_path(model_id: str) -> Path: + """Return the local fixture directory for a model ID.""" + return FIXTURES_DIR / model_id + + +def _has_fixture(model_id: str) -> bool: + """Check if we have a local fixture for this model.""" + d = _fixture_path(model_id) + return d.is_dir() and ((d / "config.json").exists() or (d / "model_config.yaml").exists()) + + +def _mock_hf_hub_download(repo_id: str, filename: str, **kwargs): + """Return path to fixture file instead of downloading from HF Hub.""" + fixture_file = _fixture_path(repo_id) / filename + if fixture_file.exists(): + return str(fixture_file) + raise FileNotFoundError(f"Fixture not found: {fixture_file}. Run download_fixtures.py to regenerate.") + + +_real_auto_config_from_pretrained = None + + +def _mock_auto_config_from_pretrained(pretrained_model_name_or_path, **kwargs): + """Redirect remote model IDs to local fixture directories.""" + path_str = str(pretrained_model_name_or_path) + # Only intercept remote model IDs (not local paths) + if not Path(path_str).exists() and _has_fixture(path_str): + return _real_auto_config_from_pretrained(str(_fixture_path(path_str)), **kwargs) + return _real_auto_config_from_pretrained(pretrained_model_name_or_path, **kwargs) + + +@pytest.fixture(autouse=True) +def _offline_hf(monkeypatch): + """Patch HF Hub calls to use local fixtures for all parallelism tests.""" + global _real_auto_config_from_pretrained + + from transformers import AutoConfig + + _real_auto_config_from_pretrained = AutoConfig.from_pretrained + + # Clear the model spec cache so stale entries from prior tests don't bypass our mocks + from nmp.core.models.parallelism.api import _model_spec_cache + + _model_spec_cache.clear() + + with ( + patch( + "nmp.core.models.parallelism.models.hf_hub_download", + side_effect=_mock_hf_hub_download, + ), + patch( + "transformers.AutoConfig.from_pretrained", + side_effect=_mock_auto_config_from_pretrained, + ), + ): + yield + + _model_spec_cache.clear() diff --git a/services/core/models/tests/integration/parallelism/download_fixtures.py b/services/core/models/tests/integration/parallelism/download_fixtures.py new file mode 100644 index 0000000000..c38affd8b2 --- /dev/null +++ b/services/core/models/tests/integration/parallelism/download_fixtures.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +One-time script to download HuggingFace model config files for offline test fixtures. + +Run this script when adding new models to parallelism tests: + uv run python services/core/models/tests/integration/parallelism/download_fixtures.py + +It downloads config.json (and model_config.yaml if present) for each model +into the fixtures/ directory, enabling tests to run without network access. +""" + +import json +import shutil +from pathlib import Path + +from huggingface_hub import hf_hub_download + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + +# All non-gated model IDs used across parallelism integration tests. +# Gated models (meta-llama/*, google/gemma-*) are already skipped via REQUIRES_HF_TOKEN. +MODEL_IDS = [ + "gpt2", + "microsoft/phi-2", + "microsoft/phi-4", + "mistralai/Mixtral-8x7B-v0.1", + "mistralai/Mistral-7B-v0.1", + "mistralai/Devstral-Small-2505", + "nvidia/Mistral-NeMo-Minitron-8B-Instruct", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2", + "nvidia/nemotron-4-340b-instruct", + "openai/gpt-oss-20b", + "openai/gpt-oss-120b", + "EleutherAI/gpt-j-6b", + "EleutherAI/gpt-neox-20b", + "Qwen/Qwen3-8B", + "Qwen/Qwen3-4B-SafeRL", + "Qwen/Qwen2.5-7B", + "Qwen/Qwen2.5-72B", + "Qwen/Qwen2.5-72B-Instruct", + "deepseek-ai/deepseek-llm-7b-base", + "deepseek-ai/deepseek-llm-67b-base", + "deepseek-ai/DeepSeek-V3-Base", +] + + +def _model_dir(model_id: str) -> Path: + """Return the fixture directory for a model, e.g. fixtures/gpt2 or fixtures/microsoft/phi-4.""" + return FIXTURES_DIR / model_id + + +def download_model_configs(model_id: str) -> None: + dest = _model_dir(model_id) + dest.mkdir(parents=True, exist_ok=True) + + got_config_json = False + for filename in ("config.json", "model_config.yaml"): + try: + cached_path = hf_hub_download(model_id, filename) + shutil.copy2(cached_path, dest / filename) + print(f" [OK] {model_id}/{filename}") + if filename == "config.json": + got_config_json = True + except Exception: + # model_config.yaml is optional; config.json is required unless + # model_config.yaml exists (NeMo YAML-only models like Nemotron-4-340B) + pass + + if not got_config_json and not (dest / "model_config.yaml").exists(): + raise RuntimeError(f"Neither config.json nor model_config.yaml found for {model_id}") + + # Download custom config/model Python files referenced by auto_map. + # These are needed for AutoConfig.from_pretrained(trust_remote_code=True). + if got_config_json: + config = json.loads((dest / "config.json").read_text()) + auto_map = config.get("auto_map", {}) + for key, value in auto_map.items(): + # value is like "configuration_nemotron_h.NemotronHConfig" + module_name = value.split(".")[0] + py_file = f"{module_name}.py" + try: + cached_path = hf_hub_download(model_id, py_file) + shutil.copy2(cached_path, dest / py_file) + print(f" [OK] {model_id}/{py_file}") + except Exception: + print(f" [WARN] {model_id}/{py_file} not found (auto_map: {key}={value})") + + +def main() -> None: + print(f"Downloading config fixtures to {FIXTURES_DIR}/\n") + for model_id in MODEL_IDS: + print(f"Downloading {model_id}...") + download_model_configs(model_id) + print("\nDone! Fixtures are ready for offline tests.") + + # Write a manifest so the conftest can validate completeness + manifest = {mid: sorted(str(p.name) for p in _model_dir(mid).iterdir()) for mid in MODEL_IDS} + manifest_path = FIXTURES_DIR / "manifest.json" + manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n") + print(f"Wrote manifest to {manifest_path}") + + +if __name__ == "__main__": + main() From 2c2f64171a8ab5da5de6e29972bfbd10bfe8f063 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Wed, 10 Jun 2026 09:56:20 -0700 Subject: [PATCH 4/4] fix(tests): remove parallelism fixtures, keep only auth test mock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 5K lines of HF config fixtures were unnecessary — the parallelism tests are always skipped in CI (torch not installed). Stripped down to only the change that fixes the actual CI failures: mocking HfApi in TestTrustRemoteCodePermission. Signed-off-by: Matthew Grossman --- .../tests/integration/parallelism/conftest.py | 78 - .../parallelism/download_fixtures.py | 107 - .../fixtures/EleutherAI/gpt-j-6b/config.json | 40 - .../EleutherAI/gpt-neox-20b/config.json | 25 - .../Qwen/Qwen2.5-72B-Instruct/config.json | 27 - .../fixtures/Qwen/Qwen2.5-72B/config.json | 27 - .../fixtures/Qwen/Qwen2.5-7B/config.json | 28 - .../fixtures/Qwen/Qwen3-4B-SafeRL/config.json | 30 - .../fixtures/Qwen/Qwen3-8B/config.json | 30 - .../deepseek-ai/DeepSeek-V3-Base/config.json | 67 - .../configuration_deepseek.py | 199 -- .../DeepSeek-V3-Base/modeling_deepseek.py | 1848 ----------------- .../deepseek-llm-67b-base/config.json | 25 - .../deepseek-llm-7b-base/config.json | 25 - .../parallelism/fixtures/gpt2/config.json | 31 - .../parallelism/fixtures/manifest.json | 69 - .../fixtures/microsoft/phi-2/config.json | 30 - .../fixtures/microsoft/phi-4/config.json | 32 - .../mistralai/Devstral-Small-2505/config.json | 26 - .../mistralai/Mistral-7B-v0.1/config.json | 24 - .../mistralai/Mixtral-8x7B-v0.1/config.json | 29 - .../config.json | 28 - .../NVIDIA-Nemotron-Nano-9B-v2/config.json | 56 - .../configuration_nemotron_h.py | 245 --- .../modeling_nemotron_h.py | 1643 --------------- .../model_config.yaml | 261 --- .../fixtures/openai/gpt-oss-120b/config.json | 88 - .../fixtures/openai/gpt-oss-20b/config.json | 76 - .../test_parallelism_hf_model_config.py | 3 +- .../parallelism/test_recent_models.py | 3 +- .../tests/parallelism/nemo_validation_data.py | 3 +- 31 files changed, 6 insertions(+), 5197 deletions(-) delete mode 100644 services/core/models/tests/integration/parallelism/conftest.py delete mode 100644 services/core/models/tests/integration/parallelism/download_fixtures.py delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/manifest.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json diff --git a/services/core/models/tests/integration/parallelism/conftest.py b/services/core/models/tests/integration/parallelism/conftest.py deleted file mode 100644 index 2acbbcdede..0000000000 --- a/services/core/models/tests/integration/parallelism/conftest.py +++ /dev/null @@ -1,78 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Fixtures that redirect HuggingFace Hub calls to local config fixtures. - -This prevents parallelism integration tests from making real network calls to -huggingface.co, avoiding rate-limit failures in CI. - -To regenerate fixtures after adding new models to tests, run: - uv run python services/core/models/tests/integration/parallelism/download_fixtures.py -""" - -from pathlib import Path -from unittest.mock import patch - -import pytest - -FIXTURES_DIR = Path(__file__).parent / "fixtures" - - -def _fixture_path(model_id: str) -> Path: - """Return the local fixture directory for a model ID.""" - return FIXTURES_DIR / model_id - - -def _has_fixture(model_id: str) -> bool: - """Check if we have a local fixture for this model.""" - d = _fixture_path(model_id) - return d.is_dir() and ((d / "config.json").exists() or (d / "model_config.yaml").exists()) - - -def _mock_hf_hub_download(repo_id: str, filename: str, **kwargs): - """Return path to fixture file instead of downloading from HF Hub.""" - fixture_file = _fixture_path(repo_id) / filename - if fixture_file.exists(): - return str(fixture_file) - raise FileNotFoundError(f"Fixture not found: {fixture_file}. Run download_fixtures.py to regenerate.") - - -_real_auto_config_from_pretrained = None - - -def _mock_auto_config_from_pretrained(pretrained_model_name_or_path, **kwargs): - """Redirect remote model IDs to local fixture directories.""" - path_str = str(pretrained_model_name_or_path) - # Only intercept remote model IDs (not local paths) - if not Path(path_str).exists() and _has_fixture(path_str): - return _real_auto_config_from_pretrained(str(_fixture_path(path_str)), **kwargs) - return _real_auto_config_from_pretrained(pretrained_model_name_or_path, **kwargs) - - -@pytest.fixture(autouse=True) -def _offline_hf(monkeypatch): - """Patch HF Hub calls to use local fixtures for all parallelism tests.""" - global _real_auto_config_from_pretrained - - from transformers import AutoConfig - - _real_auto_config_from_pretrained = AutoConfig.from_pretrained - - # Clear the model spec cache so stale entries from prior tests don't bypass our mocks - from nmp.core.models.parallelism.api import _model_spec_cache - - _model_spec_cache.clear() - - with ( - patch( - "nmp.core.models.parallelism.models.hf_hub_download", - side_effect=_mock_hf_hub_download, - ), - patch( - "transformers.AutoConfig.from_pretrained", - side_effect=_mock_auto_config_from_pretrained, - ), - ): - yield - - _model_spec_cache.clear() diff --git a/services/core/models/tests/integration/parallelism/download_fixtures.py b/services/core/models/tests/integration/parallelism/download_fixtures.py deleted file mode 100644 index c38affd8b2..0000000000 --- a/services/core/models/tests/integration/parallelism/download_fixtures.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -""" -One-time script to download HuggingFace model config files for offline test fixtures. - -Run this script when adding new models to parallelism tests: - uv run python services/core/models/tests/integration/parallelism/download_fixtures.py - -It downloads config.json (and model_config.yaml if present) for each model -into the fixtures/ directory, enabling tests to run without network access. -""" - -import json -import shutil -from pathlib import Path - -from huggingface_hub import hf_hub_download - -FIXTURES_DIR = Path(__file__).parent / "fixtures" - -# All non-gated model IDs used across parallelism integration tests. -# Gated models (meta-llama/*, google/gemma-*) are already skipped via REQUIRES_HF_TOKEN. -MODEL_IDS = [ - "gpt2", - "microsoft/phi-2", - "microsoft/phi-4", - "mistralai/Mixtral-8x7B-v0.1", - "mistralai/Mistral-7B-v0.1", - "mistralai/Devstral-Small-2505", - "nvidia/Mistral-NeMo-Minitron-8B-Instruct", - "nvidia/NVIDIA-Nemotron-Nano-9B-v2", - "nvidia/nemotron-4-340b-instruct", - "openai/gpt-oss-20b", - "openai/gpt-oss-120b", - "EleutherAI/gpt-j-6b", - "EleutherAI/gpt-neox-20b", - "Qwen/Qwen3-8B", - "Qwen/Qwen3-4B-SafeRL", - "Qwen/Qwen2.5-7B", - "Qwen/Qwen2.5-72B", - "Qwen/Qwen2.5-72B-Instruct", - "deepseek-ai/deepseek-llm-7b-base", - "deepseek-ai/deepseek-llm-67b-base", - "deepseek-ai/DeepSeek-V3-Base", -] - - -def _model_dir(model_id: str) -> Path: - """Return the fixture directory for a model, e.g. fixtures/gpt2 or fixtures/microsoft/phi-4.""" - return FIXTURES_DIR / model_id - - -def download_model_configs(model_id: str) -> None: - dest = _model_dir(model_id) - dest.mkdir(parents=True, exist_ok=True) - - got_config_json = False - for filename in ("config.json", "model_config.yaml"): - try: - cached_path = hf_hub_download(model_id, filename) - shutil.copy2(cached_path, dest / filename) - print(f" [OK] {model_id}/{filename}") - if filename == "config.json": - got_config_json = True - except Exception: - # model_config.yaml is optional; config.json is required unless - # model_config.yaml exists (NeMo YAML-only models like Nemotron-4-340B) - pass - - if not got_config_json and not (dest / "model_config.yaml").exists(): - raise RuntimeError(f"Neither config.json nor model_config.yaml found for {model_id}") - - # Download custom config/model Python files referenced by auto_map. - # These are needed for AutoConfig.from_pretrained(trust_remote_code=True). - if got_config_json: - config = json.loads((dest / "config.json").read_text()) - auto_map = config.get("auto_map", {}) - for key, value in auto_map.items(): - # value is like "configuration_nemotron_h.NemotronHConfig" - module_name = value.split(".")[0] - py_file = f"{module_name}.py" - try: - cached_path = hf_hub_download(model_id, py_file) - shutil.copy2(cached_path, dest / py_file) - print(f" [OK] {model_id}/{py_file}") - except Exception: - print(f" [WARN] {model_id}/{py_file} not found (auto_map: {key}={value})") - - -def main() -> None: - print(f"Downloading config fixtures to {FIXTURES_DIR}/\n") - for model_id in MODEL_IDS: - print(f"Downloading {model_id}...") - download_model_configs(model_id) - print("\nDone! Fixtures are ready for offline tests.") - - # Write a manifest so the conftest can validate completeness - manifest = {mid: sorted(str(p.name) for p in _model_dir(mid).iterdir()) for mid in MODEL_IDS} - manifest_path = FIXTURES_DIR / "manifest.json" - manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n") - print(f"Wrote manifest to {manifest_path}") - - -if __name__ == "__main__": - main() diff --git a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json deleted file mode 100644 index 614ae4f4e0..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "activation_function": "gelu_new", - "architectures": [ - "GPTJForCausalLM" - ], - "attn_pdrop": 0.0, - "bos_token_id": 50256, - "embd_pdrop": 0.0, - "eos_token_id": 50256, - "gradient_checkpointing": false, - "initializer_range": 0.02, - "layer_norm_epsilon": 1e-05, - "model_type": "gptj", - "n_embd": 4096, - "n_head": 16, - "n_inner": null, - "n_layer": 28, - "n_positions": 2048, - "resid_pdrop": 0.0, - "rotary": true, - "rotary_dim": 64, - "scale_attn_weights": true, - "summary_activation": null, - "summary_first_dropout": 0.1, - "summary_proj_to_labels": true, - "summary_type": "cls_index", - "summary_use_proj": true, - "task_specific_params": { - "text-generation": { - "do_sample": true, - "max_length": 50, - "temperature": 1.0 - } - }, - "tie_word_embeddings": false, - "tokenizer_class": "GPT2Tokenizer", - "transformers_version": "4.18.0.dev0", - "use_cache": true, - "vocab_size": 50400 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json deleted file mode 100644 index 54d3633ef9..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "architectures": [ - "GPTNeoXForCausalLM" - ], - "attention_probs_dropout_prob": 0, - "bos_token_id": 0, - "eos_token_id": 0, - "hidden_act": "gelu_fast", - "hidden_dropout_prob": 0, - "hidden_size": 6144, - "initializer_range": 0.02, - "intermediate_size": 24576, - "layer_norm_eps": 1e-05, - "max_position_embeddings": 2048, - "model_type": "gpt_neox", - "num_attention_heads": 64, - "num_hidden_layers": 44, - "rotary_emb_base": 10000, - "rotary_pct": 0.25, - "tie_word_embeddings": false, - "torch_dtype": "float16", - "transformers_version": "4.19.0.dev0", - "use_cache": true, - "vocab_size": 50432 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json deleted file mode 100644 index ec6ea340e5..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151643, - "eos_token_id": 151645, - "hidden_act": "silu", - "hidden_size": 8192, - "initializer_range": 0.02, - "intermediate_size": 29568, - "max_position_embeddings": 32768, - "max_window_layers": 70, - "model_type": "qwen2", - "num_attention_heads": 64, - "num_hidden_layers": 80, - "num_key_value_heads": 8, - "rms_norm_eps": 1e-06, - "rope_theta": 1000000.0, - "sliding_window": 131072, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.43.1", - "use_cache": true, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json deleted file mode 100644 index 67663e297b..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151643, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 8192, - "initializer_range": 0.02, - "intermediate_size": 29568, - "max_position_embeddings": 131072, - "max_window_layers": 80, - "model_type": "qwen2", - "num_attention_heads": 64, - "num_hidden_layers": 80, - "num_key_value_heads": 8, - "rms_norm_eps": 1e-05, - "rope_theta": 1000000.0, - "sliding_window": 131072, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.43.1", - "use_cache": true, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json deleted file mode 100644 index 1a90713f0e..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "architectures": [ - "Qwen2ForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 151643, - "eos_token_id": 151643, - "hidden_act": "silu", - "hidden_size": 3584, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 131072, - "max_window_layers": 28, - "model_type": "qwen2", - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "rms_norm_eps": 1e-06, - "rope_theta": 1000000.0, - "sliding_window": 131072, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.40.1", - "use_cache": true, - "use_mrope": false, - "use_sliding_window": false, - "vocab_size": 152064 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json deleted file mode 100644 index e49eccdc32..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 151643, - "eos_token_id": 151645, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 2560, - "initializer_range": 0.02, - "intermediate_size": 9728, - "max_position_embeddings": 40960, - "max_window_layers": 36, - "model_type": "qwen3", - "num_attention_heads": 32, - "num_hidden_layers": 36, - "num_key_value_heads": 8, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 1000000, - "sliding_window": null, - "tie_word_embeddings": true, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.0", - "use_cache": true, - "use_sliding_window": false, - "vocab_size": 151936 -} \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json deleted file mode 100644 index d46195ac87..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "architectures": [ - "Qwen3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 151643, - "eos_token_id": 151645, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 12288, - "max_position_embeddings": 40960, - "max_window_layers": 36, - "model_type": "qwen3", - "num_attention_heads": 32, - "num_hidden_layers": 36, - "num_key_value_heads": 8, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 1000000, - "sliding_window": null, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.0", - "use_cache": true, - "use_sliding_window": false, - "vocab_size": 151936 -} \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json deleted file mode 100644 index aec35a75d9..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json +++ /dev/null @@ -1,67 +0,0 @@ -{ - "architectures": [ - "DeepseekV3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "auto_map": { - "AutoConfig": "configuration_deepseek.DeepseekV3Config", - "AutoModel": "modeling_deepseek.DeepseekV3Model", - "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM" - }, - "bos_token_id": 0, - "eos_token_id": 1, - "ep_size": 1, - "first_k_dense_replace": 3, - "hidden_act": "silu", - "hidden_size": 7168, - "initializer_range": 0.02, - "intermediate_size": 18432, - "kv_lora_rank": 512, - "max_position_embeddings": 163840, - "model_type": "deepseek_v3", - "moe_intermediate_size": 2048, - "moe_layer_freq": 1, - "n_group": 8, - "n_routed_experts": 256, - "n_shared_experts": 1, - "norm_topk_prob": true, - "num_attention_heads": 128, - "num_experts_per_tok": 8, - "num_hidden_layers": 61, - "num_key_value_heads": 128, - "num_nextn_predict_layers": 1, - "q_lora_rank": 1536, - "qk_nope_head_dim": 128, - "qk_rope_head_dim": 64, - "quantization_config": { - "activation_scheme": "dynamic", - "fmt": "e4m3", - "quant_method": "fp8", - "weight_block_size": [ - 128, - 128 - ] - }, - "rms_norm_eps": 1e-06, - "rope_scaling": { - "beta_fast": 32, - "beta_slow": 1, - "factor": 40, - "mscale": 1.0, - "mscale_all_dim": 1.0, - "original_max_position_embeddings": 4096, - "type": "yarn" - }, - "rope_theta": 10000, - "routed_scaling_factor": 2.5, - "scoring_func": "sigmoid", - "tie_word_embeddings": false, - "topk_group": 4, - "topk_method": "noaux_tc", - "torch_dtype": "bfloat16", - "transformers_version": "4.33.1", - "use_cache": true, - "v_head_dim": 128, - "vocab_size": 129280 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py deleted file mode 100644 index f549f2b17d..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py +++ /dev/null @@ -1,199 +0,0 @@ -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} -class DeepseekV3Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the DeepSeek-V3. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 129280): - Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`DeepseekV3Model`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - moe_intermediate_size (`int`, *optional*, defaults to 1407): - Dimension of the MoE representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer decoder. - num_nextn_predict_layers (`int`, *optional*, defaults to 1): - Number of nextn predict layers in the DeepSeekV3 Model. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer decoder. - n_shared_experts (`int`, *optional*, defaults to None): - Number of shared experts, None means dense model. - n_routed_experts (`int`, *optional*, defaults to None): - Number of routed experts, None means dense model. - routed_scaling_factor (`float`, *optional*, defaults to 1.0): - Scaling factor or routed experts. - topk_method (`str`, *optional*, defaults to `gready`): - Topk method used in routed gate. - n_group (`int`, *optional*, defaults to None): - Number of groups for routed experts. - topk_group (`int`, *optional*, defaults to None): - Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups). - num_experts_per_tok (`int`, *optional*, defaults to None): - Number of selected experts, None means dense model. - moe_layer_freq (`int`, *optional*, defaults to 1): - The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers. - first_k_dense_replace (`int`, *optional*, defaults to 0): - Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head). - \--k dense layers--/ - norm_topk_prob (`bool`, *optional*, defaults to False): - Whether to normalize the weights of the routed experts. - scoring_func (`str`, *optional*, defaults to 'softmax'): - Method of computing expert weights. - aux_loss_alpha (`float`, *optional*, defaults to 0.001): - Auxiliary loss weight coefficient. - seq_aux = (`bool`, *optional*, defaults to True): - Whether to compute the auxiliary loss for each individual sample. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 1): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - - ```python - >>> from transformers import DeepseekV3Model, DeepseekV3Config - - >>> # Initializing a Deepseek-V3 style configuration - >>> configuration = DeepseekV3Config() - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "deepseek_v3" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=129280, - hidden_size=7168, - intermediate_size=18432, - moe_intermediate_size = 2048, - num_hidden_layers=61, - num_nextn_predict_layers=1, - num_attention_heads=128, - num_key_value_heads=128, - n_shared_experts = 1, - n_routed_experts = 256, - ep_size = 1, - routed_scaling_factor = 2.5, - kv_lora_rank = 512, - q_lora_rank = 1536, - qk_rope_head_dim = 64, - v_head_dim = 128, - qk_nope_head_dim = 128, - topk_method = 'noaux_tc', - n_group = 8, - topk_group = 4, - num_experts_per_tok = 8, - moe_layer_freq = 1, - first_k_dense_replace = 3, - norm_topk_prob = True, - scoring_func = 'sigmoid', - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=0, - eos_token_id=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.moe_intermediate_size = moe_intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_nextn_predict_layers = num_nextn_predict_layers - self.num_attention_heads = num_attention_heads - self.n_shared_experts = n_shared_experts - self.n_routed_experts = n_routed_experts - self.ep_size = ep_size - self.routed_scaling_factor = routed_scaling_factor - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.qk_nope_head_dim = qk_nope_head_dim - self.topk_method = topk_method - self.n_group = n_group - self.topk_group = topk_group - self.num_experts_per_tok = num_experts_per_tok - self.moe_layer_freq = moe_layer_freq - self.first_k_dense_replace = first_k_dense_replace - self.norm_topk_prob = norm_topk_prob - self.scoring_func = scoring_func - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py deleted file mode 100644 index 28d9ea27aa..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py +++ /dev/null @@ -1,1848 +0,0 @@ -# coding=utf-8 -# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch DeepSeek model.""" -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import ( - AttentionMaskConverter, - _prepare_4d_attention_mask, - _prepare_4d_causal_attention_mask, -) -from transformers.modeling_outputs import ( - BaseModelOutputWithPast, - CausalLMOutputWithPast, - SequenceClassifierOutputWithPast, -) -from transformers.modeling_utils import PreTrainedModel -from transformers.pytorch_utils import ( - ALL_LAYERNORM_LAYERS, - is_torch_greater_or_equal_than_1_13, -) -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from transformers.utils.import_utils import is_torch_fx_available -from .configuration_deepseek import DeepseekV3Config -import torch.distributed as dist -import numpy as np - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - - -# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. -# It means that the function will not be traced through and simply appear as a node in the graph. -if is_torch_fx_available(): - if not is_torch_greater_or_equal_than_1_13: - import torch.fx - - _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "DeepseekV3Config" - - -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad( - torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0) - ) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -class DeepseekV3RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - DeepseekV3RMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -ALL_LAYERNORM_LAYERS.append(DeepseekV3RMSNorm) - - -class DeepseekV3RotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / ( - self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) - ) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, - device=self.inv_freq.device, - dtype=torch.get_default_dtype(), - ) - self.max_seq_len_cached = None - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange( - self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype - ) - - freqs = torch.outer(t, self.inv_freq.to(t.device)) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV3 -class DeepseekV3LinearScalingRotaryEmbedding(DeepseekV3RotaryEmbedding): - """DeepseekV3RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" - - def __init__( - self, - dim, - max_position_embeddings=2048, - base=10000, - device=None, - scaling_factor=1.0, - ): - self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange( - self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype - ) - t = t / self.scaling_factor - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV3 -class DeepseekV3DynamicNTKScalingRotaryEmbedding(DeepseekV3RotaryEmbedding): - """DeepseekV3RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" - - def __init__( - self, - dim, - max_position_embeddings=2048, - base=10000, - device=None, - scaling_factor=1.0, - ): - self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / ( - base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) - ) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - t = torch.arange( - self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype - ) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - -# Inverse dim formula to find dim based on number of rotations -def yarn_find_correction_dim( - num_rotations, dim, base=10000, max_position_embeddings=2048 -): - return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / ( - 2 * math.log(base) - ) - - -# Find dim range bounds based on rotations -def yarn_find_correction_range( - low_rot, high_rot, dim, base=10000, max_position_embeddings=2048 -): - low = math.floor( - yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) - ) - high = math.ceil( - yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) - ) - return max(low, 0), min(high, dim - 1) # Clamp values just in case - - -def yarn_get_mscale(scale=1, mscale=1): - if scale <= 1: - return 1.0 - return 0.1 * mscale * math.log(scale) + 1.0 - - -def yarn_linear_ramp_mask(min, max, dim): - if min == max: - max += 0.001 # Prevent singularity - - linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) - ramp_func = torch.clamp(linear_func, 0, 1) - return ramp_func - - -class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding): - - def __init__( - self, - dim, - max_position_embeddings=2048, - base=10000, - device=None, - scaling_factor=1.0, - original_max_position_embeddings=4096, - beta_fast=32, - beta_slow=1, - mscale=1, - mscale_all_dim=0, - ): - self.scaling_factor = scaling_factor - self.original_max_position_embeddings = original_max_position_embeddings - self.beta_fast = beta_fast - self.beta_slow = beta_slow - self.mscale = mscale - self.mscale_all_dim = mscale_all_dim - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - dim = self.dim - - freq_extra = 1.0 / ( - self.base - ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) - ) - freq_inter = 1.0 / ( - self.scaling_factor - * self.base - ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) - ) - - low, high = yarn_find_correction_range( - self.beta_fast, - self.beta_slow, - dim, - self.base, - self.original_max_position_embeddings, - ) - inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to( - device=device, dtype=torch.float32 - ) - inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask - self.register_buffer("inv_freq", inv_freq, persistent=False) - - t = torch.arange(seq_len, device=device, dtype=torch.float32) - - freqs = torch.outer(t, inv_freq) - - _mscale = float( - yarn_get_mscale(self.scaling_factor, self.mscale) - / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim) - ) - - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer( - "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False - ) - self.register_buffer( - "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - - b, h, s, d = q.shape - q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) - - b, h, s, d = k.shape - k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) - - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class DeepseekV3MLP(nn.Module): - def __init__(self, config, hidden_size=None, intermediate_size=None): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size if hidden_size is None else hidden_size - self.intermediate_size = ( - config.intermediate_size if intermediate_size is None else intermediate_size - ) - - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - return down_proj - - -class MoEGate(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.top_k = config.num_experts_per_tok - self.n_routed_experts = config.n_routed_experts - self.routed_scaling_factor = config.routed_scaling_factor - self.scoring_func = config.scoring_func - self.topk_method = config.topk_method - self.n_group = config.n_group - self.topk_group = config.topk_group - - # topk selection algorithm - self.norm_topk_prob = config.norm_topk_prob - self.gating_dim = config.hidden_size - self.weight = nn.Parameter( - torch.empty((self.n_routed_experts, self.gating_dim)) - ) - if self.topk_method == "noaux_tc": - self.e_score_correction_bias = nn.Parameter( - torch.empty((self.n_routed_experts)) - ) - self.reset_parameters() - - def reset_parameters(self) -> None: - import torch.nn.init as init - - init.kaiming_uniform_(self.weight, a=math.sqrt(5)) - - def forward(self, hidden_states): - bsz, seq_len, h = hidden_states.shape - ### compute gating score - hidden_states = hidden_states.view(-1, h) - logits = F.linear( - hidden_states.type(torch.float32), self.weight.type(torch.float32), None - ) - if self.scoring_func == "sigmoid": - scores = logits.sigmoid() - else: - raise NotImplementedError( - f"insupportable scoring function for MoE gating: {self.scoring_func}" - ) - - ### select top-k experts - if self.topk_method == "noaux_tc": - assert not self.training - scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0) - group_scores = ( - scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1) - ) # [n, n_group] - group_idx = torch.topk( - group_scores, k=self.topk_group, dim=-1, sorted=False - )[ - 1 - ] # [n, top_k_group] - group_mask = torch.zeros_like(group_scores) # [n, n_group] - group_mask.scatter_(1, group_idx, 1) # [n, n_group] - score_mask = ( - group_mask.unsqueeze(-1) - .expand( - bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group - ) - .reshape(bsz * seq_len, -1) - ) # [n, e] - tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e] - _, topk_idx = torch.topk( - tmp_scores, k=self.top_k, dim=-1, sorted=False - ) - topk_weight = scores.gather(1, topk_idx) - else: - raise NotImplementedError( - f"insupportable TopK function for MoE gating: {self.topk_method}" - ) - - ### norm gate to sum 1 - if self.top_k > 1 and self.norm_topk_prob: - denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20 - topk_weight = topk_weight / denominator - topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor - - return topk_idx, topk_weight - -class DeepseekV3MoE(nn.Module): - """ - A mixed expert module containing shared experts. - """ - - def __init__(self, config): - super().__init__() - self.config = config - self.num_experts_per_tok = config.num_experts_per_tok - - if hasattr(config, "ep_size") and config.ep_size > 1: - assert config.ep_size == dist.get_world_size() - self.ep_size = config.ep_size - self.experts_per_rank = config.n_routed_experts // config.ep_size - self.ep_rank = dist.get_rank() - self.experts = nn.ModuleList( - [ - ( - DeepseekV3MLP( - config, intermediate_size=config.moe_intermediate_size - ) - if i >= self.ep_rank * self.experts_per_rank - and i < (self.ep_rank + 1) * self.experts_per_rank - else None - ) - for i in range(config.n_routed_experts) - ] - ) - else: - self.ep_size = 1 - self.experts_per_rank = config.n_routed_experts - self.ep_rank = 0 - self.experts = nn.ModuleList( - [ - DeepseekV3MLP( - config, intermediate_size=config.moe_intermediate_size - ) - for i in range(config.n_routed_experts) - ] - ) - self.gate = MoEGate(config) - if config.n_shared_experts is not None: - intermediate_size = config.moe_intermediate_size * config.n_shared_experts - self.shared_experts = DeepseekV3MLP( - config=config, intermediate_size=intermediate_size - ) - - def forward(self, hidden_states): - identity = hidden_states - orig_shape = hidden_states.shape - topk_idx, topk_weight = self.gate(hidden_states) - hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - flat_topk_idx = topk_idx.view(-1) - if not self.training: - y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape) - if self.config.n_shared_experts is not None: - y = y + self.shared_experts(identity) - return y - - @torch.no_grad() - def moe_infer(self, x, topk_ids, topk_weight): - cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts))) - cnts.scatter_(1, topk_ids, 1) - tokens_per_expert = cnts.sum(dim=0) - idxs = topk_ids.view(-1).argsort() - sorted_tokens = x[idxs // topk_ids.shape[1]] - sorted_tokens_shape = sorted_tokens.shape - if self.ep_size > 1: - tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1) - tokens_per_expert_group = tokens_per_expert.new_empty( - tokens_per_expert.shape[0] - ) - dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert) - output_splits = ( - tokens_per_expert_group.view(self.ep_size, -1) - .sum(1) - .cpu() - .numpy() - .tolist() - ) - gathered_tokens = sorted_tokens.new_empty( - tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1] - ) - input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist() - dist.all_to_all( - list(gathered_tokens.split(output_splits)), - list(sorted_tokens.split(input_split_sizes)), - ) - tokens_per_expert_post_gather = tokens_per_expert_group.view( - self.ep_size, self.experts_per_rank - ).sum(dim=0) - gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32) - s = 0 - for i, k in enumerate(tokens_per_expert_group.cpu().numpy()): - gatherd_idxs[s : s + k] = i % self.experts_per_rank - s += k - gatherd_idxs = gatherd_idxs.argsort() - sorted_tokens = gathered_tokens[gatherd_idxs] - tokens_per_expert = tokens_per_expert_post_gather - tokens_per_expert = tokens_per_expert.cpu().numpy() - - outputs = [] - start_idx = 0 - for i, num_tokens in enumerate(tokens_per_expert): - end_idx = start_idx + num_tokens - if num_tokens == 0: - continue - expert = self.experts[i + self.ep_rank * self.experts_per_rank] - tokens_for_this_expert = sorted_tokens[start_idx:end_idx] - expert_out = expert(tokens_for_this_expert) - outputs.append(expert_out) - start_idx = end_idx - - outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0) - if self.ep_size > 1: - new_x = torch.empty_like(outs) - new_x[gatherd_idxs] = outs - gathered_tokens = new_x.new_empty(*sorted_tokens_shape) - dist.all_to_all( - list(gathered_tokens.split(input_split_sizes)), - list(new_x.split(output_splits)), - ) - outs = gathered_tokens - - new_x = torch.empty_like(outs) - new_x[idxs] = outs - final_out = ( - new_x.view(*topk_ids.shape, -1) - .type(topk_weight.dtype) - .mul_(topk_weight.unsqueeze(dim=-1)) - .sum(dim=1) - .type(new_x.dtype) - ) - return final_out - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand( - batch, num_key_value_heads, n_rep, slen, head_dim - ) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV3 -class DeepseekV3Attention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " - "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.attention_dropout = config.attention_dropout - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.q_lora_rank = config.q_lora_rank - self.qk_rope_head_dim = config.qk_rope_head_dim - self.kv_lora_rank = config.kv_lora_rank - self.v_head_dim = config.v_head_dim - self.qk_nope_head_dim = config.qk_nope_head_dim - self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim - - self.is_causal = True - - if self.q_lora_rank is None: - self.q_proj = nn.Linear( - self.hidden_size, self.num_heads * self.q_head_dim, bias=False - ) - else: - self.q_a_proj = nn.Linear( - self.hidden_size, config.q_lora_rank, bias=config.attention_bias - ) - self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank) - self.q_b_proj = nn.Linear( - config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False - ) - - self.kv_a_proj_with_mqa = nn.Linear( - self.hidden_size, - config.kv_lora_rank + config.qk_rope_head_dim, - bias=config.attention_bias, - ) - self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank) - self.kv_b_proj = nn.Linear( - config.kv_lora_rank, - self.num_heads - * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim), - bias=False, - ) - - self.o_proj = nn.Linear( - self.num_heads * self.v_head_dim, - self.hidden_size, - bias=config.attention_bias, - ) - self._init_rope() - - self.softmax_scale = self.q_head_dim ** (-0.5) - if self.config.rope_scaling is not None: - mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) - scaling_factor = self.config.rope_scaling["factor"] - if mscale_all_dim: - mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) - self.softmax_scale = self.softmax_scale * mscale * mscale - - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = DeepseekV3RotaryEmbedding( - self.qk_rope_head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - if scaling_type == "linear": - self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding( - self.qk_rope_head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = DeepseekV3DynamicNTKScalingRotaryEmbedding( - self.qk_rope_head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "yarn": - kwargs = { - key: self.config.rope_scaling[key] - for key in [ - "original_max_position_embeddings", - "beta_fast", - "beta_slow", - "mscale", - "mscale_all_dim", - ] - if key in self.config.rope_scaling - } - self.rotary_emb = DeepseekV3YarnRotaryEmbedding( - self.qk_rope_head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - **kwargs, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return ( - tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim) - .transpose(1, 2) - .contiguous() - ) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - - if self.q_lora_rank is None: - q = self.q_proj(hidden_states) - else: - q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) - q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) - q_nope, q_pe = torch.split( - q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1 - ) - - compressed_kv = self.kv_a_proj_with_mqa(hidden_states) - compressed_kv, k_pe = torch.split( - compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1 - ) - k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) - kv = ( - self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) - .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - .transpose(1, 2) - ) - - k_nope, value_states = torch.split( - kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1 - ) - kv_seq_len = value_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) - - query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - query_states[:, :, :, : self.qk_nope_head_dim] = q_nope - query_states[:, :, :, self.qk_nope_head_dim :] = q_pe - - key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - key_states[:, :, :, : self.qk_nope_head_dim] = k_nope - key_states[:, :, :, self.qk_nope_head_dim :] = k_pe - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update( - key_states, value_states, self.layer_idx, cache_kwargs - ) - - attn_weights = ( - torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale - ) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - assert attention_mask is not None - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax( - attn_weights, dim=-1, dtype=torch.float32 - ).to(query_states.dtype) - attn_weights = nn.functional.dropout( - attn_weights, p=self.attention_dropout, training=self.training - ) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - - attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV3 -class DeepseekV3FlashAttention2(DeepseekV3Attention): - """ - DeepseekV3 flash attention module. This module inherits from `DeepseekV3Attention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # DeepseekV3FlashAttention2 attention does not support output_attentions - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - - output_attentions = False - - bsz, q_len, _ = hidden_states.size() - - if self.q_lora_rank is None: - q = self.q_proj(hidden_states) - else: - q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) - q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2) - q_nope, q_pe = torch.split( - q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1 - ) - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - compressed_kv = self.kv_a_proj_with_mqa(hidden_states) - compressed_kv, k_pe = torch.split( - compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1 - ) - k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2) - kv = ( - self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) - .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) - .transpose(1, 2) - ) - - k_nope, value_states = torch.split( - kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1 - ) - kv_seq_len = value_states.shape[-2] - - kv_seq_len = value_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids) - - query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - query_states[:, :, :, : self.qk_nope_head_dim] = q_nope - query_states[:, :, :, self.qk_nope_head_dim :] = q_pe - - key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim) - key_states[:, :, :, : self.qk_nope_head_dim] = k_nope - key_states[:, :, :, self.qk_nope_head_dim :] = k_pe - - if self.q_head_dim != self.v_head_dim: - value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim]) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update( - key_states, value_states, self.layer_idx, cache_kwargs - ) - - # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache - # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - dropout_rate = self.attention_dropout if self.training else 0.0 - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in the correct dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - # in fp32. (DeepseekV3RMSNorm handles it correctly) - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - # Handle the case where the model is quantized - if hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - elif torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - else: - target_dtype = ( - self.q_proj.weight.dtype - if self.q_lora_rank is None - else self.q_a_proj.weight.dtype - ) - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - softmax_scale=self.softmax_scale, - ) - if self.q_head_dim != self.v_head_dim: - attn_output = attn_output[:, :, :, : self.v_head_dim] - - attn_output = attn_output.reshape( - bsz, q_len, self.num_heads * self.v_head_dim - ).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`int`, *optional*): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV3FlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - ( - query_states, - key_states, - value_states, - indices_q, - cu_seq_lens, - max_seq_lens, - ) = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - - attn_output = pad_input( - attn_output_unpad, indices_q, batch_size, query_length - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - - return attn_output - - def _upad_input( - self, query_layer, key_layer, value_layer, attention_mask, query_length - ): - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape - - key_layer = index_first_axis( - key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), - indices_k, - ) - value_layer = index_first_axis( - value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), - indices_k, - ) - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), - indices_k, - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( - query_layer, attention_mask - ) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -ATTENTION_CLASSES = { - "eager": DeepseekV3Attention, - "flash_attention_2": DeepseekV3FlashAttention2, -} - - -class DeepseekV3DecoderLayer(nn.Module): - def __init__(self, config: DeepseekV3Config, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - - self.self_attn = ATTENTION_CLASSES[config._attn_implementation]( - config=config, layer_idx=layer_idx - ) - - self.mlp = ( - DeepseekV3MoE(config) - if ( - config.n_routed_experts is not None - and layer_idx >= config.first_k_dense_replace - and layer_idx % config.moe_layer_freq == 0 - ) - else DeepseekV3MLP(config) - ) - self.input_layernorm = DeepseekV3RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - self.post_attention_layernorm = DeepseekV3RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[ - torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] - ]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): - attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, - query_sequence_length, key_sequence_length)` if default attention is used. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - **kwargs, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -DeepseekV3_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`DeepseekV3Config`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.", - DeepseekV3_START_DOCSTRING, -) -class DeepseekV3PreTrainedModel(PreTrainedModel): - config_class = DeepseekV3Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["DeepseekV3DecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -DeepseekV3_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.", - DeepseekV3_START_DOCSTRING, -) -class DeepseekV3Model(DeepseekV3PreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`] - - Args: - config: DeepseekV3Config - """ - - def __init__(self, config: DeepseekV3Config): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding( - config.vocab_size, config.hidden_size, self.padding_idx - ) - self.layers = nn.ModuleList( - [ - DeepseekV3DecoderLayer(config, layer_idx) - for layer_idx in range(config.num_hidden_layers) - ] - ) - self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" - self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError( - "You cannot specify both input_ids and inputs_embeds at the same time" - ) - elif input_ids is not None: - batch_size, seq_length = input_ids.shape[:2] - elif inputs_embeds is not None: - batch_size, seq_length = inputs_embeds.shape[:2] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - past_key_values_length = 0 - if use_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, - seq_length + past_key_values_length, - dtype=torch.long, - device=device, - ) - position_ids = position_ids.unsqueeze(0) - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if self._use_flash_attention_2: - # 2d mask is passed through the layers - attention_mask = ( - attention_mask - if (attention_mask is not None and 0 in attention_mask) - else None - ) - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - - # embed positions - hidden_states = inputs_embeds - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = None - - for decoder_layer in self.layers: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache = layer_outputs[2 if output_attentions else 1] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = None - if use_cache: - next_cache = ( - next_decoder_cache.to_legacy_cache() - if use_legacy_cache - else next_decoder_cache - ) - if not return_dict: - return tuple( - v - for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] - if v is not None - ) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config): - super().__init__(config) - self.model = DeepseekV3Model(config) - self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING) - @replace_return_docstrings( - output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC - ) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM - - >>> model = DeepseekV3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) - >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - **kwargs, - ): - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as - # input) - if ( - attention_mask is not None - and attention_mask.shape[1] > input_ids.shape[1] - ): - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple( - past_state.index_select(0, beam_idx.to(past_state.device)) - for past_state in layer_past - ), - ) - return reordered_past - - -@add_start_docstrings( - """ - The DeepseekV3 Model transformer with a sequence classification head on top (linear layer). - - [`DeepseekV3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - DeepseekV3_START_DOCSTRING, -) -class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = DeepseekV3Model(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError( - "Cannot handle batch sizes > 1 if no padding token is defined." - ) - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - sequence_lengths = ( - torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - ).to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[ - torch.arange(batch_size, device=logits.device), sequence_lengths - ] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and ( - labels.dtype == torch.long or labels.dtype == torch.int - ): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct( - pooled_logits.view(-1, self.num_labels), labels.view(-1) - ) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json deleted file mode 100644 index 031a7856cc..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_act": "silu", - "hidden_size": 8192, - "initializer_range": 0.02, - "intermediate_size": 22016, - "max_position_embeddings": 4096, - "model_type": "llama", - "num_attention_heads": 64, - "num_hidden_layers": 95, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000.0, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.33.1", - "use_cache": true, - "vocab_size": 102400 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json deleted file mode 100644 index 208956063b..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 11008, - "max_position_embeddings": 4096, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 30, - "num_key_value_heads": 32, - "pretraining_tp": 1, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 10000.0, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.33.1", - "use_cache": true, - "vocab_size": 102400 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json b/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json deleted file mode 100644 index 10c66461e4..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "activation_function": "gelu_new", - "architectures": [ - "GPT2LMHeadModel" - ], - "attn_pdrop": 0.1, - "bos_token_id": 50256, - "embd_pdrop": 0.1, - "eos_token_id": 50256, - "initializer_range": 0.02, - "layer_norm_epsilon": 1e-05, - "model_type": "gpt2", - "n_ctx": 1024, - "n_embd": 768, - "n_head": 12, - "n_layer": 12, - "n_positions": 1024, - "resid_pdrop": 0.1, - "summary_activation": null, - "summary_first_dropout": 0.1, - "summary_proj_to_labels": true, - "summary_type": "cls_index", - "summary_use_proj": true, - "task_specific_params": { - "text-generation": { - "do_sample": true, - "max_length": 50 - } - }, - "vocab_size": 50257 -} \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/manifest.json b/services/core/models/tests/integration/parallelism/fixtures/manifest.json deleted file mode 100644 index baddd3d114..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/manifest.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "EleutherAI/gpt-j-6b": [ - "config.json" - ], - "EleutherAI/gpt-neox-20b": [ - "config.json" - ], - "Qwen/Qwen2.5-72B": [ - "config.json" - ], - "Qwen/Qwen2.5-72B-Instruct": [ - "config.json" - ], - "Qwen/Qwen2.5-7B": [ - "config.json" - ], - "Qwen/Qwen3-4B-SafeRL": [ - "config.json" - ], - "Qwen/Qwen3-8B": [ - "config.json" - ], - "deepseek-ai/DeepSeek-V3-Base": [ - "config.json", - "configuration_deepseek.py", - "modeling_deepseek.py" - ], - "deepseek-ai/deepseek-llm-67b-base": [ - "config.json" - ], - "deepseek-ai/deepseek-llm-7b-base": [ - "config.json" - ], - "gpt2": [ - "config.json" - ], - "microsoft/phi-2": [ - "config.json" - ], - "microsoft/phi-4": [ - "config.json" - ], - "mistralai/Devstral-Small-2505": [ - "config.json" - ], - "mistralai/Mistral-7B-v0.1": [ - "config.json" - ], - "mistralai/Mixtral-8x7B-v0.1": [ - "config.json" - ], - "nvidia/Mistral-NeMo-Minitron-8B-Instruct": [ - "config.json" - ], - "nvidia/NVIDIA-Nemotron-Nano-9B-v2": [ - "config.json", - "configuration_nemotron_h.py", - "modeling_nemotron_h.py" - ], - "nvidia/nemotron-4-340b-instruct": [ - "model_config.yaml" - ], - "openai/gpt-oss-120b": [ - "config.json" - ], - "openai/gpt-oss-20b": [ - "config.json" - ] -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json deleted file mode 100644 index 011968cc02..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "_name_or_path": "microsoft/phi-2", - "architectures": [ - "PhiForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 50256, - "embd_pdrop": 0.0, - "eos_token_id": 50256, - "hidden_act": "gelu_new", - "hidden_size": 2560, - "initializer_range": 0.02, - "intermediate_size": 10240, - "layer_norm_eps": 1e-05, - "max_position_embeddings": 2048, - "model_type": "phi", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 32, - "partial_rotary_factor": 0.4, - "qk_layernorm": false, - "resid_pdrop": 0.1, - "rope_scaling": null, - "rope_theta": 10000.0, - "tie_word_embeddings": false, - "torch_dtype": "float16", - "transformers_version": "4.37.0", - "use_cache": true, - "vocab_size": 51200 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json deleted file mode 100644 index ab17e0b583..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "_name_or_path": "microsoft/phi-4", - "architectures": [ - "Phi3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 100257, - "embd_pdrop": 0.0, - "eos_token_id": 100265, - "hidden_act": "silu", - "hidden_size": 5120, - "initializer_range": 0.02, - "intermediate_size": 17920, - "max_position_embeddings": 16384, - "model_type": "phi3", - "num_attention_heads": 40, - "num_hidden_layers": 40, - "num_key_value_heads": 10, - "original_max_position_embeddings": 16384, - "pad_token_id": 100349, - "resid_pdrop": 0.0, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 250000, - "sliding_window": null, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.47.0", - "use_cache": true, - "vocab_size": 100352 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json deleted file mode 100644 index dae01ddab0..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "architectures": [ - "MistralForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 1, - "eos_token_id": 2, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 5120, - "initializer_range": 0.02, - "intermediate_size": 32768, - "max_position_embeddings": 131072, - "model_type": "mistral", - "num_attention_heads": 32, - "num_hidden_layers": 40, - "num_key_value_heads": 8, - "rms_norm_eps": 1e-05, - "rope_theta": 1000000000.0, - "sliding_window": null, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_cache": true, - "vocab_size": 131072 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json deleted file mode 100644 index f4989f072a..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "architectures": [ - "MistralForCausalLM" - ], - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 32768, - "model_type": "mistral", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "rms_norm_eps": 1e-05, - "rope_theta": 10000.0, - "sliding_window": 4096, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.34.0.dev0", - "use_cache": true, - "vocab_size": 32000 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json deleted file mode 100644 index de132a80b2..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "architectures": [ - "MixtralForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 1, - "eos_token_id": 2, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 32768, - "model_type": "mixtral", - "num_attention_heads": 32, - "num_experts_per_tok": 2, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "num_local_experts": 8, - "output_router_logits": false, - "rms_norm_eps": 1e-05, - "rope_theta": 1000000.0, - "router_aux_loss_coef": 0.02, - "sliding_window": null, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.36.0.dev0", - "use_cache": true, - "vocab_size": 32000 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json b/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json deleted file mode 100644 index 55d6cbbe0b..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "_name_or_path": "nvidia/Mistral-NeMo-Minitron-8B-Instruct", - "activation": "silu", - "architectures": [ - "MistralForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 1, - "eos_token_id": 2, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 11520, - "max_position_embeddings": 8192, - "model_type": "mistral", - "num_attention_heads": 32, - "num_hidden_layers": 40, - "num_key_value_heads": 8, - "rms_norm_eps": 1e-05, - "rope_theta": 1000000.0, - "sliding_window": null, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.44.0", - "use_cache": true, - "vocab_size": 131072 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json deleted file mode 100644 index 4f56c18a20..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "architectures": [ - "NemotronHForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "auto_map": { - "AutoConfig": "configuration_nemotron_h.NemotronHConfig", - "AutoModelForCausalLM": "modeling_nemotron_h.NemotronHForCausalLM" - }, - "bos_token_id": 1, - "chunk_size": 128, - "conv_kernel": 4, - "eos_token_id": 12, - "head_dim": 128, - "hidden_dropout": 0.0, - "hidden_size": 4480, - "hybrid_override_pattern": "M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-", - "initializer_range": 0.02, - "intermediate_size": 15680, - "layer_norm_epsilon": 1e-05, - "mamba_head_dim": 80, - "mamba_hidden_act": "silu", - "mamba_num_groups": 8, - "mamba_num_heads": 128, - "mamba_proj_bias": false, - "mamba_state_dim": 128, - "max_position_embeddings": 131072, - "mlp_bias": false, - "mlp_hidden_act": "relu2", - "model_type": "nemotron_h", - "n_groups": 8, - "num_attention_heads": 40, - "num_hidden_layers": 56, - "num_key_value_heads": 8, - "num_logits_to_keep": 1, - "num_query_groups": 8, - "pad_token_id": 0, - "rescale_prenorm_residual": true, - "residual_in_fp32": false, - "rms_norm_eps": 1e-05, - "sliding_window": null, - "ssm_state_size": 128, - "tie_word_embeddings": false, - "time_step_floor": 0.0001, - "time_step_max": 0.1, - "time_step_min": 0.001, - "time_step_rank": 256, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.3", - "use_bias": false, - "use_cache": true, - "use_conv_bias": true, - "use_mamba_kernels": true, - "vocab_size": 131072 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py deleted file mode 100644 index 2b5c451b4a..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py +++ /dev/null @@ -1,245 +0,0 @@ -# coding=utf-8 -# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved. -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""NemotronH model configuration""" - -import re - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - - -class NemotronHConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`NemotronHModel`]. It is used to instantiate a - NemotronH model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the NemotronH-v0.1 model. - - [todo](todo) - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 131072): - Vocabulary size of the NemotronH model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`NemotronHModel`] - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the - model has a output word embedding layer. - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 21504): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 52): - Number of hidden layers in the Transformer encoder. - hybrid_override_pattern (`str`, *optional*, defaults to `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`): - The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, *: Attention, -: MLP - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - attention_head_dim (`int`, *optional*, defaults to 128): - Dimension of each attention head. - num_key_value_heads (`int`, *optional*, defaults to 8): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. - mlp_hidden_act (`str`, *optional*, defaults to "relu2"): - The non-linear activation function in the MLP layers. - attention_bias (`bool`, *optional*, defaults to `False`): - Whether to use bias in attention layers. - mlp_bias (`bool`, *optional*, defaults to `False`): - Whether to use bias in MLP layers. - use_bias (`bool`, *optional*, defaults to `False`): - Whether to use bias in the model. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): - The epsilon used by the layer normalization layers. - residual_in_fp32 (`bool`, *optional*, defaults to `False`): - Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - num_logits_to_keep (`int` or `None`, *optional*, defaults to 1): - Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an - integer value, only last `num_logits_to_keep` logits will be calculated. - pad_token_id (`int`, *optional*, defaults to 0): - The id of the padding token. - bos_token_id (`int`, *optional*, defaults to 1): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 2): - The id of the "end-of-sequence" token. - sliding_window (`int`, *optional*, defaults to None): - Sliding window attention window size. - max_position_embeddings (`int`, *optional*, defaults to 4096): - The maximum sequence length that this model might ever be used with. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - hidden_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the hidden states. - use_mamba_kernels (`bool`, *optional*, defaults to `True`): - Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and - `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. - ssm_state_size (`int`, *optional*, defaults to 128): - The dimension of the mamba state space latents. - mamba_num_heads (`int`, *optional*, defaults to 128): - Number of heads in Mamba layers. - mamba_n_groups (`int`, *optional*, defaults to 8): - Number of groups in Mamba layers. - mamba_head_dim (`int`, *optional*, defaults to 64): - Dimension of each Mamba head. - mamba_d_conv (`int`, *optional*, defaults to 4): - The size of the mamba convolution kernel. - mamba_expand (`int`, *optional*, defaults to 2): - Expanding factor used to determine the mamba intermediate size. - mamba_hidden_act (`str`, *optional*, defaults to "silu"): - The non-linear activation function in the Mamba layers. - mamba_dt_min (`float`, *optional*, defaults to 0.001): - Minimum value for the time step in Mamba. - mamba_dt_max (`float`, *optional*, defaults to 0.1): - Maximum value for the time step in Mamba. - mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))): - Limits for the time step in Mamba. - mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4): - Floor value for time step initialization in Mamba. - mamba_conv_bias (`bool`, *optional*, defaults to `True`): - Whether to use bias in the convolution layer of the mamba mixer block. - mamba_proj_bias (`bool`, *optional*, defaults to `False`): - Whether to use bias in the input and output projections of the mamba mixer block. - mamba_chunk_size (`int`, *optional*, defaults to 256): - Size of chunks for Mamba processing. - rescale_prenorm_residual (`bool`, *optional*, defaults to `True`): - Whether to rescale the pre-normalization residual connections. - """ - - model_type = "nemotron_h" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=131072, - tie_word_embeddings=False, - hidden_size=4096, - intermediate_size=21504, - num_hidden_layers=52, - hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-", - num_attention_heads=32, - #attention_head_dim=128, - head_dim=128, - num_key_value_heads=8, # nemo: num_query_groups - mlp_hidden_act="relu2", - attention_bias=False, - mlp_bias=False, - use_bias=False, - initializer_range=0.02, # nemo: init_method_std - layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon - residual_in_fp32=False, # Megatron Core default value - use_cache=True, - num_logits_to_keep=1, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - sliding_window=None, - max_position_embeddings=4096, - attention_dropout=0.0, - hidden_dropout=0.0, # * ADDED - use_mamba_kernels=True, - ssm_state_size=128, # mamba_state_size - mamba_num_heads=128, - mamba_n_groups=8, # nemo: mamba_ssm_ngroups = num_heads - mamba_head_dim=64, - mamba_d_conv=4, - mamba_expand=2, - mamba_hidden_act="silu", - mamba_dt_min=0.001, - mamba_dt_max=0.1, - mamba_dt_limit=(0.0, float("inf")), - mamba_dt_init_floor=1e-4, - mamba_conv_bias=True, - mamba_proj_bias=False, - mamba_chunk_size=256, - rescale_prenorm_residual=True, - **kwargs, - ): - self.vocab_size = vocab_size - self.tie_word_embeddings = tie_word_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.hybrid_override_pattern = hybrid_override_pattern - self.num_attention_heads = num_attention_heads - #self.attention_head_dim = attention_head_dim - self.head_dim = head_dim - self.sliding_window = sliding_window - self.max_position_embeddings = max_position_embeddings - self.attention_dropout = attention_dropout - self.hidden_dropout = hidden_dropout - - # Validate hybrid_override_pattern - # M: Mamba2, *: Attention, -: MLP - assert len(self.hybrid_override_pattern) == self.num_hidden_layers, "hybrid_override_pattern must have the same length as num_hidden_layers" - assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), "hybrid_override_pattern must only contain characters 'M', '*', or '-'" - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.mlp_hidden_act = mlp_hidden_act - self.attention_bias = attention_bias - self.mlp_bias = mlp_bias - self.use_bias = use_bias - self.initializer_range = initializer_range - self.layer_norm_epsilon = layer_norm_epsilon - self.residual_in_fp32 = residual_in_fp32 - - self.use_cache = use_cache - self.num_logits_to_keep = num_logits_to_keep - - self.use_mamba_kernels = use_mamba_kernels - self.n_groups = mamba_n_groups - self.mamba_head_dim = mamba_head_dim - self.ssm_state_size = ssm_state_size - self.mamba_num_heads = mamba_num_heads - self.conv_kernel = mamba_d_conv - self.expand = mamba_expand - self.mamba_hidden_act = mamba_hidden_act - self.time_step_min = mamba_dt_min - self.time_step_max = mamba_dt_max - self.time_step_limit = mamba_dt_limit - self.time_step_floor = mamba_dt_init_floor - self.use_conv_bias = mamba_conv_bias - self.mamba_proj_bias = mamba_proj_bias - self.chunk_size = mamba_chunk_size - self.rescale_prenorm_residual = rescale_prenorm_residual - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - @property - def layers_block_type(self): - return [ - "mamba" if self.hybrid_override_pattern[i] == "M" else - "attention" if self.hybrid_override_pattern[i] == "*" else "mlp" - for i in range(self.num_hidden_layers)] \ No newline at end of file diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py deleted file mode 100644 index 7ac44be3f8..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py +++ /dev/null @@ -1,1643 +0,0 @@ -# coding=utf-8 -# Copyright 2024 HuggingFace Inc. team. -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch NemotronH model.""" - -import math -from dataclasses import dataclass -from typing import Any, Dict, Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from torch import nn -from torch.nn import CrossEntropyLoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import DynamicCache # we need __iter__ and __len__ of pkv -from transformers.generation import GenerationMixin -from transformers.modeling_attn_mask_utils import ( - AttentionMaskConverter, -) -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import ( - ModelOutput, - add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - logging, -) -from transformers.utils.import_utils import ( - is_causal_conv1d_available, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - is_mamba_2_ssm_available, -) -from .configuration_nemotron_h import NemotronHConfig - - -logger = logging.get_logger(__name__) - - -# Copied from transformers.models.mamba.modeling_mamba2.modeling_mamba2.py with MAMBA2->NEMOTRONH,Mamba2->NemotronH -# For Mamba2 components Mamba2->NemotronHMamba2 -if is_mamba_2_ssm_available(): - from mamba_ssm.ops.triton.selective_state_update import selective_state_update - from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined -else: - mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined, selective_state_update = None, None, None - -try: - #from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated - from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn -except ImportError: - raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported") - -if is_causal_conv1d_available(): - from causal_conv1d import causal_conv1d_fn, causal_conv1d_update -else: - causal_conv1d_update, causal_conv1d_fn = None, None - -if is_flash_attn_2_available(): - from transformers.modeling_flash_attention_utils import _flash_attention_forward - -is_fast_path_available = all( - ( - selective_state_update, - mamba_chunk_scan_combined, - mamba_split_conv1d_scan_combined, - causal_conv1d_fn, - causal_conv1d_update, - ) -) - - -_CHECKPOINT_FOR_DOC = "nvidia/Nemotron-H-56B-Base-8K" -_CONFIG_FOR_DOC = "NemotronHConfig" - - -# Helper methods for segment sum computation - - -def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int): - """ - Padding x tensor with `pad_size` on the seq_len dim (dim=1) - - Assumes that we only have tensors of either size 4 or 3 - """ - pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0) - - return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0) - - -def reshape_into_chunks(input_tensor, pad_size, chunk_size): - """ - Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and - simultaneously splitting it into chunk sequences. - - Assumes that we only have tensors of either size 4 or 3 - """ - # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...] - input_tensor = pad_tensor_by_size(input_tensor, pad_size) - - if len(input_tensor.shape) == 3: - # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads] - return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2]) - else: - # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size] - return input_tensor.reshape( - input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3] - ) - - -def segment_sum(input_tensor): - """ - More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions. - """ - chunk_size = input_tensor.size(-1) - # 1. expand input tensor to have an additional dimension and repeat along that dimension - # [..., chunk_size] -> [..., chunk_size, chunk_size] - input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size) - # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag - mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1) - input_tensor = input_tensor.masked_fill(~mask, 0) - # 3. compute actual cumsum - tensor_segsum = torch.cumsum(input_tensor, dim=-2) - - # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time) - mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0) - tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf) - return tensor_segsum - - -def apply_mask_to_padding_states(hidden_states, attention_mask): - """ - Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 - """ - if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: - dtype = hidden_states.dtype - hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) - - return hidden_states - -# Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/models/jamba/modeling_jamba.py -class HybridMambaAttentionDynamicCache(DynamicCache): - """ - A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache - (which has a constant shape regardless of seq_len). - - This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` - and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor - For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, - while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). - For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), - while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, - and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. - """ - - def __init__(self, config, batch_size, dtype=torch.float16, device=None): - super().__init__() - self.dtype = dtype - self.hybrid_override_pattern = config.hybrid_override_pattern - self.has_previous_state = False # only used by mamba - #intermediate_size = config.expand * config.hidden_size - intermediate_size = config.mamba_num_heads * config.mamba_head_dim - ssm_state_size = config.ssm_state_size - conv_kernel_size = config.conv_kernel - self.conv_states = [] - self.ssm_states = [] - self.transformer_layers = [] - for i in range(config.num_hidden_layers): - if self.hybrid_override_pattern[i] == "M": - # Mamba layer - self.conv_states += [ - torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype) - ] - self.ssm_states += [ - torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype) - ] - else: - # Attention or MLP layer - self.conv_states += [torch.tensor([[]] * batch_size, device=device)] - self.ssm_states += [torch.tensor([[]] * batch_size, device=device)] - self.transformer_layers.append(i) - - self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] - self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] - - def update( - self, - key_states: torch.Tensor, - value_states: torch.Tensor, - layer_idx: int, - cache_kwargs: Optional[Dict[str, Any]] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Update the cache - if self.key_cache[layer_idx].shape[-1] == 0: - self.key_cache[layer_idx] = key_states - self.value_cache[layer_idx] = value_states - else: - self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) - self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) - - return self.key_cache[layer_idx], self.value_cache[layer_idx] - - def reorder_cache(self, beam_idx: torch.LongTensor): - """Reorders the cache for beam search, given the selected beam indices.""" - for layer_idx in range(len(self.key_cache)): - device = self.key_cache[layer_idx].device - self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) - device = self.value_cache[layer_idx].device - self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) - - device = self.conv_states[layer_idx].device - self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device)) - device = self.ssm_states[layer_idx].device - self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device)) - - def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: - """Returns the sequence length of the cached states. A layer index can be optionally passed.""" - # take any layer that contains cache and not empty tensor - layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx - if len(self.key_cache) <= layer_idx: - return 0 - return self.key_cache[layer_idx].shape[-2] - - def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: - raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.") - - @classmethod - def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache": - raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.") - - # Copied from modeling_mamba2.py - def update_conv_state( - self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False - ) -> torch.Tensor: - if cache_init: - self.conv_states[layer_idx] = new_conv_state.to(self.conv_states.device) - else: - self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1) - self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states.device) - return self.conv_states[layer_idx] - - def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor): - self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device) - return self.ssm_states[layer_idx] - - def reset(self): - self.conv_states.zero_() - self.ssm_states.zero_() - -class MambaRMSNormGated(torch.nn.Module): - def __init__(self, hidden_size, group_size, eps=1e-5): - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - self.group_size = group_size - - # jan28b version - def forward(self, hidden_states, gate=None): - return rmsnorm_fn(x=hidden_states, - weight=self.weight, - bias=None, # No bias - z=gate, - eps=self.variance_epsilon, - group_size=self.group_size, - norm_before_gate=False - ) - -class NemotronHMamba2Mixer(nn.Module): - """ - Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. - A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) - ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, - and is why Mamba is called **selective** state spaces) - """ - - def __init__(self, config: NemotronHConfig, layer_idx: int): - super().__init__() - self.num_heads = config.mamba_num_heads - self.hidden_size = config.hidden_size - self.ssm_state_size = config.ssm_state_size - self.conv_kernel_size = config.conv_kernel - self.intermediate_size = config.mamba_num_heads * config.mamba_head_dim - self.layer_idx = layer_idx - self.use_conv_bias = config.use_conv_bias - self.activation = config.mamba_hidden_act - self.act = ACT2FN[config.mamba_hidden_act] - - self.layer_norm_epsilon = config.layer_norm_epsilon - - self.n_groups = config.n_groups - self.head_dim = config.mamba_head_dim - self.chunk_size = config.chunk_size - - self.time_step_limit = config.time_step_limit - self.time_step_min = config.time_step_min - self.time_step_max = config.time_step_max - - self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size - self.conv1d = nn.Conv1d( - in_channels=self.conv_dim, - out_channels=self.conv_dim, - bias=config.use_conv_bias, - kernel_size=config.conv_kernel, - groups=self.conv_dim, - padding=config.conv_kernel - 1, - ) - - # projection of the input hidden states - projection_size = self.intermediate_size + self.conv_dim + self.num_heads - self.in_proj = nn.Linear( - self.hidden_size, - projection_size, - bias=config.use_bias, - ) - # selective projection used to make dt, B and C input dependant - - # time step projection (discretization) - # instantiate once and copy inv_dt in init_weights of PretrainedModel - self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) - - # S4D real initialization. These are not discretized! - # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded - A = torch.arange(1, self.num_heads + 1) - self.A_log = nn.Parameter(torch.log(A)) - self.A_log._no_weight_decay = True - self.norm = MambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon, group_size=self.intermediate_size // self.n_groups) - self.D = nn.Parameter(torch.ones(self.num_heads)) - self.D._no_weight_decay = True - - self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias) - self.use_bias = config.use_bias - - if not is_fast_path_available: - logger.warning_once( - "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" - " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" - " https://github.com/Dao-AILab/causal-conv1d" - ) - - def cuda_kernels_forward( - self, - hidden_states: torch.Tensor, - cache_params: Optional[HybridMambaAttentionDynamicCache] = None, - cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - ): - # 1. Gated MLP's linear projection - hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) - projected_states = self.in_proj(hidden_states) - - # Set up dimensions for reshapes later - batch_size, seq_len, _ = hidden_states.shape - groups_time_state_size = self.n_groups * self.ssm_state_size - d_mlp = ( - projected_states.shape[-1] - - 2 * self.intermediate_size - - 2 * self.n_groups * self.ssm_state_size - - self.num_heads - ) // 2 - - # Single step calculations via cache - if cache_params is not None and cache_position is not None and cache_position[0] > 0: - _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( - [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 - ) - - # 2. Convolution sequence transformation - hidden_states_B_C = causal_conv1d_update( - hidden_states_B_C, - cache_params.conv_states[self.layer_idx], - self.conv1d.weight.squeeze(1), - self.conv1d.bias, - self.activation, - ) - - hidden_states, B, C = torch.split( - hidden_states_B_C, - [self.intermediate_size, groups_time_state_size, groups_time_state_size], - dim=-1, - ) - - # 3. SSM transformation - A = -torch.exp(self.A_log.float()) # (nheads,) - A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) - dt = dt[:, :, None].expand(-1, -1, self.head_dim) - dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) - D = self.D[:, None, ...].expand(-1, self.head_dim) - B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) - C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) - hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) - hidden_states = selective_state_update( - cache_params.ssm_states[self.layer_idx], - hidden_states_reshaped, - dt, - A, - B, - C, - D, - z=None, - dt_bias=dt_bias, - dt_softplus=True, - ) - hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) - hidden_states = self.norm(hidden_states, gate) - - # 4. Final linear projection - out = self.out_proj(hidden_states)[:, None, ...] - - # Fused calculations or step by step if no initialized cache is found - else: - A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) - dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} - - # 2-4. Fused kernel for conv1d, SSM, and the final projection - if self.training and cache_params is None: - out = mamba_split_conv1d_scan_combined( - projected_states, - self.conv1d.weight.squeeze(1), - self.conv1d.bias, - self.dt_bias, - A, - D=self.D, - chunk_size=self.chunk_size, - seq_idx=None, # was seq_idx - activation=self.activation, - rmsnorm_weight=self.norm.weight, - rmsnorm_eps=self.norm.variance_epsilon, - outproj_weight=self.out_proj.weight, - outproj_bias=self.out_proj.bias, - headdim=self.head_dim, - ngroups=self.n_groups, - norm_before_gate=False, - return_final_states=False, - **dt_limit_kwargs, - ) - - else: - _, _, gate, hidden_states_B_C, dt = projected_states.split( - [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 - ) - - # 2. Convolution sequence transformation - # Init cache - if cache_params is not None: - hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) - conv_states = nn.functional.pad( - hidden_states_B_C_transposed, - (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), - ) - cache_params.update_conv_state( - layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True - ) - - if self.activation not in ["silu", "swish"]: - hidden_states_B_C = self.act( - self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2) - ) - else: - hidden_states_B_C = causal_conv1d_fn( - x=hidden_states_B_C.transpose(1, 2), - weight=self.conv1d.weight.squeeze(1), - bias=self.conv1d.bias, - activation=self.activation, - ).transpose(1, 2) - hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) - hidden_states, B, C = torch.split( - hidden_states_B_C, - [self.intermediate_size, groups_time_state_size, groups_time_state_size], - dim=-1, - ) - - # 3. SSM transformation - scan_output, ssm_state = mamba_chunk_scan_combined( - hidden_states.view(batch_size, seq_len, -1, self.head_dim), - dt, - A, - B.view(batch_size, seq_len, self.n_groups, -1), - C.view(batch_size, seq_len, self.n_groups, -1), - chunk_size=self.chunk_size, - D=self.D, - z=None, - seq_idx=None, - return_final_states=True, - dt_bias=self.dt_bias, - dt_softplus=True, - **dt_limit_kwargs, - ) - - # Init cache - if ssm_state is not None and cache_params is not None: - cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) - - scan_output = scan_output.view(batch_size, seq_len, -1) - - # Multiply "gate" branch and apply extra normalization layer - scan_output = self.norm(scan_output, gate) - - # 4. Final linear projection - out = self.out_proj(scan_output) - return out - - # fmt: off - def torch_forward(self, input_states, cache_params: Optional[HybridMambaAttentionDynamicCache]=None, cache_position:Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None): - batch_size, seq_len, _ = input_states.shape - dtype = input_states.dtype - - # 1. Gated MLP's linear projection - input_states = apply_mask_to_padding_states(input_states, attention_mask) - projected_states = self.in_proj(input_states) - d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size-self.num_heads) // 2 - _, _, gate, hidden_states_B_C, dt = projected_states.split( - [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 - ) - - # 2. Convolution sequence transformation - if cache_params is not None and cache_position is not None and cache_position[0] > 0: - cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False) - - # We need to guarantee that anything regarding the cache is on the same device - conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) - - hidden_states_B_C = torch.sum( - conv_states * self.conv1d.weight.squeeze(1), dim=-1 - ) - if self.use_conv_bias: - hidden_states_B_C = hidden_states_B_C + self.conv1d.bias - hidden_states_B_C = self.act(hidden_states_B_C) - else: - # Init cache - if cache_params is not None: - hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) - conv_states = nn.functional.pad( - hidden_states_B_C_transposed, (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) - ) - cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True) - - hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) - - hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) - hidden_states, B, C = torch.split( - hidden_states_B_C, - [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], - dim=-1 - ) - - # 3. SSM transformation - A = -torch.exp(self.A_log.float()) # [num_heads] - if cache_params is not None and cache_position is not None and cache_position[0] > 0: - # We need to guarantee that anything regarding the cache is on the same device - cache_device = cache_params.ssm_states.device - - # Note: there is no need to pad parameter matrices here, as there is just one new token - # for batched generation - dt = dt[:, 0, :][:, None, ...] - dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) - # [num_heads] -> [num_heads, head_dim] - dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) - - dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) - dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) - A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) - # [bsz, num_heads, head_dim, state_size] - dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) - - # Discretize B - # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> - # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] - B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] - B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() - B = B.reshape(batch_size, -1, B.shape[-1]) - # [bsz, num_heads, head_dim, state_size] - dB = dt[..., None] * B[..., None, :] - - # Discretize x into dB - # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] - hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) - dBx = (dB * hidden_states[..., None]).to(device=cache_device) - - # State calculation - cache_params.update_ssm_state( - layer_idx=self.layer_idx, - new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx - ) - - # Subsequent output - # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] - C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] - C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() - C = C.reshape(batch_size, -1, C.shape[-1]) - # [bsz, num_heads, head_dim] - - ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] - # Reshape ssm_states to merge the first two dimensions - ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] - C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] - y = torch.bmm(ssm_states_reshaped, C_reshaped) - y = y.view(batch_size, self.num_heads, self.head_dim) - - # D skip connection - # [num_heads] -> [num_heads, head_dim] - D = self.D[..., None].expand(self.D.shape[0], self.head_dim) - y = (y + hidden_states * D).to(y.dtype) - - # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] - y = y.reshape(batch_size, -1)[:, None, ...] - else: - # begin ssd naive implementation without einsums - dt = nn.functional.softplus(dt + self.dt_bias) - dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) - hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() - B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() - C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() - B = B.repeat(1, 1, self.num_heads // self.n_groups, 1) - C = C.repeat(1, 1, self.num_heads // self.n_groups, 1) - pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size - - D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) - - # Discretize x and A - hidden_states = hidden_states * dt[..., None] - A = A.to(hidden_states.dtype) * dt - - # Rearrange into blocks/chunks - hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] - - # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] - A = A.permute(0, 3, 1, 2) - A_cumsum = torch.cumsum(A, dim=-1) - - # 1. Compute the output for each intra-chunk (diagonal blocks) - # This is the analog of a causal mask - L = torch.exp(segment_sum(A)) - - # Contraction of C and B to get G (attention-weights like) - G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) - G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) - - # Compute M, equivalent to applying attention mask to weights - M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] - M = M_intermediate.sum(dim=-1) - - # Compute Y_diag (apply to values) - Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) - - # 2. Compute the state for each intra-chunk - # (right term of low-rank factorization of off-diagonal blocks; B terms) - decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum)) - B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] - states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) - - # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries - # (middle term of factorization of off-diag blocks; A terms) - if cache_params is not None and cache_position is not None and cache_position[0] > 0: - previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) - else: - previous_states = torch.zeros_like(states[:, :1]) - states = torch.cat([previous_states, states], dim=1) - decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) - decay_chunk = decay_chunk.transpose(1, 3) - new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) - states, ssm_state = new_states[:, :-1], new_states[:, -1] - - # 4. Compute state -> output conversion per chunk - # (left term of low-rank factorization of off-diagonal blocks; C terms) - state_decay_out = torch.exp(A_cumsum) - C_times_states = (C[..., None, :] * states[:, :, None, ...]) - state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) - Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) - - # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) - y = Y_diag + Y_off - # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] - y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) - - y = y + D_residual - # Cutting off padded chunks - if pad_size > 0: - y = y[:, :seq_len, :, :] - y = y.reshape(batch_size, seq_len, -1) - - # Init cache - if ssm_state is not None and cache_params is not None: - cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) - - scan_output = self.norm(y, gate) - - # end ssd naive - - # 4. Final linear projection - contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] - return contextualized_states - # fmt: on - - def forward( - self, - hidden_states, - cache_params: Optional[HybridMambaAttentionDynamicCache] = None, - cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - ): - if is_fast_path_available and "cuda" in self.in_proj.weight.device.type: - return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) - dtype = hidden_states.dtype - if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: - # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 - hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) - - return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) - - -class NemotronHRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - NemotronHRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - # Weights are in float32 - return (self.weight.to(torch.float32) * hidden_states).to(input_dtype) - -class NemotronHBlock(nn.Module): - def __init__(self, config, layer_idx): - super().__init__() - self.config = config - self.layer_idx = layer_idx - self.residual_in_fp32 = config.residual_in_fp32 - self.norm = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - - # M: Mamba2, *: Attention, -: MLP - self.block_type = config.layers_block_type[layer_idx] - if self.block_type == "mamba": - self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx) - elif self.block_type == "attention": - self.mixer = NEMOTRONH_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) - elif self.block_type == "mlp": - self.mixer = NemotronHMLP(config, layer_idx=layer_idx) - else: - raise ValueError(f"Invalid layer pattern {config.hybrid_override_pattern[layer_idx]}") - - def forward( - self, - hidden_states, - cache_params: Optional[HybridMambaAttentionDynamicCache] = None, - cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - ): - with torch.cuda.stream(torch.cuda.default_stream(hidden_states.device)): - # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs - residual = hidden_states - hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) - if self.residual_in_fp32: - residual = residual.to(torch.float32) - - if self.block_type == "mamba": - hidden_states = self.mixer( - hidden_states, cache_params=cache_params, cache_position=cache_position - ) - elif self.block_type == "attention": - hidden_states = self.mixer( - hidden_states, cache_position=cache_position - ) - hidden_states = hidden_states[0] - elif self.block_type == "mlp": - hidden_states = self.mixer( - hidden_states - ) - else: - raise ValueError(f"Invalid block_type: {self.block_type}") - - hidden_states = residual + hidden_states - return hidden_states - - -# Copied from transformers.models.nemotron.modeling_nemotron Nemotron->NemotronH -class NemotronHMLP(nn.Module): - def __init__(self, config, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - self.hidden_size = config.hidden_size - #intermediate_size = config.expand * config.hidden_size - self.intermediate_size = config.intermediate_size - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) - self.act_fn = ACT2FN[config.mlp_hidden_act] - - def forward(self, x): - return self.down_proj(self.act_fn(self.up_proj(x))) - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class NemotronHAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: NemotronHConfig, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.attention_dropout = config.attention_dropout - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - if config.head_dim is not None: - self.head_dim = config.head_dim - else: - self.head_dim = config.hidden_size // config.num_attention_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.is_causal = True - - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) - self.o_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=config.attention_bias) - - def forward( - self, - hidden_states: torch.Tensor, - # position_embeddings: Tuple[torch.Tensor, torch.Tensor], #TODO - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - if past_key_value is not None: - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - causal_mask = attention_mask - if attention_mask is not None: # no matter the length, we just slice it - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - is_causal = True if causal_mask is None and q_len > 1 else False - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=causal_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=is_causal, - ) - attn_output = attn_output.transpose(1, 2).contiguous() - #attn_output = attn_output.view(bsz, q_len, self.hidden_size) - attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim) - - attn_output = self.o_proj(attn_output) - - return attn_output, None, past_key_value - - -# Adapted from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Jamba -#class JambaFlashAttention2(JambaAttention): -class NemotronHFlashAttention2(NemotronHAttention): - """ - Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - **kwargs, - ): - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - if past_key_value is not None: - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = _flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - sliding_window=getattr(self.config, "sliding_window", None), - is_causal=self.is_causal, - use_top_left_mask=self._flash_attn_uses_top_left_mask, - ) - - #attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -# Adapted from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Jamba -#class JambaSdpaAttention(JambaAttention): -class NemotronHSdpaAttention(NemotronHAttention): - """ - Jamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `JambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from NemotronHAttention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "NemotronHModel is using NemotronHSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - if past_key_value is not None: - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - causal_mask = attention_mask - if attention_mask is not None: - causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal = True if self.is_causal and causal_mask is None and q_len > 1 else False - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=causal_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=is_causal, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - return attn_output, None, past_key_value - - -NEMOTRONH_ATTENTION_CLASSES = { - "eager": NemotronHAttention, - "flash_attention_2": NemotronHFlashAttention2, - "sdpa": NemotronHSdpaAttention, -} - -# Copied from transformers.models.mamba.modeling_mamba2.Mamba2PreTrainedModel -class NemotronHPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = NemotronHConfig - base_model_prefix = "backbone" - _no_split_modules = ["NemotronHBlock"] - supports_gradient_checkpointing = True - _is_stateful = True - _supports_flash_attn_2 = True - - def _init_weights(self, module): - """Initialize the weights.""" - if isinstance(module, NemotronHMamba2Mixer): - if getattr(module.dt_bias, "_is_hf_initialized", False): - return - module.A_log._no_weight_decay = True - module.D._no_weight_decay = True - - dt = torch.exp( - torch.rand(self.config.mamba_num_heads) - * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min)) - + math.log(self.config.time_step_min) - ).clamp(min=self.config.time_step_floor) - - # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 - inv_dt = dt + torch.log(-torch.expm1(-dt)) - with torch.no_grad(): - module.dt_bias.copy_(inv_dt) - module.dt_bias._no_reinit = True - - if isinstance(module, nn.Linear): - if module.bias is not None: - if not getattr(module.bias, "_no_reinit", False): - nn.init.zeros_(module.bias) - elif isinstance(module, nn.Embedding): - nn.init.normal_(module.weight, std=self.config.initializer_range) - - # TODO: Check - if self.config.rescale_prenorm_residual: - # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: - # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale - # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. - # > -- GPT-2 :: https://openai.com/blog/better-language-models/ - # - # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py - for name, p in module.named_parameters(): - if getattr(p, "_is_hf_initialized", False): - continue - if name in ["out_proj.weight"]: - # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block - # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) - # We need to reinit p since this code could be called multiple times - # Having just p *= scale would repeatedly scale it down - nn.init.kaiming_uniform_(p, a=math.sqrt(5)) - with torch.no_grad(): - p /= math.sqrt(self.config.num_hidden_layers) - - -@dataclass -# Copied from transformers.models.mamba.modeling_mamba2.Mamba2Output with MAMBA2->NemotronH,Mamba2->NemotronH -class NemotronHOutput(ModelOutput): - """ - Class for the NemotronH model outputs. - - Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - cache_params (`HybridMambaAttentionDynamicCache`): - The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to - avoid providing the old `input_ids`. - - Includes both the State space model state matrices after the selective scan, and the Convolutional states - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - """ - - last_hidden_state: Optional[torch.FloatTensor] = None - cache_params: Optional[HybridMambaAttentionDynamicCache] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -# Copied from transformers.models.mamba2.modeling_mamba2.MambaCausalLMOutput with Mamba2->NemotronH -class NemotronHCausalLMOutput(ModelOutput): - """ - Base class for causal language model (or autoregressive) outputs. - - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Language modeling loss (for next-token prediction). - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - cache_params (`HybridMambaAttentionDynamicCache`): - The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to - avoid providing the old `input_ids`. - - Includes both the State space model state matrices after the selective scan, and the Convolutional states - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - """ - - loss: Optional[torch.FloatTensor] = None - logits: Optional[torch.FloatTensor] = None - cache_params: Optional[HybridMambaAttentionDynamicCache] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -NEMOTRONH_START_DOCSTRING = r""" - - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`NemotronHConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -NEMOTRONH_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): - Indices of input sequence tokens in the vocabulary. - - If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as - `input_ids`. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - position_ids (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. - cache_params (`HybridMambaAttentionDynamicCache`, *optional*): - If passed along, the model uses the previous state in all the blocks (which will give the output for the - `input_ids` provided as if the model add `state_input_ids + input_ids` as context). - use_cache (`bool`, *optional*): - If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - The position of the current input in the cache. This is used to ensure that the cache is correctly updated. - If `cache_params` is passed, `cache_position` should also be passed. - attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) -""" - - -@add_start_docstrings( - "The bare NemotronH Model transformer outputting raw hidden-states without any specific head on top.", - NEMOTRONH_START_DOCSTRING, -) -class NemotronHModel(NemotronHPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size) - self.layers = nn.ModuleList([NemotronHBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]) - - self.gradient_checkpointing = False - self.norm_f = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - # Initialize weights and apply final processing - self._register_load_state_dict_pre_hook(self.load_hook) - self.post_init() - - def load_hook(self, state_dict, prefix, *args): - for k in state_dict: - if "embedding." in k: - state_dict[k.replace("embedding.", "embeddings.")] = state_dict.pop(k) - break - - def get_input_embeddings(self): - return self.embeddings - - def set_input_embeddings(self, new_embeddings): - self.embeddings = new_embeddings - - @add_start_docstrings_to_model_forward(NEMOTRONH_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=NemotronHOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - cache_params: Optional[HybridMambaAttentionDynamicCache] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - **kwargs, - ) -> Union[Tuple, NemotronHOutput]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - # use_cache = use_cache if use_cache is not None else self.config.use_cache - use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if inputs_embeds is None: - inputs_embeds = self.embeddings(input_ids) - - if self.gradient_checkpointing and self.training and use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." - ) - use_cache = False - - # From zamba_modeling.py - if use_cache and cache_params is None: - logger.warning_once( - "NemotronH requires an initialized `NemotronHHybridDynamicCache` to return a cache. None was " - "provided, so no cache will be returned." - ) - - hidden_states = inputs_embeds - - if cache_position is None: - cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) - if position_ids is None: - position_ids = cache_position.unsqueeze(0) - - causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) - mamba_mask = self._update_mamba_mask(attention_mask, cache_position) - - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - # Until HERE - - for layer_idx, mixer_block in enumerate(self.layers): - # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention) - if mixer_block.block_type == "mamba": - layer_mask = mamba_mask - elif mixer_block.block_type == "attention": - layer_mask = causal_mask - elif mixer_block.block_type == "mlp": - layer_mask = None - else: - raise ValueError(f"Invalid block_type: {self.block_type}") - - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - hidden_states = self._gradient_checkpointing_func( - mixer_block.__call__, hidden_states, cache_params, cache_position, layer_mask - ) - else: - hidden_states = mixer_block( - hidden_states, - cache_params=cache_params, - cache_position=cache_position, - attention_mask=layer_mask, - ) - - # TODO: Store attentions - # if output_attentions: - # if layer_outputs[1] is not None: - # # append attentions only of attention layers. Mamba layers return `None` as the attention weights - # all_self_attns += (layer_outputs[1],) - - # TODO (Check): should it happen before the forward pass? - # if output_hidden_states: - # all_hidden_states = all_hidden_states + (hidden_states,) - - hidden_states = self.norm_f(hidden_states) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None) - - return NemotronHOutput( - last_hidden_state=hidden_states, - cache_params=cache_params if use_cache else None, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask - def _update_causal_mask(self, attention_mask, input_tensor, cache_position): - if self.config._attn_implementation == "flash_attention_2": - if attention_mask is not None and 0.0 in attention_mask: - return attention_mask - return None - - dtype, device = input_tensor.dtype, input_tensor.device - min_dtype = torch.finfo(dtype).min - sequence_length = input_tensor.shape[1] - target_length = cache_position[-1] + 1 - - causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) - if sequence_length != 1: - causal_mask = torch.triu(causal_mask, diagonal=1) - causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) - causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) - if attention_mask is not None: - causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit - if attention_mask.dim() == 2: - mask_length = attention_mask.shape[-1] - padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) - causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) - - if ( - self.config._attn_implementation == "sdpa" - and attention_mask is not None - and attention_mask.device.type == "cuda" - ): - # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when - # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. - # Details: https://github.com/pytorch/pytorch/issues/110213 - causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) - - return causal_mask - - def _update_mamba_mask(self, attention_mask, cache_position): - """ - No need for zeroing states when - 1. Cached forward - 2. Attending to all inputs - """ - mamba_mask = attention_mask - if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): - mamba_mask = None - return mamba_mask - - -@add_start_docstrings( - """ - The NEMOTRONH Model transformer with a language modeling head on top (linear layer with weights not tied to the input - embeddings). - """, - NEMOTRONH_START_DOCSTRING, -) -class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config): - super().__init__(config) - self.backbone = NemotronHModel(config) - self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.backbone.get_input_embeddings() - - def set_input_embeddings(self, new_embeddings): - return self.backbone.set_input_embeddings(new_embeddings) - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def get_decoder(self): - return self.model - - def set_decoder(self, decoder): - self.model = decoder - - def prepare_inputs_for_generation( - self, - input_ids, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - cache_position=None, - position_ids=None, - use_cache=True, - **kwargs, - ): - # Copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/jamba/modeling_jamba.py - # Overwitten -- uses `cache_params` as opposed to `past_key_values` - empty_past_kv = past_key_values is None - - # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens - # Exception 1: when passing input_embeds, input_ids may be missing entries - # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. - # (we can't check exception 3 while compiling) - if not empty_past_kv: - if ( - inputs_embeds is not None # Exception 1 - or cache_position[-1] >= input_ids.shape[1] # Exception 3 - ): - input_ids = input_ids[:, -cache_position.shape[0] :] - elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) - input_ids = input_ids[:, cache_position] - else: - past_key_values = HybridMambaAttentionDynamicCache( - self.config, input_ids.shape[0], self.dtype, device=self.device - ) - - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if not empty_past_kv: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and empty_past_kv: - # TODO(pjin): workaround fix for properly extending inputs_embeds; - # longer term, may be better handled elsewhere in .generate(). - if input_ids is not None and inputs_embeds.shape[1] < input_ids.shape[1]: - new_token_embeds = self.get_input_embeddings()(input_ids[:,inputs_embeds.shape[1]:]) - inputs_embeds = torch.cat([inputs_embeds, new_token_embeds], dim=1) - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "attention_mask": attention_mask, - "logits_to_keep": self.config.num_logits_to_keep, - "cache_position": cache_position, - } - ) - return model_inputs - - @add_start_docstrings_to_model_forward(NEMOTRONH_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=NemotronHCausalLMOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - cache_params: Optional[HybridMambaAttentionDynamicCache] = None, - labels: Optional[torch.LongTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - use_cache: Optional[bool] = None, - cache_position: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - **kwargs, # for now we need this for generation - ) -> Union[Tuple, NemotronHCausalLMOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set - `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` - are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - nemotron_h_outputs = self.backbone( - input_ids, - cache_params=cache_params, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - use_cache=use_cache, - cache_position=cache_position, - attention_mask=attention_mask, - ) - hidden_states = nemotron_h_outputs[0] - - # TODO: Check zamba_modeling.py: https://github.com/huggingface/transformers/blob/d7188ba600e36d3fd191b12e19f1b3bb81a8404f/src/transformers/models/zamba/modeling_zamba.py#L1284C1-L1286C2 - #logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float() - logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float() - - loss = None - if labels is not None: - # move labels to correct device to enable model parallelism - labels = labels.to(logits.device) - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - - if not return_dict: - output = (logits,) + nemotron_h_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return NemotronHCausalLMOutput( - loss=loss, - logits=logits, - cache_params=nemotron_h_outputs.cache_params, - hidden_states=nemotron_h_outputs.hidden_states, - attentions=nemotron_h_outputs.attentions, - ) diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml b/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml deleted file mode 100644 index dec06eac9c..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml +++ /dev/null @@ -1,261 +0,0 @@ -mcore_gpt: true -micro_batch_size: 1 -global_batch_size: 256 -tensor_model_parallel_size: 8 -pipeline_model_parallel_size: 4 -virtual_pipeline_model_parallel_size: null -encoder_seq_length: 4096 -max_position_embeddings: 4096 -num_layers: 96 -hidden_size: 18432 -ffn_hidden_size: 73728 -num_attention_heads: 96 -init_method_std: 0.0063 -use_scaled_init_method: true -hidden_dropout: 0.0 -attention_dropout: 0.0 -ffn_dropout: 0.0 -kv_channels: null -apply_query_key_layer_scaling: true -normalization: layernorm1p -layernorm_epsilon: 1.0e-05 -do_layer_norm_weight_decay: false -make_vocab_size_divisible_by: 128 -pre_process: true -post_process: true -persist_layer_norm: true -bias: false -activation: squared-relu -headscale: false -transformer_block_type: pre_ln -openai_gelu: false -normalize_attention_scores: true -position_embedding_type: rope -rotary_percentage: 0.5 -attention_type: multihead -share_embeddings_and_output_weights: false -num_query_groups: 8 -tokenizer: - library: sentencepiece - type: null - model: nemo:8223bf8eaa194eb8920af568bb52e2d0_megatron_2.model - vocab_file: null - merge_file: null - tokenizer_model: nemo:eb5528fdec5c4083affa2c97958eeef7_megatron_2.model - sentencepiece_legacy: false -native_amp_init_scale: 4294967296 -native_amp_growth_interval: 1000 -hysteresis: 2 -fp32_residual_connection: false -fp16_lm_cross_entropy: false -megatron_amp_O2: true -grad_allreduce_chunk_size_mb: 125 -grad_div_ar_fusion: true -gradient_accumulation_fusion: false -bias_activation_fusion: false -bias_dropout_add_fusion: false -masked_softmax_fusion: true -seed: 1234 -resume_from_checkpoint: null -use_cpu_initialization: false -onnx_safe: false -apex_transformer_log_level: 30 -gradient_as_bucket_view: false -sync_batch_comm: false -activations_checkpoint_granularity: null -activations_checkpoint_method: null -activations_checkpoint_num_layers: 1 -num_micro_batches_with_partial_activation_checkpoints: null -activations_checkpoint_layers_per_pipeline: null -sequence_parallel: false -transformer_engine: false -fp8: false -fp8_e4m3: false -fp8_hybrid: false -fp8_margin: 0 -fp8_interval: 1 -fp8_amax_history_len: 1 -fp8_amax_compute_algo: most_recent -reduce_amax: true -use_emha: false -optim: - name: distributed_fused_adam - lr: 3.001e-07 - weight_decay: 0.1 - betas: - - 0.9 - - 0.98 - sched: - name: CosineAnnealing - warmup_steps: 10 - constant_steps: 400 - min_lr: 3.0e-07 - bucket_cap_mb: 200 - overlap_grad_sync: false - contiguous_grad_buffer: true -precision: bf16-mixed -data: - chat: true - chat_prompt_tokens: - system_turn_start: - turn_start: - label_start: - end_of_turn: ' - - ' - end_of_name: ' - - ' - sample: true - num_workers: 2 - dataloader_type: single - train_ds: - file_path: /dataset/train.jsonl - global_batch_size: 128 - micro_batch_size: 1 - shuffle: true - memmap_workers: null - max_seq_length: 4096 - min_seq_length: 1 - drop_last: true - concat_sampling_probabilities: null - label_key: output - add_eos: false - add_sep: false - add_bos: false - truncation_field: input - index_mapping_dir: /indexmap_dir - prompt_template: 'System - - {system message} - - User - - {turn 1 user message} - - Assistant - - {turn 1 assistant label} - - {turn 1 assistant message} - - User - - {turn 2 user message} - - Assistant - - {turn 2 assistant label} - - {turn 2 assistant message} - - ' - hf_dataset: true - truncation_method: right - validation_ds: - file_path: /dataset/val.jsonl - names: null - global_batch_size: 128 - micro_batch_size: 1 - shuffle: false - memmap_workers: null - max_seq_length: 4096 - min_seq_length: 1 - drop_last: false - label_key: output - add_eos: false - add_sep: false - add_bos: false - write_predictions_to_file: false - output_file_path_prefix: null - truncation_field: input - index_mapping_dir: /indexmap_dir - prompt_template: 'System - - {system message} - - User - - {turn 1 user message} - - Assistant - - {turn 1 assistant label} - - {turn 1 assistant message} - - User - - {turn 2 user message} - - Assistant - - {turn 2 assistant label} - - {turn 2 assistant message} - - ' - tokens_to_generate: 32 - hf_dataset: true - truncation_method: right - metric: - name: loss - average: null - num_classes: null - test_ds: - prompt_template: 'System - - {system message} - - User - - {turn 1 user message} - - Assistant - - {turn 1 assistant label} - - {turn 1 assistant message} - - User - - {turn 2 user message} - - Assistant - - {turn 2 assistant label} - - {turn 2 assistant message} - - ' - data_impl: jsonl - splits_string: null - seq_length: 4096 - skip_warmup: true - reset_position_ids: false - reset_attention_mask: false - eod_mask_loss: false - index_mapping_dir: /indexmap_dir - data_prefix: - train: - - /datasets/train.jsonl - validation: - - /datasets/val.jsonl - test: - - /datasets/val.jsonl -answer_only_loss: true -restore_from_path: /models/340B_base -save_nemo_on_validation_end: true -use_flash_attention: null -pipeline_model_parallel_split_rank: 0 -dpo: - log_prob_forward_micro_batch_size: 2 - ref_policy_kl_penalty: 0.3 - average_log_probs: false - sft_loss_coeff: 1.0e-05 - optimize_ref_policy_kl_penalty: false - preference_loss: reward_rev_dpo - gt_reward_scale: 1.0 -apply_rope_fusion: false -target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel -nemo_version: 1.22.0 diff --git a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json deleted file mode 100644 index 42300b8993..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json +++ /dev/null @@ -1,88 +0,0 @@ -{ - "architectures": [ - "GptOssForCausalLM" - ], - "attention_bias": true, - "attention_dropout": 0.0, - "eos_token_id": 200002, - "experts_per_token": 4, - "head_dim": 64, - "hidden_act": "silu", - "hidden_size": 2880, - "initial_context_length": 4096, - "initializer_range": 0.02, - "intermediate_size": 2880, - "layer_types": [ - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention" - ], - "max_position_embeddings": 131072, - "model_type": "gpt_oss", - "num_attention_heads": 64, - "num_experts_per_tok": 4, - "num_hidden_layers": 36, - "num_key_value_heads": 8, - "num_local_experts": 128, - "output_router_logits": false, - "pad_token_id": 199999, - "quantization_config": { - "modules_to_not_convert": [ - "model.layers.*.self_attn", - "model.layers.*.mlp.router", - "model.embed_tokens", - "lm_head" - ], - "quant_method": "mxfp4" - }, - "rms_norm_eps": 1e-05, - "rope_scaling": { - "beta_fast": 32.0, - "beta_slow": 1.0, - "factor": 32.0, - "original_max_position_embeddings": 4096, - "rope_type": "yarn", - "truncate": false - }, - "rope_theta": 150000, - "router_aux_loss_coef": 0.9, - "sliding_window": 128, - "swiglu_limit": 7.0, - "tie_word_embeddings": false, - "transformers_version": "4.55.0.dev0", - "use_cache": true, - "vocab_size": 201088 -} diff --git a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json deleted file mode 100644 index 8fb5a4a033..0000000000 --- a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json +++ /dev/null @@ -1,76 +0,0 @@ -{ - "architectures": [ - "GptOssForCausalLM" - ], - "attention_bias": true, - "attention_dropout": 0.0, - "eos_token_id": 200002, - "experts_per_token": 4, - "head_dim": 64, - "hidden_act": "silu", - "hidden_size": 2880, - "initial_context_length": 4096, - "initializer_range": 0.02, - "intermediate_size": 2880, - "layer_types": [ - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention", - "sliding_attention", - "full_attention" - ], - "max_position_embeddings": 131072, - "model_type": "gpt_oss", - "num_attention_heads": 64, - "num_experts_per_tok": 4, - "num_hidden_layers": 24, - "num_key_value_heads": 8, - "num_local_experts": 32, - "output_router_logits": false, - "pad_token_id": 199999, - "quantization_config": { - "modules_to_not_convert": [ - "model.layers.*.self_attn", - "model.layers.*.mlp.router", - "model.embed_tokens", - "lm_head" - ], - "quant_method": "mxfp4" - }, - "rms_norm_eps": 1e-05, - "rope_scaling": { - "beta_fast": 32.0, - "beta_slow": 1.0, - "factor": 32.0, - "original_max_position_embeddings": 4096, - "rope_type": "yarn", - "truncate": false - }, - "rope_theta": 150000, - "router_aux_loss_coef": 0.9, - "sliding_window": 128, - "swiglu_limit": 7.0, - "tie_word_embeddings": false, - "transformers_version": "4.55.0.dev0", - "use_cache": true, - "vocab_size": 201088 -} diff --git a/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py b/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py index 967e9f49d4..304974dd35 100644 --- a/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py +++ b/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py @@ -9,8 +9,9 @@ from nmp.core.models.parallelism.api import infer_model_cfg_from_hf +# TODO: Mock HuggingFace API calls instead of accessing real gated models REQUIRES_HF_TOKEN = pytest.mark.skip( - reason="Gated HuggingFace models require authentication (no fixture available)", + reason="Gated HuggingFace models require mocking (not yet implemented)", ) GATED_MODEL_IDS = frozenset({"meta-llama/Llama-3.1-8B"}) diff --git a/services/core/models/tests/integration/parallelism/test_recent_models.py b/services/core/models/tests/integration/parallelism/test_recent_models.py index c405d11a3e..5b32c86d19 100644 --- a/services/core/models/tests/integration/parallelism/test_recent_models.py +++ b/services/core/models/tests/integration/parallelism/test_recent_models.py @@ -14,8 +14,9 @@ from nmp.core.models.parallelism.api import estimate_parallelization, find_minimum_gpus +# TODO: Mock HuggingFace API calls instead of accessing real gated models REQUIRES_HF_TOKEN = pytest.mark.skip( - reason="Gated HuggingFace models require authentication (no fixture available)", + reason="Gated HuggingFace models require mocking (not yet implemented)", ) GATED_MODEL_IDS = frozenset({"meta-llama/Llama-3.3-70B-Instruct"}) diff --git a/services/core/models/tests/parallelism/nemo_validation_data.py b/services/core/models/tests/parallelism/nemo_validation_data.py index 12947f4d0b..606f78ccec 100644 --- a/services/core/models/tests/parallelism/nemo_validation_data.py +++ b/services/core/models/tests/parallelism/nemo_validation_data.py @@ -10,8 +10,9 @@ import pytest +# TODO: Mock HuggingFace API calls instead of accessing real gated models REQUIRES_HF_TOKEN = pytest.mark.skip( - reason="Gated HuggingFace models require authentication (no fixture available)", + reason="Gated HuggingFace models require mocking (not yet implemented)", ) GATED_MODEL_IDS = frozenset( {