From e4df4da6c98646da33eb7828f03901c9c437a3a5 Mon Sep 17 00:00:00 2001
From: Matthew Grossman <mgrossman@nvidia.com>
Date: Wed, 10 Jun 2026 09:53:39 -0700
Subject: [PATCH 1/4] fix(tests): Mock HF in integration tests

Signed-off-by: Matthew Grossman <mgrossman@nvidia.com>
---
 .../test_parallelism_hf_model_config.py        |  3 +--
 .../parallelism/test_recent_models.py          |  3 +--
 .../tests/integration/test_models_with_auth.py | 18 +++++++++++++++++-
 .../tests/parallelism/nemo_validation_data.py  |  3 +--
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py b/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py
index 304974dd35..967e9f49d4 100644
--- a/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py
+++ b/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py
@@ -9,9 +9,8 @@
 
 from nmp.core.models.parallelism.api import infer_model_cfg_from_hf
 
-# TODO: Mock HuggingFace API calls instead of accessing real gated models
 REQUIRES_HF_TOKEN = pytest.mark.skip(
-    reason="Gated HuggingFace models require mocking (not yet implemented)",
+    reason="Gated HuggingFace models require authentication (no fixture available)",
 )
 GATED_MODEL_IDS = frozenset({"meta-llama/Llama-3.1-8B"})
 
diff --git a/services/core/models/tests/integration/parallelism/test_recent_models.py b/services/core/models/tests/integration/parallelism/test_recent_models.py
index 5b32c86d19..c405d11a3e 100644
--- a/services/core/models/tests/integration/parallelism/test_recent_models.py
+++ b/services/core/models/tests/integration/parallelism/test_recent_models.py
@@ -14,9 +14,8 @@
 
 from nmp.core.models.parallelism.api import estimate_parallelization, find_minimum_gpus
 
-# TODO: Mock HuggingFace API calls instead of accessing real gated models
 REQUIRES_HF_TOKEN = pytest.mark.skip(
-    reason="Gated HuggingFace models require mocking (not yet implemented)",
+    reason="Gated HuggingFace models require authentication (no fixture available)",
 )
 GATED_MODEL_IDS = frozenset({"meta-llama/Llama-3.3-70B-Instruct"})
 
diff --git a/services/core/models/tests/integration/test_models_with_auth.py b/services/core/models/tests/integration/test_models_with_auth.py
index 902e991342..a5f378637b 100644
--- a/services/core/models/tests/integration/test_models_with_auth.py
+++ b/services/core/models/tests/integration/test_models_with_auth.py
@@ -19,7 +19,7 @@
 
 from contextlib import contextmanager
 from typing import Generator
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
 import pytest
 from nemo_platform import NeMoPlatform, PermissionDeniedError
@@ -1292,6 +1292,22 @@ class TestTrustRemoteCodePermission:
     trust_remote_code=True requires models.trust-remote-code.set.
     """
 
+    @pytest.fixture(autouse=True)
+    def _mock_hf_storage(self):
+        """Prevent real HuggingFace API calls during fileset creation.
+
+        These tests verify authorization logic, not HF connectivity.
+        Mocking the HfApi avoids rate-limit failures in CI.
+        """
+        with patch("nmp.core.files.app.backends.huggingface.HfApi") as mock_cls:
+            mock_api = Mock()
+            mock_repo_info = Mock()
+            mock_repo_info.sha = "abc123mocked"
+            mock_repo_info.siblings = []  # skip file metadata check
+            mock_api.repo_info.return_value = mock_repo_info
+            mock_cls.return_value = mock_api
+            yield
+
     def test_create_model_trust_remote_code_true_has_permission_succeeds(self, sdk: NeMoPlatform):
         """Create with trust_remote_code=True succeeds when principal has models.trust-remote-code.set (repo not on allow list)."""
         workspace = short_unique_name("trc-has")
diff --git a/services/core/models/tests/parallelism/nemo_validation_data.py b/services/core/models/tests/parallelism/nemo_validation_data.py
index 606f78ccec..12947f4d0b 100644
--- a/services/core/models/tests/parallelism/nemo_validation_data.py
+++ b/services/core/models/tests/parallelism/nemo_validation_data.py
@@ -10,9 +10,8 @@
 
 import pytest
 
-# TODO: Mock HuggingFace API calls instead of accessing real gated models
 REQUIRES_HF_TOKEN = pytest.mark.skip(
-    reason="Gated HuggingFace models require mocking (not yet implemented)",
+    reason="Gated HuggingFace models require authentication (no fixture available)",
 )
 GATED_MODEL_IDS = frozenset(
     {

From 50602f58628b367a45d293f88b5d2dbb6385d269 Mon Sep 17 00:00:00 2001
From: Matthew Grossman <mgrossman@nvidia.com>
Date: Wed, 10 Jun 2026 09:53:49 -0700
Subject: [PATCH 2/4] add fixtures

Signed-off-by: Matthew Grossman <mgrossman@nvidia.com>
---
 .../fixtures/EleutherAI/gpt-j-6b/config.json  |   40 +
 .../EleutherAI/gpt-neox-20b/config.json       |   25 +
 .../Qwen/Qwen2.5-72B-Instruct/config.json     |   27 +
 .../fixtures/Qwen/Qwen2.5-72B/config.json     |   27 +
 .../fixtures/Qwen/Qwen2.5-7B/config.json      |   28 +
 .../fixtures/Qwen/Qwen3-4B-SafeRL/config.json |   30 +
 .../fixtures/Qwen/Qwen3-8B/config.json        |   30 +
 .../deepseek-ai/DeepSeek-V3-Base/config.json  |   67 +
 .../configuration_deepseek.py                 |  199 ++
 .../DeepSeek-V3-Base/modeling_deepseek.py     | 1848 +++++++++++++++++
 .../deepseek-llm-67b-base/config.json         |   25 +
 .../deepseek-llm-7b-base/config.json          |   25 +
 .../parallelism/fixtures/gpt2/config.json     |   31 +
 .../parallelism/fixtures/manifest.json        |   69 +
 .../fixtures/microsoft/phi-2/config.json      |   30 +
 .../fixtures/microsoft/phi-4/config.json      |   32 +
 .../mistralai/Devstral-Small-2505/config.json |   26 +
 .../mistralai/Mistral-7B-v0.1/config.json     |   24 +
 .../mistralai/Mixtral-8x7B-v0.1/config.json   |   29 +
 .../config.json                               |   28 +
 .../NVIDIA-Nemotron-Nano-9B-v2/config.json    |   56 +
 .../configuration_nemotron_h.py               |  245 +++
 .../modeling_nemotron_h.py                    | 1643 +++++++++++++++
 .../model_config.yaml                         |  261 +++
 .../fixtures/openai/gpt-oss-120b/config.json  |   88 +
 .../fixtures/openai/gpt-oss-20b/config.json   |   76 +
 26 files changed, 5009 insertions(+)
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/manifest.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json
 create mode 100644 services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json

diff --git a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json
new file mode 100644
index 0000000000..614ae4f4e0
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json
@@ -0,0 +1,40 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPTJForCausalLM"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 50256,
+  "gradient_checkpointing": false,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gptj",
+  "n_embd": 4096,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 28,
+  "n_positions": 2048,
+  "resid_pdrop": 0.0,
+  "rotary": true,
+  "rotary_dim": 64,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50,
+      "temperature": 1.0
+    }
+  },
+  "tie_word_embeddings": false,
+  "tokenizer_class": "GPT2Tokenizer",
+  "transformers_version": "4.18.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50400
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json
new file mode 100644
index 0000000000..54d3633ef9
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json
@@ -0,0 +1,25 @@
+{
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "attention_probs_dropout_prob": 0,
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "hidden_act": "gelu_fast",
+  "hidden_dropout_prob": 0,
+  "hidden_size": 6144,
+  "initializer_range": 0.02,
+  "intermediate_size": 24576,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 44,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.19.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50432
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json
new file mode 100644
index 0000000000..ec6ea340e5
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json
@@ -0,0 +1,27 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 8192,
+  "initializer_range": 0.02,
+  "intermediate_size": 29568,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 70,
+  "model_type": "qwen2",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 80,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json
new file mode 100644
index 0000000000..67663e297b
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json
@@ -0,0 +1,27 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 8192,
+  "initializer_range": 0.02,
+  "intermediate_size": 29568,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 80,
+  "model_type": "qwen2",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 80,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json
new file mode 100644
index 0000000000..1a90713f0e
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json
@@ -0,0 +1,28 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.1",
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json
new file mode 100644
index 0000000000..e49eccdc32
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json
@@ -0,0 +1,30 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json
new file mode 100644
index 0000000000..d46195ac87
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json
@@ -0,0 +1,30 @@
+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json
new file mode 100644
index 0000000000..aec35a75d9
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json
@@ -0,0 +1,67 @@
+{
+  "architectures": [
+    "DeepseekV3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_deepseek.DeepseekV3Config",
+    "AutoModel": "modeling_deepseek.DeepseekV3Model",
+    "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
+  },
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "ep_size": 1,
+  "first_k_dense_replace": 3,
+  "hidden_act": "silu",
+  "hidden_size": 7168,
+  "initializer_range": 0.02,
+  "intermediate_size": 18432,
+  "kv_lora_rank": 512,
+  "max_position_embeddings": 163840,
+  "model_type": "deepseek_v3",
+  "moe_intermediate_size": 2048,
+  "moe_layer_freq": 1,
+  "n_group": 8,
+  "n_routed_experts": 256,
+  "n_shared_experts": 1,
+  "norm_topk_prob": true,
+  "num_attention_heads": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 61,
+  "num_key_value_heads": 128,
+  "num_nextn_predict_layers": 1,
+  "q_lora_rank": 1536,
+  "qk_nope_head_dim": 128,
+  "qk_rope_head_dim": 64,
+  "quantization_config": {
+    "activation_scheme": "dynamic",
+    "fmt": "e4m3",
+    "quant_method": "fp8",
+    "weight_block_size": [
+      128,
+      128
+    ]
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 40,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "original_max_position_embeddings": 4096,
+    "type": "yarn"
+  },
+  "rope_theta": 10000,
+  "routed_scaling_factor": 2.5,
+  "scoring_func": "sigmoid",
+  "tie_word_embeddings": false,
+  "topk_group": 4,
+  "topk_method": "noaux_tc",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.33.1",
+  "use_cache": true,
+  "v_head_dim": 128,
+  "vocab_size": 129280
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py
new file mode 100644
index 0000000000..f549f2b17d
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py
@@ -0,0 +1,199 @@
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class DeepseekV3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DeepSeek-V3.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 129280):
+            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DeepseekV3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1407):
+            Dimension of the MoE representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
+            Number of nextn predict layers in the DeepSeekV3 Model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        n_shared_experts (`int`, *optional*, defaults to None):
+            Number of shared experts, None means dense model.
+        n_routed_experts (`int`, *optional*, defaults to None):
+            Number of routed experts, None means dense model.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor or routed experts.
+        topk_method (`str`, *optional*, defaults to `gready`):
+            Topk method used in routed gate.
+        n_group (`int`, *optional*, defaults to None):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to None):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        num_experts_per_tok (`int`, *optional*, defaults to None):
+            Number of selected experts, None means dense model.
+        moe_layer_freq (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
+        first_k_dense_replace (`int`, *optional*, defaults to 0):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                            \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to False):
+            Whether to normalize the weights of the routed experts.
+        scoring_func (`str`, *optional*, defaults to 'softmax'):
+            Method of computing expert weights.
+        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+            Auxiliary loss weight coefficient.
+        seq_aux = (`bool`, *optional*, defaults to True):
+            Whether to compute the auxiliary loss for each individual sample.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import DeepseekV3Model, DeepseekV3Config
+
+    >>> # Initializing a Deepseek-V3 style configuration
+    >>> configuration = DeepseekV3Config()
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deepseek_v3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=129280,
+        hidden_size=7168,
+        intermediate_size=18432,
+        moe_intermediate_size = 2048,
+        num_hidden_layers=61,
+        num_nextn_predict_layers=1,
+        num_attention_heads=128,
+        num_key_value_heads=128,
+        n_shared_experts = 1,
+        n_routed_experts = 256,
+        ep_size = 1,
+        routed_scaling_factor = 2.5,
+        kv_lora_rank = 512,
+        q_lora_rank = 1536,
+        qk_rope_head_dim = 64,
+        v_head_dim = 128,
+        qk_nope_head_dim = 128,
+        topk_method = 'noaux_tc',
+        n_group = 8,
+        topk_group = 4,
+        num_experts_per_tok = 8,
+        moe_layer_freq = 1,
+        first_k_dense_replace = 3,
+        norm_topk_prob = True,
+        scoring_func = 'sigmoid',
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=0,
+        eos_token_id=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py
new file mode 100644
index 0000000000..28d9ea27aa
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py
@@ -0,0 +1,1848 @@
+# coding=utf-8
+# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeepSeek model."""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import (
+    ALL_LAYERNORM_LAYERS,
+    is_torch_greater_or_equal_than_1_13,
+)
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_deepseek import DeepseekV3Config
+import torch.distributed as dist
+import numpy as np
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DeepseekV3Config"
+
+
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
+    )
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+class DeepseekV3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DeepseekV3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+ALL_LAYERNORM_LAYERS.append(DeepseekV3RMSNorm)
+
+
+class DeepseekV3RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings,
+            device=self.inv_freq.device,
+            dtype=torch.get_default_dtype(),
+        )
+        self.max_seq_len_cached = None
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+
+        freqs = torch.outer(t, self.inv_freq.to(t.device))
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV3
+class DeepseekV3LinearScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
+    """DeepseekV3RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        t = t / self.scaling_factor
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV3
+class DeepseekV3DynamicNTKScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
+    """DeepseekV3RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings)
+                - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(
+    num_rotations, dim, base=10000, max_position_embeddings=2048
+):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
+    high = math.ceil(
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+def yarn_linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding):
+
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        original_max_position_embeddings=4096,
+        beta_fast=32,
+        beta_slow=1,
+        mscale=1,
+        mscale_all_dim=0,
+    ):
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.mscale_all_dim = mscale_all_dim
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        dim = self.dim
+
+        freq_extra = 1.0 / (
+            self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        freq_inter = 1.0 / (
+            self.scaling_factor
+            * self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            dim,
+            self.base,
+            self.original_max_position_embeddings,
+        )
+        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
+            device=device, dtype=torch.float32
+        )
+        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+
+        freqs = torch.outer(t, inv_freq)
+
+        _mscale = float(
+            yarn_get_mscale(self.scaling_factor, self.mscale)
+            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+        )
+
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer(
+            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
+        )
+        self.register_buffer(
+            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class DeepseekV3MLP(nn.Module):
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.scoring_func = config.scoring_func
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+
+        # topk selection algorithm
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.empty((self.n_routed_experts, self.gating_dim))
+        )
+        if self.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty((self.n_routed_experts))
+            )
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        import torch.nn.init as init
+
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
+        )
+        if self.scoring_func == "sigmoid":
+            scores = logits.sigmoid()
+        else:
+            raise NotImplementedError(
+                f"insupportable scoring function for MoE gating: {self.scoring_func}"
+            )
+
+        ### select top-k experts
+        if self.topk_method == "noaux_tc":
+            assert not self.training
+            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
+            group_scores = (
+                scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
+            )  # [n, n_group]
+            group_idx = torch.topk(
+                group_scores, k=self.topk_group, dim=-1, sorted=False
+            )[
+                1
+            ]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+            score_mask = (
+                group_mask.unsqueeze(-1)
+                .expand(
+                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
+                )
+                .reshape(bsz * seq_len, -1)
+            )  # [n, e]
+            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
+            _, topk_idx = torch.topk(
+                tmp_scores, k=self.top_k, dim=-1, sorted=False
+            )
+            topk_weight = scores.gather(1, topk_idx)
+        else:
+            raise NotImplementedError(
+                f"insupportable TopK function for MoE gating: {self.topk_method}"
+            )
+
+        ### norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
+
+        return topk_idx, topk_weight
+
+class DeepseekV3MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        if hasattr(config, "ep_size") and config.ep_size > 1:
+            assert config.ep_size == dist.get_world_size()
+            self.ep_size = config.ep_size
+            self.experts_per_rank = config.n_routed_experts // config.ep_size
+            self.ep_rank = dist.get_rank()
+            self.experts = nn.ModuleList(
+                [
+                    (
+                        DeepseekV3MLP(
+                            config, intermediate_size=config.moe_intermediate_size
+                        )
+                        if i >= self.ep_rank * self.experts_per_rank
+                        and i < (self.ep_rank + 1) * self.experts_per_rank
+                        else None
+                    )
+                    for i in range(config.n_routed_experts)
+                ]
+            )
+        else:
+            self.ep_size = 1
+            self.experts_per_rank = config.n_routed_experts
+            self.ep_rank = 0
+            self.experts = nn.ModuleList(
+                [
+                    DeepseekV3MLP(
+                        config, intermediate_size=config.moe_intermediate_size
+                    )
+                    for i in range(config.n_routed_experts)
+                ]
+            )
+        self.gate = MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekV3MLP(
+                config=config, intermediate_size=intermediate_size
+            )
+
+    def forward(self, hidden_states):
+        identity = hidden_states
+        orig_shape = hidden_states.shape
+        topk_idx, topk_weight = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        flat_topk_idx = topk_idx.view(-1)
+        if not self.training:
+            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
+        if self.config.n_shared_experts is not None:
+            y = y + self.shared_experts(identity)
+        return y
+
+    @torch.no_grad()
+    def moe_infer(self, x, topk_ids, topk_weight):
+        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
+        cnts.scatter_(1, topk_ids, 1)
+        tokens_per_expert = cnts.sum(dim=0)
+        idxs = topk_ids.view(-1).argsort()
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        sorted_tokens_shape = sorted_tokens.shape
+        if self.ep_size > 1:
+            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+            tokens_per_expert_group = tokens_per_expert.new_empty(
+                tokens_per_expert.shape[0]
+            )
+            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
+            output_splits = (
+                tokens_per_expert_group.view(self.ep_size, -1)
+                .sum(1)
+                .cpu()
+                .numpy()
+                .tolist()
+            )
+            gathered_tokens = sorted_tokens.new_empty(
+                tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
+            )
+            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
+            dist.all_to_all(
+                list(gathered_tokens.split(output_splits)),
+                list(sorted_tokens.split(input_split_sizes)),
+            )
+            tokens_per_expert_post_gather = tokens_per_expert_group.view(
+                self.ep_size, self.experts_per_rank
+            ).sum(dim=0)
+            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
+            s = 0
+            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
+                gatherd_idxs[s : s + k] = i % self.experts_per_rank
+                s += k
+            gatherd_idxs = gatherd_idxs.argsort()
+            sorted_tokens = gathered_tokens[gatherd_idxs]
+            tokens_per_expert = tokens_per_expert_post_gather
+        tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+        outputs = []
+        start_idx = 0
+        for i, num_tokens in enumerate(tokens_per_expert):
+            end_idx = start_idx + num_tokens
+            if num_tokens == 0:
+                continue
+            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
+            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+            expert_out = expert(tokens_for_this_expert)
+            outputs.append(expert_out)
+            start_idx = end_idx
+
+        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+        if self.ep_size > 1:
+            new_x = torch.empty_like(outs)
+            new_x[gatherd_idxs] = outs
+            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
+            dist.all_to_all(
+                list(gathered_tokens.split(input_split_sizes)),
+                list(new_x.split(output_splits)),
+            )
+            outs = gathered_tokens
+
+        new_x = torch.empty_like(outs)
+        new_x[idxs] = outs
+        final_out = (
+            new_x.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(new_x.dtype)
+        )
+        return final_out
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV3
+class DeepseekV3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+
+        self.is_causal = True
+
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(
+                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
+            )
+        else:
+            self.q_a_proj = nn.Linear(
+                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
+            )
+            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
+            self.q_b_proj = nn.Linear(
+                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
+            )
+
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            config.kv_lora_rank + config.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            config.kv_lora_rank,
+            self.num_heads
+            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+        )
+        self._init_rope()
+
+        self.softmax_scale = self.q_head_dim ** (-0.5)
+        if self.config.rope_scaling is not None:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.softmax_scale = self.softmax_scale * mscale * mscale
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = DeepseekV3RotaryEmbedding(
+                self.qk_rope_head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = DeepseekV3DynamicNTKScalingRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "yarn":
+                kwargs = {
+                    key: self.config.rope_scaling[key]
+                    for key in [
+                        "original_max_position_embeddings",
+                        "beta_fast",
+                        "beta_slow",
+                        "mscale",
+                        "mscale_all_dim",
+                    ]
+                    if key in self.config.rope_scaling
+                }
+                self.rotary_emb = DeepseekV3YarnRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                    **kwargs,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+        kv = (
+            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            .transpose(1, 2)
+        )
+
+        k_nope, value_states = torch.split(
+            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+        )
+        kv_seq_len = value_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+
+        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        attn_weights = (
+            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
+        )
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        assert attention_mask is not None
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.attention_dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV3
+class DeepseekV3FlashAttention2(DeepseekV3Attention):
+    """
+    DeepseekV3 flash attention module. This module inherits from `DeepseekV3Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # DeepseekV3FlashAttention2 attention does not support output_attentions
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+        kv = (
+            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            .transpose(1, 2)
+        )
+
+        k_nope, value_states = torch.split(
+            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+        )
+        kv_seq_len = value_states.shape[-2]
+
+        kv_seq_len = value_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+
+        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+
+        if self.q_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DeepseekV3RMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            elif torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            else:
+                target_dtype = (
+                    self.q_proj.weight.dtype
+                    if self.q_lora_rank is None
+                    else self.q_a_proj.weight.dtype
+                )
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            softmax_scale=self.softmax_scale,
+        )
+        if self.q_head_dim != self.v_head_dim:
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+        attn_output = attn_output.reshape(
+            bsz, q_len, self.num_heads * self.v_head_dim
+        ).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV3FlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            (
+                query_states,
+                key_states,
+                value_states,
+                indices_q,
+                cu_seq_lens,
+                max_seq_lens,
+            ) = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(
+                attn_output_unpad, indices_q, batch_size, query_length
+            )
+        else:
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+        return attn_output
+
+    def _upad_input(
+        self, query_layer, key_layer, value_layer, attention_mask, query_length
+    ):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+            indices_k,
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
+                indices_k,
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+                query_layer, attention_mask
+            )
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+ATTENTION_CLASSES = {
+    "eager": DeepseekV3Attention,
+    "flash_attention_2": DeepseekV3FlashAttention2,
+}
+
+
+class DeepseekV3DecoderLayer(nn.Module):
+    def __init__(self, config: DeepseekV3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
+            config=config, layer_idx=layer_idx
+        )
+
+        self.mlp = (
+            DeepseekV3MoE(config)
+            if (
+                config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0
+            )
+            else DeepseekV3MLP(config)
+        )
+        self.input_layernorm = DeepseekV3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = DeepseekV3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+DeepseekV3_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`DeepseekV3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
+    DeepseekV3_START_DOCSTRING,
+)
+class DeepseekV3PreTrainedModel(PreTrainedModel):
+    config_class = DeepseekV3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DeepseekV3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+DeepseekV3_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
+    DeepseekV3_START_DOCSTRING,
+)
+class DeepseekV3Model(DeepseekV3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`]
+
+    Args:
+        config: DeepseekV3Config
+    """
+
+    def __init__(self, config: DeepseekV3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                DeepseekV3DecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = (
+                attention_mask
+                if (attention_mask is not None and 0 in attention_mask)
+                else None
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                if use_legacy_cache
+                else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DeepseekV3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM
+
+        >>> model = DeepseekV3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if (
+                attention_mask is not None
+                and attention_mask.shape[1] > input_ids.shape[1]
+            ):
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx.to(past_state.device))
+                    for past_state in layer_past
+                ),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The DeepseekV3 Model transformer with a sequence classification head on top (linear layer).
+
+    [`DeepseekV3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    DeepseekV3_START_DOCSTRING,
+)
+class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = DeepseekV3Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError(
+                "Cannot handle batch sizes > 1 if no padding token is defined."
+            )
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (
+                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                ).to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[
+            torch.arange(batch_size, device=logits.device), sequence_lengths
+        ]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
+                )
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json
new file mode 100644
index 0000000000..031a7856cc
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json
@@ -0,0 +1,25 @@
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 8192,
+  "initializer_range": 0.02,
+  "intermediate_size": 22016,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 95,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.33.1",
+  "use_cache": true,
+  "vocab_size": 102400
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json
new file mode 100644
index 0000000000..208956063b
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json
@@ -0,0 +1,25 @@
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.33.1",
+  "use_cache": true,
+  "vocab_size": 102400
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json b/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json
new file mode 100644
index 0000000000..10c66461e4
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json
@@ -0,0 +1,31 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "resid_pdrop": 0.1,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "vocab_size": 50257
+}
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/manifest.json b/services/core/models/tests/integration/parallelism/fixtures/manifest.json
new file mode 100644
index 0000000000..baddd3d114
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/manifest.json
@@ -0,0 +1,69 @@
+{
+  "EleutherAI/gpt-j-6b": [
+    "config.json"
+  ],
+  "EleutherAI/gpt-neox-20b": [
+    "config.json"
+  ],
+  "Qwen/Qwen2.5-72B": [
+    "config.json"
+  ],
+  "Qwen/Qwen2.5-72B-Instruct": [
+    "config.json"
+  ],
+  "Qwen/Qwen2.5-7B": [
+    "config.json"
+  ],
+  "Qwen/Qwen3-4B-SafeRL": [
+    "config.json"
+  ],
+  "Qwen/Qwen3-8B": [
+    "config.json"
+  ],
+  "deepseek-ai/DeepSeek-V3-Base": [
+    "config.json",
+    "configuration_deepseek.py",
+    "modeling_deepseek.py"
+  ],
+  "deepseek-ai/deepseek-llm-67b-base": [
+    "config.json"
+  ],
+  "deepseek-ai/deepseek-llm-7b-base": [
+    "config.json"
+  ],
+  "gpt2": [
+    "config.json"
+  ],
+  "microsoft/phi-2": [
+    "config.json"
+  ],
+  "microsoft/phi-4": [
+    "config.json"
+  ],
+  "mistralai/Devstral-Small-2505": [
+    "config.json"
+  ],
+  "mistralai/Mistral-7B-v0.1": [
+    "config.json"
+  ],
+  "mistralai/Mixtral-8x7B-v0.1": [
+    "config.json"
+  ],
+  "nvidia/Mistral-NeMo-Minitron-8B-Instruct": [
+    "config.json"
+  ],
+  "nvidia/NVIDIA-Nemotron-Nano-9B-v2": [
+    "config.json",
+    "configuration_nemotron_h.py",
+    "modeling_nemotron_h.py"
+  ],
+  "nvidia/nemotron-4-340b-instruct": [
+    "model_config.yaml"
+  ],
+  "openai/gpt-oss-120b": [
+    "config.json"
+  ],
+  "openai/gpt-oss-20b": [
+    "config.json"
+  ]
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json
new file mode 100644
index 0000000000..011968cc02
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json
@@ -0,0 +1,30 @@
+{
+  "_name_or_path": "microsoft/phi-2",
+  "architectures": [
+    "PhiForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 50256,
+  "hidden_act": "gelu_new",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 10240,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "phi",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "partial_rotary_factor": 0.4,
+  "qk_layernorm": false,
+  "resid_pdrop": 0.1,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.37.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json
new file mode 100644
index 0000000000..ab17e0b583
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json
@@ -0,0 +1,32 @@
+{
+  "_name_or_path": "microsoft/phi-4",
+  "architectures": [
+    "Phi3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 100257,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 100265,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17920,
+  "max_position_embeddings": 16384,
+  "model_type": "phi3",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 10,
+  "original_max_position_embeddings": 16384,
+  "pad_token_id": 100349,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 250000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 100352
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json
new file mode 100644
index 0000000000..dae01ddab0
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json
@@ -0,0 +1,26 @@
+{
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 32768,
+  "max_position_embeddings": 131072,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "vocab_size": 131072
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json
new file mode 100644
index 0000000000..f4989f072a
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json
@@ -0,0 +1,24 @@
+{
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.34.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32000
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json
new file mode 100644
index 0000000000..de132a80b2
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json
@@ -0,0 +1,29 @@
+{
+  "architectures": [
+    "MixtralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mixtral",
+  "num_attention_heads": 32,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "num_local_experts": 8,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "router_aux_loss_coef": 0.02,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.36.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32000
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json b/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json
new file mode 100644
index 0000000000..55d6cbbe0b
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json
@@ -0,0 +1,28 @@
+{
+  "_name_or_path": "nvidia/Mistral-NeMo-Minitron-8B-Instruct",
+  "activation": "silu",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11520,
+  "max_position_embeddings": 8192,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.0",
+  "use_cache": true,
+  "vocab_size": 131072
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json
new file mode 100644
index 0000000000..4f56c18a20
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json
@@ -0,0 +1,56 @@
+{
+  "architectures": [
+    "NemotronHForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_nemotron_h.NemotronHConfig",
+    "AutoModelForCausalLM": "modeling_nemotron_h.NemotronHForCausalLM"
+  },
+  "bos_token_id": 1,
+  "chunk_size": 128,
+  "conv_kernel": 4,
+  "eos_token_id": 12,
+  "head_dim": 128,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4480,
+  "hybrid_override_pattern": "M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-",
+  "initializer_range": 0.02,
+  "intermediate_size": 15680,
+  "layer_norm_epsilon": 1e-05,
+  "mamba_head_dim": 80,
+  "mamba_hidden_act": "silu",
+  "mamba_num_groups": 8,
+  "mamba_num_heads": 128,
+  "mamba_proj_bias": false,
+  "mamba_state_dim": 128,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "mlp_hidden_act": "relu2",
+  "model_type": "nemotron_h",
+  "n_groups": 8,
+  "num_attention_heads": 40,
+  "num_hidden_layers": 56,
+  "num_key_value_heads": 8,
+  "num_logits_to_keep": 1,
+  "num_query_groups": 8,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": false,
+  "rms_norm_eps": 1e-05,
+  "sliding_window": null,
+  "ssm_state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 256,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "use_mamba_kernels": true,
+  "vocab_size": 131072
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py
new file mode 100644
index 0000000000..2b5c451b4a
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py
@@ -0,0 +1,245 @@
+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NemotronH model configuration"""
+
+import re
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class NemotronHConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`NemotronHModel`]. It is used to instantiate a
+    NemotronH model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the NemotronH-v0.1 model.
+
+    [todo](todo)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the NemotronH model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`NemotronHModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, *: Attention, -: MLP
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+
+    model_type = "nemotron_h"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        num_attention_heads=32,
+        #attention_head_dim=128,
+        head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02, # nemo: init_method_std
+        layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0, # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128, # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=256,
+        rescale_prenorm_residual=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        #self.attention_head_dim = attention_head_dim
+        self.head_dim = head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, "hybrid_override_pattern must have the same length as num_hidden_layers"
+        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), "hybrid_override_pattern must only contain characters 'M', '*', or '-'"
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return [
+            "mamba" if self.hybrid_override_pattern[i] == "M" else
+            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
+            for i in range(self.num_hidden_layers)]
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py
new file mode 100644
index 0000000000..7ac44be3f8
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py
@@ -0,0 +1,1643 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team.
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch NemotronH model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import DynamicCache  # we need __iter__ and __len__ of pkv
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from transformers.utils.import_utils import (
+    is_causal_conv1d_available,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    is_mamba_2_ssm_available,
+)
+from .configuration_nemotron_h import NemotronHConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.mamba.modeling_mamba2.modeling_mamba2.py with MAMBA2->NEMOTRONH,Mamba2->NemotronH
+# For Mamba2 components Mamba2->NemotronHMamba2
+if is_mamba_2_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined, selective_state_update = None, None, None
+
+try:
+    #from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
+    from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn
+except ImportError:
+    raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+
+is_fast_path_available = all(
+    (
+        selective_state_update,
+        mamba_chunk_scan_combined,
+        mamba_split_conv1d_scan_combined,
+        causal_conv1d_fn,
+        causal_conv1d_update,
+    )
+)
+
+
+_CHECKPOINT_FOR_DOC = "nvidia/Nemotron-H-56B-Base-8K"
+_CONFIG_FOR_DOC = "NemotronHConfig"
+
+
+# Helper methods for segment sum computation
+
+
+def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int):
+    """
+    Padding x tensor with `pad_size` on the seq_len dim (dim=1)
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0)
+
+    return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0)
+
+
+def reshape_into_chunks(input_tensor, pad_size, chunk_size):
+    """
+    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
+    simultaneously splitting it into chunk sequences.
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...]
+    input_tensor = pad_tensor_by_size(input_tensor, pad_size)
+
+    if len(input_tensor.shape) == 3:
+        # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads]
+        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2])
+    else:
+        # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size]
+        return input_tensor.reshape(
+            input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]
+        )
+
+
+def segment_sum(input_tensor):
+    """
+    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
+    """
+    chunk_size = input_tensor.size(-1)
+    # 1. expand input tensor to have an additional dimension and repeat along that dimension
+    # [..., chunk_size] -> [..., chunk_size, chunk_size]
+    input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size)
+    # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1)
+    input_tensor = input_tensor.masked_fill(~mask, 0)
+    # 3. compute actual cumsum
+    tensor_segsum = torch.cumsum(input_tensor, dim=-2)
+
+    # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time)
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0)
+    tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf)
+    return tensor_segsum
+
+
+def apply_mask_to_padding_states(hidden_states, attention_mask):
+    """
+    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
+    """
+    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+        dtype = hidden_states.dtype
+        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+    return hidden_states
+
+# Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/models/jamba/modeling_jamba.py
+class HybridMambaAttentionDynamicCache(DynamicCache):
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    def __init__(self, config, batch_size, dtype=torch.float16, device=None):
+        super().__init__()
+        self.dtype = dtype
+        self.hybrid_override_pattern = config.hybrid_override_pattern
+        self.has_previous_state = False  # only used by mamba
+        #intermediate_size = config.expand * config.hidden_size
+        intermediate_size = config.mamba_num_heads * config.mamba_head_dim
+        ssm_state_size = config.ssm_state_size
+        conv_kernel_size = config.conv_kernel
+        self.conv_states = []
+        self.ssm_states = []
+        self.transformer_layers = []
+        for i in range(config.num_hidden_layers):
+            if self.hybrid_override_pattern[i] == "M":
+                # Mamba layer
+                self.conv_states += [
+                    torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
+                ]
+                self.ssm_states += [
+                    torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
+                ]
+            else:
+                # Attention or MLP layer
+                self.conv_states += [torch.tensor([[]] * batch_size, device=device)]
+                self.ssm_states += [torch.tensor([[]] * batch_size, device=device)]
+                self.transformer_layers.append(i)
+
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
+
+    @classmethod
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
+        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
+
+    # Copied from modeling_mamba2.py
+    def update_conv_state(
+        self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False
+    ) -> torch.Tensor:
+        if cache_init:
+            self.conv_states[layer_idx] = new_conv_state.to(self.conv_states.device)
+        else:
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
+            self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states.device)
+        return self.conv_states[layer_idx]
+
+    def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
+        self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device)
+        return self.ssm_states[layer_idx]
+
+    def reset(self):
+        self.conv_states.zero_()
+        self.ssm_states.zero_()
+
+class MambaRMSNormGated(torch.nn.Module):
+    def __init__(self, hidden_size, group_size, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.group_size = group_size
+
+    # jan28b version
+    def forward(self, hidden_states, gate=None):
+        return rmsnorm_fn(x=hidden_states,
+                          weight=self.weight,
+                          bias=None, # No bias
+                          z=gate,
+                          eps=self.variance_epsilon,
+                          group_size=self.group_size,
+                          norm_before_gate=False
+        )
+
+class NemotronHMamba2Mixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: NemotronHConfig, layer_idx: int):
+        super().__init__()
+        self.num_heads = config.mamba_num_heads
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.ssm_state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.intermediate_size = config.mamba_num_heads * config.mamba_head_dim
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.activation = config.mamba_hidden_act
+        self.act = ACT2FN[config.mamba_hidden_act]
+
+        self.layer_norm_epsilon = config.layer_norm_epsilon
+
+        self.n_groups = config.n_groups
+        self.head_dim = config.mamba_head_dim
+        self.chunk_size = config.chunk_size
+
+        self.time_step_limit = config.time_step_limit
+        self.time_step_min = config.time_step_min
+        self.time_step_max = config.time_step_max
+
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=config.use_conv_bias,
+            kernel_size=config.conv_kernel,
+            groups=self.conv_dim,
+            padding=config.conv_kernel - 1,
+        )
+
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(
+            self.hidden_size,
+            projection_size,
+            bias=config.use_bias,
+        )
+        # selective projection used to make dt, B and C input dependant
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        self.norm = MambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon, group_size=self.intermediate_size // self.n_groups)
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        self.D._no_weight_decay = True
+
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.use_bias = config.use_bias
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # 1. Gated MLP's linear projection
+        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+        projected_states = self.in_proj(hidden_states)
+
+        # Set up dimensions for reshapes later
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+        d_mlp = (
+            projected_states.shape[-1]
+            - 2 * self.intermediate_size
+            - 2 * self.n_groups * self.ssm_state_size
+            - self.num_heads
+        ) // 2
+
+        # Single step calculations via cache
+        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
+            _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split(
+                [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+            )
+
+            # 2. Convolution sequence transformation
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+
+            # 3. SSM transformation
+            A = -torch.exp(self.A_log.float())  # (nheads,)
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            hidden_states = self.norm(hidden_states, gate)
+
+            # 4. Final linear projection
+            out = self.out_proj(hidden_states)[:, None, ...]
+
+        # Fused calculations or step by step if no initialized cache is found
+        else:
+            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+            dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
+
+            # 2-4. Fused kernel for conv1d, SSM, and the final projection
+            if self.training and cache_params is None:
+                out = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=None,  # was seq_idx
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight,
+                    rmsnorm_eps=self.norm.variance_epsilon,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=False,
+                    **dt_limit_kwargs,
+                )
+
+            else:
+                _, _, gate, hidden_states_B_C, dt = projected_states.split(
+                    [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
+                )
+
+                # 2. Convolution sequence transformation
+                # Init cache
+                if cache_params is not None:
+                    hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
+                    conv_states = nn.functional.pad(
+                        hidden_states_B_C_transposed,
+                        (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0),
+                    )
+                    cache_params.update_conv_state(
+                        layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True
+                    )
+
+                if self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)
+                    )
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                    ).transpose(1, 2)
+                hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                    dim=-1,
+                )
+
+                # 3. SSM transformation
+                scan_output, ssm_state = mamba_chunk_scan_combined(
+                    hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                    dt,
+                    A,
+                    B.view(batch_size, seq_len, self.n_groups, -1),
+                    C.view(batch_size, seq_len, self.n_groups, -1),
+                    chunk_size=self.chunk_size,
+                    D=self.D,
+                    z=None,
+                    seq_idx=None,
+                    return_final_states=True,
+                    dt_bias=self.dt_bias,
+                    dt_softplus=True,
+                    **dt_limit_kwargs,
+                )
+
+                # Init cache
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state)
+
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+
+                # Multiply "gate" branch and apply extra normalization layer
+                scan_output = self.norm(scan_output, gate)
+
+                # 4. Final linear projection
+                out = self.out_proj(scan_output)
+        return out
+
+    # fmt: off
+    def torch_forward(self, input_states, cache_params: Optional[HybridMambaAttentionDynamicCache]=None, cache_position:Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+
+        # 1. Gated MLP's linear projection
+        input_states = apply_mask_to_padding_states(input_states, attention_mask)
+        projected_states = self.in_proj(input_states)
+        d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size-self.num_heads) // 2
+        _, _, gate, hidden_states_B_C, dt = projected_states.split(
+                [d_mlp, d_mlp, self.intermediate_size,  self.conv_dim, self.num_heads], dim=-1
+        )
+
+        # 2. Convolution sequence transformation
+        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
+            cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False)
+
+            # We need to guarantee that anything regarding the cache is on the same device
+            conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device)
+
+            hidden_states_B_C = torch.sum(
+                conv_states * self.conv1d.weight.squeeze(1), dim=-1
+            )
+            if self.use_conv_bias:
+                hidden_states_B_C = hidden_states_B_C + self.conv1d.bias
+            hidden_states_B_C = self.act(hidden_states_B_C)
+        else:
+            # Init cache
+            if cache_params is not None:
+                hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
+                conv_states = nn.functional.pad(
+                    hidden_states_B_C_transposed, (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0)
+                )
+                cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True)
+
+            hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+
+        hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
+        hidden_states, B, C = torch.split(
+            hidden_states_B_C,
+            [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size],
+            dim=-1
+        )
+
+        # 3. SSM transformation
+        A = -torch.exp(self.A_log.float())                            # [num_heads]
+        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
+            # We need to guarantee that anything regarding the cache is on the same device
+            cache_device = cache_params.ssm_states.device
+
+            # Note: there is no need to pad parameter matrices here, as there is just one new token
+            # for batched generation
+            dt = dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            # [num_heads] -> [num_heads, head_dim]
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+
+            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            # [bsz, num_heads, head_dim, state_size]
+            dA = (torch.exp(dt[..., None] * A)).to(device=cache_device)
+
+            # Discretize B
+            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
+            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            # [bsz, num_heads, head_dim, state_size]
+            dB = dt[..., None] * B[..., None, :]
+
+            # Discretize x into dB
+            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = (dB * hidden_states[..., None]).to(device=cache_device)
+
+            # State calculation
+            cache_params.update_ssm_state(
+                layer_idx=self.layer_idx,
+                new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+
+            # Subsequent output
+            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            # [bsz, num_heads, head_dim]
+
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype)  # Shape: [b, h, d, n]
+            # Reshape ssm_states to merge the first two dimensions
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+
+            # D skip connection
+            # [num_heads] -> [num_heads, head_dim]
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+
+            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # begin ssd naive implementation without einsums
+            dt = nn.functional.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat(1, 1, self.num_heads // self.n_groups, 1)
+            C = C.repeat(1, 1, self.num_heads // self.n_groups, 1)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+
+            # Discretize x and A
+            hidden_states = hidden_states * dt[..., None]
+            A = A.to(hidden_states.dtype) * dt
+
+            # Rearrange into blocks/chunks
+            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
+
+            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
+            A = A.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A, dim=-1)
+
+            # 1. Compute the output for each intra-chunk (diagonal blocks)
+            # This is the analog of a causal mask
+            L = torch.exp(segment_sum(A))
+
+            # Contraction of C and B to get G (attention-weights like)
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]  # shape: (b, c, l, s, h, n)
+            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
+
+            # Compute M, equivalent to applying attention mask to weights
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+
+            # Compute Y_diag (apply to values)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3)
+
+            # 2. Compute the state for each intra-chunk
+            # (right term of low-rank factorization of off-diagonal blocks; B terms)
+            decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
+            B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
+            states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
+
+            # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
+            # (middle term of factorization of off-diag blocks; A terms)
+            if cache_params is not None and cache_position is not None and cache_position[0] > 0:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device)
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
+            decay_chunk = decay_chunk.transpose(1, 3)
+            new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+
+            # 4. Compute state -> output conversion per chunk
+            # (left term of low-rank factorization of off-diagonal blocks; C terms)
+            state_decay_out = torch.exp(A_cumsum)
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+
+            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
+            y = Y_diag + Y_off
+            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+
+            y = y + D_residual
+            # Cutting off padded chunks
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+
+            # Init cache
+            if ssm_state is not None and cache_params is not None:
+                cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state)
+
+        scan_output = self.norm(y, gate)
+
+        # end ssd naive
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
+        dtype = hidden_states.dtype
+        if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+            # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+            hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+        return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
+
+
+class NemotronHRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        NemotronHRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # Weights are in float32
+        return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
+
+class NemotronHBlock(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.norm = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        # M: Mamba2, *: Attention, -: MLP
+        self.block_type = config.layers_block_type[layer_idx]
+        if self.block_type == "mamba":
+            self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx)
+        elif self.block_type == "attention":
+            self.mixer = NEMOTRONH_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+        elif self.block_type == "mlp":
+            self.mixer = NemotronHMLP(config, layer_idx=layer_idx)
+        else:
+            raise ValueError(f"Invalid layer pattern {config.hybrid_override_pattern[layer_idx]}")
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        with torch.cuda.stream(torch.cuda.default_stream(hidden_states.device)):
+            # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+
+            if self.block_type == "mamba":
+                hidden_states = self.mixer(
+                    hidden_states, cache_params=cache_params, cache_position=cache_position
+                )
+            elif self.block_type == "attention":
+                hidden_states = self.mixer(
+                    hidden_states, cache_position=cache_position
+                )
+                hidden_states = hidden_states[0]
+            elif self.block_type == "mlp":
+                hidden_states = self.mixer(
+                    hidden_states
+                )
+            else:
+                raise ValueError(f"Invalid block_type: {self.block_type}")
+
+            hidden_states = residual + hidden_states
+            return hidden_states
+
+
+# Copied from transformers.models.nemotron.modeling_nemotron Nemotron->NemotronH
+class NemotronHMLP(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        #intermediate_size = config.expand * config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.mlp_hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.up_proj(x)))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class NemotronHAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: NemotronHConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        if config.head_dim is not None:
+            self.head_dim = config.head_dim
+        else:
+            self.head_dim = config.hidden_size // config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=config.attention_bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        # position_embeddings: Tuple[torch.Tensor, torch.Tensor], #TODO
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        #attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Jamba
+#class JambaFlashAttention2(JambaAttention):
+class NemotronHFlashAttention2(NemotronHAttention):
+    """
+    Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        #attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Jamba
+#class JambaSdpaAttention(JambaAttention):
+class NemotronHSdpaAttention(NemotronHAttention):
+    """
+    Jamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `JambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from NemotronHAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "NemotronHModel is using NemotronHSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+NEMOTRONH_ATTENTION_CLASSES = {
+    "eager": NemotronHAttention,
+    "flash_attention_2": NemotronHFlashAttention2,
+    "sdpa": NemotronHSdpaAttention,
+}
+
+# Copied from transformers.models.mamba.modeling_mamba2.Mamba2PreTrainedModel
+class NemotronHPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = NemotronHConfig
+    base_model_prefix = "backbone"
+    _no_split_modules = ["NemotronHBlock"]
+    supports_gradient_checkpointing = True
+    _is_stateful = True
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, NemotronHMamba2Mixer):
+            if getattr(module.dt_bias, "_is_hf_initialized", False):
+                return
+            module.A_log._no_weight_decay = True
+            module.D._no_weight_decay = True
+
+            dt = torch.exp(
+                torch.rand(self.config.mamba_num_heads)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            with torch.no_grad():
+                module.dt_bias.copy_(inv_dt)
+            module.dt_bias._no_reinit = True
+
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=self.config.initializer_range)
+
+        # TODO: Check
+        if self.config.rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if getattr(p, "_is_hf_initialized", False):
+                    continue
+                if name in ["out_proj.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(self.config.num_hidden_layers)
+
+
+@dataclass
+# Copied from transformers.models.mamba.modeling_mamba2.Mamba2Output with MAMBA2->NemotronH,Mamba2->NemotronH
+class NemotronHOutput(ModelOutput):
+    """
+    Class for the NemotronH model outputs.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        cache_params (`HybridMambaAttentionDynamicCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    cache_params: Optional[HybridMambaAttentionDynamicCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.mamba2.modeling_mamba2.MambaCausalLMOutput with Mamba2->NemotronH
+class NemotronHCausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        cache_params (`HybridMambaAttentionDynamicCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    cache_params: Optional[HybridMambaAttentionDynamicCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+NEMOTRONH_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`NemotronHConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+NEMOTRONH_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary.
+
+            If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        position_ids (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings.
+        cache_params (`HybridMambaAttentionDynamicCache`, *optional*):
+            If passed along, the model uses the previous state in all the blocks (which will give the output for the
+            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
+        use_cache (`bool`, *optional*):
+            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
+            If `cache_params` is passed, `cache_position` should also be passed.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+"""
+
+
+@add_start_docstrings(
+    "The bare NemotronH Model transformer outputting raw hidden-states without any specific head on top.",
+    NEMOTRONH_START_DOCSTRING,
+)
+class NemotronHModel(NemotronHPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([NemotronHBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])
+
+        self.gradient_checkpointing = False
+        self.norm_f = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        # Initialize weights and apply final processing
+        self._register_load_state_dict_pre_hook(self.load_hook)
+        self.post_init()
+
+    def load_hook(self, state_dict, prefix, *args):
+        for k in state_dict:
+            if "embedding." in k:
+                state_dict[k.replace("embedding.", "embeddings.")] = state_dict.pop(k)
+                break
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
+    @add_start_docstrings_to_model_forward(NEMOTRONH_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=NemotronHOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, NemotronHOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        # From zamba_modeling.py
+        if use_cache and cache_params is None:
+            logger.warning_once(
+                "NemotronH requires an initialized `NemotronHHybridDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
+            )
+
+        hidden_states = inputs_embeds
+
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+        mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        # Until HERE
+
+        for layer_idx, mixer_block in enumerate(self.layers):
+            # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention)
+            if mixer_block.block_type == "mamba":
+                layer_mask = mamba_mask
+            elif mixer_block.block_type == "attention":
+                layer_mask = causal_mask
+            elif mixer_block.block_type == "mlp":
+                layer_mask = None
+            else:
+                raise ValueError(f"Invalid block_type: {self.block_type}")
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    mixer_block.__call__, hidden_states, cache_params, cache_position, layer_mask
+                )
+            else:
+                hidden_states = mixer_block(
+                    hidden_states,
+                    cache_params=cache_params,
+                    cache_position=cache_position,
+                    attention_mask=layer_mask,
+                )
+
+            # TODO: Store attentions
+            # if output_attentions:
+            #     if layer_outputs[1] is not None:
+            #         # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+            #         all_self_attns += (layer_outputs[1],)
+
+            # TODO (Check): should it happen before the forward pass?
+            # if output_hidden_states:
+            #     all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states = self.norm_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
+
+        return NemotronHOutput(
+            last_hidden_state=hidden_states,
+            cache_params=cache_params if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    def _update_mamba_mask(self, attention_mask, cache_position):
+        """
+        No need for zeroing states when
+            1. Cached forward
+            2. Attending to all inputs
+        """
+        mamba_mask = attention_mask
+        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
+            mamba_mask = None
+        return mamba_mask
+
+
+@add_start_docstrings(
+    """
+    The NEMOTRONH Model transformer with a language modeling head on top (linear layer with weights not tied to the input
+    embeddings).
+    """,
+    NEMOTRONH_START_DOCSTRING,
+)
+class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.backbone = NemotronHModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.backbone.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        return self.backbone.set_input_embeddings(new_embeddings)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_decoder(self):
+        return self.model
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/jamba/modeling_jamba.py
+        # Overwitten -- uses `cache_params` as opposed to `past_key_values`
+        empty_past_kv = past_key_values is None
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        if not empty_past_kv:
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.config, input_ids.shape[0], self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            # TODO(pjin): workaround fix for properly extending inputs_embeds;
+            # longer term, may be better handled elsewhere in .generate().
+            if input_ids is not None and inputs_embeds.shape[1] < input_ids.shape[1]:
+                new_token_embeds = self.get_input_embeddings()(input_ids[:,inputs_embeds.shape[1]:])
+                inputs_embeds = torch.cat([inputs_embeds, new_token_embeds], dim=1)
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+    @add_start_docstrings_to_model_forward(NEMOTRONH_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=NemotronHCausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,  # for now we need this for generation
+    ) -> Union[Tuple, NemotronHCausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        nemotron_h_outputs = self.backbone(
+            input_ids,
+            cache_params=cache_params,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+        )
+        hidden_states = nemotron_h_outputs[0]
+
+        # TODO: Check zamba_modeling.py: https://github.com/huggingface/transformers/blob/d7188ba600e36d3fd191b12e19f1b3bb81a8404f/src/transformers/models/zamba/modeling_zamba.py#L1284C1-L1286C2
+        #logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + nemotron_h_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return NemotronHCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            cache_params=nemotron_h_outputs.cache_params,
+            hidden_states=nemotron_h_outputs.hidden_states,
+            attentions=nemotron_h_outputs.attentions,
+        )
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml b/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml
new file mode 100644
index 0000000000..dec06eac9c
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml
@@ -0,0 +1,261 @@
+mcore_gpt: true
+micro_batch_size: 1
+global_batch_size: 256
+tensor_model_parallel_size: 8
+pipeline_model_parallel_size: 4
+virtual_pipeline_model_parallel_size: null
+encoder_seq_length: 4096
+max_position_embeddings: 4096
+num_layers: 96
+hidden_size: 18432
+ffn_hidden_size: 73728
+num_attention_heads: 96
+init_method_std: 0.0063
+use_scaled_init_method: true
+hidden_dropout: 0.0
+attention_dropout: 0.0
+ffn_dropout: 0.0
+kv_channels: null
+apply_query_key_layer_scaling: true
+normalization: layernorm1p
+layernorm_epsilon: 1.0e-05
+do_layer_norm_weight_decay: false
+make_vocab_size_divisible_by: 128
+pre_process: true
+post_process: true
+persist_layer_norm: true
+bias: false
+activation: squared-relu
+headscale: false
+transformer_block_type: pre_ln
+openai_gelu: false
+normalize_attention_scores: true
+position_embedding_type: rope
+rotary_percentage: 0.5
+attention_type: multihead
+share_embeddings_and_output_weights: false
+num_query_groups: 8
+tokenizer:
+  library: sentencepiece
+  type: null
+  model: nemo:8223bf8eaa194eb8920af568bb52e2d0_megatron_2.model
+  vocab_file: null
+  merge_file: null
+  tokenizer_model: nemo:eb5528fdec5c4083affa2c97958eeef7_megatron_2.model
+  sentencepiece_legacy: false
+native_amp_init_scale: 4294967296
+native_amp_growth_interval: 1000
+hysteresis: 2
+fp32_residual_connection: false
+fp16_lm_cross_entropy: false
+megatron_amp_O2: true
+grad_allreduce_chunk_size_mb: 125
+grad_div_ar_fusion: true
+gradient_accumulation_fusion: false
+bias_activation_fusion: false
+bias_dropout_add_fusion: false
+masked_softmax_fusion: true
+seed: 1234
+resume_from_checkpoint: null
+use_cpu_initialization: false
+onnx_safe: false
+apex_transformer_log_level: 30
+gradient_as_bucket_view: false
+sync_batch_comm: false
+activations_checkpoint_granularity: null
+activations_checkpoint_method: null
+activations_checkpoint_num_layers: 1
+num_micro_batches_with_partial_activation_checkpoints: null
+activations_checkpoint_layers_per_pipeline: null
+sequence_parallel: false
+transformer_engine: false
+fp8: false
+fp8_e4m3: false
+fp8_hybrid: false
+fp8_margin: 0
+fp8_interval: 1
+fp8_amax_history_len: 1
+fp8_amax_compute_algo: most_recent
+reduce_amax: true
+use_emha: false
+optim:
+  name: distributed_fused_adam
+  lr: 3.001e-07
+  weight_decay: 0.1
+  betas:
+  - 0.9
+  - 0.98
+  sched:
+    name: CosineAnnealing
+    warmup_steps: 10
+    constant_steps: 400
+    min_lr: 3.0e-07
+  bucket_cap_mb: 200
+  overlap_grad_sync: false
+  contiguous_grad_buffer: true
+precision: bf16-mixed
+data:
+  chat: true
+  chat_prompt_tokens:
+    system_turn_start: <extra_id_0>
+    turn_start: <extra_id_1>
+    label_start: <extra_id_2>
+    end_of_turn: '
+
+      '
+    end_of_name: '
+
+      '
+  sample: true
+  num_workers: 2
+  dataloader_type: single
+  train_ds:
+    file_path: /dataset/train.jsonl
+    global_batch_size: 128
+    micro_batch_size: 1
+    shuffle: true
+    memmap_workers: null
+    max_seq_length: 4096
+    min_seq_length: 1
+    drop_last: true
+    concat_sampling_probabilities: null
+    label_key: output
+    add_eos: false
+    add_sep: false
+    add_bos: false
+    truncation_field: input
+    index_mapping_dir: /indexmap_dir
+    prompt_template: '<extra_id_0>System
+
+      {system message}
+
+      <extra_id_1>User
+
+      {turn 1 user message}
+
+      <extra_id_1>Assistant
+
+      <extra_id_2>{turn 1 assistant label}
+
+      {turn 1 assistant message}
+
+      <extra_id_1>User
+
+      {turn 2 user message}
+
+      <extra_id_1>Assistant
+
+      <extra_id_2>{turn 2 assistant label}
+
+      {turn 2 assistant message}
+
+      <extra_id_1>'
+    hf_dataset: true
+    truncation_method: right
+  validation_ds:
+    file_path: /dataset/val.jsonl
+    names: null
+    global_batch_size: 128
+    micro_batch_size: 1
+    shuffle: false
+    memmap_workers: null
+    max_seq_length: 4096
+    min_seq_length: 1
+    drop_last: false
+    label_key: output
+    add_eos: false
+    add_sep: false
+    add_bos: false
+    write_predictions_to_file: false
+    output_file_path_prefix: null
+    truncation_field: input
+    index_mapping_dir: /indexmap_dir
+    prompt_template: '<extra_id_0>System
+
+      {system message}
+
+      <extra_id_1>User
+
+      {turn 1 user message}
+
+      <extra_id_1>Assistant
+
+      <extra_id_2>{turn 1 assistant label}
+
+      {turn 1 assistant message}
+
+      <extra_id_1>User
+
+      {turn 2 user message}
+
+      <extra_id_1>Assistant
+
+      <extra_id_2>{turn 2 assistant label}
+
+      {turn 2 assistant message}
+
+      <extra_id_1>'
+    tokens_to_generate: 32
+    hf_dataset: true
+    truncation_method: right
+    metric:
+      name: loss
+      average: null
+      num_classes: null
+  test_ds:
+    prompt_template: '<extra_id_0>System
+
+      {system message}
+
+      <extra_id_1>User
+
+      {turn 1 user message}
+
+      <extra_id_1>Assistant
+
+      <extra_id_2>{turn 1 assistant label}
+
+      {turn 1 assistant message}
+
+      <extra_id_1>User
+
+      {turn 2 user message}
+
+      <extra_id_1>Assistant
+
+      <extra_id_2>{turn 2 assistant label}
+
+      {turn 2 assistant message}
+
+      <extra_id_1>'
+  data_impl: jsonl
+  splits_string: null
+  seq_length: 4096
+  skip_warmup: true
+  reset_position_ids: false
+  reset_attention_mask: false
+  eod_mask_loss: false
+  index_mapping_dir: /indexmap_dir
+  data_prefix:
+    train:
+    - /datasets/train.jsonl
+    validation:
+    - /datasets/val.jsonl
+    test:
+    - /datasets/val.jsonl
+answer_only_loss: true
+restore_from_path: /models/340B_base
+save_nemo_on_validation_end: true
+use_flash_attention: null
+pipeline_model_parallel_split_rank: 0
+dpo:
+  log_prob_forward_micro_batch_size: 2
+  ref_policy_kl_penalty: 0.3
+  average_log_probs: false
+  sft_loss_coeff: 1.0e-05
+  optimize_ref_policy_kl_penalty: false
+  preference_loss: reward_rev_dpo
+  gt_reward_scale: 1.0
+apply_rope_fusion: false
+target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel
+nemo_version: 1.22.0
diff --git a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json
new file mode 100644
index 0000000000..42300b8993
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json
@@ -0,0 +1,88 @@
+{
+  "architectures": [
+    "GptOssForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "eos_token_id": 200002,
+  "experts_per_token": 4,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2880,
+  "initial_context_length": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 2880,
+  "layer_types": [
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "model_type": "gpt_oss",
+  "num_attention_heads": 64,
+  "num_experts_per_tok": 4,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "num_local_experts": 128,
+  "output_router_logits": false,
+  "pad_token_id": 199999,
+  "quantization_config": {
+    "modules_to_not_convert": [
+      "model.layers.*.self_attn",
+      "model.layers.*.mlp.router",
+      "model.embed_tokens",
+      "lm_head"
+    ],
+    "quant_method": "mxfp4"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "beta_fast": 32.0,
+    "beta_slow": 1.0,
+    "factor": 32.0,
+    "original_max_position_embeddings": 4096,
+    "rope_type": "yarn",
+    "truncate": false
+  },
+  "rope_theta": 150000,
+  "router_aux_loss_coef": 0.9,
+  "sliding_window": 128,
+  "swiglu_limit": 7.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.55.0.dev0",
+  "use_cache": true,
+  "vocab_size": 201088
+}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json
new file mode 100644
index 0000000000..8fb5a4a033
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json
@@ -0,0 +1,76 @@
+{
+  "architectures": [
+    "GptOssForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "eos_token_id": 200002,
+  "experts_per_token": 4,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2880,
+  "initial_context_length": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 2880,
+  "layer_types": [
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "model_type": "gpt_oss",
+  "num_attention_heads": 64,
+  "num_experts_per_tok": 4,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 8,
+  "num_local_experts": 32,
+  "output_router_logits": false,
+  "pad_token_id": 199999,
+  "quantization_config": {
+    "modules_to_not_convert": [
+      "model.layers.*.self_attn",
+      "model.layers.*.mlp.router",
+      "model.embed_tokens",
+      "lm_head"
+    ],
+    "quant_method": "mxfp4"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "beta_fast": 32.0,
+    "beta_slow": 1.0,
+    "factor": 32.0,
+    "original_max_position_embeddings": 4096,
+    "rope_type": "yarn",
+    "truncate": false
+  },
+  "rope_theta": 150000,
+  "router_aux_loss_coef": 0.9,
+  "sliding_window": 128,
+  "swiglu_limit": 7.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.55.0.dev0",
+  "use_cache": true,
+  "vocab_size": 201088
+}

From 62a425239493438a2d0ed10bff652c3997561392 Mon Sep 17 00:00:00 2001
From: Matthew Grossman <mgrossman@nvidia.com>
Date: Wed, 10 Jun 2026 09:54:49 -0700
Subject: [PATCH 3/4] fix(tests): add parallelism conftest and fixture download
 script

The conftest.py patches hf_hub_download and AutoConfig.from_pretrained
to serve from local fixtures, preventing HuggingFace API calls.
The download_fixtures.py script regenerates fixtures when new models
are added to tests.

Signed-off-by: Matthew Grossman <mgrossman@nvidia.com>
---
 .../tests/integration/parallelism/conftest.py |  78 +++++++++++++
 .../parallelism/download_fixtures.py          | 107 ++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 services/core/models/tests/integration/parallelism/conftest.py
 create mode 100644 services/core/models/tests/integration/parallelism/download_fixtures.py

diff --git a/services/core/models/tests/integration/parallelism/conftest.py b/services/core/models/tests/integration/parallelism/conftest.py
new file mode 100644
index 0000000000..2acbbcdede
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/conftest.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Fixtures that redirect HuggingFace Hub calls to local config fixtures.
+
+This prevents parallelism integration tests from making real network calls to
+huggingface.co, avoiding rate-limit failures in CI.
+
+To regenerate fixtures after adding new models to tests, run:
+    uv run python services/core/models/tests/integration/parallelism/download_fixtures.py
+"""
+
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
+
+def _fixture_path(model_id: str) -> Path:
+    """Return the local fixture directory for a model ID."""
+    return FIXTURES_DIR / model_id
+
+
+def _has_fixture(model_id: str) -> bool:
+    """Check if we have a local fixture for this model."""
+    d = _fixture_path(model_id)
+    return d.is_dir() and ((d / "config.json").exists() or (d / "model_config.yaml").exists())
+
+
+def _mock_hf_hub_download(repo_id: str, filename: str, **kwargs):
+    """Return path to fixture file instead of downloading from HF Hub."""
+    fixture_file = _fixture_path(repo_id) / filename
+    if fixture_file.exists():
+        return str(fixture_file)
+    raise FileNotFoundError(f"Fixture not found: {fixture_file}. Run download_fixtures.py to regenerate.")
+
+
+_real_auto_config_from_pretrained = None
+
+
+def _mock_auto_config_from_pretrained(pretrained_model_name_or_path, **kwargs):
+    """Redirect remote model IDs to local fixture directories."""
+    path_str = str(pretrained_model_name_or_path)
+    # Only intercept remote model IDs (not local paths)
+    if not Path(path_str).exists() and _has_fixture(path_str):
+        return _real_auto_config_from_pretrained(str(_fixture_path(path_str)), **kwargs)
+    return _real_auto_config_from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+
+@pytest.fixture(autouse=True)
+def _offline_hf(monkeypatch):
+    """Patch HF Hub calls to use local fixtures for all parallelism tests."""
+    global _real_auto_config_from_pretrained
+
+    from transformers import AutoConfig
+
+    _real_auto_config_from_pretrained = AutoConfig.from_pretrained
+
+    # Clear the model spec cache so stale entries from prior tests don't bypass our mocks
+    from nmp.core.models.parallelism.api import _model_spec_cache
+
+    _model_spec_cache.clear()
+
+    with (
+        patch(
+            "nmp.core.models.parallelism.models.hf_hub_download",
+            side_effect=_mock_hf_hub_download,
+        ),
+        patch(
+            "transformers.AutoConfig.from_pretrained",
+            side_effect=_mock_auto_config_from_pretrained,
+        ),
+    ):
+        yield
+
+    _model_spec_cache.clear()
diff --git a/services/core/models/tests/integration/parallelism/download_fixtures.py b/services/core/models/tests/integration/parallelism/download_fixtures.py
new file mode 100644
index 0000000000..c38affd8b2
--- /dev/null
+++ b/services/core/models/tests/integration/parallelism/download_fixtures.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+One-time script to download HuggingFace model config files for offline test fixtures.
+
+Run this script when adding new models to parallelism tests:
+    uv run python services/core/models/tests/integration/parallelism/download_fixtures.py
+
+It downloads config.json (and model_config.yaml if present) for each model
+into the fixtures/ directory, enabling tests to run without network access.
+"""
+
+import json
+import shutil
+from pathlib import Path
+
+from huggingface_hub import hf_hub_download
+
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+
+# All non-gated model IDs used across parallelism integration tests.
+# Gated models (meta-llama/*, google/gemma-*) are already skipped via REQUIRES_HF_TOKEN.
+MODEL_IDS = [
+    "gpt2",
+    "microsoft/phi-2",
+    "microsoft/phi-4",
+    "mistralai/Mixtral-8x7B-v0.1",
+    "mistralai/Mistral-7B-v0.1",
+    "mistralai/Devstral-Small-2505",
+    "nvidia/Mistral-NeMo-Minitron-8B-Instruct",
+    "nvidia/NVIDIA-Nemotron-Nano-9B-v2",
+    "nvidia/nemotron-4-340b-instruct",
+    "openai/gpt-oss-20b",
+    "openai/gpt-oss-120b",
+    "EleutherAI/gpt-j-6b",
+    "EleutherAI/gpt-neox-20b",
+    "Qwen/Qwen3-8B",
+    "Qwen/Qwen3-4B-SafeRL",
+    "Qwen/Qwen2.5-7B",
+    "Qwen/Qwen2.5-72B",
+    "Qwen/Qwen2.5-72B-Instruct",
+    "deepseek-ai/deepseek-llm-7b-base",
+    "deepseek-ai/deepseek-llm-67b-base",
+    "deepseek-ai/DeepSeek-V3-Base",
+]
+
+
+def _model_dir(model_id: str) -> Path:
+    """Return the fixture directory for a model, e.g. fixtures/gpt2 or fixtures/microsoft/phi-4."""
+    return FIXTURES_DIR / model_id
+
+
+def download_model_configs(model_id: str) -> None:
+    dest = _model_dir(model_id)
+    dest.mkdir(parents=True, exist_ok=True)
+
+    got_config_json = False
+    for filename in ("config.json", "model_config.yaml"):
+        try:
+            cached_path = hf_hub_download(model_id, filename)
+            shutil.copy2(cached_path, dest / filename)
+            print(f"  [OK] {model_id}/{filename}")
+            if filename == "config.json":
+                got_config_json = True
+        except Exception:
+            # model_config.yaml is optional; config.json is required unless
+            # model_config.yaml exists (NeMo YAML-only models like Nemotron-4-340B)
+            pass
+
+    if not got_config_json and not (dest / "model_config.yaml").exists():
+        raise RuntimeError(f"Neither config.json nor model_config.yaml found for {model_id}")
+
+    # Download custom config/model Python files referenced by auto_map.
+    # These are needed for AutoConfig.from_pretrained(trust_remote_code=True).
+    if got_config_json:
+        config = json.loads((dest / "config.json").read_text())
+        auto_map = config.get("auto_map", {})
+        for key, value in auto_map.items():
+            # value is like "configuration_nemotron_h.NemotronHConfig"
+            module_name = value.split(".")[0]
+            py_file = f"{module_name}.py"
+            try:
+                cached_path = hf_hub_download(model_id, py_file)
+                shutil.copy2(cached_path, dest / py_file)
+                print(f"  [OK] {model_id}/{py_file}")
+            except Exception:
+                print(f"  [WARN] {model_id}/{py_file} not found (auto_map: {key}={value})")
+
+
+def main() -> None:
+    print(f"Downloading config fixtures to {FIXTURES_DIR}/\n")
+    for model_id in MODEL_IDS:
+        print(f"Downloading {model_id}...")
+        download_model_configs(model_id)
+    print("\nDone! Fixtures are ready for offline tests.")
+
+    # Write a manifest so the conftest can validate completeness
+    manifest = {mid: sorted(str(p.name) for p in _model_dir(mid).iterdir()) for mid in MODEL_IDS}
+    manifest_path = FIXTURES_DIR / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n")
+    print(f"Wrote manifest to {manifest_path}")
+
+
+if __name__ == "__main__":
+    main()

From 2c2f64171a8ab5da5de6e29972bfbd10bfe8f063 Mon Sep 17 00:00:00 2001
From: Matthew Grossman <mgrossman@nvidia.com>
Date: Wed, 10 Jun 2026 09:56:20 -0700
Subject: [PATCH 4/4] fix(tests): remove parallelism fixtures, keep only auth
 test mock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 5K lines of HF config fixtures were unnecessary — the parallelism
tests are always skipped in CI (torch not installed). Stripped down to
only the change that fixes the actual CI failures: mocking HfApi in
TestTrustRemoteCodePermission.

Signed-off-by: Matthew Grossman <mgrossman@nvidia.com>
---
 .../tests/integration/parallelism/conftest.py |   78 -
 .../parallelism/download_fixtures.py          |  107 -
 .../fixtures/EleutherAI/gpt-j-6b/config.json  |   40 -
 .../EleutherAI/gpt-neox-20b/config.json       |   25 -
 .../Qwen/Qwen2.5-72B-Instruct/config.json     |   27 -
 .../fixtures/Qwen/Qwen2.5-72B/config.json     |   27 -
 .../fixtures/Qwen/Qwen2.5-7B/config.json      |   28 -
 .../fixtures/Qwen/Qwen3-4B-SafeRL/config.json |   30 -
 .../fixtures/Qwen/Qwen3-8B/config.json        |   30 -
 .../deepseek-ai/DeepSeek-V3-Base/config.json  |   67 -
 .../configuration_deepseek.py                 |  199 --
 .../DeepSeek-V3-Base/modeling_deepseek.py     | 1848 -----------------
 .../deepseek-llm-67b-base/config.json         |   25 -
 .../deepseek-llm-7b-base/config.json          |   25 -
 .../parallelism/fixtures/gpt2/config.json     |   31 -
 .../parallelism/fixtures/manifest.json        |   69 -
 .../fixtures/microsoft/phi-2/config.json      |   30 -
 .../fixtures/microsoft/phi-4/config.json      |   32 -
 .../mistralai/Devstral-Small-2505/config.json |   26 -
 .../mistralai/Mistral-7B-v0.1/config.json     |   24 -
 .../mistralai/Mixtral-8x7B-v0.1/config.json   |   29 -
 .../config.json                               |   28 -
 .../NVIDIA-Nemotron-Nano-9B-v2/config.json    |   56 -
 .../configuration_nemotron_h.py               |  245 ---
 .../modeling_nemotron_h.py                    | 1643 ---------------
 .../model_config.yaml                         |  261 ---
 .../fixtures/openai/gpt-oss-120b/config.json  |   88 -
 .../fixtures/openai/gpt-oss-20b/config.json   |   76 -
 .../test_parallelism_hf_model_config.py       |    3 +-
 .../parallelism/test_recent_models.py         |    3 +-
 .../tests/parallelism/nemo_validation_data.py |    3 +-
 31 files changed, 6 insertions(+), 5197 deletions(-)
 delete mode 100644 services/core/models/tests/integration/parallelism/conftest.py
 delete mode 100644 services/core/models/tests/integration/parallelism/download_fixtures.py
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/manifest.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json
 delete mode 100644 services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json

diff --git a/services/core/models/tests/integration/parallelism/conftest.py b/services/core/models/tests/integration/parallelism/conftest.py
deleted file mode 100644
index 2acbbcdede..0000000000
--- a/services/core/models/tests/integration/parallelism/conftest.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Fixtures that redirect HuggingFace Hub calls to local config fixtures.
-
-This prevents parallelism integration tests from making real network calls to
-huggingface.co, avoiding rate-limit failures in CI.
-
-To regenerate fixtures after adding new models to tests, run:
-    uv run python services/core/models/tests/integration/parallelism/download_fixtures.py
-"""
-
-from pathlib import Path
-from unittest.mock import patch
-
-import pytest
-
-FIXTURES_DIR = Path(__file__).parent / "fixtures"
-
-
-def _fixture_path(model_id: str) -> Path:
-    """Return the local fixture directory for a model ID."""
-    return FIXTURES_DIR / model_id
-
-
-def _has_fixture(model_id: str) -> bool:
-    """Check if we have a local fixture for this model."""
-    d = _fixture_path(model_id)
-    return d.is_dir() and ((d / "config.json").exists() or (d / "model_config.yaml").exists())
-
-
-def _mock_hf_hub_download(repo_id: str, filename: str, **kwargs):
-    """Return path to fixture file instead of downloading from HF Hub."""
-    fixture_file = _fixture_path(repo_id) / filename
-    if fixture_file.exists():
-        return str(fixture_file)
-    raise FileNotFoundError(f"Fixture not found: {fixture_file}. Run download_fixtures.py to regenerate.")
-
-
-_real_auto_config_from_pretrained = None
-
-
-def _mock_auto_config_from_pretrained(pretrained_model_name_or_path, **kwargs):
-    """Redirect remote model IDs to local fixture directories."""
-    path_str = str(pretrained_model_name_or_path)
-    # Only intercept remote model IDs (not local paths)
-    if not Path(path_str).exists() and _has_fixture(path_str):
-        return _real_auto_config_from_pretrained(str(_fixture_path(path_str)), **kwargs)
-    return _real_auto_config_from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-
-@pytest.fixture(autouse=True)
-def _offline_hf(monkeypatch):
-    """Patch HF Hub calls to use local fixtures for all parallelism tests."""
-    global _real_auto_config_from_pretrained
-
-    from transformers import AutoConfig
-
-    _real_auto_config_from_pretrained = AutoConfig.from_pretrained
-
-    # Clear the model spec cache so stale entries from prior tests don't bypass our mocks
-    from nmp.core.models.parallelism.api import _model_spec_cache
-
-    _model_spec_cache.clear()
-
-    with (
-        patch(
-            "nmp.core.models.parallelism.models.hf_hub_download",
-            side_effect=_mock_hf_hub_download,
-        ),
-        patch(
-            "transformers.AutoConfig.from_pretrained",
-            side_effect=_mock_auto_config_from_pretrained,
-        ),
-    ):
-        yield
-
-    _model_spec_cache.clear()
diff --git a/services/core/models/tests/integration/parallelism/download_fixtures.py b/services/core/models/tests/integration/parallelism/download_fixtures.py
deleted file mode 100644
index c38affd8b2..0000000000
--- a/services/core/models/tests/integration/parallelism/download_fixtures.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-One-time script to download HuggingFace model config files for offline test fixtures.
-
-Run this script when adding new models to parallelism tests:
-    uv run python services/core/models/tests/integration/parallelism/download_fixtures.py
-
-It downloads config.json (and model_config.yaml if present) for each model
-into the fixtures/ directory, enabling tests to run without network access.
-"""
-
-import json
-import shutil
-from pathlib import Path
-
-from huggingface_hub import hf_hub_download
-
-FIXTURES_DIR = Path(__file__).parent / "fixtures"
-
-# All non-gated model IDs used across parallelism integration tests.
-# Gated models (meta-llama/*, google/gemma-*) are already skipped via REQUIRES_HF_TOKEN.
-MODEL_IDS = [
-    "gpt2",
-    "microsoft/phi-2",
-    "microsoft/phi-4",
-    "mistralai/Mixtral-8x7B-v0.1",
-    "mistralai/Mistral-7B-v0.1",
-    "mistralai/Devstral-Small-2505",
-    "nvidia/Mistral-NeMo-Minitron-8B-Instruct",
-    "nvidia/NVIDIA-Nemotron-Nano-9B-v2",
-    "nvidia/nemotron-4-340b-instruct",
-    "openai/gpt-oss-20b",
-    "openai/gpt-oss-120b",
-    "EleutherAI/gpt-j-6b",
-    "EleutherAI/gpt-neox-20b",
-    "Qwen/Qwen3-8B",
-    "Qwen/Qwen3-4B-SafeRL",
-    "Qwen/Qwen2.5-7B",
-    "Qwen/Qwen2.5-72B",
-    "Qwen/Qwen2.5-72B-Instruct",
-    "deepseek-ai/deepseek-llm-7b-base",
-    "deepseek-ai/deepseek-llm-67b-base",
-    "deepseek-ai/DeepSeek-V3-Base",
-]
-
-
-def _model_dir(model_id: str) -> Path:
-    """Return the fixture directory for a model, e.g. fixtures/gpt2 or fixtures/microsoft/phi-4."""
-    return FIXTURES_DIR / model_id
-
-
-def download_model_configs(model_id: str) -> None:
-    dest = _model_dir(model_id)
-    dest.mkdir(parents=True, exist_ok=True)
-
-    got_config_json = False
-    for filename in ("config.json", "model_config.yaml"):
-        try:
-            cached_path = hf_hub_download(model_id, filename)
-            shutil.copy2(cached_path, dest / filename)
-            print(f"  [OK] {model_id}/{filename}")
-            if filename == "config.json":
-                got_config_json = True
-        except Exception:
-            # model_config.yaml is optional; config.json is required unless
-            # model_config.yaml exists (NeMo YAML-only models like Nemotron-4-340B)
-            pass
-
-    if not got_config_json and not (dest / "model_config.yaml").exists():
-        raise RuntimeError(f"Neither config.json nor model_config.yaml found for {model_id}")
-
-    # Download custom config/model Python files referenced by auto_map.
-    # These are needed for AutoConfig.from_pretrained(trust_remote_code=True).
-    if got_config_json:
-        config = json.loads((dest / "config.json").read_text())
-        auto_map = config.get("auto_map", {})
-        for key, value in auto_map.items():
-            # value is like "configuration_nemotron_h.NemotronHConfig"
-            module_name = value.split(".")[0]
-            py_file = f"{module_name}.py"
-            try:
-                cached_path = hf_hub_download(model_id, py_file)
-                shutil.copy2(cached_path, dest / py_file)
-                print(f"  [OK] {model_id}/{py_file}")
-            except Exception:
-                print(f"  [WARN] {model_id}/{py_file} not found (auto_map: {key}={value})")
-
-
-def main() -> None:
-    print(f"Downloading config fixtures to {FIXTURES_DIR}/\n")
-    for model_id in MODEL_IDS:
-        print(f"Downloading {model_id}...")
-        download_model_configs(model_id)
-    print("\nDone! Fixtures are ready for offline tests.")
-
-    # Write a manifest so the conftest can validate completeness
-    manifest = {mid: sorted(str(p.name) for p in _model_dir(mid).iterdir()) for mid in MODEL_IDS}
-    manifest_path = FIXTURES_DIR / "manifest.json"
-    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n")
-    print(f"Wrote manifest to {manifest_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json
deleted file mode 100644
index 614ae4f4e0..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-j-6b/config.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-  "activation_function": "gelu_new",
-  "architectures": [
-    "GPTJForCausalLM"
-  ],
-  "attn_pdrop": 0.0,
-  "bos_token_id": 50256,
-  "embd_pdrop": 0.0,
-  "eos_token_id": 50256,
-  "gradient_checkpointing": false,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gptj",
-  "n_embd": 4096,
-  "n_head": 16,
-  "n_inner": null,
-  "n_layer": 28,
-  "n_positions": 2048,
-  "resid_pdrop": 0.0,
-  "rotary": true,
-  "rotary_dim": 64,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "task_specific_params": {
-    "text-generation": {
-      "do_sample": true,
-      "max_length": 50,
-      "temperature": 1.0
-    }
-  },
-  "tie_word_embeddings": false,
-  "tokenizer_class": "GPT2Tokenizer",
-  "transformers_version": "4.18.0.dev0",
-  "use_cache": true,
-  "vocab_size": 50400
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json b/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json
deleted file mode 100644
index 54d3633ef9..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/EleutherAI/gpt-neox-20b/config.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "architectures": [
-    "GPTNeoXForCausalLM"
-  ],
-  "attention_probs_dropout_prob": 0,
-  "bos_token_id": 0,
-  "eos_token_id": 0,
-  "hidden_act": "gelu_fast",
-  "hidden_dropout_prob": 0,
-  "hidden_size": 6144,
-  "initializer_range": 0.02,
-  "intermediate_size": 24576,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 2048,
-  "model_type": "gpt_neox",
-  "num_attention_heads": 64,
-  "num_hidden_layers": 44,
-  "rotary_emb_base": 10000,
-  "rotary_pct": 0.25,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.19.0.dev0",
-  "use_cache": true,
-  "vocab_size": 50432
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json
deleted file mode 100644
index ec6ea340e5..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B-Instruct/config.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 8192,
-  "initializer_range": 0.02,
-  "intermediate_size": 29568,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 70,
-  "model_type": "qwen2",
-  "num_attention_heads": 64,
-  "num_hidden_layers": 80,
-  "num_key_value_heads": 8,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 131072,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.43.1",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 152064
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json
deleted file mode 100644
index 67663e297b..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-72B/config.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 8192,
-  "initializer_range": 0.02,
-  "intermediate_size": 29568,
-  "max_position_embeddings": 131072,
-  "max_window_layers": 80,
-  "model_type": "qwen2",
-  "num_attention_heads": 64,
-  "num_hidden_layers": 80,
-  "num_key_value_heads": 8,
-  "rms_norm_eps": 1e-05,
-  "rope_theta": 1000000.0,
-  "sliding_window": 131072,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.43.1",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 152064
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json
deleted file mode 100644
index 1a90713f0e..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen2.5-7B/config.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 3584,
-  "initializer_range": 0.02,
-  "intermediate_size": 18944,
-  "max_position_embeddings": 131072,
-  "max_window_layers": 28,
-  "model_type": "qwen2",
-  "num_attention_heads": 28,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 4,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 131072,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.40.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 152064
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json
deleted file mode 100644
index e49eccdc32..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-4B-SafeRL/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "architectures": [
-    "Qwen3ForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 2560,
-  "initializer_range": 0.02,
-  "intermediate_size": 9728,
-  "max_position_embeddings": 40960,
-  "max_window_layers": 36,
-  "model_type": "qwen3",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 36,
-  "num_key_value_heads": 8,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": null,
-  "tie_word_embeddings": true,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.51.0",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json b/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json
deleted file mode 100644
index d46195ac87..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/Qwen/Qwen3-8B/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "architectures": [
-    "Qwen3ForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 12288,
-  "max_position_embeddings": 40960,
-  "max_window_layers": 36,
-  "model_type": "qwen3",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 36,
-  "num_key_value_heads": 8,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 1000000,
-  "sliding_window": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.51.0",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json
deleted file mode 100644
index aec35a75d9..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/config.json
+++ /dev/null
@@ -1,67 +0,0 @@
-{
-  "architectures": [
-    "DeepseekV3ForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoConfig": "configuration_deepseek.DeepseekV3Config",
-    "AutoModel": "modeling_deepseek.DeepseekV3Model",
-    "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
-  },
-  "bos_token_id": 0,
-  "eos_token_id": 1,
-  "ep_size": 1,
-  "first_k_dense_replace": 3,
-  "hidden_act": "silu",
-  "hidden_size": 7168,
-  "initializer_range": 0.02,
-  "intermediate_size": 18432,
-  "kv_lora_rank": 512,
-  "max_position_embeddings": 163840,
-  "model_type": "deepseek_v3",
-  "moe_intermediate_size": 2048,
-  "moe_layer_freq": 1,
-  "n_group": 8,
-  "n_routed_experts": 256,
-  "n_shared_experts": 1,
-  "norm_topk_prob": true,
-  "num_attention_heads": 128,
-  "num_experts_per_tok": 8,
-  "num_hidden_layers": 61,
-  "num_key_value_heads": 128,
-  "num_nextn_predict_layers": 1,
-  "q_lora_rank": 1536,
-  "qk_nope_head_dim": 128,
-  "qk_rope_head_dim": 64,
-  "quantization_config": {
-    "activation_scheme": "dynamic",
-    "fmt": "e4m3",
-    "quant_method": "fp8",
-    "weight_block_size": [
-      128,
-      128
-    ]
-  },
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": {
-    "beta_fast": 32,
-    "beta_slow": 1,
-    "factor": 40,
-    "mscale": 1.0,
-    "mscale_all_dim": 1.0,
-    "original_max_position_embeddings": 4096,
-    "type": "yarn"
-  },
-  "rope_theta": 10000,
-  "routed_scaling_factor": 2.5,
-  "scoring_func": "sigmoid",
-  "tie_word_embeddings": false,
-  "topk_group": 4,
-  "topk_method": "noaux_tc",
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.33.1",
-  "use_cache": true,
-  "v_head_dim": 128,
-  "vocab_size": 129280
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py
deleted file mode 100644
index f549f2b17d..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/configuration_deepseek.py
+++ /dev/null
@@ -1,199 +0,0 @@
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-class DeepseekV3Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the DeepSeek-V3.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 129280):
-            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`DeepseekV3Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        moe_intermediate_size (`int`, *optional*, defaults to 1407):
-            Dimension of the MoE representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
-            Number of nextn predict layers in the DeepSeekV3 Model.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        n_shared_experts (`int`, *optional*, defaults to None):
-            Number of shared experts, None means dense model.
-        n_routed_experts (`int`, *optional*, defaults to None):
-            Number of routed experts, None means dense model.
-        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
-            Scaling factor or routed experts.
-        topk_method (`str`, *optional*, defaults to `gready`):
-            Topk method used in routed gate.
-        n_group (`int`, *optional*, defaults to None):
-            Number of groups for routed experts.
-        topk_group (`int`, *optional*, defaults to None):
-            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
-        num_experts_per_tok (`int`, *optional*, defaults to None):
-            Number of selected experts, None means dense model.
-        moe_layer_freq (`int`, *optional*, defaults to 1):
-            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
-        first_k_dense_replace (`int`, *optional*, defaults to 0):
-            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
-                                                            \--k dense layers--/
-        norm_topk_prob (`bool`, *optional*, defaults to False):
-            Whether to normalize the weights of the routed experts.
-        scoring_func (`str`, *optional*, defaults to 'softmax'):
-            Method of computing expert weights.
-        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
-            Auxiliary loss weight coefficient.
-        seq_aux = (`bool`, *optional*, defaults to True):
-            Whether to compute the auxiliary loss for each individual sample.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
-            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
-            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import DeepseekV3Model, DeepseekV3Config
-
-    >>> # Initializing a Deepseek-V3 style configuration
-    >>> configuration = DeepseekV3Config()
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "deepseek_v3"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=129280,
-        hidden_size=7168,
-        intermediate_size=18432,
-        moe_intermediate_size = 2048,
-        num_hidden_layers=61,
-        num_nextn_predict_layers=1,
-        num_attention_heads=128,
-        num_key_value_heads=128,
-        n_shared_experts = 1,
-        n_routed_experts = 256,
-        ep_size = 1,
-        routed_scaling_factor = 2.5,
-        kv_lora_rank = 512,
-        q_lora_rank = 1536,
-        qk_rope_head_dim = 64,
-        v_head_dim = 128,
-        qk_nope_head_dim = 128,
-        topk_method = 'noaux_tc',
-        n_group = 8,
-        topk_group = 4,
-        num_experts_per_tok = 8,
-        moe_layer_freq = 1,
-        first_k_dense_replace = 3,
-        norm_topk_prob = True,
-        scoring_func = 'sigmoid',
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=None,
-        bos_token_id=0,
-        eos_token_id=1,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.moe_intermediate_size = moe_intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_nextn_predict_layers = num_nextn_predict_layers
-        self.num_attention_heads = num_attention_heads
-        self.n_shared_experts = n_shared_experts
-        self.n_routed_experts = n_routed_experts
-        self.ep_size = ep_size
-        self.routed_scaling_factor = routed_scaling_factor
-        self.kv_lora_rank = kv_lora_rank
-        self.q_lora_rank = q_lora_rank
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.topk_method = topk_method
-        self.n_group = n_group
-        self.topk_group = topk_group
-        self.num_experts_per_tok = num_experts_per_tok
-        self.moe_layer_freq = moe_layer_freq
-        self.first_k_dense_replace = first_k_dense_replace
-        self.norm_topk_prob = norm_topk_prob
-        self.scoring_func = scoring_func
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py
deleted file mode 100644
index 28d9ea27aa..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/DeepSeek-V3-Base/modeling_deepseek.py
+++ /dev/null
@@ -1,1848 +0,0 @@
-# coding=utf-8
-# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch DeepSeek model."""
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import (
-    AttentionMaskConverter,
-    _prepare_4d_attention_mask,
-    _prepare_4d_causal_attention_mask,
-)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import (
-    ALL_LAYERNORM_LAYERS,
-    is_torch_greater_or_equal_than_1_13,
-)
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_deepseek import DeepseekV3Config
-import torch.distributed as dist
-import numpy as np
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-
-# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
-# It means that the function will not be traced through and simply appear as a node in the graph.
-if is_torch_fx_available():
-    if not is_torch_greater_or_equal_than_1_13:
-        import torch.fx
-
-    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "DeepseekV3Config"
-
-
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(
-        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
-    )
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-class DeepseekV3RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        DeepseekV3RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-ALL_LAYERNORM_LAYERS.append(DeepseekV3RMSNorm)
-
-
-class DeepseekV3RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (
-            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
-        )
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings,
-            device=self.inv_freq.device,
-            dtype=torch.get_default_dtype(),
-        )
-        self.max_seq_len_cached = None
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
-        )
-
-        freqs = torch.outer(t, self.inv_freq.to(t.device))
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->DeepseekV3
-class DeepseekV3LinearScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
-    """DeepseekV3RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(
-        self,
-        dim,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-    ):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
-        )
-        t = t / self.scaling_factor
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->DeepseekV3
-class DeepseekV3DynamicNTKScalingRotaryEmbedding(DeepseekV3RotaryEmbedding):
-    """DeepseekV3RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(
-        self,
-        dim,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-    ):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings)
-                - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (
-                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
-            )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
-        )
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-# Inverse dim formula to find dim based on number of rotations
-def yarn_find_correction_dim(
-    num_rotations, dim, base=10000, max_position_embeddings=2048
-):
-    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
-        2 * math.log(base)
-    )
-
-
-# Find dim range bounds based on rotations
-def yarn_find_correction_range(
-    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
-):
-    low = math.floor(
-        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
-    )
-    high = math.ceil(
-        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
-    )
-    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
-
-
-def yarn_get_mscale(scale=1, mscale=1):
-    if scale <= 1:
-        return 1.0
-    return 0.1 * mscale * math.log(scale) + 1.0
-
-
-def yarn_linear_ramp_mask(min, max, dim):
-    if min == max:
-        max += 0.001  # Prevent singularity
-
-    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
-    ramp_func = torch.clamp(linear_func, 0, 1)
-    return ramp_func
-
-
-class DeepseekV3YarnRotaryEmbedding(DeepseekV3RotaryEmbedding):
-
-    def __init__(
-        self,
-        dim,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-        original_max_position_embeddings=4096,
-        beta_fast=32,
-        beta_slow=1,
-        mscale=1,
-        mscale_all_dim=0,
-    ):
-        self.scaling_factor = scaling_factor
-        self.original_max_position_embeddings = original_max_position_embeddings
-        self.beta_fast = beta_fast
-        self.beta_slow = beta_slow
-        self.mscale = mscale
-        self.mscale_all_dim = mscale_all_dim
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        dim = self.dim
-
-        freq_extra = 1.0 / (
-            self.base
-            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
-        )
-        freq_inter = 1.0 / (
-            self.scaling_factor
-            * self.base
-            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
-        )
-
-        low, high = yarn_find_correction_range(
-            self.beta_fast,
-            self.beta_slow,
-            dim,
-            self.base,
-            self.original_max_position_embeddings,
-        )
-        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
-            device=device, dtype=torch.float32
-        )
-        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        t = torch.arange(seq_len, device=device, dtype=torch.float32)
-
-        freqs = torch.outer(t, inv_freq)
-
-        _mscale = float(
-            yarn_get_mscale(self.scaling_factor, self.mscale)
-            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
-        )
-
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer(
-            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
-        )
-        self.register_buffer(
-            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-
-    b, h, s, d = q.shape
-    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
-
-    b, h, s, d = k.shape
-    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
-
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class DeepseekV3MLP(nn.Module):
-    def __init__(self, config, hidden_size=None, intermediate_size=None):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
-        self.intermediate_size = (
-            config.intermediate_size if intermediate_size is None else intermediate_size
-        )
-
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-
-
-class MoEGate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.top_k = config.num_experts_per_tok
-        self.n_routed_experts = config.n_routed_experts
-        self.routed_scaling_factor = config.routed_scaling_factor
-        self.scoring_func = config.scoring_func
-        self.topk_method = config.topk_method
-        self.n_group = config.n_group
-        self.topk_group = config.topk_group
-
-        # topk selection algorithm
-        self.norm_topk_prob = config.norm_topk_prob
-        self.gating_dim = config.hidden_size
-        self.weight = nn.Parameter(
-            torch.empty((self.n_routed_experts, self.gating_dim))
-        )
-        if self.topk_method == "noaux_tc":
-            self.e_score_correction_bias = nn.Parameter(
-                torch.empty((self.n_routed_experts))
-            )
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        import torch.nn.init as init
-
-        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-
-    def forward(self, hidden_states):
-        bsz, seq_len, h = hidden_states.shape
-        ### compute gating score
-        hidden_states = hidden_states.view(-1, h)
-        logits = F.linear(
-            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
-        )
-        if self.scoring_func == "sigmoid":
-            scores = logits.sigmoid()
-        else:
-            raise NotImplementedError(
-                f"insupportable scoring function for MoE gating: {self.scoring_func}"
-            )
-
-        ### select top-k experts
-        if self.topk_method == "noaux_tc":
-            assert not self.training
-            scores_for_choice = scores.view(bsz * seq_len, -1) + self.e_score_correction_bias.unsqueeze(0)
-            group_scores = (
-                scores_for_choice.view(bsz * seq_len, self.n_group, -1).topk(2, dim=-1)[0].sum(dim = -1)
-            )  # [n, n_group]
-            group_idx = torch.topk(
-                group_scores, k=self.topk_group, dim=-1, sorted=False
-            )[
-                1
-            ]  # [n, top_k_group]
-            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
-            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
-            score_mask = (
-                group_mask.unsqueeze(-1)
-                .expand(
-                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
-                )
-                .reshape(bsz * seq_len, -1)
-            )  # [n, e]
-            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf"))  # [n, e]
-            _, topk_idx = torch.topk(
-                tmp_scores, k=self.top_k, dim=-1, sorted=False
-            )
-            topk_weight = scores.gather(1, topk_idx)
-        else:
-            raise NotImplementedError(
-                f"insupportable TopK function for MoE gating: {self.topk_method}"
-            )
-
-        ### norm gate to sum 1
-        if self.top_k > 1 and self.norm_topk_prob:
-            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
-            topk_weight = topk_weight / denominator
-        topk_weight = topk_weight * self.routed_scaling_factor # must multiply the scaling factor
-
-        return topk_idx, topk_weight
-
-class DeepseekV3MoE(nn.Module):
-    """
-    A mixed expert module containing shared experts.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.num_experts_per_tok = config.num_experts_per_tok
-
-        if hasattr(config, "ep_size") and config.ep_size > 1:
-            assert config.ep_size == dist.get_world_size()
-            self.ep_size = config.ep_size
-            self.experts_per_rank = config.n_routed_experts // config.ep_size
-            self.ep_rank = dist.get_rank()
-            self.experts = nn.ModuleList(
-                [
-                    (
-                        DeepseekV3MLP(
-                            config, intermediate_size=config.moe_intermediate_size
-                        )
-                        if i >= self.ep_rank * self.experts_per_rank
-                        and i < (self.ep_rank + 1) * self.experts_per_rank
-                        else None
-                    )
-                    for i in range(config.n_routed_experts)
-                ]
-            )
-        else:
-            self.ep_size = 1
-            self.experts_per_rank = config.n_routed_experts
-            self.ep_rank = 0
-            self.experts = nn.ModuleList(
-                [
-                    DeepseekV3MLP(
-                        config, intermediate_size=config.moe_intermediate_size
-                    )
-                    for i in range(config.n_routed_experts)
-                ]
-            )
-        self.gate = MoEGate(config)
-        if config.n_shared_experts is not None:
-            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
-            self.shared_experts = DeepseekV3MLP(
-                config=config, intermediate_size=intermediate_size
-            )
-
-    def forward(self, hidden_states):
-        identity = hidden_states
-        orig_shape = hidden_states.shape
-        topk_idx, topk_weight = self.gate(hidden_states)
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-        flat_topk_idx = topk_idx.view(-1)
-        if not self.training:
-            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
-        if self.config.n_shared_experts is not None:
-            y = y + self.shared_experts(identity)
-        return y
-
-    @torch.no_grad()
-    def moe_infer(self, x, topk_ids, topk_weight):
-        cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
-        cnts.scatter_(1, topk_ids, 1)
-        tokens_per_expert = cnts.sum(dim=0)
-        idxs = topk_ids.view(-1).argsort()
-        sorted_tokens = x[idxs // topk_ids.shape[1]]
-        sorted_tokens_shape = sorted_tokens.shape
-        if self.ep_size > 1:
-            tokens_per_ep_rank = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
-            tokens_per_expert_group = tokens_per_expert.new_empty(
-                tokens_per_expert.shape[0]
-            )
-            dist.all_to_all_single(tokens_per_expert_group, tokens_per_expert)
-            output_splits = (
-                tokens_per_expert_group.view(self.ep_size, -1)
-                .sum(1)
-                .cpu()
-                .numpy()
-                .tolist()
-            )
-            gathered_tokens = sorted_tokens.new_empty(
-                tokens_per_expert_group.sum(dim=0).cpu().item(), sorted_tokens.shape[1]
-            )
-            input_split_sizes = tokens_per_ep_rank.cpu().numpy().tolist()
-            dist.all_to_all(
-                list(gathered_tokens.split(output_splits)),
-                list(sorted_tokens.split(input_split_sizes)),
-            )
-            tokens_per_expert_post_gather = tokens_per_expert_group.view(
-                self.ep_size, self.experts_per_rank
-            ).sum(dim=0)
-            gatherd_idxs = np.zeros(shape=(gathered_tokens.shape[0],), dtype=np.int32)
-            s = 0
-            for i, k in enumerate(tokens_per_expert_group.cpu().numpy()):
-                gatherd_idxs[s : s + k] = i % self.experts_per_rank
-                s += k
-            gatherd_idxs = gatherd_idxs.argsort()
-            sorted_tokens = gathered_tokens[gatherd_idxs]
-            tokens_per_expert = tokens_per_expert_post_gather
-        tokens_per_expert = tokens_per_expert.cpu().numpy()
-
-        outputs = []
-        start_idx = 0
-        for i, num_tokens in enumerate(tokens_per_expert):
-            end_idx = start_idx + num_tokens
-            if num_tokens == 0:
-                continue
-            expert = self.experts[i + self.ep_rank * self.experts_per_rank]
-            tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
-            expert_out = expert(tokens_for_this_expert)
-            outputs.append(expert_out)
-            start_idx = end_idx
-
-        outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
-        if self.ep_size > 1:
-            new_x = torch.empty_like(outs)
-            new_x[gatherd_idxs] = outs
-            gathered_tokens = new_x.new_empty(*sorted_tokens_shape)
-            dist.all_to_all(
-                list(gathered_tokens.split(input_split_sizes)),
-                list(new_x.split(output_splits)),
-            )
-            outs = gathered_tokens
-
-        new_x = torch.empty_like(outs)
-        new_x[idxs] = outs
-        final_out = (
-            new_x.view(*topk_ids.shape, -1)
-            .type(topk_weight.dtype)
-            .mul_(topk_weight.unsqueeze(dim=-1))
-            .sum(dim=1)
-            .type(new_x.dtype)
-        )
-        return final_out
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(
-        batch, num_key_value_heads, n_rep, slen, head_dim
-    )
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->DeepseekV3
-class DeepseekV3Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: DeepseekV3Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.q_lora_rank = config.q_lora_rank
-        self.qk_rope_head_dim = config.qk_rope_head_dim
-        self.kv_lora_rank = config.kv_lora_rank
-        self.v_head_dim = config.v_head_dim
-        self.qk_nope_head_dim = config.qk_nope_head_dim
-        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
-
-        self.is_causal = True
-
-        if self.q_lora_rank is None:
-            self.q_proj = nn.Linear(
-                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
-            )
-        else:
-            self.q_a_proj = nn.Linear(
-                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
-            )
-            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
-            self.q_b_proj = nn.Linear(
-                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
-            )
-
-        self.kv_a_proj_with_mqa = nn.Linear(
-            self.hidden_size,
-            config.kv_lora_rank + config.qk_rope_head_dim,
-            bias=config.attention_bias,
-        )
-        self.kv_a_layernorm = DeepseekV3RMSNorm(config.kv_lora_rank)
-        self.kv_b_proj = nn.Linear(
-            config.kv_lora_rank,
-            self.num_heads
-            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
-            bias=False,
-        )
-
-        self.o_proj = nn.Linear(
-            self.num_heads * self.v_head_dim,
-            self.hidden_size,
-            bias=config.attention_bias,
-        )
-        self._init_rope()
-
-        self.softmax_scale = self.q_head_dim ** (-0.5)
-        if self.config.rope_scaling is not None:
-            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
-            scaling_factor = self.config.rope_scaling["factor"]
-            if mscale_all_dim:
-                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
-                self.softmax_scale = self.softmax_scale * mscale * mscale
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = DeepseekV3RotaryEmbedding(
-                self.qk_rope_head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = DeepseekV3LinearScalingRotaryEmbedding(
-                    self.qk_rope_head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = DeepseekV3DynamicNTKScalingRotaryEmbedding(
-                    self.qk_rope_head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "yarn":
-                kwargs = {
-                    key: self.config.rope_scaling[key]
-                    for key in [
-                        "original_max_position_embeddings",
-                        "beta_fast",
-                        "beta_slow",
-                        "mscale",
-                        "mscale_all_dim",
-                    ]
-                    if key in self.config.rope_scaling
-                }
-                self.rotary_emb = DeepseekV3YarnRotaryEmbedding(
-                    self.qk_rope_head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                    **kwargs,
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return (
-            tensor.view(bsz, seq_len, self.num_heads, self.v_head_dim)
-            .transpose(1, 2)
-            .contiguous()
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.q_lora_rank is None:
-            q = self.q_proj(hidden_states)
-        else:
-            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-        q_nope, q_pe = torch.split(
-            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
-        )
-
-        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-        compressed_kv, k_pe = torch.split(
-            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
-        )
-        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-        kv = (
-            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-            .transpose(1, 2)
-        )
-
-        k_nope, value_states = torch.split(
-            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
-        )
-        kv_seq_len = value_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-
-        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-
-        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
-
-        attn_weights = (
-            torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
-        )
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        assert attention_mask is not None
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training
-        )
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->DeepseekV3
-class DeepseekV3FlashAttention2(DeepseekV3Attention):
-    """
-    DeepseekV3 flash attention module. This module inherits from `DeepseekV3Attention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # DeepseekV3FlashAttention2 attention does not support output_attentions
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.q_lora_rank is None:
-            q = self.q_proj(hidden_states)
-        else:
-            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
-        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
-        q_nope, q_pe = torch.split(
-            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
-        )
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
-        compressed_kv, k_pe = torch.split(
-            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
-        )
-        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
-        kv = (
-            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-            .transpose(1, 2)
-        )
-
-        k_nope, value_states = torch.split(
-            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
-        )
-        kv_seq_len = value_states.shape[-2]
-
-        kv_seq_len = value_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
-
-        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
-        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
-
-        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
-        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
-        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
-
-        if self.q_head_dim != self.v_head_dim:
-            value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.attention_dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (DeepseekV3RMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            # Handle the case where the model is quantized
-            if hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            elif torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            else:
-                target_dtype = (
-                    self.q_proj.weight.dtype
-                    if self.q_lora_rank is None
-                    else self.q_a_proj.weight.dtype
-                )
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            softmax_scale=self.softmax_scale,
-        )
-        if self.q_head_dim != self.v_head_dim:
-            attn_output = attn_output[:, :, :, : self.v_head_dim]
-
-        attn_output = attn_output.reshape(
-            bsz, q_len, self.num_heads * self.v_head_dim
-        ).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in DeepseekV3FlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            (
-                query_states,
-                key_states,
-                value_states,
-                indices_q,
-                cu_seq_lens,
-                max_seq_lens,
-            ) = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-
-            attn_output = pad_input(
-                attn_output_unpad, indices_q, batch_size, query_length
-            )
-        else:
-            attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-
-        return attn_output
-
-    def _upad_input(
-        self, query_layer, key_layer, value_layer, attention_mask, query_length
-    ):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
-            indices_k,
-        )
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
-            indices_k,
-        )
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
-                indices_k,
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
-                query_layer, attention_mask
-            )
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-ATTENTION_CLASSES = {
-    "eager": DeepseekV3Attention,
-    "flash_attention_2": DeepseekV3FlashAttention2,
-}
-
-
-class DeepseekV3DecoderLayer(nn.Module):
-    def __init__(self, config: DeepseekV3Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = ATTENTION_CLASSES[config._attn_implementation](
-            config=config, layer_idx=layer_idx
-        )
-
-        self.mlp = (
-            DeepseekV3MoE(config)
-            if (
-                config.n_routed_experts is not None
-                and layer_idx >= config.first_k_dense_replace
-                and layer_idx % config.moe_layer_freq == 0
-            )
-            else DeepseekV3MLP(config)
-        )
-        self.input_layernorm = DeepseekV3RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-        self.post_attention_layernorm = DeepseekV3RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[
-        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
-    ]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-DeepseekV3_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`DeepseekV3Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
-    DeepseekV3_START_DOCSTRING,
-)
-class DeepseekV3PreTrainedModel(PreTrainedModel):
-    config_class = DeepseekV3Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["DeepseekV3DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-DeepseekV3_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare DeepseekV3 Model outputting raw hidden-states without any specific head on top.",
-    DeepseekV3_START_DOCSTRING,
-)
-class DeepseekV3Model(DeepseekV3PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV3DecoderLayer`]
-
-    Args:
-        config: DeepseekV3Config
-    """
-
-    def __init__(self, config: DeepseekV3Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(
-            config.vocab_size, config.hidden_size, self.padding_idx
-        )
-        self.layers = nn.ModuleList(
-            [
-                DeepseekV3DecoderLayer(config, layer_idx)
-                for layer_idx in range(config.num_hidden_layers)
-            ]
-        )
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        past_key_values_length = 0
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length,
-                seq_length + past_key_values_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if self._use_flash_attention_2:
-            # 2d mask is passed through the layers
-            attention_mask = (
-                attention_mask
-                if (attention_mask is not None and 0 in attention_mask)
-                else None
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-
-        # embed positions
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache()
-                if use_legacy_cache
-                else next_decoder_cache
-            )
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None
-            )
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = DeepseekV3Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-    )
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM
-
-        >>> model = DeepseekV3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        **kwargs,
-    ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if (
-                attention_mask is not None
-                and attention_mask.shape[1] > input_ids.shape[1]
-            ):
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(
-                    past_state.index_select(0, beam_idx.to(past_state.device))
-                    for past_state in layer_past
-                ),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The DeepseekV3 Model transformer with a sequence classification head on top (linear layer).
-
-    [`DeepseekV3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    DeepseekV3_START_DOCSTRING,
-)
-class DeepseekV3ForSequenceClassification(DeepseekV3PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = DeepseekV3Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(DeepseekV3_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, transformers.,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                "Cannot handle batch sizes > 1 if no padding token is defined."
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (
-                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                ).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[
-            torch.arange(batch_size, device=logits.device), sequence_lengths
-        ]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (
-                    labels.dtype == torch.long or labels.dtype == torch.int
-                ):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
-                )
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json
deleted file mode 100644
index 031a7856cc..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-67b-base/config.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 8192,
-  "initializer_range": 0.02,
-  "intermediate_size": 22016,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 64,
-  "num_hidden_layers": 95,
-  "num_key_value_heads": 8,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.33.1",
-  "use_cache": true,
-  "vocab_size": 102400
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json b/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json
deleted file mode 100644
index 208956063b..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/deepseek-ai/deepseek-llm-7b-base/config.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 11008,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 30,
-  "num_key_value_heads": 32,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.33.1",
-  "use_cache": true,
-  "vocab_size": 102400
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json b/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json
deleted file mode 100644
index 10c66461e4..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/gpt2/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "activation_function": "gelu_new",
-  "architectures": [
-    "GPT2LMHeadModel"
-  ],
-  "attn_pdrop": 0.1,
-  "bos_token_id": 50256,
-  "embd_pdrop": 0.1,
-  "eos_token_id": 50256,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gpt2",
-  "n_ctx": 1024,
-  "n_embd": 768,
-  "n_head": 12,
-  "n_layer": 12,
-  "n_positions": 1024,
-  "resid_pdrop": 0.1,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "task_specific_params": {
-    "text-generation": {
-      "do_sample": true,
-      "max_length": 50
-    }
-  },
-  "vocab_size": 50257
-}
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/manifest.json b/services/core/models/tests/integration/parallelism/fixtures/manifest.json
deleted file mode 100644
index baddd3d114..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/manifest.json
+++ /dev/null
@@ -1,69 +0,0 @@
-{
-  "EleutherAI/gpt-j-6b": [
-    "config.json"
-  ],
-  "EleutherAI/gpt-neox-20b": [
-    "config.json"
-  ],
-  "Qwen/Qwen2.5-72B": [
-    "config.json"
-  ],
-  "Qwen/Qwen2.5-72B-Instruct": [
-    "config.json"
-  ],
-  "Qwen/Qwen2.5-7B": [
-    "config.json"
-  ],
-  "Qwen/Qwen3-4B-SafeRL": [
-    "config.json"
-  ],
-  "Qwen/Qwen3-8B": [
-    "config.json"
-  ],
-  "deepseek-ai/DeepSeek-V3-Base": [
-    "config.json",
-    "configuration_deepseek.py",
-    "modeling_deepseek.py"
-  ],
-  "deepseek-ai/deepseek-llm-67b-base": [
-    "config.json"
-  ],
-  "deepseek-ai/deepseek-llm-7b-base": [
-    "config.json"
-  ],
-  "gpt2": [
-    "config.json"
-  ],
-  "microsoft/phi-2": [
-    "config.json"
-  ],
-  "microsoft/phi-4": [
-    "config.json"
-  ],
-  "mistralai/Devstral-Small-2505": [
-    "config.json"
-  ],
-  "mistralai/Mistral-7B-v0.1": [
-    "config.json"
-  ],
-  "mistralai/Mixtral-8x7B-v0.1": [
-    "config.json"
-  ],
-  "nvidia/Mistral-NeMo-Minitron-8B-Instruct": [
-    "config.json"
-  ],
-  "nvidia/NVIDIA-Nemotron-Nano-9B-v2": [
-    "config.json",
-    "configuration_nemotron_h.py",
-    "modeling_nemotron_h.py"
-  ],
-  "nvidia/nemotron-4-340b-instruct": [
-    "model_config.yaml"
-  ],
-  "openai/gpt-oss-120b": [
-    "config.json"
-  ],
-  "openai/gpt-oss-20b": [
-    "config.json"
-  ]
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json
deleted file mode 100644
index 011968cc02..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-2/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "_name_or_path": "microsoft/phi-2",
-  "architectures": [
-    "PhiForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 50256,
-  "embd_pdrop": 0.0,
-  "eos_token_id": 50256,
-  "hidden_act": "gelu_new",
-  "hidden_size": 2560,
-  "initializer_range": 0.02,
-  "intermediate_size": 10240,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 2048,
-  "model_type": "phi",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 32,
-  "partial_rotary_factor": 0.4,
-  "qk_layernorm": false,
-  "resid_pdrop": 0.1,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.37.0",
-  "use_cache": true,
-  "vocab_size": 51200
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json b/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json
deleted file mode 100644
index ab17e0b583..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/microsoft/phi-4/config.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "_name_or_path": "microsoft/phi-4",
-  "architectures": [
-    "Phi3ForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 100257,
-  "embd_pdrop": 0.0,
-  "eos_token_id": 100265,
-  "hidden_act": "silu",
-  "hidden_size": 5120,
-  "initializer_range": 0.02,
-  "intermediate_size": 17920,
-  "max_position_embeddings": 16384,
-  "model_type": "phi3",
-  "num_attention_heads": 40,
-  "num_hidden_layers": 40,
-  "num_key_value_heads": 10,
-  "original_max_position_embeddings": 16384,
-  "pad_token_id": 100349,
-  "resid_pdrop": 0.0,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 250000,
-  "sliding_window": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.47.0",
-  "use_cache": true,
-  "vocab_size": 100352
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json
deleted file mode 100644
index dae01ddab0..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Devstral-Small-2505/config.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "architectures": [
-    "MistralForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 5120,
-  "initializer_range": 0.02,
-  "intermediate_size": 32768,
-  "max_position_embeddings": 131072,
-  "model_type": "mistral",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 40,
-  "num_key_value_heads": 8,
-  "rms_norm_eps": 1e-05,
-  "rope_theta": 1000000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.51.3",
-  "use_cache": true,
-  "vocab_size": 131072
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json
deleted file mode 100644
index f4989f072a..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mistral-7B-v0.1/config.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "architectures": [
-    "MistralForCausalLM"
-  ],
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 14336,
-  "max_position_embeddings": 32768,
-  "model_type": "mistral",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 8,
-  "rms_norm_eps": 1e-05,
-  "rope_theta": 10000.0,
-  "sliding_window": 4096,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.34.0.dev0",
-  "use_cache": true,
-  "vocab_size": 32000
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json b/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json
deleted file mode 100644
index de132a80b2..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/mistralai/Mixtral-8x7B-v0.1/config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "architectures": [
-    "MixtralForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 14336,
-  "max_position_embeddings": 32768,
-  "model_type": "mixtral",
-  "num_attention_heads": 32,
-  "num_experts_per_tok": 2,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 8,
-  "num_local_experts": 8,
-  "output_router_logits": false,
-  "rms_norm_eps": 1e-05,
-  "rope_theta": 1000000.0,
-  "router_aux_loss_coef": 0.02,
-  "sliding_window": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.36.0.dev0",
-  "use_cache": true,
-  "vocab_size": 32000
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json b/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json
deleted file mode 100644
index 55d6cbbe0b..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/Mistral-NeMo-Minitron-8B-Instruct/config.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "_name_or_path": "nvidia/Mistral-NeMo-Minitron-8B-Instruct",
-  "activation": "silu",
-  "architectures": [
-    "MistralForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 11520,
-  "max_position_embeddings": 8192,
-  "model_type": "mistral",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 40,
-  "num_key_value_heads": 8,
-  "rms_norm_eps": 1e-05,
-  "rope_theta": 1000000.0,
-  "sliding_window": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.44.0",
-  "use_cache": true,
-  "vocab_size": 131072
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json
deleted file mode 100644
index 4f56c18a20..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/config.json
+++ /dev/null
@@ -1,56 +0,0 @@
-{
-  "architectures": [
-    "NemotronHForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoConfig": "configuration_nemotron_h.NemotronHConfig",
-    "AutoModelForCausalLM": "modeling_nemotron_h.NemotronHForCausalLM"
-  },
-  "bos_token_id": 1,
-  "chunk_size": 128,
-  "conv_kernel": 4,
-  "eos_token_id": 12,
-  "head_dim": 128,
-  "hidden_dropout": 0.0,
-  "hidden_size": 4480,
-  "hybrid_override_pattern": "M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-",
-  "initializer_range": 0.02,
-  "intermediate_size": 15680,
-  "layer_norm_epsilon": 1e-05,
-  "mamba_head_dim": 80,
-  "mamba_hidden_act": "silu",
-  "mamba_num_groups": 8,
-  "mamba_num_heads": 128,
-  "mamba_proj_bias": false,
-  "mamba_state_dim": 128,
-  "max_position_embeddings": 131072,
-  "mlp_bias": false,
-  "mlp_hidden_act": "relu2",
-  "model_type": "nemotron_h",
-  "n_groups": 8,
-  "num_attention_heads": 40,
-  "num_hidden_layers": 56,
-  "num_key_value_heads": 8,
-  "num_logits_to_keep": 1,
-  "num_query_groups": 8,
-  "pad_token_id": 0,
-  "rescale_prenorm_residual": true,
-  "residual_in_fp32": false,
-  "rms_norm_eps": 1e-05,
-  "sliding_window": null,
-  "ssm_state_size": 128,
-  "tie_word_embeddings": false,
-  "time_step_floor": 0.0001,
-  "time_step_max": 0.1,
-  "time_step_min": 0.001,
-  "time_step_rank": 256,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.51.3",
-  "use_bias": false,
-  "use_cache": true,
-  "use_conv_bias": true,
-  "use_mamba_kernels": true,
-  "vocab_size": 131072
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py
deleted file mode 100644
index 2b5c451b4a..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/configuration_nemotron_h.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# coding=utf-8
-# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""NemotronH model configuration"""
-
-import re
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-class NemotronHConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`NemotronHModel`]. It is used to instantiate a
-    NemotronH model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the NemotronH-v0.1 model.
-
-    [todo](todo)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 131072):
-            Vocabulary size of the NemotronH model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`NemotronHModel`]
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
-            model has a output word embedding layer.
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 21504):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 52):
-            Number of hidden layers in the Transformer encoder.
-        hybrid_override_pattern (`str`, *optional*, defaults to `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
-            The pattern of the hybrid model. The pattern is a string of characters where each character represents M: Mamba2, *: Attention, -: MLP
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        attention_head_dim (`int`, *optional*, defaults to 128):
-            Dimension of each attention head.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
-        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
-            The non-linear activation function in the MLP layers.
-        attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in attention layers.
-        mlp_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in MLP layers.
-        use_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in the model.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
-            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
-            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
-            integer value, only last `num_logits_to_keep` logits will be calculated.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            The id of the padding token.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the "end-of-sequence" token.
-        sliding_window (`int`, *optional*, defaults to None):
-            Sliding window attention window size.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model might ever be used with.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        hidden_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the hidden states.
-        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
-            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
-            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device.
-        ssm_state_size (`int`, *optional*, defaults to 128):
-            The dimension of the mamba state space latents.
-        mamba_num_heads (`int`, *optional*, defaults to 128):
-            Number of heads in Mamba layers.
-        mamba_n_groups (`int`, *optional*, defaults to 8):
-            Number of groups in Mamba layers.
-        mamba_head_dim (`int`, *optional*, defaults to 64):
-            Dimension of each Mamba head.
-        mamba_d_conv (`int`, *optional*, defaults to 4):
-            The size of the mamba convolution kernel.
-        mamba_expand (`int`, *optional*, defaults to 2):
-            Expanding factor used to determine the mamba intermediate size.
-        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
-            The non-linear activation function in the Mamba layers.
-        mamba_dt_min (`float`, *optional*, defaults to 0.001):
-            Minimum value for the time step in Mamba.
-        mamba_dt_max (`float`, *optional*, defaults to 0.1):
-            Maximum value for the time step in Mamba.
-        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
-            Limits for the time step in Mamba.
-        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
-            Floor value for time step initialization in Mamba.
-        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use bias in the convolution layer of the mamba mixer block.
-        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use bias in the input and output projections of the mamba mixer block.
-        mamba_chunk_size (`int`, *optional*, defaults to 256):
-            Size of chunks for Mamba processing.
-        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the pre-normalization residual connections.
-    """
-
-    model_type = "nemotron_h"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=131072,
-        tie_word_embeddings=False,
-        hidden_size=4096,
-        intermediate_size=21504,
-        num_hidden_layers=52,
-        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
-        num_attention_heads=32,
-        #attention_head_dim=128,
-        head_dim=128,
-        num_key_value_heads=8,  # nemo: num_query_groups
-        mlp_hidden_act="relu2",
-        attention_bias=False,
-        mlp_bias=False,
-        use_bias=False,
-        initializer_range=0.02, # nemo: init_method_std
-        layer_norm_epsilon=1e-5, # nemo: layernorm_epsilon
-        residual_in_fp32=False,  #  Megatron Core default value
-        use_cache=True,
-        num_logits_to_keep=1,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        sliding_window=None,
-        max_position_embeddings=4096,
-        attention_dropout=0.0,
-        hidden_dropout=0.0, # * ADDED
-        use_mamba_kernels=True,
-        ssm_state_size=128, # mamba_state_size
-        mamba_num_heads=128,
-        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
-        mamba_head_dim=64,
-        mamba_d_conv=4,
-        mamba_expand=2,
-        mamba_hidden_act="silu",
-        mamba_dt_min=0.001,
-        mamba_dt_max=0.1,
-        mamba_dt_limit=(0.0, float("inf")),
-        mamba_dt_init_floor=1e-4,
-        mamba_conv_bias=True,
-        mamba_proj_bias=False,
-        mamba_chunk_size=256,
-        rescale_prenorm_residual=True,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.tie_word_embeddings = tie_word_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.hybrid_override_pattern = hybrid_override_pattern
-        self.num_attention_heads = num_attention_heads
-        #self.attention_head_dim = attention_head_dim
-        self.head_dim = head_dim
-        self.sliding_window = sliding_window
-        self.max_position_embeddings = max_position_embeddings
-        self.attention_dropout = attention_dropout
-        self.hidden_dropout = hidden_dropout
-
-        # Validate hybrid_override_pattern
-        # M: Mamba2, *: Attention, -: MLP
-        assert len(self.hybrid_override_pattern) == self.num_hidden_layers, "hybrid_override_pattern must have the same length as num_hidden_layers"
-        assert re.match(r"^[*-M]+$", self.hybrid_override_pattern), "hybrid_override_pattern must only contain characters 'M', '*', or '-'"
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.mlp_hidden_act = mlp_hidden_act
-        self.attention_bias = attention_bias
-        self.mlp_bias = mlp_bias
-        self.use_bias = use_bias
-        self.initializer_range = initializer_range
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.residual_in_fp32 = residual_in_fp32
-
-        self.use_cache = use_cache
-        self.num_logits_to_keep = num_logits_to_keep
-
-        self.use_mamba_kernels = use_mamba_kernels
-        self.n_groups = mamba_n_groups
-        self.mamba_head_dim = mamba_head_dim
-        self.ssm_state_size = ssm_state_size
-        self.mamba_num_heads = mamba_num_heads
-        self.conv_kernel = mamba_d_conv
-        self.expand = mamba_expand
-        self.mamba_hidden_act = mamba_hidden_act
-        self.time_step_min = mamba_dt_min
-        self.time_step_max = mamba_dt_max
-        self.time_step_limit = mamba_dt_limit
-        self.time_step_floor = mamba_dt_init_floor
-        self.use_conv_bias = mamba_conv_bias
-        self.mamba_proj_bias = mamba_proj_bias
-        self.chunk_size = mamba_chunk_size
-        self.rescale_prenorm_residual = rescale_prenorm_residual
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    @property
-    def layers_block_type(self):
-        return [
-            "mamba" if self.hybrid_override_pattern[i] == "M" else
-            "attention" if self.hybrid_override_pattern[i] == "*" else "mlp"
-            for i in range(self.num_hidden_layers)]
\ No newline at end of file
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py b/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py
deleted file mode 100644
index 7ac44be3f8..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/NVIDIA-Nemotron-Nano-9B-v2/modeling_nemotron_h.py
+++ /dev/null
@@ -1,1643 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc. team.
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch NemotronH model."""
-
-import math
-from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import CrossEntropyLoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import DynamicCache  # we need __iter__ and __len__ of pkv
-from transformers.generation import GenerationMixin
-from transformers.modeling_attn_mask_utils import (
-    AttentionMaskConverter,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-)
-from transformers.utils.import_utils import (
-    is_causal_conv1d_available,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    is_mamba_2_ssm_available,
-)
-from .configuration_nemotron_h import NemotronHConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-# Copied from transformers.models.mamba.modeling_mamba2.modeling_mamba2.py with MAMBA2->NEMOTRONH,Mamba2->NemotronH
-# For Mamba2 components Mamba2->NemotronHMamba2
-if is_mamba_2_ssm_available():
-    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
-    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
-else:
-    mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined, selective_state_update = None, None, None
-
-try:
-    #from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
-    from mamba_ssm.ops.triton.layernorm_gated import rmsnorm_fn
-except ImportError:
-    raise ImportError("mamba-ssm is required by the Mamba model but cannot be imported")
-
-if is_causal_conv1d_available():
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
-else:
-    causal_conv1d_update, causal_conv1d_fn = None, None
-
-if is_flash_attn_2_available():
-    from transformers.modeling_flash_attention_utils import _flash_attention_forward
-
-is_fast_path_available = all(
-    (
-        selective_state_update,
-        mamba_chunk_scan_combined,
-        mamba_split_conv1d_scan_combined,
-        causal_conv1d_fn,
-        causal_conv1d_update,
-    )
-)
-
-
-_CHECKPOINT_FOR_DOC = "nvidia/Nemotron-H-56B-Base-8K"
-_CONFIG_FOR_DOC = "NemotronHConfig"
-
-
-# Helper methods for segment sum computation
-
-
-def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int):
-    """
-    Padding x tensor with `pad_size` on the seq_len dim (dim=1)
-
-    Assumes that we only have tensors of either size 4 or 3
-    """
-    pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0)
-
-    return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0)
-
-
-def reshape_into_chunks(input_tensor, pad_size, chunk_size):
-    """
-    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
-    simultaneously splitting it into chunk sequences.
-
-    Assumes that we only have tensors of either size 4 or 3
-    """
-    # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...]
-    input_tensor = pad_tensor_by_size(input_tensor, pad_size)
-
-    if len(input_tensor.shape) == 3:
-        # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads]
-        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2])
-    else:
-        # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size]
-        return input_tensor.reshape(
-            input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]
-        )
-
-
-def segment_sum(input_tensor):
-    """
-    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
-    """
-    chunk_size = input_tensor.size(-1)
-    # 1. expand input tensor to have an additional dimension and repeat along that dimension
-    # [..., chunk_size] -> [..., chunk_size, chunk_size]
-    input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size)
-    # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag
-    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1)
-    input_tensor = input_tensor.masked_fill(~mask, 0)
-    # 3. compute actual cumsum
-    tensor_segsum = torch.cumsum(input_tensor, dim=-2)
-
-    # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time)
-    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0)
-    tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf)
-    return tensor_segsum
-
-
-def apply_mask_to_padding_states(hidden_states, attention_mask):
-    """
-    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
-    """
-    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
-        dtype = hidden_states.dtype
-        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
-
-    return hidden_states
-
-# Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/models/jamba/modeling_jamba.py
-class HybridMambaAttentionDynamicCache(DynamicCache):
-    """
-    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
-    (which has a constant shape regardless of seq_len).
-
-    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
-    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
-    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
-    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
-    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
-    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
-    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
-    """
-
-    def __init__(self, config, batch_size, dtype=torch.float16, device=None):
-        super().__init__()
-        self.dtype = dtype
-        self.hybrid_override_pattern = config.hybrid_override_pattern
-        self.has_previous_state = False  # only used by mamba
-        #intermediate_size = config.expand * config.hidden_size
-        intermediate_size = config.mamba_num_heads * config.mamba_head_dim
-        ssm_state_size = config.ssm_state_size
-        conv_kernel_size = config.conv_kernel
-        self.conv_states = []
-        self.ssm_states = []
-        self.transformer_layers = []
-        for i in range(config.num_hidden_layers):
-            if self.hybrid_override_pattern[i] == "M":
-                # Mamba layer
-                self.conv_states += [
-                    torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
-                ]
-                self.ssm_states += [
-                    torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
-                ]
-            else:
-                # Attention or MLP layer
-                self.conv_states += [torch.tensor([[]] * batch_size, device=device)]
-                self.ssm_states += [torch.tensor([[]] * batch_size, device=device)]
-                self.transformer_layers.append(i)
-
-        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
-        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
-
-    def update(
-        self,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        layer_idx: int,
-        cache_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Update the cache
-        if self.key_cache[layer_idx].shape[-1] == 0:
-            self.key_cache[layer_idx] = key_states
-            self.value_cache[layer_idx] = value_states
-        else:
-            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
-            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
-
-        return self.key_cache[layer_idx], self.value_cache[layer_idx]
-
-    def reorder_cache(self, beam_idx: torch.LongTensor):
-        """Reorders the cache for beam search, given the selected beam indices."""
-        for layer_idx in range(len(self.key_cache)):
-            device = self.key_cache[layer_idx].device
-            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
-            device = self.value_cache[layer_idx].device
-            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
-
-            device = self.conv_states[layer_idx].device
-            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
-            device = self.ssm_states[layer_idx].device
-            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
-
-    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
-        # take any layer that contains cache and not empty tensor
-        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
-        if len(self.key_cache) <= layer_idx:
-            return 0
-        return self.key_cache[layer_idx].shape[-2]
-
-    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
-        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
-
-    @classmethod
-    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
-        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
-
-    # Copied from modeling_mamba2.py
-    def update_conv_state(
-        self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False
-    ) -> torch.Tensor:
-        if cache_init:
-            self.conv_states[layer_idx] = new_conv_state.to(self.conv_states.device)
-        else:
-            self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
-            self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states.device)
-        return self.conv_states[layer_idx]
-
-    def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
-        self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device)
-        return self.ssm_states[layer_idx]
-
-    def reset(self):
-        self.conv_states.zero_()
-        self.ssm_states.zero_()
-
-class MambaRMSNormGated(torch.nn.Module):
-    def __init__(self, hidden_size, group_size, eps=1e-5):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-        self.group_size = group_size
-
-    # jan28b version
-    def forward(self, hidden_states, gate=None):
-        return rmsnorm_fn(x=hidden_states,
-                          weight=self.weight,
-                          bias=None, # No bias
-                          z=gate,
-                          eps=self.variance_epsilon,
-                          group_size=self.group_size,
-                          norm_before_gate=False
-        )
-
-class NemotronHMamba2Mixer(nn.Module):
-    """
-    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
-    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
-    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
-    and is why Mamba is called **selective** state spaces)
-    """
-
-    def __init__(self, config: NemotronHConfig, layer_idx: int):
-        super().__init__()
-        self.num_heads = config.mamba_num_heads
-        self.hidden_size = config.hidden_size
-        self.ssm_state_size = config.ssm_state_size
-        self.conv_kernel_size = config.conv_kernel
-        self.intermediate_size = config.mamba_num_heads * config.mamba_head_dim
-        self.layer_idx = layer_idx
-        self.use_conv_bias = config.use_conv_bias
-        self.activation = config.mamba_hidden_act
-        self.act = ACT2FN[config.mamba_hidden_act]
-
-        self.layer_norm_epsilon = config.layer_norm_epsilon
-
-        self.n_groups = config.n_groups
-        self.head_dim = config.mamba_head_dim
-        self.chunk_size = config.chunk_size
-
-        self.time_step_limit = config.time_step_limit
-        self.time_step_min = config.time_step_min
-        self.time_step_max = config.time_step_max
-
-        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
-        self.conv1d = nn.Conv1d(
-            in_channels=self.conv_dim,
-            out_channels=self.conv_dim,
-            bias=config.use_conv_bias,
-            kernel_size=config.conv_kernel,
-            groups=self.conv_dim,
-            padding=config.conv_kernel - 1,
-        )
-
-        # projection of the input hidden states
-        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
-        self.in_proj = nn.Linear(
-            self.hidden_size,
-            projection_size,
-            bias=config.use_bias,
-        )
-        # selective projection used to make dt, B and C input dependant
-
-        # time step projection (discretization)
-        # instantiate once and copy inv_dt in init_weights of PretrainedModel
-        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
-
-        # S4D real initialization. These are not discretized!
-        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
-        A = torch.arange(1, self.num_heads + 1)
-        self.A_log = nn.Parameter(torch.log(A))
-        self.A_log._no_weight_decay = True
-        self.norm = MambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon, group_size=self.intermediate_size // self.n_groups)
-        self.D = nn.Parameter(torch.ones(self.num_heads))
-        self.D._no_weight_decay = True
-
-        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
-        self.use_bias = config.use_bias
-
-        if not is_fast_path_available:
-            logger.warning_once(
-                "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
-                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
-                " https://github.com/Dao-AILab/causal-conv1d"
-            )
-
-    def cuda_kernels_forward(
-        self,
-        hidden_states: torch.Tensor,
-        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-    ):
-        # 1. Gated MLP's linear projection
-        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
-        projected_states = self.in_proj(hidden_states)
-
-        # Set up dimensions for reshapes later
-        batch_size, seq_len, _ = hidden_states.shape
-        groups_time_state_size = self.n_groups * self.ssm_state_size
-        d_mlp = (
-            projected_states.shape[-1]
-            - 2 * self.intermediate_size
-            - 2 * self.n_groups * self.ssm_state_size
-            - self.num_heads
-        ) // 2
-
-        # Single step calculations via cache
-        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
-            _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split(
-                [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
-            )
-
-            # 2. Convolution sequence transformation
-            hidden_states_B_C = causal_conv1d_update(
-                hidden_states_B_C,
-                cache_params.conv_states[self.layer_idx],
-                self.conv1d.weight.squeeze(1),
-                self.conv1d.bias,
-                self.activation,
-            )
-
-            hidden_states, B, C = torch.split(
-                hidden_states_B_C,
-                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
-                dim=-1,
-            )
-
-            # 3. SSM transformation
-            A = -torch.exp(self.A_log.float())  # (nheads,)
-            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
-            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
-            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
-            D = self.D[:, None, ...].expand(-1, self.head_dim)
-            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
-            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
-            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
-            hidden_states = selective_state_update(
-                cache_params.ssm_states[self.layer_idx],
-                hidden_states_reshaped,
-                dt,
-                A,
-                B,
-                C,
-                D,
-                z=None,
-                dt_bias=dt_bias,
-                dt_softplus=True,
-            )
-            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
-            hidden_states = self.norm(hidden_states, gate)
-
-            # 4. Final linear projection
-            out = self.out_proj(hidden_states)[:, None, ...]
-
-        # Fused calculations or step by step if no initialized cache is found
-        else:
-            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
-            dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit}
-
-            # 2-4. Fused kernel for conv1d, SSM, and the final projection
-            if self.training and cache_params is None:
-                out = mamba_split_conv1d_scan_combined(
-                    projected_states,
-                    self.conv1d.weight.squeeze(1),
-                    self.conv1d.bias,
-                    self.dt_bias,
-                    A,
-                    D=self.D,
-                    chunk_size=self.chunk_size,
-                    seq_idx=None,  # was seq_idx
-                    activation=self.activation,
-                    rmsnorm_weight=self.norm.weight,
-                    rmsnorm_eps=self.norm.variance_epsilon,
-                    outproj_weight=self.out_proj.weight,
-                    outproj_bias=self.out_proj.bias,
-                    headdim=self.head_dim,
-                    ngroups=self.n_groups,
-                    norm_before_gate=False,
-                    return_final_states=False,
-                    **dt_limit_kwargs,
-                )
-
-            else:
-                _, _, gate, hidden_states_B_C, dt = projected_states.split(
-                    [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1
-                )
-
-                # 2. Convolution sequence transformation
-                # Init cache
-                if cache_params is not None:
-                    hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
-                    conv_states = nn.functional.pad(
-                        hidden_states_B_C_transposed,
-                        (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0),
-                    )
-                    cache_params.update_conv_state(
-                        layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True
-                    )
-
-                if self.activation not in ["silu", "swish"]:
-                    hidden_states_B_C = self.act(
-                        self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)
-                    )
-                else:
-                    hidden_states_B_C = causal_conv1d_fn(
-                        x=hidden_states_B_C.transpose(1, 2),
-                        weight=self.conv1d.weight.squeeze(1),
-                        bias=self.conv1d.bias,
-                        activation=self.activation,
-                    ).transpose(1, 2)
-                hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
-                hidden_states, B, C = torch.split(
-                    hidden_states_B_C,
-                    [self.intermediate_size, groups_time_state_size, groups_time_state_size],
-                    dim=-1,
-                )
-
-                # 3. SSM transformation
-                scan_output, ssm_state = mamba_chunk_scan_combined(
-                    hidden_states.view(batch_size, seq_len, -1, self.head_dim),
-                    dt,
-                    A,
-                    B.view(batch_size, seq_len, self.n_groups, -1),
-                    C.view(batch_size, seq_len, self.n_groups, -1),
-                    chunk_size=self.chunk_size,
-                    D=self.D,
-                    z=None,
-                    seq_idx=None,
-                    return_final_states=True,
-                    dt_bias=self.dt_bias,
-                    dt_softplus=True,
-                    **dt_limit_kwargs,
-                )
-
-                # Init cache
-                if ssm_state is not None and cache_params is not None:
-                    cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state)
-
-                scan_output = scan_output.view(batch_size, seq_len, -1)
-
-                # Multiply "gate" branch and apply extra normalization layer
-                scan_output = self.norm(scan_output, gate)
-
-                # 4. Final linear projection
-                out = self.out_proj(scan_output)
-        return out
-
-    # fmt: off
-    def torch_forward(self, input_states, cache_params: Optional[HybridMambaAttentionDynamicCache]=None, cache_position:Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None):
-        batch_size, seq_len, _ = input_states.shape
-        dtype = input_states.dtype
-
-        # 1. Gated MLP's linear projection
-        input_states = apply_mask_to_padding_states(input_states, attention_mask)
-        projected_states = self.in_proj(input_states)
-        d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size-self.num_heads) // 2
-        _, _, gate, hidden_states_B_C, dt = projected_states.split(
-                [d_mlp, d_mlp, self.intermediate_size,  self.conv_dim, self.num_heads], dim=-1
-        )
-
-        # 2. Convolution sequence transformation
-        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
-            cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False)
-
-            # We need to guarantee that anything regarding the cache is on the same device
-            conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device)
-
-            hidden_states_B_C = torch.sum(
-                conv_states * self.conv1d.weight.squeeze(1), dim=-1
-            )
-            if self.use_conv_bias:
-                hidden_states_B_C = hidden_states_B_C + self.conv1d.bias
-            hidden_states_B_C = self.act(hidden_states_B_C)
-        else:
-            # Init cache
-            if cache_params is not None:
-                hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2)
-                conv_states = nn.functional.pad(
-                    hidden_states_B_C_transposed, (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0)
-                )
-                cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True)
-
-            hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2))
-
-        hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask)
-        hidden_states, B, C = torch.split(
-            hidden_states_B_C,
-            [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size],
-            dim=-1
-        )
-
-        # 3. SSM transformation
-        A = -torch.exp(self.A_log.float())                            # [num_heads]
-        if cache_params is not None and cache_position is not None and cache_position[0] > 0:
-            # We need to guarantee that anything regarding the cache is on the same device
-            cache_device = cache_params.ssm_states.device
-
-            # Note: there is no need to pad parameter matrices here, as there is just one new token
-            # for batched generation
-            dt = dt[:, 0, :][:, None, ...]
-            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
-            # [num_heads] -> [num_heads, head_dim]
-            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
-
-            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
-            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
-            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
-            # [bsz, num_heads, head_dim, state_size]
-            dA = (torch.exp(dt[..., None] * A)).to(device=cache_device)
-
-            # Discretize B
-            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
-            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
-            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
-            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
-            B = B.reshape(batch_size, -1, B.shape[-1])
-            # [bsz, num_heads, head_dim, state_size]
-            dB = dt[..., None] * B[..., None, :]
-
-            # Discretize x into dB
-            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
-            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
-            dBx = (dB * hidden_states[..., None]).to(device=cache_device)
-
-            # State calculation
-            cache_params.update_ssm_state(
-                layer_idx=self.layer_idx,
-                new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx
-            )
-
-            # Subsequent output
-            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
-            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
-            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
-            C = C.reshape(batch_size, -1, C.shape[-1])
-            # [bsz, num_heads, head_dim]
-
-            ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype)  # Shape: [b, h, d, n]
-            # Reshape ssm_states to merge the first two dimensions
-            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
-            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
-            y = torch.bmm(ssm_states_reshaped, C_reshaped)
-            y = y.view(batch_size, self.num_heads, self.head_dim)
-
-            # D skip connection
-            # [num_heads] -> [num_heads, head_dim]
-            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
-            y = (y + hidden_states * D).to(y.dtype)
-
-            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
-            y = y.reshape(batch_size, -1)[:, None, ...]
-        else:
-            # begin ssd naive implementation without einsums
-            dt = nn.functional.softplus(dt + self.dt_bias)
-            dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1])
-            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
-            B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
-            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
-            B = B.repeat(1, 1, self.num_heads // self.n_groups, 1)
-            C = C.repeat(1, 1, self.num_heads // self.n_groups, 1)
-            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
-
-            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
-
-            # Discretize x and A
-            hidden_states = hidden_states * dt[..., None]
-            A = A.to(hidden_states.dtype) * dt
-
-            # Rearrange into blocks/chunks
-            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
-
-            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
-            A = A.permute(0, 3, 1, 2)
-            A_cumsum = torch.cumsum(A, dim=-1)
-
-            # 1. Compute the output for each intra-chunk (diagonal blocks)
-            # This is the analog of a causal mask
-            L = torch.exp(segment_sum(A))
-
-            # Contraction of C and B to get G (attention-weights like)
-            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :]  # shape: (b, c, l, s, h, n)
-            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
-
-            # Compute M, equivalent to applying attention mask to weights
-            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
-            M = M_intermediate.sum(dim=-1)
-
-            # Compute Y_diag (apply to values)
-            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3)
-
-            # 2. Compute the state for each intra-chunk
-            # (right term of low-rank factorization of off-diagonal blocks; B terms)
-            decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
-            B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None]
-            states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2)
-
-            # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
-            # (middle term of factorization of off-diag blocks; A terms)
-            if cache_params is not None and cache_position is not None and cache_position[0] > 0:
-                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device)
-            else:
-                previous_states = torch.zeros_like(states[:, :1])
-            states = torch.cat([previous_states, states], dim=1)
-            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
-            decay_chunk = decay_chunk.transpose(1, 3)
-            new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1)
-            states, ssm_state = new_states[:, :-1], new_states[:, -1]
-
-            # 4. Compute state -> output conversion per chunk
-            # (left term of low-rank factorization of off-diagonal blocks; C terms)
-            state_decay_out = torch.exp(A_cumsum)
-            C_times_states = (C[..., None, :] * states[:, :, None, ...])
-            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
-            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
-
-            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
-            y = Y_diag + Y_off
-            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
-            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
-
-            y = y + D_residual
-            # Cutting off padded chunks
-            if pad_size > 0:
-                y = y[:, :seq_len, :, :]
-            y = y.reshape(batch_size, seq_len, -1)
-
-            # Init cache
-            if ssm_state is not None and cache_params is not None:
-                cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state)
-
-        scan_output = self.norm(y, gate)
-
-        # end ssd naive
-
-        # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
-        return contextualized_states
-    # fmt: on
-
-    def forward(
-        self,
-        hidden_states,
-        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-    ):
-        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
-            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
-        dtype = hidden_states.dtype
-        if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
-            # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
-            hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
-
-        return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask)
-
-
-class NemotronHRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        NemotronHRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        # Weights are in float32
-        return (self.weight.to(torch.float32) * hidden_states).to(input_dtype)
-
-class NemotronHBlock(nn.Module):
-    def __init__(self, config, layer_idx):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.residual_in_fp32 = config.residual_in_fp32
-        self.norm = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-
-        # M: Mamba2, *: Attention, -: MLP
-        self.block_type = config.layers_block_type[layer_idx]
-        if self.block_type == "mamba":
-            self.mixer = NemotronHMamba2Mixer(config, layer_idx=layer_idx)
-        elif self.block_type == "attention":
-            self.mixer = NEMOTRONH_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
-        elif self.block_type == "mlp":
-            self.mixer = NemotronHMLP(config, layer_idx=layer_idx)
-        else:
-            raise ValueError(f"Invalid layer pattern {config.hybrid_override_pattern[layer_idx]}")
-
-    def forward(
-        self,
-        hidden_states,
-        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-    ):
-        with torch.cuda.stream(torch.cuda.default_stream(hidden_states.device)):
-            # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs
-            residual = hidden_states
-            hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
-            if self.residual_in_fp32:
-                residual = residual.to(torch.float32)
-
-            if self.block_type == "mamba":
-                hidden_states = self.mixer(
-                    hidden_states, cache_params=cache_params, cache_position=cache_position
-                )
-            elif self.block_type == "attention":
-                hidden_states = self.mixer(
-                    hidden_states, cache_position=cache_position
-                )
-                hidden_states = hidden_states[0]
-            elif self.block_type == "mlp":
-                hidden_states = self.mixer(
-                    hidden_states
-                )
-            else:
-                raise ValueError(f"Invalid block_type: {self.block_type}")
-
-            hidden_states = residual + hidden_states
-            return hidden_states
-
-
-# Copied from transformers.models.nemotron.modeling_nemotron Nemotron->NemotronH
-class NemotronHMLP(nn.Module):
-    def __init__(self, config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-        self.hidden_size = config.hidden_size
-        #intermediate_size = config.expand * config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[config.mlp_hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.up_proj(x)))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class NemotronHAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: NemotronHConfig, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        if config.head_dim is not None:
-            self.head_dim = config.head_dim
-        else:
-            self.head_dim = config.hidden_size // config.num_attention_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.is_causal = True
-
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.head_dim * self.num_heads, self.hidden_size, bias=config.attention_bias)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        # position_embeddings: Tuple[torch.Tensor, torch.Tensor], #TODO
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        if past_key_value is not None:
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        is_causal = True if causal_mask is None and q_len > 1 else False
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
-        )
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        #attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-        attn_output = attn_output.view(bsz, q_len, self.num_heads * self.head_dim)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-# Adapted from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Jamba
-#class JambaFlashAttention2(JambaAttention):
-class NemotronHFlashAttention2(NemotronHAttention):
-    """
-    Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ):
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        if past_key_value is not None:
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            sliding_window=getattr(self.config, "sliding_window", None),
-            is_causal=self.is_causal,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-        )
-
-        #attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-# Adapted from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Jamba
-#class JambaSdpaAttention(JambaAttention):
-class NemotronHSdpaAttention(NemotronHAttention):
-    """
-    Jamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `JambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from NemotronHAttention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "NemotronHModel is using NemotronHSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        if past_key_value is not None:
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-        is_causal = True if self.is_causal and causal_mask is None and q_len > 1 else False
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-NEMOTRONH_ATTENTION_CLASSES = {
-    "eager": NemotronHAttention,
-    "flash_attention_2": NemotronHFlashAttention2,
-    "sdpa": NemotronHSdpaAttention,
-}
-
-# Copied from transformers.models.mamba.modeling_mamba2.Mamba2PreTrainedModel
-class NemotronHPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = NemotronHConfig
-    base_model_prefix = "backbone"
-    _no_split_modules = ["NemotronHBlock"]
-    supports_gradient_checkpointing = True
-    _is_stateful = True
-    _supports_flash_attn_2 = True
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, NemotronHMamba2Mixer):
-            if getattr(module.dt_bias, "_is_hf_initialized", False):
-                return
-            module.A_log._no_weight_decay = True
-            module.D._no_weight_decay = True
-
-            dt = torch.exp(
-                torch.rand(self.config.mamba_num_heads)
-                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
-                + math.log(self.config.time_step_min)
-            ).clamp(min=self.config.time_step_floor)
-
-            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
-            with torch.no_grad():
-                module.dt_bias.copy_(inv_dt)
-            module.dt_bias._no_reinit = True
-
-        if isinstance(module, nn.Linear):
-            if module.bias is not None:
-                if not getattr(module.bias, "_no_reinit", False):
-                    nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.Embedding):
-            nn.init.normal_(module.weight, std=self.config.initializer_range)
-
-        # TODO: Check
-        if self.config.rescale_prenorm_residual:
-            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-            #
-            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-            for name, p in module.named_parameters():
-                if getattr(p, "_is_hf_initialized", False):
-                    continue
-                if name in ["out_proj.weight"]:
-                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                    # We need to reinit p since this code could be called multiple times
-                    # Having just p *= scale would repeatedly scale it down
-                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-                    with torch.no_grad():
-                        p /= math.sqrt(self.config.num_hidden_layers)
-
-
-@dataclass
-# Copied from transformers.models.mamba.modeling_mamba2.Mamba2Output with MAMBA2->NemotronH,Mamba2->NemotronH
-class NemotronHOutput(ModelOutput):
-    """
-    Class for the NemotronH model outputs.
-
-    Args:
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        cache_params (`HybridMambaAttentionDynamicCache`):
-            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
-            avoid providing the old `input_ids`.
-
-            Includes both the State space model state matrices after the selective scan, and the Convolutional states
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-    """
-
-    last_hidden_state: Optional[torch.FloatTensor] = None
-    cache_params: Optional[HybridMambaAttentionDynamicCache] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-# Copied from transformers.models.mamba2.modeling_mamba2.MambaCausalLMOutput with Mamba2->NemotronH
-class NemotronHCausalLMOutput(ModelOutput):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        cache_params (`HybridMambaAttentionDynamicCache`):
-            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
-            avoid providing the old `input_ids`.
-
-            Includes both the State space model state matrices after the selective scan, and the Convolutional states
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: Optional[torch.FloatTensor] = None
-    cache_params: Optional[HybridMambaAttentionDynamicCache] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-NEMOTRONH_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`NemotronHConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-NEMOTRONH_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
-            Indices of input sequence tokens in the vocabulary.
-
-            If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as
-            `input_ids`.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        position_ids (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings.
-        cache_params (`HybridMambaAttentionDynamicCache`, *optional*):
-            If passed along, the model uses the previous state in all the blocks (which will give the output for the
-            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
-        use_cache (`bool`, *optional*):
-            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
-            If `cache_params` is passed, `cache_position` should also be passed.
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-"""
-
-
-@add_start_docstrings(
-    "The bare NemotronH Model transformer outputting raw hidden-states without any specific head on top.",
-    NEMOTRONH_START_DOCSTRING,
-)
-class NemotronHModel(NemotronHPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.layers = nn.ModuleList([NemotronHBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])
-
-        self.gradient_checkpointing = False
-        self.norm_f = NemotronHRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
-        # Initialize weights and apply final processing
-        self._register_load_state_dict_pre_hook(self.load_hook)
-        self.post_init()
-
-    def load_hook(self, state_dict, prefix, *args):
-        for k in state_dict:
-            if "embedding." in k:
-                state_dict[k.replace("embedding.", "embeddings.")] = state_dict.pop(k)
-                break
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embeddings = new_embeddings
-
-    @add_start_docstrings_to_model_forward(NEMOTRONH_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=NemotronHOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[Tuple, NemotronHOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        # use_cache = use_cache if use_cache is not None else self.config.use_cache
-        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embeddings(input_ids)
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
-            use_cache = False
-
-        # From zamba_modeling.py
-        if use_cache and cache_params is None:
-            logger.warning_once(
-                "NemotronH requires an initialized `NemotronHHybridDynamicCache` to return a cache. None was "
-                "provided, so no cache will be returned."
-            )
-
-        hidden_states = inputs_embeds
-
-        if cache_position is None:
-            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
-        mamba_mask = self._update_mamba_mask(attention_mask, cache_position)
-
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        # Until HERE
-
-        for layer_idx, mixer_block in enumerate(self.layers):
-            # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention)
-            if mixer_block.block_type == "mamba":
-                layer_mask = mamba_mask
-            elif mixer_block.block_type == "attention":
-                layer_mask = causal_mask
-            elif mixer_block.block_type == "mlp":
-                layer_mask = None
-            else:
-                raise ValueError(f"Invalid block_type: {self.block_type}")
-
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                hidden_states = self._gradient_checkpointing_func(
-                    mixer_block.__call__, hidden_states, cache_params, cache_position, layer_mask
-                )
-            else:
-                hidden_states = mixer_block(
-                    hidden_states,
-                    cache_params=cache_params,
-                    cache_position=cache_position,
-                    attention_mask=layer_mask,
-                )
-
-            # TODO: Store attentions
-            # if output_attentions:
-            #     if layer_outputs[1] is not None:
-            #         # append attentions only of attention layers. Mamba layers return `None` as the attention weights
-            #         all_self_attns += (layer_outputs[1],)
-
-            # TODO (Check): should it happen before the forward pass?
-            # if output_hidden_states:
-            #     all_hidden_states = all_hidden_states + (hidden_states,)
-
-        hidden_states = self.norm_f(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
-
-        return NemotronHOutput(
-            last_hidden_state=hidden_states,
-            cache_params=cache_params if use_cache else None,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-    # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask
-    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
-        if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and 0.0 in attention_mask:
-                return attention_mask
-            return None
-
-        dtype, device = input_tensor.dtype, input_tensor.device
-        min_dtype = torch.finfo(dtype).min
-        sequence_length = input_tensor.shape[1]
-        target_length = cache_position[-1] + 1
-
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-        if sequence_length != 1:
-            causal_mask = torch.triu(causal_mask, diagonal=1)
-        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
-        if attention_mask is not None:
-            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-            if attention_mask.dim() == 2:
-                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
-                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
-
-        if (
-            self.config._attn_implementation == "sdpa"
-            and attention_mask is not None
-            and attention_mask.device.type == "cuda"
-        ):
-            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
-            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
-            # Details: https://github.com/pytorch/pytorch/issues/110213
-            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
-
-        return causal_mask
-
-    def _update_mamba_mask(self, attention_mask, cache_position):
-        """
-        No need for zeroing states when
-            1. Cached forward
-            2. Attending to all inputs
-        """
-        mamba_mask = attention_mask
-        if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)):
-            mamba_mask = None
-        return mamba_mask
-
-
-@add_start_docstrings(
-    """
-    The NEMOTRONH Model transformer with a language modeling head on top (linear layer with weights not tied to the input
-    embeddings).
-    """,
-    NEMOTRONH_START_DOCSTRING,
-)
-class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.backbone = NemotronHModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.backbone.get_input_embeddings()
-
-    def set_input_embeddings(self, new_embeddings):
-        return self.backbone.set_input_embeddings(new_embeddings)
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def get_decoder(self):
-        return self.model
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        cache_position=None,
-        position_ids=None,
-        use_cache=True,
-        **kwargs,
-    ):
-        # Copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/jamba/modeling_jamba.py
-        # Overwitten -- uses `cache_params` as opposed to `past_key_values`
-        empty_past_kv = past_key_values is None
-
-        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-        # Exception 1: when passing input_embeds, input_ids may be missing entries
-        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        #              (we can't check exception 3 while compiling)
-        if not empty_past_kv:
-            if (
-                inputs_embeds is not None  # Exception 1
-                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
-            ):
-                input_ids = input_ids[:, -cache_position.shape[0] :]
-            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
-                input_ids = input_ids[:, cache_position]
-        else:
-            past_key_values = HybridMambaAttentionDynamicCache(
-                self.config, input_ids.shape[0], self.dtype, device=self.device
-            )
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if not empty_past_kv:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and empty_past_kv:
-            # TODO(pjin): workaround fix for properly extending inputs_embeds;
-            # longer term, may be better handled elsewhere in .generate().
-            if input_ids is not None and inputs_embeds.shape[1] < input_ids.shape[1]:
-                new_token_embeds = self.get_input_embeddings()(input_ids[:,inputs_embeds.shape[1]:])
-                inputs_embeds = torch.cat([inputs_embeds, new_token_embeds], dim=1)
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": use_cache,
-                "attention_mask": attention_mask,
-                "logits_to_keep": self.config.num_logits_to_keep,
-                "cache_position": cache_position,
-            }
-        )
-        return model_inputs
-
-    @add_start_docstrings_to_model_forward(NEMOTRONH_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=NemotronHCausalLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
-        labels: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        use_cache: Optional[bool] = None,
-        cache_position: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs,  # for now we need this for generation
-    ) -> Union[Tuple, NemotronHCausalLMOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        nemotron_h_outputs = self.backbone(
-            input_ids,
-            cache_params=cache_params,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            attention_mask=attention_mask,
-        )
-        hidden_states = nemotron_h_outputs[0]
-
-        # TODO: Check zamba_modeling.py: https://github.com/huggingface/transformers/blob/d7188ba600e36d3fd191b12e19f1b3bb81a8404f/src/transformers/models/zamba/modeling_zamba.py#L1284C1-L1286C2
-        #logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
-        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + nemotron_h_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return NemotronHCausalLMOutput(
-            loss=loss,
-            logits=logits,
-            cache_params=nemotron_h_outputs.cache_params,
-            hidden_states=nemotron_h_outputs.hidden_states,
-            attentions=nemotron_h_outputs.attentions,
-        )
diff --git a/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml b/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml
deleted file mode 100644
index dec06eac9c..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/nvidia/nemotron-4-340b-instruct/model_config.yaml
+++ /dev/null
@@ -1,261 +0,0 @@
-mcore_gpt: true
-micro_batch_size: 1
-global_batch_size: 256
-tensor_model_parallel_size: 8
-pipeline_model_parallel_size: 4
-virtual_pipeline_model_parallel_size: null
-encoder_seq_length: 4096
-max_position_embeddings: 4096
-num_layers: 96
-hidden_size: 18432
-ffn_hidden_size: 73728
-num_attention_heads: 96
-init_method_std: 0.0063
-use_scaled_init_method: true
-hidden_dropout: 0.0
-attention_dropout: 0.0
-ffn_dropout: 0.0
-kv_channels: null
-apply_query_key_layer_scaling: true
-normalization: layernorm1p
-layernorm_epsilon: 1.0e-05
-do_layer_norm_weight_decay: false
-make_vocab_size_divisible_by: 128
-pre_process: true
-post_process: true
-persist_layer_norm: true
-bias: false
-activation: squared-relu
-headscale: false
-transformer_block_type: pre_ln
-openai_gelu: false
-normalize_attention_scores: true
-position_embedding_type: rope
-rotary_percentage: 0.5
-attention_type: multihead
-share_embeddings_and_output_weights: false
-num_query_groups: 8
-tokenizer:
-  library: sentencepiece
-  type: null
-  model: nemo:8223bf8eaa194eb8920af568bb52e2d0_megatron_2.model
-  vocab_file: null
-  merge_file: null
-  tokenizer_model: nemo:eb5528fdec5c4083affa2c97958eeef7_megatron_2.model
-  sentencepiece_legacy: false
-native_amp_init_scale: 4294967296
-native_amp_growth_interval: 1000
-hysteresis: 2
-fp32_residual_connection: false
-fp16_lm_cross_entropy: false
-megatron_amp_O2: true
-grad_allreduce_chunk_size_mb: 125
-grad_div_ar_fusion: true
-gradient_accumulation_fusion: false
-bias_activation_fusion: false
-bias_dropout_add_fusion: false
-masked_softmax_fusion: true
-seed: 1234
-resume_from_checkpoint: null
-use_cpu_initialization: false
-onnx_safe: false
-apex_transformer_log_level: 30
-gradient_as_bucket_view: false
-sync_batch_comm: false
-activations_checkpoint_granularity: null
-activations_checkpoint_method: null
-activations_checkpoint_num_layers: 1
-num_micro_batches_with_partial_activation_checkpoints: null
-activations_checkpoint_layers_per_pipeline: null
-sequence_parallel: false
-transformer_engine: false
-fp8: false
-fp8_e4m3: false
-fp8_hybrid: false
-fp8_margin: 0
-fp8_interval: 1
-fp8_amax_history_len: 1
-fp8_amax_compute_algo: most_recent
-reduce_amax: true
-use_emha: false
-optim:
-  name: distributed_fused_adam
-  lr: 3.001e-07
-  weight_decay: 0.1
-  betas:
-  - 0.9
-  - 0.98
-  sched:
-    name: CosineAnnealing
-    warmup_steps: 10
-    constant_steps: 400
-    min_lr: 3.0e-07
-  bucket_cap_mb: 200
-  overlap_grad_sync: false
-  contiguous_grad_buffer: true
-precision: bf16-mixed
-data:
-  chat: true
-  chat_prompt_tokens:
-    system_turn_start: <extra_id_0>
-    turn_start: <extra_id_1>
-    label_start: <extra_id_2>
-    end_of_turn: '
-
-      '
-    end_of_name: '
-
-      '
-  sample: true
-  num_workers: 2
-  dataloader_type: single
-  train_ds:
-    file_path: /dataset/train.jsonl
-    global_batch_size: 128
-    micro_batch_size: 1
-    shuffle: true
-    memmap_workers: null
-    max_seq_length: 4096
-    min_seq_length: 1
-    drop_last: true
-    concat_sampling_probabilities: null
-    label_key: output
-    add_eos: false
-    add_sep: false
-    add_bos: false
-    truncation_field: input
-    index_mapping_dir: /indexmap_dir
-    prompt_template: '<extra_id_0>System
-
-      {system message}
-
-      <extra_id_1>User
-
-      {turn 1 user message}
-
-      <extra_id_1>Assistant
-
-      <extra_id_2>{turn 1 assistant label}
-
-      {turn 1 assistant message}
-
-      <extra_id_1>User
-
-      {turn 2 user message}
-
-      <extra_id_1>Assistant
-
-      <extra_id_2>{turn 2 assistant label}
-
-      {turn 2 assistant message}
-
-      <extra_id_1>'
-    hf_dataset: true
-    truncation_method: right
-  validation_ds:
-    file_path: /dataset/val.jsonl
-    names: null
-    global_batch_size: 128
-    micro_batch_size: 1
-    shuffle: false
-    memmap_workers: null
-    max_seq_length: 4096
-    min_seq_length: 1
-    drop_last: false
-    label_key: output
-    add_eos: false
-    add_sep: false
-    add_bos: false
-    write_predictions_to_file: false
-    output_file_path_prefix: null
-    truncation_field: input
-    index_mapping_dir: /indexmap_dir
-    prompt_template: '<extra_id_0>System
-
-      {system message}
-
-      <extra_id_1>User
-
-      {turn 1 user message}
-
-      <extra_id_1>Assistant
-
-      <extra_id_2>{turn 1 assistant label}
-
-      {turn 1 assistant message}
-
-      <extra_id_1>User
-
-      {turn 2 user message}
-
-      <extra_id_1>Assistant
-
-      <extra_id_2>{turn 2 assistant label}
-
-      {turn 2 assistant message}
-
-      <extra_id_1>'
-    tokens_to_generate: 32
-    hf_dataset: true
-    truncation_method: right
-    metric:
-      name: loss
-      average: null
-      num_classes: null
-  test_ds:
-    prompt_template: '<extra_id_0>System
-
-      {system message}
-
-      <extra_id_1>User
-
-      {turn 1 user message}
-
-      <extra_id_1>Assistant
-
-      <extra_id_2>{turn 1 assistant label}
-
-      {turn 1 assistant message}
-
-      <extra_id_1>User
-
-      {turn 2 user message}
-
-      <extra_id_1>Assistant
-
-      <extra_id_2>{turn 2 assistant label}
-
-      {turn 2 assistant message}
-
-      <extra_id_1>'
-  data_impl: jsonl
-  splits_string: null
-  seq_length: 4096
-  skip_warmup: true
-  reset_position_ids: false
-  reset_attention_mask: false
-  eod_mask_loss: false
-  index_mapping_dir: /indexmap_dir
-  data_prefix:
-    train:
-    - /datasets/train.jsonl
-    validation:
-    - /datasets/val.jsonl
-    test:
-    - /datasets/val.jsonl
-answer_only_loss: true
-restore_from_path: /models/340B_base
-save_nemo_on_validation_end: true
-use_flash_attention: null
-pipeline_model_parallel_split_rank: 0
-dpo:
-  log_prob_forward_micro_batch_size: 2
-  ref_policy_kl_penalty: 0.3
-  average_log_probs: false
-  sft_loss_coeff: 1.0e-05
-  optimize_ref_policy_kl_penalty: false
-  preference_loss: reward_rev_dpo
-  gt_reward_scale: 1.0
-apply_rope_fusion: false
-target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel
-nemo_version: 1.22.0
diff --git a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json
deleted file mode 100644
index 42300b8993..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-120b/config.json
+++ /dev/null
@@ -1,88 +0,0 @@
-{
-  "architectures": [
-    "GptOssForCausalLM"
-  ],
-  "attention_bias": true,
-  "attention_dropout": 0.0,
-  "eos_token_id": 200002,
-  "experts_per_token": 4,
-  "head_dim": 64,
-  "hidden_act": "silu",
-  "hidden_size": 2880,
-  "initial_context_length": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 2880,
-  "layer_types": [
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 131072,
-  "model_type": "gpt_oss",
-  "num_attention_heads": 64,
-  "num_experts_per_tok": 4,
-  "num_hidden_layers": 36,
-  "num_key_value_heads": 8,
-  "num_local_experts": 128,
-  "output_router_logits": false,
-  "pad_token_id": 199999,
-  "quantization_config": {
-    "modules_to_not_convert": [
-      "model.layers.*.self_attn",
-      "model.layers.*.mlp.router",
-      "model.embed_tokens",
-      "lm_head"
-    ],
-    "quant_method": "mxfp4"
-  },
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": {
-    "beta_fast": 32.0,
-    "beta_slow": 1.0,
-    "factor": 32.0,
-    "original_max_position_embeddings": 4096,
-    "rope_type": "yarn",
-    "truncate": false
-  },
-  "rope_theta": 150000,
-  "router_aux_loss_coef": 0.9,
-  "sliding_window": 128,
-  "swiglu_limit": 7.0,
-  "tie_word_embeddings": false,
-  "transformers_version": "4.55.0.dev0",
-  "use_cache": true,
-  "vocab_size": 201088
-}
diff --git a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json b/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json
deleted file mode 100644
index 8fb5a4a033..0000000000
--- a/services/core/models/tests/integration/parallelism/fixtures/openai/gpt-oss-20b/config.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "architectures": [
-    "GptOssForCausalLM"
-  ],
-  "attention_bias": true,
-  "attention_dropout": 0.0,
-  "eos_token_id": 200002,
-  "experts_per_token": 4,
-  "head_dim": 64,
-  "hidden_act": "silu",
-  "hidden_size": 2880,
-  "initial_context_length": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 2880,
-  "layer_types": [
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 131072,
-  "model_type": "gpt_oss",
-  "num_attention_heads": 64,
-  "num_experts_per_tok": 4,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 8,
-  "num_local_experts": 32,
-  "output_router_logits": false,
-  "pad_token_id": 199999,
-  "quantization_config": {
-    "modules_to_not_convert": [
-      "model.layers.*.self_attn",
-      "model.layers.*.mlp.router",
-      "model.embed_tokens",
-      "lm_head"
-    ],
-    "quant_method": "mxfp4"
-  },
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": {
-    "beta_fast": 32.0,
-    "beta_slow": 1.0,
-    "factor": 32.0,
-    "original_max_position_embeddings": 4096,
-    "rope_type": "yarn",
-    "truncate": false
-  },
-  "rope_theta": 150000,
-  "router_aux_loss_coef": 0.9,
-  "sliding_window": 128,
-  "swiglu_limit": 7.0,
-  "tie_word_embeddings": false,
-  "transformers_version": "4.55.0.dev0",
-  "use_cache": true,
-  "vocab_size": 201088
-}
diff --git a/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py b/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py
index 967e9f49d4..304974dd35 100644
--- a/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py
+++ b/services/core/models/tests/integration/parallelism/test_parallelism_hf_model_config.py
@@ -9,8 +9,9 @@
 
 from nmp.core.models.parallelism.api import infer_model_cfg_from_hf
 
+# TODO: Mock HuggingFace API calls instead of accessing real gated models
 REQUIRES_HF_TOKEN = pytest.mark.skip(
-    reason="Gated HuggingFace models require authentication (no fixture available)",
+    reason="Gated HuggingFace models require mocking (not yet implemented)",
 )
 GATED_MODEL_IDS = frozenset({"meta-llama/Llama-3.1-8B"})
 
diff --git a/services/core/models/tests/integration/parallelism/test_recent_models.py b/services/core/models/tests/integration/parallelism/test_recent_models.py
index c405d11a3e..5b32c86d19 100644
--- a/services/core/models/tests/integration/parallelism/test_recent_models.py
+++ b/services/core/models/tests/integration/parallelism/test_recent_models.py
@@ -14,8 +14,9 @@
 
 from nmp.core.models.parallelism.api import estimate_parallelization, find_minimum_gpus
 
+# TODO: Mock HuggingFace API calls instead of accessing real gated models
 REQUIRES_HF_TOKEN = pytest.mark.skip(
-    reason="Gated HuggingFace models require authentication (no fixture available)",
+    reason="Gated HuggingFace models require mocking (not yet implemented)",
 )
 GATED_MODEL_IDS = frozenset({"meta-llama/Llama-3.3-70B-Instruct"})
 
diff --git a/services/core/models/tests/parallelism/nemo_validation_data.py b/services/core/models/tests/parallelism/nemo_validation_data.py
index 12947f4d0b..606f78ccec 100644
--- a/services/core/models/tests/parallelism/nemo_validation_data.py
+++ b/services/core/models/tests/parallelism/nemo_validation_data.py
@@ -10,8 +10,9 @@
 
 import pytest
 
+# TODO: Mock HuggingFace API calls instead of accessing real gated models
 REQUIRES_HF_TOKEN = pytest.mark.skip(
-    reason="Gated HuggingFace models require authentication (no fixture available)",
+    reason="Gated HuggingFace models require mocking (not yet implemented)",
 )
 GATED_MODEL_IDS = frozenset(
     {