diff --git a/sd_installer/__init__.py b/sd_installer/__init__.py
index 6c059c8..0cbabfc 100644
--- a/sd_installer/__init__.py
+++ b/sd_installer/__init__.py
@@ -16,4 +16,5 @@
 from .installer import Installer
 from .verifier import Verifier
 
+
 __all__ = ["Installer", "Verifier", "__version__"]
diff --git a/sd_installer/__main__.py b/sd_installer/__main__.py
index 2a0fbda..e878b7a 100644
--- a/sd_installer/__main__.py
+++ b/sd_installer/__main__.py
@@ -2,5 +2,6 @@
 
 from .cli import main
 
+
 if __name__ == "__main__":
     exit(main())
diff --git a/sd_installer/cli.py b/sd_installer/cli.py
index 9834f44..3f7110c 100644
--- a/sd_installer/cli.py
+++ b/sd_installer/cli.py
@@ -15,13 +15,12 @@
 """
 
 import argparse
-import os
 import sys
 from pathlib import Path
 
 
 def find_base_folder() -> Path:
-    """
+    r"""
     Find the StreamDiffusion base folder (where setup.py lives).
 
     Runtime structure:
@@ -47,9 +46,9 @@ def find_base_folder() -> Path:
     # __file__ = .../StreamDiffusion-installer/sd_installer/cli.py
     # We want: .../StreamDiffusion/
     this_file = Path(__file__).resolve()
-    sd_installer_pkg = this_file.parent          # sd_installer/
-    installer_repo = sd_installer_pkg.parent     # StreamDiffusion-installer/
-    base = installer_repo.parent                 # StreamDiffusion/
+    sd_installer_pkg = this_file.parent  # sd_installer/
+    installer_repo = sd_installer_pkg.parent  # StreamDiffusion-installer/
+    base = installer_repo.parent  # StreamDiffusion/
     if (base / "setup.py").exists():
         return base
 
@@ -85,7 +84,7 @@ def cmd_check(args):
         if venv_path.exists():
             print(f"Venv: Found at {venv_path}")
         else:
-            print(f"Venv: Not found (will be created during install)")
+            print("Venv: Not found (will be created during install)")
 
         # Check StreamDiffusion setup.py (base folder IS StreamDiffusion)
         setup_py = base / "setup.py"
@@ -197,7 +196,7 @@ def cmd_diagnose(args):
         print(f"  [{status}] {check['name']}")
         if check["error"]:
             # Print just the last line of the error
-            error_line = check["error"].split('\n')[-1][:60]
+            error_line = check["error"].split("\n")[-1][:60]
             print(f"         {error_line}")
 
     return 0
@@ -205,7 +204,7 @@ def cmd_diagnose(args):
 
 def cmd_repair(args):
     """Auto-fix known issues."""
-    from .verifier import Verifier, KNOWN_ERRORS
+    from .verifier import Verifier
 
     try:
         base = Path(args.base_folder) if args.base_folder else find_base_folder()
@@ -236,6 +235,7 @@ def cmd_repair(args):
     for check in info["checks"]:
         if not check["passed"] and check["error"]:
             from .verifier import match_known_error
+
             fix = match_known_error(check["error"])
             if fix:
                 fixes_needed.append((check["name"], fix))
@@ -245,10 +245,15 @@ def cmd_repair(args):
         # numpy 2.x
         numpy_ver = info["versions"].get("numpy", "")
         if numpy_ver.startswith("2."):
-            fixes_needed.append(("numpy version", {
-                "cause": f"numpy {numpy_ver} detected (2.x breaks things)",
-                "fix": "pip install numpy==1.26.4 --force-reinstall"
-            }))
+            fixes_needed.append(
+                (
+                    "numpy version",
+                    {
+                        "cause": f"numpy {numpy_ver} detected (2.x breaks things)",
+                        "fix": "pip install numpy==1.26.4 --force-reinstall",
+                    },
+                )
+            )
 
     if not fixes_needed:
         print("No known issues detected that can be auto-fixed.")
@@ -264,18 +269,19 @@ def cmd_repair(args):
 
     if not args.yes:
         response = input("Apply fixes? [y/N]: ")
-        if response.lower() != 'y':
+        if response.lower() != "y":
             print("Aborted.")
             return 0
 
     # Apply fixes
     import subprocess
+
     for name, fix in fixes_needed:
         print(f"Applying fix for {name}...")
         cmd = [str(python_exe), "-m", "pip"] + fix["fix"].replace("pip ", "").split()
         result = subprocess.run(cmd, capture_output=True, text=True)
         if result.returncode == 0:
-            print(f"  OK")
+            print("  OK")
         else:
             print(f"  FAILED: {result.stderr}")
 
@@ -314,7 +320,7 @@ def cmd_generate_bat(args):
 
 def cmd_install_tensorrt(args):
     """Install TensorRT packages."""
-    from .tensorrt import install, get_cuda_version_from_torch
+    from .tensorrt import get_cuda_version_from_torch, install
 
     print("StreamDiffusionTD TensorRT Installation")
     print("=" * 40)
@@ -381,7 +387,8 @@ def main():
     # repair command
     repair_parser = subparsers.add_parser("repair", help="Auto-fix known issues")
     repair_parser.add_argument(
-        "-y", "--yes",
+        "-y",
+        "--yes",
         action="store_true",
         help="Apply fixes without prompting",
     )
diff --git a/sd_installer/installer.py b/sd_installer/installer.py
index 11214f0..d982d87 100644
--- a/sd_installer/installer.py
+++ b/sd_installer/installer.py
@@ -11,12 +11,11 @@
 5. Verify imports - Catch failures immediately
 """
 
-import os
-import sys
 import subprocess
-import shutil
+import sys
 from pathlib import Path
-from typing import Optional, Callable
+from typing import Callable, Optional
+
 
 # Version pins - packages NOT in setup.py that must be manually pinned
 MANUAL_PINS = {
@@ -65,7 +64,7 @@
     "cu128": {
         "torch": "2.7.0",
         "torchvision": "0.22.0",
-        "torchaudio": "2.7.0",
+        "torchaudio": None,
         "index_url": "https://download.pytorch.org/whl/cu128",
         "cuda_python": "12.9.0",
         "xformers": None,  # Not needed - PyTorch 2.7+ has native SDPA
@@ -103,10 +102,7 @@ def __init__(
 
         # Validate CUDA version
         if cuda_version not in PYTORCH_CONFIGS:
-            raise ValueError(
-                f"Unsupported CUDA version: {cuda_version}. "
-                f"Supported: {list(PYTORCH_CONFIGS.keys())}"
-            )
+            raise ValueError(f"Unsupported CUDA version: {cuda_version}. Supported: {list(PYTORCH_CONFIGS.keys())}")
 
         self.pytorch_config = PYTORCH_CONFIGS[cuda_version]
 
@@ -238,7 +234,7 @@ def phase3b_insightface(self):
 
         version_str = result.stdout.strip()
         try:
-            major, minor = map(int, version_str.split('.'))
+            major, minor = map(int, version_str.split("."))
             py_version = (major, minor)
         except ValueError:
             print(f"  WARNING: Could not parse Python version '{version_str}', skipping insightface pre-install")
@@ -269,10 +265,13 @@ def phase5_missing_pins(self):
 
         # Force reinstall varshith15 diffusers (other deps may have overwritten it)
         self._report_progress("Ensuring varshith15 diffusers fork with kvo_cache support...", 5, 8)
-        self._run_pip([
-            "--force-reinstall", "--no-deps",
-            "diffusers @ git+https://github.com/varshith15/diffusers.git@3e3b72f557e91546894340edabc845e894f00922"
-        ])
+        self._run_pip(
+            [
+                "--force-reinstall",
+                "--no-deps",
+                "diffusers @ git+https://github.com/varshith15/diffusers.git@3e3b72f557e91546894340edabc845e894f00922",
+            ]
+        )
 
     def phase6_conflict_prone(self):
         """Phase 6: Fix conflict-prone packages with --no-deps."""
@@ -280,8 +279,7 @@ def phase6_conflict_prone(self):
 
         # Remove conflicting opencv variants
         subprocess.run(
-            [str(self.python_exe), "-m", "pip", "uninstall", "-y",
-             "opencv-python-headless", "opencv-contrib-python"],
+            [str(self.python_exe), "-m", "pip", "uninstall", "-y", "opencv-python-headless", "opencv-contrib-python"],
             capture_output=True,
         )
 
@@ -386,7 +384,7 @@ def generate_batch_file(self, output_path: Optional[str] = None, python_exe: Opt
 pause
 '''
 
-        with open(output_path, 'w', encoding='utf-8') as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             f.write(content)
 
         print(f"Generated batch file: {output_path}")
diff --git a/sd_installer/tensorrt.py b/sd_installer/tensorrt.py
index d4b46e1..a048f67 100644
--- a/sd_installer/tensorrt.py
+++ b/sd_installer/tensorrt.py
@@ -3,9 +3,10 @@
 
 Standalone module that doesn't rely on streamdiffusion package imports.
 """
+
+import platform
 import subprocess
 import sys
-import platform
 from typing import Optional
 
 
@@ -17,7 +18,7 @@ def run_pip(command: str):
 def is_installed(package_name: str) -> bool:
     """Check if a package is installed"""
     try:
-        __import__(package_name.replace('-', '_'))
+        __import__(package_name.replace("-", "_"))
         return True
     except ImportError:
         return False
@@ -27,6 +28,7 @@ def version(package_name: str) -> Optional[str]:
     """Get version of installed package"""
     try:
         import importlib.metadata
+
         return importlib.metadata.version(package_name)
     except Exception:
         return None
@@ -74,6 +76,7 @@ def install(cu: Optional[str] = None):
         if current_version_str:
             try:
                 from packaging.version import Version
+
                 current_version = Version(current_version_str)
                 if current_version < Version("10.8.0"):
                     print("Uninstalling old TensorRT version...")
@@ -84,9 +87,11 @@ def install(cu: Optional[str] = None):
                     print("Uninstalling old TensorRT version...")
                     run_pip("uninstall -y tensorrt")
 
-    # For CUDA 12.8+ (RTX 5090/Blackwell support), use TensorRT 10.12+
+    # For CUDA 12.8+ (RTX 5090/Blackwell support), use TensorRT 10.16+
+    # 10.16.1.11 is the first Blackwell-Windows-production release and fixes
+    # the 78% FP8 perf regression that shipped in 10.12–10.13 on SM_120.
     if cuda_version_float >= 12.8:
-        print("Installing TensorRT 10.12+ for CUDA 12.8+ (Blackwell GPU support)...")
+        print("Installing TensorRT 10.16+ for CUDA 12.8+ (Blackwell GPU support)...")
 
         # Install cuDNN 9 for CUDA 12
         cudnn_name = "nvidia-cudnn-cu12==9.7.1.26"
@@ -96,7 +101,7 @@ def install(cu: Optional[str] = None):
         # tensorrt_cu12 is the CUDA 12 wrapper that owns tensorrt/__init__.py
         # and depends on tensorrt_cu12_libs + tensorrt_cu12_bindings.
         # All three are normal wheels with Requires-Dist (no pip-inside-pip).
-        trt_version = "10.12.0.36"
+        trt_version = "10.16.1.11"
         print(f"Installing TensorRT {trt_version} for CUDA {cu}...")
         run_pip(f"install --extra-index-url https://pypi.nvidia.com tensorrt_cu12=={trt_version} --no-cache-dir")
 
@@ -111,7 +116,7 @@ def install(cu: Optional[str] = None):
         # tensorrt_cu12 is the CUDA 12 wrapper that owns tensorrt/__init__.py
         # and depends on tensorrt_cu12_libs + tensorrt_cu12_bindings.
         # All three are normal wheels with Requires-Dist (no pip-inside-pip).
-        trt_version = "10.12.0.36"
+        trt_version = "10.16.1.11"
         print(f"Installing TensorRT {trt_version} for CUDA {cu}...")
         run_pip(f"install --extra-index-url https://pypi.nvidia.com tensorrt_cu12=={trt_version} --no-cache-dir")
 
@@ -126,9 +131,7 @@ def install(cu: Optional[str] = None):
         # Install TensorRT for CUDA 11
         tensorrt_version = "tensorrt==9.0.1.post11.dev4"
         print(f"Installing TensorRT for CUDA {cu}: {tensorrt_version}")
-        run_pip(
-            f"install --extra-index-url https://pypi.nvidia.com {tensorrt_version} --no-cache-dir"
-        )
+        run_pip(f"install --extra-index-url https://pypi.nvidia.com {tensorrt_version} --no-cache-dir")
     else:
         print(f"Unsupported CUDA version: {cu}")
         print("Supported versions: CUDA 11.x, 12.x, 12.8+")
@@ -137,18 +140,25 @@ def install(cu: Optional[str] = None):
     # Install additional TensorRT tools
     if not is_installed("polygraphy"):
         print("Installing polygraphy...")
-        run_pip(
-            "install polygraphy==0.49.24 --extra-index-url https://pypi.ngc.nvidia.com --no-cache-dir"
-        )
+        run_pip("install polygraphy==0.49.26 --extra-index-url https://pypi.ngc.nvidia.com --no-cache-dir")
     if not is_installed("onnx_graphsurgeon"):
         print("Installing onnx-graphsurgeon...")
-        run_pip(
-            "install onnx-graphsurgeon==0.5.8 --extra-index-url https://pypi.ngc.nvidia.com --no-cache-dir"
-        )
-    if platform.system() == 'Windows' and not is_installed("pywin32"):
+        run_pip("install onnx-graphsurgeon==0.6.1 --extra-index-url https://pypi.ngc.nvidia.com --no-cache-dir")
+
+    # FP8 quantization dependencies (CUDA 12 only).
+    # Previously missing — caused ImportError in fp8_quantize.py when users enabled FP8.
+    # Aligns with FLUX pyproject.toml (nvidia-modelopt >= 0.19.0).
+    if cuda_major == "12":
+        print("Installing FP8 quantization dependencies (modelopt, cupy)...")
+        run_pip("install nvidia-modelopt[onnx]>=0.19.0 cupy-cuda12x==13.6.0 numpy==1.26.4 --no-cache-dir")
+        # modelopt's resolver downgrades onnxruntime-gpu to 1.22.0; re-assert 1.24.4.
+        # --no-deps avoids triggering a conflicting re-solve.
+        run_pip("install onnxruntime-gpu==1.24.4 --no-deps --no-cache-dir")
+
+    if platform.system() == "Windows" and not is_installed("pywin32"):
         print("Installing pywin32...")
         run_pip("install pywin32==306 --no-cache-dir")
-    if platform.system() == 'Windows' and not is_installed("triton"):
+    if platform.system() == "Windows" and not is_installed("triton"):
         print("Installing triton-windows...")
         run_pip("install triton-windows==3.4.0.post21 --no-cache-dir")
 
diff --git a/sd_installer/verifier.py b/sd_installer/verifier.py
index 2cc484c..fa23d60 100644
--- a/sd_installer/verifier.py
+++ b/sd_installer/verifier.py
@@ -12,6 +12,7 @@
 @dataclass
 class VerificationResult:
     """Result of a single verification check."""
+
     name: str
     passed: bool
     message: str
@@ -23,67 +24,39 @@ class VerificationResult:
     (
         "torch CUDA",
         "import torch; assert torch.cuda.is_available(), 'CUDA not available'; print(f'{torch.__version__}+cu{torch.version.cuda} | {torch.cuda.get_device_name(0)}')",
-        "PyTorch with CUDA"
-    ),
-    (
-        "StreamDiffusion",
-        "from streamdiffusion.config import load_config; print('OK')",
-        "StreamDiffusion core"
-    ),
-    (
-        "timm RotaryEmbedding",
-        "from timm.layers import RotaryEmbedding; print('OK')",
-        "timm (>=1.0.24 required)"
-    ),
-    (
-        "mediapipe",
-        "import mediapipe as mp; mp.solutions.drawing_utils; print('OK')",
-        "mediapipe solutions"
-    ),
-    (
-        "transformers MT5",
-        "from transformers import MT5Tokenizer; print('OK')",
-        "transformers (MT5Tokenizer)"
-    ),
-    (
-        "huggingface_hub",
-        "from huggingface_hub import hf_hub_download; print('OK')",
-        "huggingface_hub"
+        "PyTorch with CUDA",
     ),
+    ("StreamDiffusion", "from streamdiffusion.config import load_config; print('OK')", "StreamDiffusion core"),
+    ("timm RotaryEmbedding", "from timm.layers import RotaryEmbedding; print('OK')", "timm (>=1.0.24 required)"),
+    ("mediapipe", "import mediapipe as mp; mp.solutions.drawing_utils; print('OK')", "mediapipe solutions"),
+    ("transformers MT5", "from transformers import MT5Tokenizer; print('OK')", "transformers (MT5Tokenizer)"),
+    ("huggingface_hub", "from huggingface_hub import hf_hub_download; print('OK')", "huggingface_hub"),
     (
         "numpy version",
         "import numpy; v = numpy.__version__; assert v.startswith('1.'), f'numpy 2.x detected: {v}'; print(v)",
-        "numpy (<2.0.0 required)"
+        "numpy (<2.0.0 required)",
     ),
     (
         "diffusers fork",
         "import inspect; from diffusers.models.attention_processor import Attention; assert 'kvo_cache' in inspect.signature(Attention.forward).parameters, 'Missing kvo_cache'; print('OK')",
-        "diffusers (varshith15 fork with kvo_cache)"
-    ),
-    (
-        "accelerate",
-        "from accelerate import Accelerator; print('OK')",
-        "accelerate"
-    ),
-    (
-        "controlnet_aux",
-        "from controlnet_aux import OpenposeDetector; print('OK')",
-        "controlnet_aux"
+        "diffusers (varshith15 fork with kvo_cache)",
     ),
+    ("accelerate", "from accelerate import Accelerator; print('OK')", "accelerate"),
+    ("controlnet_aux", "from controlnet_aux import OpenposeDetector; print('OK')", "controlnet_aux"),
     (
         "peft (USE_PEFT_BACKEND)",
         "from diffusers.utils import USE_PEFT_BACKEND; assert USE_PEFT_BACKEND, 'peft not detected'; print('OK')",
-        "peft (required for Cached Attention/StreamV2V)"
+        "peft (required for Cached Attention/StreamV2V)",
     ),
     (
         "protobuf version",
         "import google.protobuf; v = google.protobuf.__version__; major = int(v.split('.')[0]); assert major < 5, f'protobuf {v} (>=5.x breaks TRT engine builds)'; print(v)",
-        "protobuf (<5.0 required for TRT)"
+        "protobuf (<5.0 required for TRT)",
     ),
     (
         "onnx version",
         "import onnx; v = onnx.__version__; parts = [int(x) for x in v.split('.')[:2]]; assert parts[0] == 1 and parts[1] < 20, f'onnx {v} (>=1.20 removes float32_to_bfloat16)'; print(v)",
-        "onnx (<1.20 required for TRT)"
+        "onnx (<1.20 required for TRT)",
     ),
 ]
 
@@ -178,7 +151,7 @@ def run_all(self, verbose: bool = True) -> bool:
                     print(f"FAIL: {result.message}")
                     if result.error:
                         # Print first line of error
-                        error_line = result.error.split('\n')[-1]
+                        error_line = result.error.split("\n")[-1]
                         print(f"      {error_line}")
 
         if verbose:
@@ -211,10 +184,12 @@ def diagnose(self) -> dict:
         try:
             result = subprocess.run(
                 [self.python_exe, "-c", gpu_code],
-                capture_output=True, text=True, timeout=30,
+                capture_output=True,
+                text=True,
+                timeout=30,
             )
             if result.returncode == 0:
-                lines = result.stdout.strip().split('\n')
+                lines = result.stdout.strip().split("\n")
                 info["gpu"]["name"] = lines[0]
                 info["gpu"]["vram_mb"] = int(lines[1])
                 info["gpu"]["compute_capability"] = lines[2]
@@ -224,12 +199,14 @@ def diagnose(self) -> dict:
         # Run all checks and collect detailed info
         for name, code, description in VERIFICATION_CHECKS:
             result = self.check(name, code, description)
-            info["checks"].append({
-                "name": name,
-                "passed": result.passed,
-                "message": result.message,
-                "error": result.error,
-            })
+            info["checks"].append(
+                {
+                    "name": name,
+                    "passed": result.passed,
+                    "message": result.message,
+                    "error": result.error,
+                }
+            )
 
         # Get version information for key packages
         version_checks = [
@@ -290,8 +267,8 @@ def diagnose(self) -> dict:
         "fix": "pip install accelerate==1.10.0",
     },
     "'onnx.helper' has no attribute 'float32_to_bfloat16'": {
-        "cause": "onnx version too new",
-        "fix": "pip install onnx==1.18.0",
+        "cause": "onnx-graphsurgeon too old for onnx>=1.19 (float32_to_bfloat16 was removed)",
+        "fix": "pip install onnx-graphsurgeon==0.6.1 --extra-index-url https://pypi.ngc.nvidia.com",
     },
     "Missing kvo_cache": {
         "cause": "Wrong diffusers installed (vanilla instead of varshith15 fork)",