dotsimulate · forkni · Apr 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -221,4 +221,43 @@ images/inputs/*
 controlnet_test_*
 
 # Demo uploads directory
-demo/realtime-img2img/uploads/
+demo/realtime-img2img/uploads/logs/
+.cgw.conf
+
+# Local Claude / session state (per-user, never committed)
+.claude/
+
+# Test-install debug dumps and snapshots
+Debug/
+
+# Sibling repo (tracked separately at dotsimulate/StreamDiffusion-installer)
+StreamDiffusion-installer/
+
+# TD-component dev copy (feeds .tox re-export, lives in dotsimulate/StreamDiffusionTD)
+StreamDiffusionTD/
+
+# TD custom processors (TD-side, not part of the StreamDiffusion pip package)
+custom_processors/
+
+# Generated by sd_installer.generate_batch_file — CUDA-variant and path specific
+Install_StreamDiffusion.bat
+Install_TensorRT.bat
+Start_StreamDiffusion.bat
+
+# Stray file from a previous shell redirect (pip install ...>=0.19.0)
+=0.19.0
+
+# NVIDIA Nsight profiling outputs
+*.nsys-rep
+*.qdrep
+*.qdstrm
+*.ncu-rep
+profiles/
+profiler_logs/
+logs/ncu_*
+
+# Per-session work log (local only, never committed; tracked separately by user)
+SESSION_LOG.md
+
+# Profiling/audit CSV exports (Nsight summaries, kernel stats — generated artifacts)
+audit_reports/
diff --git a/configs/profiling/profiling_fp16_cached.yaml b/configs/profiling/profiling_fp16_cached.yaml
@@ -0,0 +1,54 @@
+# Profile D — FP16 + cached_attn, no ControlNet, no IPAdapter
+# Engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-True--mode-img2img--trt10.16.1.11--cc89--res-512x512
+# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild
+
+model_id: "stabilityai/sdxl-turbo"
+
+t_index_list: [16, 35]
+width: 512
+height: 512
+device: "cuda"
+dtype: "float16"
+
+guidance_scale: 1.0
+num_inference_steps: 50
+seed: 3115577
+delta: 1.0
+
+prompt: "default fancy banana wall"
+negative_prompt: ""
+
+mode: "img2img"
+frame_buffer_size: 1
+use_denoising_batch: true
+use_tiny_vae: true
+acceleration: "tensorrt"
+cfg_type: "self"
+do_add_noise: false
+warmup: 10
+use_safety_checker: false
+skip_diffusion: false
+compile_engines_only: false
+build_engines_if_missing: false
+static_shapes: true
+fp8: false
+fp8_allow_fp16_fallback: false
+builder_optimization_level: 3
+
+scheduler: "lcm"
+sampler: "normal"
+
+use_cached_attn: true
+cache_maxframes: 2
+cache_interval: 1
+
+enable_similar_image_filter: false
+similar_image_filter_threshold: 0.99
+similar_image_filter_max_skip_frame: 1
+
+hf_cache: ""
+
+engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"
+
+use_controlnet: false
+use_ipadapter: false
diff --git a/configs/profiling/profiling_fp16_flexible.yaml b/configs/profiling/profiling_fp16_flexible.yaml
@@ -0,0 +1,54 @@
+# Profile — FP16 Flexible (TrtProfile=Flexible: static_shapes=false + optlvl=3)
+# Matches engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-False--optlvl3--mode-img2img--trt10.16.1.11--cc89--res-512x512
+# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild
+
+model_id: "stabilityai/sdxl-turbo"
+
+t_index_list: [6, 25]
+width: 512
+height: 512
+device: "cuda"
+dtype: "float16"
+
+guidance_scale: 1.0
+num_inference_steps: 50
+seed: 3115577
+delta: 1.0
+
+prompt: "default fancy banana wall"
+negative_prompt: ""
+
+mode: "img2img"
+frame_buffer_size: 1
+use_denoising_batch: true
+use_tiny_vae: true
+acceleration: "tensorrt"
+cfg_type: "self"
+do_add_noise: false
+warmup: 10
+use_safety_checker: false
+skip_diffusion: false
+compile_engines_only: false
+build_engines_if_missing: false
+static_shapes: false
+fp8: false
+fp8_allow_fp16_fallback: false
+builder_optimization_level: 3
+
+scheduler: "lcm"
+sampler: "normal"
+
+use_cached_attn: false
+cache_maxframes: 2
+cache_interval: 1
+
+enable_similar_image_filter: false
+similar_image_filter_threshold: 0.99
+similar_image_filter_max_skip_frame: 1
+
+hf_cache: ""
+
+engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"
+
+use_controlnet: false
+use_ipadapter: false
diff --git a/configs/profiling/profiling_fp16_full.yaml b/configs/profiling/profiling_fp16_full.yaml
@@ -0,0 +1,66 @@
+# Profile A — FP16 + cached_attn + ControlNet + IPAdapter (full production config)
+# Engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--tokens4--use_cached_attn-True--controlnet--optlvl3--mode-img2img--trt10.16.1.11--cc89--res-512x512
+# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild
+
+model_id: "stabilityai/sdxl-turbo"
+
+t_index_list: [16, 35]
+width: 512
+height: 512
+device: "cuda"
+dtype: "float16"
+
+guidance_scale: 1.0
+num_inference_steps: 50
+seed: 3115577
+delta: 1.0
+
+prompt: "default fancy banana wall"
+negative_prompt: ""
+
+mode: "img2img"
+frame_buffer_size: 1
+use_denoising_batch: true
+use_tiny_vae: true
+acceleration: "tensorrt"
+cfg_type: "self"
+do_add_noise: false
+warmup: 10
+use_safety_checker: false
+skip_diffusion: false
+compile_engines_only: false
+build_engines_if_missing: false
+static_shapes: true
+fp8: false
+fp8_allow_fp16_fallback: false
+builder_optimization_level: 3
+
+scheduler: "lcm"
+sampler: "normal"
+
+use_cached_attn: true
+cache_maxframes: 2
+cache_interval: 1
+
+enable_similar_image_filter: false
+similar_image_filter_threshold: 0.99
+similar_image_filter_max_skip_frame: 1
+
+hf_cache: ""
+
+engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"
+
+use_controlnet: true
+controlnets:
+  - model_id: "xinsir/controlnet-canny-sdxl-1.0"
+    conditioning_scale: 0.3
+    preprocessor: "canny"
+    enabled: true
+
+use_ipadapter: true
+ipadapters:
+  - ipadapter_model_path: "h94/IP-Adapter/sdxl_models/ip-adapter_sdxl.bin"
+    image_encoder_path: "h94/IP-Adapter/sdxl_models/image_encoder"
+    scale: 0.25
+    enabled: true
+    type: regular
diff --git a/configs/profiling/profiling_fp16_plain.yaml b/configs/profiling/profiling_fp16_plain.yaml
@@ -0,0 +1,55 @@
+# Profile C (fresh) — FP16 plain, single denoising step, no cached_attn, no ControlNet, no IPAdapter
+# t_index_list=[16] gives denoising_steps_num=1 → trt_unet_batch_size=1 → compatible with batch=1 static engine
+# Engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-False--mode-img2img--trt10.16.1.11--cc89--res-512x512
+# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild
+
+model_id: "stabilityai/sdxl-turbo"
+
+t_index_list: [16]
+width: 512
+height: 512
+device: "cuda"
+dtype: "float16"
+
+guidance_scale: 1.0
+num_inference_steps: 50
+seed: 3115577
+delta: 1.0
+
+prompt: "default fancy banana wall"
+negative_prompt: ""
+
+mode: "img2img"
+frame_buffer_size: 1
+use_denoising_batch: true
+use_tiny_vae: true
+acceleration: "tensorrt"
+cfg_type: "self"
+do_add_noise: false
+warmup: 10
+use_safety_checker: false
+skip_diffusion: false
+compile_engines_only: false
+build_engines_if_missing: false
+static_shapes: true
+fp8: false
+fp8_allow_fp16_fallback: false
+builder_optimization_level:
+
+scheduler: "lcm"
+sampler: "normal"
+
+use_cached_attn: false
+cache_maxframes: 2
+cache_interval: 1
+
+enable_similar_image_filter: false
+similar_image_filter_threshold: 0.99
+similar_image_filter_max_skip_frame: 1
+
+hf_cache: ""
+
+engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"
+
+use_controlnet: false
+use_ipadapter: false
diff --git a/configs/profiling/profiling_fp8v3.yaml b/configs/profiling/profiling_fp8v3.yaml
@@ -0,0 +1,54 @@
+# Profile B — FP8v3, no cached_attn, no ControlNet, no IPAdapter
+# Engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-False--fp8v3--mode-img2img--trt10.16.1.11--cc89--res-512x512
+# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild
+
+model_id: "stabilityai/sdxl-turbo"
+
+t_index_list: [16]
+width: 512
+height: 512
+device: "cuda"
+dtype: "float16"
+
+guidance_scale: 1.0
+num_inference_steps: 50
+seed: 3115577
+delta: 1.0
+
+prompt: "default fancy banana wall"
+negative_prompt: ""
+
+mode: "img2img"
+frame_buffer_size: 1
+use_denoising_batch: true
+use_tiny_vae: true
+acceleration: "tensorrt"
+cfg_type: "self"
+do_add_noise: false
+warmup: 10
+use_safety_checker: false
+skip_diffusion: false
+compile_engines_only: false
+build_engines_if_missing: false
+static_shapes: true
+fp8: true
+fp8_allow_fp16_fallback: false
+builder_optimization_level:
+
+scheduler: "lcm"
+sampler: "normal"
+
+use_cached_attn: false
+cache_maxframes: 2
+cache_interval: 1
+
+enable_similar_image_filter: false
+similar_image_filter_threshold: 0.99
+similar_image_filter_max_skip_frame: 1
+
+hf_cache: ""
+
+engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"
+
+use_controlnet: false
+use_ipadapter: false
diff --git a/configs/profiling/profiling_quality_cn.yaml b/configs/profiling/profiling_quality_cn.yaml
@@ -0,0 +1,61 @@
+# Profile — Quality FP16 + ControlNet (canny) only (cached_attn=false, no ipadapter)
+# Matches engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-False--controlnet--optlvl3--mode-img2img--trt10.16.1.11--cc89--res-512x512
+# B2-2 isolation test: does ControlNet alone reproduce the per-frame bool-op overhead?
+# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild
+
+model_id: "stabilityai/sdxl-turbo"
+
+t_index_list: [16, 35]
+width: 512
+height: 512
+device: "cuda"
+dtype: "float16"
+
+guidance_scale: 1.0
+num_inference_steps: 50
+seed: 3115577
+delta: 1.0
+
+prompt: "default fancy banana wall"
+negative_prompt: ""
+
+mode: "img2img"
+frame_buffer_size: 1
+use_denoising_batch: true
+use_tiny_vae: true
+acceleration: "tensorrt"
+cfg_type: "self"
+do_add_noise: false
+warmup: 10
+use_safety_checker: false
+skip_diffusion: false
+compile_engines_only: false
+build_engines_if_missing: false
+static_shapes: true
+fp8: false
+fp8_allow_fp16_fallback: false
+builder_optimization_level: 3
+
+scheduler: "lcm"
+sampler: "normal"
+
+use_cached_attn: false
+cache_maxframes: 2
+cache_interval: 1
+
+enable_similar_image_filter: false
+similar_image_filter_threshold: 0.99
+similar_image_filter_max_skip_frame: 1
+
+hf_cache: ""
+
+engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"
+
+use_controlnet: true
+controlnets:
+  - model_id: "xinsir/controlnet-canny-sdxl-1.0"
+    conditioning_scale: 0.3
+    preprocessor: "canny"
+    enabled: true
+
+use_ipadapter: false