Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -221,4 +221,43 @@ images/inputs/*
controlnet_test_*

# Demo uploads directory
demo/realtime-img2img/uploads/
demo/realtime-img2img/uploads/logs/
.cgw.conf

# Local Claude / session state (per-user, never committed)
.claude/

# Test-install debug dumps and snapshots
Debug/

# Sibling repo (tracked separately at dotsimulate/StreamDiffusion-installer)
StreamDiffusion-installer/

# TD-component dev copy (feeds .tox re-export, lives in dotsimulate/StreamDiffusionTD)
StreamDiffusionTD/

# TD custom processors (TD-side, not part of the StreamDiffusion pip package)
custom_processors/

# Generated by sd_installer.generate_batch_file — CUDA-variant and path specific
Install_StreamDiffusion.bat
Install_TensorRT.bat
Start_StreamDiffusion.bat

# Stray file from a previous shell redirect (pip install ...>=0.19.0)
=0.19.0

# NVIDIA Nsight profiling outputs
*.nsys-rep
*.qdrep
*.qdstrm
*.ncu-rep
profiles/
profiler_logs/
logs/ncu_*

# Per-session work log (local only, never committed; tracked separately by user)
SESSION_LOG.md

# Profiling/audit CSV exports (Nsight summaries, kernel stats — generated artifacts)
audit_reports/
54 changes: 54 additions & 0 deletions configs/profiling/profiling_fp16_cached.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Profile D — FP16 + cached_attn, no ControlNet, no IPAdapter
# Engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-True--mode-img2img--trt10.16.1.11--cc89--res-512x512
# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild

model_id: "stabilityai/sdxl-turbo"

t_index_list: [16, 35]
width: 512
height: 512
device: "cuda"
dtype: "float16"

guidance_scale: 1.0
num_inference_steps: 50
seed: 3115577
delta: 1.0

prompt: "default fancy banana wall"
negative_prompt: ""

mode: "img2img"
frame_buffer_size: 1
use_denoising_batch: true
use_tiny_vae: true
acceleration: "tensorrt"
cfg_type: "self"
do_add_noise: false
warmup: 10
use_safety_checker: false
skip_diffusion: false
compile_engines_only: false
build_engines_if_missing: false
static_shapes: true
fp8: false
fp8_allow_fp16_fallback: false
builder_optimization_level: 3

scheduler: "lcm"
sampler: "normal"

use_cached_attn: true
cache_maxframes: 2
cache_interval: 1

enable_similar_image_filter: false
similar_image_filter_threshold: 0.99
similar_image_filter_max_skip_frame: 1

hf_cache: ""

engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"

use_controlnet: false
use_ipadapter: false
54 changes: 54 additions & 0 deletions configs/profiling/profiling_fp16_flexible.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Profile — FP16 Flexible (TrtProfile=Flexible: static_shapes=false + optlvl=3)
# Matches engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-False--optlvl3--mode-img2img--trt10.16.1.11--cc89--res-512x512
# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild

model_id: "stabilityai/sdxl-turbo"

t_index_list: [6, 25]
width: 512
height: 512
device: "cuda"
dtype: "float16"

guidance_scale: 1.0
num_inference_steps: 50
seed: 3115577
delta: 1.0

prompt: "default fancy banana wall"
negative_prompt: ""

mode: "img2img"
frame_buffer_size: 1
use_denoising_batch: true
use_tiny_vae: true
acceleration: "tensorrt"
cfg_type: "self"
do_add_noise: false
warmup: 10
use_safety_checker: false
skip_diffusion: false
compile_engines_only: false
build_engines_if_missing: false
static_shapes: false
fp8: false
fp8_allow_fp16_fallback: false
builder_optimization_level: 3

scheduler: "lcm"
sampler: "normal"

use_cached_attn: false
cache_maxframes: 2
cache_interval: 1

enable_similar_image_filter: false
similar_image_filter_threshold: 0.99
similar_image_filter_max_skip_frame: 1

hf_cache: ""

engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"

use_controlnet: false
use_ipadapter: false
66 changes: 66 additions & 0 deletions configs/profiling/profiling_fp16_full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Profile A — FP16 + cached_attn + ControlNet + IPAdapter (full production config)
# Engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--tokens4--use_cached_attn-True--controlnet--optlvl3--mode-img2img--trt10.16.1.11--cc89--res-512x512
# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild

model_id: "stabilityai/sdxl-turbo"

t_index_list: [16, 35]
width: 512
height: 512
device: "cuda"
dtype: "float16"

guidance_scale: 1.0
num_inference_steps: 50
seed: 3115577
delta: 1.0

prompt: "default fancy banana wall"
negative_prompt: ""

mode: "img2img"
frame_buffer_size: 1
use_denoising_batch: true
use_tiny_vae: true
acceleration: "tensorrt"
cfg_type: "self"
do_add_noise: false
warmup: 10
use_safety_checker: false
skip_diffusion: false
compile_engines_only: false
build_engines_if_missing: false
static_shapes: true
fp8: false
fp8_allow_fp16_fallback: false
builder_optimization_level: 3

scheduler: "lcm"
sampler: "normal"

use_cached_attn: true
cache_maxframes: 2
cache_interval: 1

enable_similar_image_filter: false
similar_image_filter_threshold: 0.99
similar_image_filter_max_skip_frame: 1

hf_cache: ""

engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"

use_controlnet: true
controlnets:
- model_id: "xinsir/controlnet-canny-sdxl-1.0"
conditioning_scale: 0.3
preprocessor: "canny"
enabled: true

use_ipadapter: true
ipadapters:
- ipadapter_model_path: "h94/IP-Adapter/sdxl_models/ip-adapter_sdxl.bin"
image_encoder_path: "h94/IP-Adapter/sdxl_models/image_encoder"
scale: 0.25
enabled: true
type: regular
55 changes: 55 additions & 0 deletions configs/profiling/profiling_fp16_plain.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Profile C (fresh) — FP16 plain, single denoising step, no cached_attn, no ControlNet, no IPAdapter
# t_index_list=[16] gives denoising_steps_num=1 → trt_unet_batch_size=1 → compatible with batch=1 static engine
# Engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-False--mode-img2img--trt10.16.1.11--cc89--res-512x512
# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild

model_id: "stabilityai/sdxl-turbo"

t_index_list: [16]
width: 512
height: 512
device: "cuda"
dtype: "float16"

guidance_scale: 1.0
num_inference_steps: 50
seed: 3115577
delta: 1.0

prompt: "default fancy banana wall"
negative_prompt: ""

mode: "img2img"
frame_buffer_size: 1
use_denoising_batch: true
use_tiny_vae: true
acceleration: "tensorrt"
cfg_type: "self"
do_add_noise: false
warmup: 10
use_safety_checker: false
skip_diffusion: false
compile_engines_only: false
build_engines_if_missing: false
static_shapes: true
fp8: false
fp8_allow_fp16_fallback: false
builder_optimization_level:

scheduler: "lcm"
sampler: "normal"

use_cached_attn: false
cache_maxframes: 2
cache_interval: 1

enable_similar_image_filter: false
similar_image_filter_threshold: 0.99
similar_image_filter_max_skip_frame: 1

hf_cache: ""

engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"

use_controlnet: false
use_ipadapter: false
54 changes: 54 additions & 0 deletions configs/profiling/profiling_fp8v3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Profile B — FP8v3, no cached_attn, no ControlNet, no IPAdapter
# Engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-False--fp8v3--mode-img2img--trt10.16.1.11--cc89--res-512x512
# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild

model_id: "stabilityai/sdxl-turbo"

t_index_list: [16]
width: 512
height: 512
device: "cuda"
dtype: "float16"

guidance_scale: 1.0
num_inference_steps: 50
seed: 3115577
delta: 1.0

prompt: "default fancy banana wall"
negative_prompt: ""

mode: "img2img"
frame_buffer_size: 1
use_denoising_batch: true
use_tiny_vae: true
acceleration: "tensorrt"
cfg_type: "self"
do_add_noise: false
warmup: 10
use_safety_checker: false
skip_diffusion: false
compile_engines_only: false
build_engines_if_missing: false
static_shapes: true
fp8: true
fp8_allow_fp16_fallback: false
builder_optimization_level:

scheduler: "lcm"
sampler: "normal"

use_cached_attn: false
cache_maxframes: 2
cache_interval: 1

enable_similar_image_filter: false
similar_image_filter_threshold: 0.99
similar_image_filter_max_skip_frame: 1

hf_cache: ""

engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"

use_controlnet: false
use_ipadapter: false
61 changes: 61 additions & 0 deletions configs/profiling/profiling_quality_cn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Profile — Quality FP16 + ControlNet (canny) only (cached_attn=false, no ipadapter)
# Matches engine: sdxl-turbo--tiny_vae-True--min_batch-1--max_batch-4--use_cached_attn-False--controlnet--optlvl3--mode-img2img--trt10.16.1.11--cc89--res-512x512
# B2-2 isolation test: does ControlNet alone reproduce the per-frame bool-op overhead?
# IMPORTANT: build_engines_if_missing=false — fail if engine not present rather than rebuild

model_id: "stabilityai/sdxl-turbo"

t_index_list: [16, 35]
width: 512
height: 512
device: "cuda"
dtype: "float16"

guidance_scale: 1.0
num_inference_steps: 50
seed: 3115577
delta: 1.0

prompt: "default fancy banana wall"
negative_prompt: ""

mode: "img2img"
frame_buffer_size: 1
use_denoising_batch: true
use_tiny_vae: true
acceleration: "tensorrt"
cfg_type: "self"
do_add_noise: false
warmup: 10
use_safety_checker: false
skip_diffusion: false
compile_engines_only: false
build_engines_if_missing: false
static_shapes: true
fp8: false
fp8_allow_fp16_fallback: false
builder_optimization_level: 3

scheduler: "lcm"
sampler: "normal"

use_cached_attn: false
cache_maxframes: 2
cache_interval: 1

enable_similar_image_filter: false
similar_image_filter_threshold: 0.99
similar_image_filter_max_skip_frame: 1

hf_cache: ""

engine_dir: "D:/dev/SD_3_0_1/test_Install_dev/StreamDiffusion/StreamDiffusion/engines/td"

use_controlnet: true
controlnets:
- model_id: "xinsir/controlnet-canny-sdxl-1.0"
conditioning_scale: 0.3
preprocessor: "canny"
enabled: true

use_ipadapter: false
Loading