Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Normalize all text files to LF in the repository
* text=auto eol=lf

# Explicitly mark binary files
*.png binary
*.jpg binary
*.jpeg binary
*.gif binary
*.ico binary
*.pdf binary
*.zip binary
*.gz binary
*.tar binary
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.claude
venv
__pycache__
model_cache
logs
984 changes: 492 additions & 492 deletions README.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions activate_environment.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
%~dp0venv\Scripts\activate.bat
1,578 changes: 789 additions & 789 deletions config.py

Large diffs are not rendered by default.

175 changes: 38 additions & 137 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,137 +1,38 @@
# -----------------------------------------------------------------------------
# Kitten TTS Server Configuration File (config.yaml)
#
# This file controls all the settings for the server.
# Changes to sections like 'server', 'tts_engine', or 'paths'
# typically require a server restart to take effect.
# -----------------------------------------------------------------------------

# --- Server Settings ---
# Controls the web server's network behavior, security, and logging.
server:
# The IP address for the server to listen on.
# - "0.0.0.0": Makes the server accessible from other devices on your network. (Recommended for Docker)
# - "127.0.0.1" or "localhost": The server will only be accessible from your own computer.
host: 0.0.0.0

# The network port the server will run on.
# If you get a "port already in use" error, change this to another number (e.g., 8006).
port: 8005

# --- Logging Configuration ---
# Path to the server's log file, relative to the project root directory.
log_file_path: logs\tts_server.log

# The maximum size of a single log file in megabytes (MB) before it is rotated.
# This prevents log files from growing indefinitely.
log_file_max_size_mb: 10

# The number of old log files to keep as backups.
# For example, if this is 5, you will have 'tts_server.log' and up to 5 older backups.
log_file_backup_count: 5

# --- Model Settings ---
# Specifies the core AI model to be used by the TTS engine.
model:
# The repository ID of the model on the Hugging Face Hub.
# You can change this to use a different compatible ONNX model in the future.
repo_id: KittenML/kitten-tts-nano-0.1

# --- TTS Engine Settings ---
# Configures the hardware and core settings for the speech synthesis engine.
tts_engine:
# Determines which hardware to use for inference. This is a critical performance setting.
# Valid options: "auto", "cuda", "gpu", "cpu"
# - "auto": (Recommended) Automatically uses an NVIDIA GPU if one is detected, otherwise falls back to the CPU.
# - "cuda" or "gpu": Explicitly forces the use of an NVIDIA GPU. The server will fail to start if one is not available.
# - "cpu": Forces the use of the CPU, even if a powerful GPU is present.
device: auto

# --- File Path Settings ---
# Defines where the application should store various files.
paths:
# The directory where downloaded model files will be cached.
model_cache: model_cache

# The default directory where generated audio files will be saved.
output: outputs

# --- Default Generation Parameters ---
# Default values for the speech generation process. These can be overridden in the UI or via API calls.
generation_defaults:
# The default speed of the generated speech.
# 1.0 is normal speed. > 1.0 is faster, < 1.0 is slower.
speed: 1.1

# Default language for the phonemizer. The current model is trained for English.
language: en

# This is a legacy/duplicate setting. Please use the 'speed' setting above.
speed_factor: 1.1

# --- Audio Output Settings ---
# Controls the format and quality of the final audio file.
audio_output:
# The default audio format for the output file.
# - "wav": Highest quality, uncompressed, large file size. Best for processing.
# - "mp3": Good quality, compressed, small file size. Best for sharing and listening.
# - "opus": Excellent quality, highly compressed, smallest file size. Best for streaming/web.
format: wav

# The sample rate of the output audio in Hz.
# 24000 is the native sample rate of the KittenTTS model.
# Higher values (e.g., 48000) will resample the audio but won't add more detail.
sample_rate: 24000

# --- UI State Persistence ---
# Saves the state of the web interface between sessions so you don't lose your work.
ui_state:
# The last text that was entered into the main text box.
last_text: 'The solar system consists of the Sun and the astronomical objects gravitationally
bound in orbit around it.

Mars, often called the Red Planet, is the fourth planet from the Sun. It is a
terrestrial planet with a thin atmosphere, having surface features reminiscent
both of the impact craters of the Moon and the volcanoes, valleys, deserts, and
polar ice caps of Earth.

'
# The ID of the last voice you selected from the dropdown.
last_voice: expr-voice-2-m
# The last value you set for the "Chunk Size" slider, used for large text processing.
last_chunk_size: 200

# Remembers whether the "Split text into chunks" checkbox was enabled.
last_split_text_enabled: true

# Set to 'true' to permanently hide the one-time warning about voice consistency when using chunking.
hide_chunk_warning: false

# Set to 'true' to permanently hide the one-time general notice about generation quality.
hide_generation_warning: false

# The theme for the web interface. Options: "dark", "light".
theme: dark

# --- General UI Settings ---
# Controls the appearance and static elements of the web interface.
ui:
# The title that appears in the browser tab.
title: Kitten TTS Server

# Controls whether the language selection dropdown is visible in the UI.
# Set to 'false' to hide it if you only ever use one language.
show_language_select: true

# (For future use) If you had hundreds of voices, this would limit the number shown in the dropdown to prevent UI lag.
max_predefined_voices_in_dropdown: 50

# --- Debugging Settings ---
# Tools for troubleshooting and development.
debug:
# If 'true' and text chunking is enabled, the server will save each individual audio chunk
# as a separate file in the 'outputs' directory before they are stitched together.
# This is extremely useful for diagnosing issues with a specific part of a long text.
save_intermediate_audio: false


server:
host: 0.0.0.0
port: 8005
use_ngrok: false
use_auth: false
auth_username: user
auth_password: password
log_file_path: logs\tts_server.log
log_file_max_size_mb: 10
log_file_backup_count: 5
model:
repo_id: KittenML/kitten-tts-mini-0.8
tts_engine:
device: cuda
paths:
model_cache: model_cache
output: outputs
generation_defaults:
speed: 1
language: en
speed_factor: 1.1
audio_output:
format: ogg
sample_rate: 24000
ui_state:
last_text: YOu dont have any new unread emails.
last_voice: expr-voice-2-m
last_chunk_size: 200
last_split_text_enabled: true
hide_chunk_warning: false
hide_generation_warning: true
theme: dark
ui:
title: Kitten TTS Server
show_language_select: true
max_predefined_voices_in_dropdown: 50
debug:
save_intermediate_audio: false
56 changes: 28 additions & 28 deletions docker-compose-cpu.yml
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
version: '3.8'
services:
kitten-tts-server:
build:
context: .
dockerfile: Dockerfile
args:
# This build argument ensures only CPU dependencies are installed
- RUNTIME=cpu
ports:
- "${PORT:-8005}:8005"
volumes:
# Mount local config file for persistence
- ./config.yaml:/app/config.yaml
# Mount local directories for persistent app data
- ./outputs:/app/outputs
- ./logs:/app/logs
# Named volume for Hugging Face model cache to persist across container rebuilds
- hf_cache:/app/hf_cache
restart: unless-stopped
environment:
# Enable faster Hugging Face downloads inside the container
- HF_HUB_ENABLE_HF_TRANSFER=1
# Define the named volume for the Hugging Face cache
volumes:
version: '3.8'

services:
kitten-tts-server:
build:
context: .
dockerfile: Dockerfile
args:
# This build argument ensures only CPU dependencies are installed
- RUNTIME=cpu
ports:
- "${PORT:-8005}:8005"
volumes:
# Mount local config file for persistence
- ./config.yaml:/app/config.yaml
# Mount local directories for persistent app data
- ./outputs:/app/outputs
- ./logs:/app/logs
# Named volume for Hugging Face model cache to persist across container rebuilds
- hf_cache:/app/hf_cache

restart: unless-stopped
environment:
# Enable faster Hugging Face downloads inside the container
- HF_HUB_ENABLE_HF_TRANSFER=1

# Define the named volume for the Hugging Face cache
volumes:
hf_cache:
96 changes: 48 additions & 48 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,49 +1,49 @@
version: '3.8'
services:
kitten-tts-server:
build:
args:
# Can be nvidia or cpu; Default is Nvidia
- RUNTIME=nvidia
context: .
dockerfile: Dockerfile
ports:
- "${PORT:-8005}:8005"
volumes:
# Mount local config file for persistence
- ./config.yaml:/app/config.yaml
# Mount local directories for persistent app data
- ./outputs:/app/outputs
- ./logs:/app/logs
# Named volume for Hugging Face model cache to persist across container rebuilds
- hf_cache:/app/hf_cache
# --- GPU Support (NVIDIA) ---
# The 'deploy' key is the modern way to request GPU resources.
# If you get a 'CDI device injection failed' error, comment out the 'deploy' section
# and uncomment the 'runtime: nvidia' line below.
# Method 1: Modern Docker Compose (Recommended)
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
# Method 2: Legacy Docker Compose (for older setups)
# runtime: nvidia
restart: unless-stopped
environment:
# Enable faster Hugging Face downloads inside the container
- HF_HUB_ENABLE_HF_TRANSFER=1
# Make NVIDIA GPUs visible and specify capabilities for PyTorch
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
# Define the named volume for the Hugging Face cache
volumes:
version: '3.8'

services:
kitten-tts-server:
build:
args:
# Can be nvidia or cpu; Default is Nvidia
- RUNTIME=nvidia
context: .
dockerfile: Dockerfile
ports:
- "${PORT:-8005}:8005"
volumes:
# Mount local config file for persistence
- ./config.yaml:/app/config.yaml
# Mount local directories for persistent app data
- ./outputs:/app/outputs
- ./logs:/app/logs
# Named volume for Hugging Face model cache to persist across container rebuilds
- hf_cache:/app/hf_cache

# --- GPU Support (NVIDIA) ---
# The 'deploy' key is the modern way to request GPU resources.
# If you get a 'CDI device injection failed' error, comment out the 'deploy' section
# and uncomment the 'runtime: nvidia' line below.

# Method 1: Modern Docker Compose (Recommended)
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

# Method 2: Legacy Docker Compose (for older setups)
# runtime: nvidia

restart: unless-stopped
environment:
# Enable faster Hugging Face downloads inside the container
- HF_HUB_ENABLE_HF_TRANSFER=1
# Make NVIDIA GPUs visible and specify capabilities for PyTorch
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility

# Define the named volume for the Hugging Face cache
volumes:
hf_cache:
Loading