devnen · BenjaminKobjolke · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,13 @@
+# Normalize all text files to LF in the repository
+* text=auto eol=lf
+
+# Explicitly mark binary files
+*.png binary
+*.jpg binary
+*.jpeg binary
+*.gif binary
+*.ico binary
+*.pdf binary
+*.zip binary
+*.gz binary
+*.tar binary
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.claude
+venv
+__pycache__
+model_cache
+logs
diff --git a/README.md b/README.md
diff --git a/activate_environment.bat b/activate_environment.bat
@@ -0,0 +1 @@
+%~dp0venv\Scripts\activate.bat
diff --git a/config.py b/config.py
diff --git a/config.yaml b/config.yaml
@@ -1,137 +1,38 @@
-# -----------------------------------------------------------------------------
-# Kitten TTS Server Configuration File (config.yaml)
-#
-# This file controls all the settings for the server.
-# Changes to sections like 'server', 'tts_engine', or 'paths'
-# typically require a server restart to take effect.
-# -----------------------------------------------------------------------------
-
-# --- Server Settings ---
-# Controls the web server's network behavior, security, and logging.
-server:
-  # The IP address for the server to listen on.
-  # - "0.0.0.0": Makes the server accessible from other devices on your network. (Recommended for Docker)
-  # - "127.0.0.1" or "localhost": The server will only be accessible from your own computer.
-  host: 0.0.0.0
-
-  # The network port the server will run on.
-  # If you get a "port already in use" error, change this to another number (e.g., 8006).
-  port: 8005
-
-  # --- Logging Configuration ---
-  # Path to the server's log file, relative to the project root directory.
-  log_file_path: logs\tts_server.log
-
-  # The maximum size of a single log file in megabytes (MB) before it is rotated.
-  # This prevents log files from growing indefinitely.
-  log_file_max_size_mb: 10
-
-  # The number of old log files to keep as backups.
-  # For example, if this is 5, you will have 'tts_server.log' and up to 5 older backups.
-  log_file_backup_count: 5
-
-# --- Model Settings ---
-# Specifies the core AI model to be used by the TTS engine.
-model:
-  # The repository ID of the model on the Hugging Face Hub.
-  # You can change this to use a different compatible ONNX model in the future.
-  repo_id: KittenML/kitten-tts-nano-0.1
-
-# --- TTS Engine Settings ---
-# Configures the hardware and core settings for the speech synthesis engine.
-tts_engine:
-  # Determines which hardware to use for inference. This is a critical performance setting.
-  # Valid options: "auto", "cuda", "gpu", "cpu"
-  # - "auto": (Recommended) Automatically uses an NVIDIA GPU if one is detected, otherwise falls back to the CPU.
-  # - "cuda" or "gpu": Explicitly forces the use of an NVIDIA GPU. The server will fail to start if one is not available.
-  # - "cpu": Forces the use of the CPU, even if a powerful GPU is present.
-  device: auto
-
-# --- File Path Settings ---
-# Defines where the application should store various files.
-paths:
-  # The directory where downloaded model files will be cached.
-  model_cache: model_cache
-
-  # The default directory where generated audio files will be saved.
-  output: outputs
-
-# --- Default Generation Parameters ---
-# Default values for the speech generation process. These can be overridden in the UI or via API calls.
-generation_defaults:
-  # The default speed of the generated speech.
-  # 1.0 is normal speed. > 1.0 is faster, < 1.0 is slower.
-  speed: 1.1
-
-  # Default language for the phonemizer. The current model is trained for English.
-  language: en
-
-  # This is a legacy/duplicate setting. Please use the 'speed' setting above.
-  speed_factor: 1.1
-
-# --- Audio Output Settings ---
-# Controls the format and quality of the final audio file.
-audio_output:
-  # The default audio format for the output file.
-  # - "wav": Highest quality, uncompressed, large file size. Best for processing.
-  # - "mp3": Good quality, compressed, small file size. Best for sharing and listening.
-  # - "opus": Excellent quality, highly compressed, smallest file size. Best for streaming/web.
-  format: wav
-
-  # The sample rate of the output audio in Hz.
-  # 24000 is the native sample rate of the KittenTTS model.
-  # Higher values (e.g., 48000) will resample the audio but won't add more detail.
-  sample_rate: 24000
-
-# --- UI State Persistence ---
-# Saves the state of the web interface between sessions so you don't lose your work.
-ui_state:
-  # The last text that was entered into the main text box.
-  last_text: 'The solar system consists of the Sun and the astronomical objects gravitationally
-    bound in orbit around it.
-
-    Mars, often called the Red Planet, is the fourth planet from the Sun. It is a
-    terrestrial planet with a thin atmosphere, having surface features reminiscent
-    both of the impact craters of the Moon and the volcanoes, valleys, deserts, and
-    polar ice caps of Earth.
-
-    '
-  # The ID of the last voice you selected from the dropdown.
-  last_voice: expr-voice-2-m
-  # The last value you set for the "Chunk Size" slider, used for large text processing.
-  last_chunk_size: 200
-
-  # Remembers whether the "Split text into chunks" checkbox was enabled.
-  last_split_text_enabled: true
-
-  # Set to 'true' to permanently hide the one-time warning about voice consistency when using chunking.
-  hide_chunk_warning: false
-
-  # Set to 'true' to permanently hide the one-time general notice about generation quality.
-  hide_generation_warning: false
-
-  # The theme for the web interface. Options: "dark", "light".
-  theme: dark
-
-# --- General UI Settings ---
-# Controls the appearance and static elements of the web interface.
-ui:
-  # The title that appears in the browser tab.
-  title: Kitten TTS Server
-
-  # Controls whether the language selection dropdown is visible in the UI.
-  # Set to 'false' to hide it if you only ever use one language.
-  show_language_select: true
-
-  # (For future use) If you had hundreds of voices, this would limit the number shown in the dropdown to prevent UI lag.
-  max_predefined_voices_in_dropdown: 50
-
-# --- Debugging Settings ---
-# Tools for troubleshooting and development.
-debug:
-  # If 'true' and text chunking is enabled, the server will save each individual audio chunk
-  # as a separate file in the 'outputs' directory before they are stitched together.
-  # This is extremely useful for diagnosing issues with a specific part of a long text.
-  save_intermediate_audio: false
-
-
+server:
+  host: 0.0.0.0
+  port: 8005
+  use_ngrok: false
+  use_auth: false
+  auth_username: user
+  auth_password: password
+  log_file_path: logs\tts_server.log
+  log_file_max_size_mb: 10
+  log_file_backup_count: 5
+model:
+  repo_id: KittenML/kitten-tts-mini-0.8
+tts_engine:
+  device: cuda
+paths:
+  model_cache: model_cache
+  output: outputs
+generation_defaults:
+  speed: 1
+  language: en
+  speed_factor: 1.1
+audio_output:
+  format: ogg
+  sample_rate: 24000
+ui_state:
+  last_text: YOu dont have any new unread emails.
+  last_voice: expr-voice-2-m
+  last_chunk_size: 200
+  last_split_text_enabled: true
+  hide_chunk_warning: false
+  hide_generation_warning: true
+  theme: dark
+ui:
+  title: Kitten TTS Server
+  show_language_select: true
+  max_predefined_voices_in_dropdown: 50
+debug:
+  save_intermediate_audio: false
diff --git a/docker-compose-cpu.yml b/docker-compose-cpu.yml
@@ -1,29 +1,29 @@
-version: '3.8'
-
-services:
-  kitten-tts-server:
-    build:
-      context: .
-      dockerfile: Dockerfile
-      args:
-        # This build argument ensures only CPU dependencies are installed
-        - RUNTIME=cpu
-    ports:
-      - "${PORT:-8005}:8005"
-    volumes:
-      # Mount local config file for persistence
-      - ./config.yaml:/app/config.yaml
-      # Mount local directories for persistent app data
-      - ./outputs:/app/outputs
-      - ./logs:/app/logs
-      # Named volume for Hugging Face model cache to persist across container rebuilds
-      - hf_cache:/app/hf_cache
-    
-    restart: unless-stopped
-    environment:
-      # Enable faster Hugging Face downloads inside the container
-      - HF_HUB_ENABLE_HF_TRANSFER=1
-
-# Define the named volume for the Hugging Face cache
-volumes:
+version: '3.8'
+
+services:
+  kitten-tts-server:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        # This build argument ensures only CPU dependencies are installed
+        - RUNTIME=cpu
+    ports:
+      - "${PORT:-8005}:8005"
+    volumes:
+      # Mount local config file for persistence
+      - ./config.yaml:/app/config.yaml
+      # Mount local directories for persistent app data
+      - ./outputs:/app/outputs
+      - ./logs:/app/logs
+      # Named volume for Hugging Face model cache to persist across container rebuilds
+      - hf_cache:/app/hf_cache
+
+    restart: unless-stopped
+    environment:
+      # Enable faster Hugging Face downloads inside the container
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+
+# Define the named volume for the Hugging Face cache
+volumes:
   hf_cache:
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,49 +1,49 @@
-version: '3.8'
-
-services:
-  kitten-tts-server:
-    build:
-      args:
-      # Can be nvidia or cpu; Default is Nvidia
-        - RUNTIME=nvidia
-      context: .
-      dockerfile: Dockerfile
-    ports:
-      - "${PORT:-8005}:8005"
-    volumes:
-      # Mount local config file for persistence
-      - ./config.yaml:/app/config.yaml
-      # Mount local directories for persistent app data
-      - ./outputs:/app/outputs
-      - ./logs:/app/logs
-      # Named volume for Hugging Face model cache to persist across container rebuilds
-      - hf_cache:/app/hf_cache
-    
-    # --- GPU Support (NVIDIA) ---
-    # The 'deploy' key is the modern way to request GPU resources.
-    # If you get a 'CDI device injection failed' error, comment out the 'deploy' section
-    # and uncomment the 'runtime: nvidia' line below.
-    
-    # Method 1: Modern Docker Compose (Recommended)
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-
-    # Method 2: Legacy Docker Compose (for older setups)
-    # runtime: nvidia
-
-    restart: unless-stopped
-    environment:
-      # Enable faster Hugging Face downloads inside the container
-      - HF_HUB_ENABLE_HF_TRANSFER=1
-      # Make NVIDIA GPUs visible and specify capabilities for PyTorch
-      - NVIDIA_VISIBLE_DEVICES=all
-      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
-
-# Define the named volume for the Hugging Face cache
-volumes:
+version: '3.8'
+
+services:
+  kitten-tts-server:
+    build:
+      args:
+      # Can be nvidia or cpu; Default is Nvidia
+        - RUNTIME=nvidia
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "${PORT:-8005}:8005"
+    volumes:
+      # Mount local config file for persistence
+      - ./config.yaml:/app/config.yaml
+      # Mount local directories for persistent app data
+      - ./outputs:/app/outputs
+      - ./logs:/app/logs
+      # Named volume for Hugging Face model cache to persist across container rebuilds
+      - hf_cache:/app/hf_cache
+
+    # --- GPU Support (NVIDIA) ---
+    # The 'deploy' key is the modern way to request GPU resources.
+    # If you get a 'CDI device injection failed' error, comment out the 'deploy' section
+    # and uncomment the 'runtime: nvidia' line below.
+
+    # Method 1: Modern Docker Compose (Recommended)
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+    # Method 2: Legacy Docker Compose (for older setups)
+    # runtime: nvidia
+
+    restart: unless-stopped
+    environment:
+      # Enable faster Hugging Face downloads inside the container
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+      # Make NVIDIA GPUs visible and specify capabilities for PyTorch
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+# Define the named volume for the Hugging Face cache
+volumes:
   hf_cache: