diff --git a/.gitignore b/.gitignore
index 194578f..a289e30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,10 @@ models/kokoro/*
# Ignore audio files
static/audio_cache/*
+# Ignore PDF position caches
+users/**/.pdf_cache/
+**/.pdf_cache/
+
# Ignore build dirs
.flatpak-builder
build-dir
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/OpenWebTTS.iml b/.idea/OpenWebTTS.iml
new file mode 100644
index 0000000..a28a3c0
--- /dev/null
+++ b/.idea/OpenWebTTS.iml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..981364f
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..d75895f
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..19bf7ae
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/BUILD.md b/BUILD.md
index 2894920..264a9e8 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -50,6 +50,10 @@ OpenWebTTS/
2. `pip` for managing dependencies (usually comes with Python).
3. `espeak-ng` for fallback.
4. `ffmpeg` or `libav` for audio processing.
+5. **Tesseract OCR** for PDF text extraction (scanned PDFs). [Download for Windows](https://github.com/UB-Mannheim/tesseract/wiki)
+6. **Poppler** for PDF rendering and OCR preprocessing. [Download for Windows](https://github.com/oschwartz10612/poppler-windows/releases/)
+ - After installation, add Poppler's `bin` directory to your system PATH
+ - Or use Chocolatey: `choco install poppler` (requires admin)
> **Note:** Other Python versions might not be fully compatible due to dependencies. Later version might work, but use at your own risk.
diff --git a/Dockerfile b/Dockerfile
index aada43a..4729eef 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,8 +10,14 @@ RUN apt-get update && apt-get install -y \
ffmpeg \
pkg-config \
espeak-ng \
+ curl \
&& rm -rf /var/lib/apt/lists/*
+# Install Node.js and npm
+RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
+ apt-get install -y nodejs && \
+ rm -rf /var/lib/apt/lists/*
+
# Set working directory
WORKDIR /app
@@ -19,7 +25,7 @@ WORKDIR /app
COPY requirements.txt .
# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install -r requirements.txt
RUN pip install https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
RUN pip install https://github.com/rsxdalv/chatterbox/releases/download/v0.4.4/tts_webui_chatterbox_tts-0.4.4-py3-none-any.whl
diff --git a/README.md b/README.md
index 2186854..65f2b68 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,33 @@ Features marked with an `*` are *paid* on other platforms!
See `BUILD.md` for detailed instructions. If you know what you're doing: clone the repo, install Python dependencies with a venv and build with `npm`.
+## Browser Extension 🔊
+
+OpenWebTTS now includes browser extensions for **Chrome** and **Firefox** that let you read any webpage aloud with word-by-word highlighting!
+
+### Features:
+- 📖 Read entire webpages or just selected text
+- 🎯 Real-time word-by-word highlighting as text is spoken
+- 🎨 Customizable highlight colors (yellow, green, blue, pink, orange)
+- ⚡ Adjustable reading speed (0.5x to 2.0x)
+- 🔄 Auto-scroll to keep reading position visible
+- 🎭 Support for all OpenWebTTS voice engines
+
+### Quick Install:
+
+1. **Start the backend:**
+ ```bash
+ python app.py
+ ```
+
+2. **Load extension:**
+ - **Chrome:** Navigate to `chrome://extensions/`, enable Developer mode, click "Load unpacked", select `browser-extension/` folder
+ - **Firefox:** Navigate to `about:debugging#/runtime/this-firefox`, click "Load Temporary Add-on", select `browser-extension/manifest_firefox.json`
+
+3. **Start reading:** Click the extension icon, navigate to any webpage, and click "📖 Read Page"!
+
+See [browser-extension/README.md](browser-extension/README.md) for complete installation guide and features.
+
## Using TTS models
### Piper
diff --git a/app.py b/app.py
index 93f54e0..4ecb18f 100644
--- a/app.py
+++ b/app.py
@@ -4,18 +4,68 @@
import threading
import time
import socket
+import mimetypes
+import logging
+import warnings
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
+from starlette.responses import JSONResponse
# Import config and router
import config
from functions.routes import router
from functions.openai_api import openai_api_router
+# Suppress noisy connection reset errors (benign - occur when browser cancels requests)
+logging.getLogger("asyncio").setLevel(logging.CRITICAL)
+warnings.filterwarnings("ignore", message=".*Connection reset.*")
+
+# Configure MIME types for JavaScript modules
+mimetypes.add_type('application/javascript', '.mjs')
+mimetypes.add_type('application/javascript', '.js')
+
+# Custom StaticFiles with better error handling and CORS support
+class AudioStaticFiles(StaticFiles):
+ async def __call__(self, scope, receive, send):
+ response_started = False
+
+ async def send_with_cors(message):
+ nonlocal response_started
+ if message["type"] == "http.response.start":
+ response_started = True
+ # Add CORS headers to allow cross-origin access
+ headers = list(message.get("headers", []))
+ headers.append((b"access-control-allow-origin", b"*"))
+ headers.append((b"access-control-allow-methods", b"GET, OPTIONS"))
+ headers.append((b"access-control-allow-headers", b"*"))
+ headers.append((b"access-control-allow-private-network", b"true"))
+ message["headers"] = headers
+ await send(message)
+
+ try:
+ await super().__call__(scope, receive, send_with_cors)
+ except RuntimeError as e:
+ error_msg = str(e)
+ if "Response content shorter than Content-Length" in error_msg:
+ # File accessed while being written
+ if not response_started:
+ # Only send error response if we haven't started sending the response yet
+ response = JSONResponse(
+ status_code=503,
+ content={"detail": "Audio file still being generated, please retry"}
+ )
+ await response(scope, receive, send_with_cors)
+ else:
+ # Response already started, can't send error response, just log it
+ pass
+ else:
+ raise
+
# --- FastAPI Setup ---
app = FastAPI()
-app.mount("/static", StaticFiles(directory=config.STATIC_DIR), name="static")
+app.mount("/static", AudioStaticFiles(directory=config.STATIC_DIR), name="static")
+app.mount("/audio_cache", AudioStaticFiles(directory=config.AUDIO_CACHE_DIR), name="audio_cache")
app.include_router(router)
app.include_router(openai_api_router)
diff --git a/browser-extension/.gitignore b/browser-extension/.gitignore
new file mode 100644
index 0000000..1406aa6
--- /dev/null
+++ b/browser-extension/.gitignore
@@ -0,0 +1,31 @@
+# Node modules
+node_modules/
+npm-debug.log
+package-lock.json
+
+# Build artifacts
+dist/
+*.zip
+*.xpi
+*.crx
+
+# Generated icons (if regenerated)
+icons/*.png
+
+# Development files
+.web-ext-config.js
+web-ext-artifacts/
+
+# OS files
+.DS_Store
+Thumbs.db
+desktop.ini
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# Logs
+*.log
diff --git a/browser-extension/INSTALL.md b/browser-extension/INSTALL.md
new file mode 100644
index 0000000..230abba
--- /dev/null
+++ b/browser-extension/INSTALL.md
@@ -0,0 +1,141 @@
+# OpenWebTTS Browser Extension - Installation Guide
+
+## Quick Start
+
+### 1. Ensure Backend is Running
+
+```bash
+cd OpenWebTTS
+python app.py
+```
+
+Verify it's running by visiting: http://localhost:5000
+
+### 2. Install Extension
+
+#### For Chrome/Edge/Brave:
+
+1. Open browser and go to extensions page:
+ - Chrome: `chrome://extensions/`
+ - Edge: `edge://extensions/`
+ - Brave: `brave://extensions/`
+
+2. **Enable "Developer mode"** (toggle in top-right)
+
+3. Click **"Load unpacked"**
+
+4. Navigate to and select: `OpenWebTTS/browser-extension/`
+
+5. Extension icon appears in toolbar! 🎉
+
+#### For Firefox:
+
+1. Open: `about:debugging#/runtime/this-firefox`
+
+2. Click **"Load Temporary Add-on..."**
+
+3. Navigate to: `OpenWebTTS/browser-extension/`
+
+4. Select file: `manifest_firefox.json`
+
+5. Extension loaded! (temporary until browser restart)
+
+### 3. Generate Icons (Optional)
+
+The extension includes SVG icons which work in modern browsers. To create PNG versions:
+
+```bash
+cd browser-extension
+python generate_icons.py
+```
+
+This will create `icon16.png`, `icon48.png`, and `icon128.png` in the `icons/` folder.
+
+**Requirements:** `pip install pillow`
+
+### 4. Test the Extension
+
+1. **Click extension icon** in toolbar
+2. Should show: "Connected to backend" (green)
+3. Navigate to any webpage with text
+4. Click **"📖 Read Page"**
+5. Watch as text is highlighted and read aloud! 🔊
+
+## Troubleshooting
+
+### ❌ "Cannot reach backend"
+
+**Fix:**
+```bash
+# Make sure backend is running
+cd OpenWebTTS
+python app.py
+
+# Should see: Running on http://127.0.0.1:5000
+```
+
+### ❌ Icons not showing
+
+**Options:**
+1. Use SVG icons (already included, work in modern browsers)
+2. Generate PNG: `python generate_icons.py`
+3. Download PNG icons from a converter service
+
+### ❌ Extension not loading
+
+**Chrome:**
+- Check for manifest errors in extensions page
+- Make sure you selected the `browser-extension` folder (not a file)
+
+**Firefox:**
+- Make sure you selected `manifest_firefox.json` (not `manifest.json`)
+- For permanent install, see README for signing instructions
+
+### ❌ No audio playing
+
+**Fix:**
+1. Click somewhere on the page first (browser autoplay policy)
+2. Check backend console for errors
+3. Verify voice/TTS engine is working:
+ ```bash
+ curl -X POST http://localhost:5000/api/generate_speech \
+ -H "Content-Type: application/json" \
+ -d '{"text": "Hello world", "voice": "piper"}'
+ ```
+
+## Features to Try
+
+1. **Read Selection**
+ - Select any text on a page
+ - Click "📝 Read Selection"
+
+2. **Adjust Speed**
+ - Move speed slider (0.5x to 2.0x)
+ - Click "Save Settings"
+
+3. **Change Highlight Color**
+ - Try yellow, green, blue, pink, or orange
+ - Watch how highlighting changes
+
+4. **Auto-scroll**
+ - Enable/disable auto-scroll
+ - Page follows along as it reads
+
+5. **Word Highlighting**
+ - Toggle word-by-word option
+ - See individual words light up as spoken
+
+## Next Steps
+
+See full [README.md](README.md) for:
+- Complete feature list
+- API documentation
+- Development guide
+- Contributing guidelines
+
+---
+
+**Need Help?**
+- Check main OpenWebTTS documentation
+- Open an issue on GitHub
+- Review browser console for errors (`F12` → Console tab)
diff --git a/browser-extension/QUICK_START.md b/browser-extension/QUICK_START.md
new file mode 100644
index 0000000..ab85571
--- /dev/null
+++ b/browser-extension/QUICK_START.md
@@ -0,0 +1,190 @@
+# 🚀 Quick Start Guide - Browser Extension
+
+Get your OpenWebTTS browser extension up and running in 5 minutes!
+
+## Step 1: Start the Backend (30 seconds)
+
+```bash
+cd OpenWebTTS
+python app.py
+```
+
+**Expected output:**
+```
+✅ Piper configured
+✅ Kokoro configured
+INFO: Started server process
+INFO: Uvicorn running on http://127.0.0.1:5000
+```
+
+**Verify:** Open http://localhost:5000 in browser - should see OpenWebTTS interface
+
+## Step 2: Install Extension (2 minutes)
+
+### For Chrome/Edge/Brave:
+
+1. **Open extensions page:**
+ - Type in address bar: `chrome://extensions/`
+ - Or: Menu → Extensions → Manage Extensions
+
+2. **Enable Developer Mode:**
+ - Toggle switch in top-right corner
+
+3. **Load extension:**
+ - Click "Load unpacked"
+ - Navigate to: `OpenWebTTS/browser-extension/`
+ - Click "Select Folder"
+
+4. **Pin to toolbar (optional):**
+ - Click puzzle icon in toolbar
+ - Click pin icon next to OpenWebTTS
+
+### For Firefox:
+
+1. **Open debugging page:**
+ - Type in address bar: `about:debugging#/runtime/this-firefox`
+
+2. **Load extension:**
+ - Click "Load Temporary Add-on..."
+ - Navigate to: `OpenWebTTS/browser-extension/`
+ - Select file: `manifest_firefox.json`
+ - Click "Open"
+
+**Note:** Firefox temporary extensions are removed on restart. For permanent install, see [README.md](README.md).
+
+## Step 3: Test Extension (2 minutes)
+
+1. **Open test page:**
+ - File → Open File (or Ctrl+O)
+ - Navigate to: `OpenWebTTS/browser-extension/test-page.html`
+ - Or just drag the file into your browser
+
+2. **Click extension icon** in toolbar
+ - Should see popup with settings
+
+3. **Verify connection:**
+ - Status should show: "🟢 Connected to backend"
+ - If not, click "Test Connection"
+
+4. **Start reading:**
+ - Click "📖 Read Page" button
+ - Audio should start playing
+ - Text should highlight as it reads
+
+5. **Test controls:**
+ - Watch highlighting follow along
+ - Try "⏹️ Stop" button
+ - Select some text and try "📝 Read Selection"
+
+## Step 4: Try on Real Websites (30 seconds)
+
+1. **Navigate to any website:**
+ - Wikipedia: https://en.wikipedia.org/wiki/Artificial_intelligence
+ - News site: https://news.ycombinator.com
+ - Blog: https://blog.google/
+ - Any webpage with text!
+
+2. **Click extension icon**
+
+3. **Click "📖 Read Page"**
+
+4. **Enjoy hands-free reading!** 🎉
+
+## Common Quick Fixes
+
+### ❌ "Cannot reach backend"
+**Fix:** Make sure backend is running
+```bash
+cd OpenWebTTS
+python app.py
+```
+
+### ❌ Extension icon not showing
+**Fix:** Pin it to toolbar
+- Click puzzle icon in toolbar
+- Find "OpenWebTTS - Text to Speech Reader"
+- Click pin icon
+
+### ❌ No audio playing
+**Fix:** Click somewhere on the page first (browser autoplay policy)
+
+### ❌ Icons showing as puzzle pieces
+**Fix:** Generate PNG icons:
+```bash
+cd browser-extension
+python generate_icons.py
+# OR just use the SVG icons - they work fine!
+```
+
+## Customize Settings
+
+Click extension icon to access settings:
+
+- **Voice:** Choose Piper (fast), Kokoro (quality), Coqui (cloning), or OpenAI (cloud)
+- **Speed:** Drag slider from 0.5x (slow) to 2.0x (fast)
+- **Colors:** Pick yellow, green, blue, pink, or orange highlights
+- **Auto-scroll:** Toggle on/off to follow reading position
+- **Word Highlight:** Toggle word-by-word highlighting effect
+
+Click "Save Settings" to persist your preferences!
+
+## What's Next?
+
+### Explore Features:
+- Read long articles hands-free while doing other tasks
+- Use selection reading to focus on specific paragraphs
+- Try different voices to find your favorite
+- Experiment with reading speeds for different content types
+- Use auto-scroll for comfortable reading experience
+
+### Advanced Usage:
+- Read research papers and documentation
+- Listen while exercising or commuting
+- Help with proofreading by hearing your own writing
+- Accessibility aid for visual impairments or dyslexia
+- Language learning tool (pronunciation)
+
+### Get Help:
+- 📖 Full documentation: [README.md](README.md)
+- 🔧 Installation guide: [INSTALL.md](INSTALL.md)
+- 💻 Developer guide: [DEVELOPMENT.md](DEVELOPMENT.md)
+- ✅ Verification: [VERIFICATION_CHECKLIST.md](VERIFICATION_CHECKLIST.md)
+
+## Showcase Example
+
+**Try this now:**
+1. Go to: https://en.wikipedia.org/wiki/OpenAI
+2. Click extension icon
+3. Click "Read Page"
+4. Watch the magic happen! ✨
+
+The extension will:
+- Extract all readable content
+- Generate speech in real-time
+- Highlight sentences and words
+- Scroll to keep position visible
+- Show progress and time remaining
+
+## Share & Contribute
+
+Love the extension? Consider:
+- ⭐ Star the repo on GitHub
+- 🐛 Report bugs or request features
+- 📝 Improve documentation
+- 💻 Contribute code improvements
+- 📢 Share with others who might benefit
+
+## Support
+
+Need help? Check:
+1. [INSTALL.md](INSTALL.md) - Detailed installation instructions
+2. [README.md](README.md) - Complete feature documentation
+3. [Troubleshooting section](README.md#troubleshooting) - Common issues
+4. Browser console (F12) - Error messages
+5. GitHub Issues - Report problems
+
+---
+
+**You're all set! Happy reading (or listening)! 🎧📚**
+
+The OpenWebTTS extension transforms any webpage into an audiobook with synchronized highlighting. Enjoy hands-free web browsing with complete privacy and control.
diff --git a/browser-extension/README.md b/browser-extension/README.md
new file mode 100644
index 0000000..e0229a3
--- /dev/null
+++ b/browser-extension/README.md
@@ -0,0 +1,448 @@
+# OpenWebTTS Browser Extension
+
+🔊 A powerful browser extension that reads any webpage aloud with word-by-word highlighting, powered by your local OpenWebTTS backend.
+
+## ✨ New in v1.1.0 - Enhanced UI/UX
+
+🎯 **Floating Paragraph Buttons** - Hover over paragraphs to reveal play buttons
+🎛️ **Draggable Floating Menu** - Full-featured control panel you can move anywhere
+⌨️ **Keyboard Shortcuts** - Quick access with Ctrl+Shift combinations
+🖱️ **Context Menu Integration** - Right-click selected text to read it
+🎯 **Ctrl+Click Reading** - Click any paragraph while holding Ctrl to start from there
+🎨 **Professional UI** - Glass morphism effects, smooth animations, gradient themes
+♿ **Accessibility** - High contrast mode, reduced motion support, keyboard navigation
+
+## Features
+
+### Reading Features
+✨ **Read Any Webpage** - Convert any web content to speech with a single click
+📝 **Smart Text Selection** - Read selected paragraphs or the entire page
+🎯 **Word-by-Word Highlighting** - Follow along with synchronized highlighting
+🎯 **Click to Read** - Ctrl+Click on any paragraph to start reading from there
+📍 **Paragraph Buttons** - Floating play buttons appear on hover for instant reading
+
+### UI/UX Features
+🎛️ **Floating Menu** - Draggable control panel with live progress tracking
+⌨️ **Keyboard Shortcuts** - Fast control with Ctrl+Shift+R/S/X/M
+🖱️ **Context Menu** - Right-click "Read this text" option
+🎨 **Modern Design** - Purple gradient theme with glass morphism
+📊 **Progress Tracking** - Visual progress bar with chunk counter
+
+### Customization
+🎨 **Customizable Colors** - Choose from 5 highlight color schemes (Yellow, Green, Blue, Pink, Orange)
+⚡ **Adjustable Speed** - Control reading speed from 0.5x to 2.0x
+🚀 **Auto-scroll** - Automatically scroll to keep reading position visible
+🎭 **Multiple Voices** - Support for Piper, Kokoro, Coqui, and OpenAI TTS
+
+### Privacy & Performance
+🔐 **Privacy First** - All processing done locally on your backend (port 8000)
+⚡ **Smart Chunking** - Optimized 50-word chunks for better synchronization
+💾 **Audio Caching** - Cached audio for improved performance
+
+## Installation
+
+### Prerequisites
+
+1. **OpenWebTTS Backend Running**
+ - Make sure your OpenWebTTS backend is running on `http://localhost:8000`
+ - Start it with: `python app.py`
+
+### Chrome Installation
+
+1. Open Chrome and navigate to `chrome://extensions/`
+2. Enable "Developer mode" (toggle in top-right corner)
+3. Click "Load unpacked"
+4. Navigate to and select the `browser-extension` folder
+5. The extension icon will appear in your toolbar!
+
+### Firefox Installation
+
+1. Open Firefox and navigate to `about:debugging#/runtime/this-firefox`
+2. Click "Load Temporary Add-on..."
+3. Navigate to the `browser-extension` folder
+4. Select the `manifest_firefox.json` file
+5. The extension will be loaded temporarily
+ - **Note:** For permanent installation, you need to sign the extension
+
+#### Making Firefox Extension Permanent
+
+To use the extension permanently in Firefox:
+
+1. **Package the extension:**
+ ```bash
+ cd browser-extension
+ # Rename manifest for Firefox
+ mv manifest.json manifest_chrome.json
+ mv manifest_firefox.json manifest.json
+ # Create zip file
+ zip -r openwebtts-extension.zip * -x "*.md" -x "manifest_chrome.json"
+ ```
+
+2. **Sign on Firefox Add-ons:**
+ - Go to [https://addons.mozilla.org/developers/](https://addons.mozilla.org/developers/)
+ - Create an account and submit for signing
+ - OR use web-ext for self-distribution
+
+3. **Or use web-ext for development:**
+ ```bash
+ npm install -g web-ext
+ cd browser-extension
+ web-ext run --firefox-profile=your-profile
+ ```
+
+## Usage
+
+### Quick Start
+
+#### Method 1: Floating Paragraph Buttons (NEW! ⭐)
+1. Navigate to any webpage
+2. Hover your mouse over any paragraph
+3. A purple play button (▶) appears on the left
+4. Click the button to start reading from that paragraph
+
+#### Method 2: Ctrl+Click (NEW! 🎯)
+1. Hold down `Ctrl` (or `Cmd` on Mac)
+2. Notice the blue indicator at the top of the page
+3. Click on any paragraph
+4. Reading starts from that paragraph to the end
+
+#### Method 3: Context Menu (NEW! 🖱️)
+1. Select any text on the page
+2. Right-click on the selection
+3. Choose "🔊 Read this text"
+4. Selected text is read aloud
+
+#### Method 4: Floating Menu (NEW! 🎛️)
+1. Look for the floating purple menu on the right side
+2. Use the control buttons:
+ - **📖 Read Page** - Read entire page
+ - **📝 Read Selection** - Read selected text
+ - **⏹️ Stop** - Stop reading
+3. Drag the menu header to reposition it anywhere
+4. Track progress with the visual progress bar
+
+#### Method 5: Keyboard Shortcuts (NEW! ⌨️)
+- `Ctrl+Shift+R` - Read entire page
+- `Ctrl+Shift+S` - Read selected text
+- `Ctrl+Shift+X` - Stop reading
+- `Ctrl+Shift+M` - Toggle floating menu visibility
+
+### Configuration
+
+#### Backend URL
+- Default: `http://localhost:8000` (changed from 5000)
+- Change if your backend runs on different host/port
+- Click "Test Connection" to verify connectivity
+
+#### Voice Selection
+- **Piper** - Fast, high-quality neural TTS (default)
+- **Kokoro** - Alternative high-quality voice
+- **Coqui** - Open-source TTS engine
+- **OpenAI** - Cloud-based (requires API key)
+
+#### Reading Settings
+- **Speed:** 0.5x to 2.0x playback speed
+- **Chunk Size:** Default 50 words (optimized for better sync)
+- **Auto-scroll:** Automatically scroll to current reading position
+- **Word Highlight:** Show word-by-word highlighting
+
+#### Highlight Colors
+- Yellow (default)
+- Green
+- Blue
+- Pink
+- Orange
+
+### Advanced Features
+
+#### Progress Tracking
+- Visual progress bar shows completion percentage
+- Chunk counter displays "Chunk X/Y"
+- Estimated time remaining
+- Real-time status updates
+
+#### Smart Text Extraction
+- Automatically filters navigation, headers, footers
+- Focuses on main content
+- Skips hidden and invisible elements
+- Respects semantic HTML structure
+
+#### Audio Caching
+- Caches generated audio for better performance
+- Reduces backend calls for repeated text
+- Automatically cleans up on stop
+
+### Keyboard Shortcuts
+
+The extension includes these built-in shortcuts:
+
+| Shortcut | Action | Description |
+|----------|--------|-------------|
+| `Ctrl+Shift+R` | Read Page | Start reading entire page |
+| `Ctrl+Shift+S` | Read Selection | Read currently selected text |
+| `Ctrl+Shift+X` | Stop | Stop reading immediately |
+| `Ctrl+Shift+M` | Toggle Menu | Show/hide floating menu |
+| `Ctrl+Click` | Read from Here | Start reading from clicked paragraph |
+
+**Customizing Shortcuts:**
+
+**Chrome/Edge:**
+1. Go to `chrome://extensions/shortcuts`
+2. Find "OpenWebTTS"
+3. Modify the default shortcuts if desired
+
+**Firefox:**
+1. Go to `about:addons`
+2. Click gear icon → "Manage Extension Shortcuts"
+3. Customize the OpenWebTTS shortcuts
+
+## How It Works
+
+### Architecture
+
+```
+┌─────────────┐
+│ Web Page │
+│ (Content) │
+└──────┬──────┘
+ │
+ │ Extract Text
+ ▼
+┌─────────────────┐
+│ Content Script │
+│ - Text Extract │
+│ - Highlighting │
+│ - Audio Playback│
+└──────┬──────────┘
+ │
+ │ TTS Request
+ ▼
+┌─────────────────┐
+│ OpenWebTTS │
+│ Backend │
+│ (localhost:5000)│
+└──────┬──────────┘
+ │
+ │ Audio Stream
+ ▼
+┌─────────────────┐
+│ Browser Audio │
+│ (Playback) │
+└─────────────────┘
+```
+
+### Text Extraction
+
+The content script intelligently extracts readable text:
+- Identifies paragraphs, headings, lists, and blockquotes
+- Ignores scripts, styles, hidden elements
+- Maintains document structure
+- Filters out navigation and UI elements
+
+### Highlighting System
+
+**Two-tier highlighting:**
+1. **Chunk/Sentence Level (25% opacity)** - Shows current sentence being read
+2. **Word Level (65% opacity)** - Highlights the specific word being spoken
+
+**Timing:**
+- Calculates word position based on audio progress
+- Updates highlighting in real-time (using `timeupdate` events)
+- Smooth transitions between words
+
+### API Communication
+
+**Endpoint:** `POST /api/generate_speech`
+
+**Request:**
+```json
+{
+ "text": "Text to convert to speech",
+ "voice": "piper",
+ "speed": 1.0
+}
+```
+
+**Response:** Audio file (WAV/MP3) as blob
+
+## Troubleshooting
+
+### "Cannot reach backend" Error
+
+**Causes:**
+- Backend not running
+- Wrong URL configured
+- CORS issues
+
+**Solutions:**
+1. Start backend: `python app.py`
+2. Verify URL in extension settings
+3. Test connection with "Test Connection" button
+4. Check browser console for CORS errors
+
+### No Text Being Read
+
+**Causes:**
+- Page has no readable content
+- Content is in iframes or shadow DOM
+- JavaScript-rendered content not loaded
+
+**Solutions:**
+1. Wait for page to fully load
+2. Try selecting specific text manually
+3. Check if page has readable paragraphs
+
+### Highlighting Not Working
+
+**Causes:**
+- Word highlight disabled in settings
+- Page CSS conflicts
+- Dynamic content changes
+
+**Solutions:**
+1. Enable "Word-by-word highlighting" in settings
+2. Try different highlight colors
+3. Refresh the page and try again
+
+### Audio Not Playing
+
+**Causes:**
+- Browser autoplay policy
+- Audio format not supported
+- Backend audio generation failed
+
+**Solutions:**
+1. Click somewhere on the page first (user interaction required)
+2. Check backend logs for errors
+3. Try different voice/TTS engine
+
+## Development
+
+### Project Structure
+
+```
+browser-extension/
+├── manifest.json # Chrome manifest
+├── manifest_firefox.json # Firefox manifest
+├── popup/
+│ ├── popup.html # Extension popup UI
+│ ├── popup.css # Popup styles
+│ └── popup.js # Popup logic
+├── content/
+│ ├── content.js # Content script (runs on pages)
+│ └── content.css # Highlighting styles
+├── background/
+│ └── background.js # Background service worker
+├── icons/
+│ ├── icon16.svg # 16x16 icon
+│ ├── icon48.svg # 48x48 icon
+│ └── icon128.svg # 128x128 icon
+└── README.md # This file
+```
+
+### Building Icons (Optional)
+
+The extension includes SVG icons. To convert to PNG:
+
+```bash
+# Install ImageMagick or use online converter
+convert icon128.svg -resize 128x128 icon128.png
+convert icon48.svg -resize 48x48 icon48.png
+convert icon16.svg -resize 16x16 icon16.png
+```
+
+Or use provided SVGs directly (modern browsers support this).
+
+### Modifying the Extension
+
+1. **Edit Files**
+ - Modify popup UI: `popup/popup.html`, `popup/popup.css`
+ - Change logic: `popup/popup.js`, `content/content.js`
+ - Update styles: `content/content.css`
+
+2. **Reload Extension**
+ - Chrome: Go to `chrome://extensions/` and click reload icon
+ - Firefox: Click "Reload" in `about:debugging`
+
+3. **Test Changes**
+ - Open popup to test UI changes
+ - Navigate to webpage to test content script
+ - Check browser console for errors
+
+## API Reference
+
+### Content Script Messages
+
+**Start Reading:**
+```javascript
+browser.tabs.sendMessage(tabId, {
+ action: 'startReading',
+ mode: 'page' | 'selection',
+ settings: {...}
+});
+```
+
+**Stop Reading:**
+```javascript
+browser.tabs.sendMessage(tabId, {
+ action: 'stopReading'
+});
+```
+
+### Background Script Messages
+
+**Update Progress:**
+```javascript
+browser.runtime.sendMessage({
+ action: 'updateProgress',
+ current: 5,
+ total: 20,
+ timeRemaining: 45
+});
+```
+
+**Reading Complete:**
+```javascript
+browser.runtime.sendMessage({
+ action: 'readingComplete'
+});
+```
+
+**Reading Error:**
+```javascript
+browser.runtime.sendMessage({
+ action: 'readingError',
+ error: 'Error message'
+});
+```
+
+## Contributing
+
+Contributions welcome! Areas for improvement:
+
+- [ ] Better text extraction for complex layouts
+- [ ] Support for PDF reading in browser
+- [ ] Playback controls (pause, skip, rewind)
+- [ ] Reading history and bookmarks
+- [ ] Custom voice settings per website
+- [ ] Keyboard shortcuts for controls
+- [ ] Dictionary/pronunciation customization
+- [ ] Multi-language support UI
+
+## License
+
+MIT License - Same as OpenWebTTS main project
+
+## Credits
+
+Part of the [OpenWebTTS](https://github.com/Gyyyn/OpenWebTTS) project.
+
+## Support
+
+- **Issues:** Report bugs on GitHub
+- **Docs:** See main OpenWebTTS documentation
+- **Backend:** Ensure OpenWebTTS backend is properly configured
+
+---
+
+**Made with ❤️ for accessible web reading**
diff --git a/browser-extension/TESTING_GUIDE.md b/browser-extension/TESTING_GUIDE.md
new file mode 100644
index 0000000..9aa7899
--- /dev/null
+++ b/browser-extension/TESTING_GUIDE.md
@@ -0,0 +1,497 @@
+# 🧪 OpenWebTTS Enhanced Extension - Testing Guide
+
+## ✅ Pre-Testing Checklist
+
+Before testing the extension, make sure:
+- [ ] Backend server is running on `http://localhost:8000`
+- [ ] Extension is loaded in browser (Chrome or Firefox)
+- [ ] Test page or real website is open
+- [ ] Extension icon appears in browser toolbar
+
+## 🔧 Backend Setup
+
+### Start the Backend
+```powershell
+# Navigate to project directory
+cd d:\tts\OpenWebTTS
+
+# Start the backend server
+python app.py
+```
+
+Expected output:
+```
+INFO: Started server process
+INFO: Uvicorn running on http://localhost:8000
+```
+
+### Test Backend Health
+```powershell
+# In another terminal
+curl http://localhost:8000/api/health
+```
+
+Expected: `{"status": "healthy", "timestamp": "..."}`
+
+## 🔄 Load Extension
+
+### For Chrome/Edge
+1. Open `chrome://extensions/`
+2. Enable "Developer mode" (toggle in top right)
+3. Click "Load unpacked"
+4. Select folder: `d:\tts\OpenWebTTS\browser-extension`
+5. Extension should appear with purple icon
+
+### For Firefox
+1. Open `about:debugging#/runtime/this-firefox`
+2. Click "Load Temporary Add-on"
+3. Select: `d:\tts\OpenWebTTS\browser-extension\manifest_firefox.json`
+4. Extension should appear
+
+## 📝 Test Plan
+
+### Test 1: Floating Paragraph Buttons ⭐
+
+**Steps:**
+1. Open test page: `d:\tts\OpenWebTTS\browser-extension\test-page.html`
+2. Hover mouse over any paragraph
+3. Look for purple play button appearing on the left
+
+**Expected Result:**
+- ✅ Button appears with smooth fade-in animation
+- ✅ Button is purple with white play icon (▶)
+- ✅ Button position: about 40px to the left of paragraph
+- ✅ Button scales up on hover
+
+**Test clicking button:**
+4. Click the play button on first paragraph
+
+**Expected Result:**
+- ✅ Reading starts immediately
+- ✅ Paragraph highlights with yellow background
+- ✅ Words highlight individually as they're spoken
+- ✅ Auto-scrolls if paragraph is off-screen
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 2: Floating Menu 🎛️
+
+**Steps:**
+1. Look for floating menu on right side of page
+2. Should appear automatically (purple gradient box)
+
+**Expected Result:**
+- ✅ Menu visible on right side
+- ✅ Purple gradient background
+- ✅ Shows "Ready to read" status
+- ✅ Has 3 buttons: "Read Page", "Read Selection", "Stop"
+- ✅ Has close button (×) in top-right
+
+**Test dragging:**
+3. Click and hold on menu header ("🔊 OpenWebTTS")
+4. Drag menu to different position
+5. Release mouse
+
+**Expected Result:**
+- ✅ Menu follows cursor smoothly
+- ✅ Position updates in real-time
+- ✅ Menu stays in new position
+
+**Test buttons:**
+6. Click "Read Page" button
+
+**Expected Result:**
+- ✅ Reading starts
+- ✅ Status changes to "Reading X chunks..."
+- ✅ Progress bar appears and updates
+- ✅ "Stop" button becomes visible
+- ✅ Chunk counter updates (e.g., "Chunk 3/25")
+
+7. Click "Stop" button
+
+**Expected Result:**
+- ✅ Reading stops immediately
+- ✅ Highlights disappear
+- ✅ Progress bar resets
+- ✅ Status returns to "Ready to read"
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 3: Keyboard Shortcuts ⌨️
+
+**Test Shortcut 1: Read Page**
+1. Refresh the page
+2. Press `Ctrl+Shift+R`
+
+**Expected Result:**
+- ✅ Reading starts immediately
+- ✅ Same behavior as clicking "Read Page" button
+
+**Test Shortcut 2: Stop Reading**
+3. While reading, press `Ctrl+Shift+X`
+
+**Expected Result:**
+- ✅ Reading stops immediately
+- ✅ Highlights clear
+
+**Test Shortcut 3: Read Selection**
+4. Select some text with mouse
+5. Press `Ctrl+Shift+S`
+
+**Expected Result:**
+- ✅ Only selected text is read
+- ✅ Highlights only appear on selected text
+
+**Test Shortcut 4: Toggle Menu**
+6. Press `Ctrl+Shift+M`
+
+**Expected Result:**
+- ✅ Menu disappears (if visible)
+
+7. Press `Ctrl+Shift+M` again
+
+**Expected Result:**
+- ✅ Menu reappears in same position
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 4: Context Menu 🖱️
+
+**Steps:**
+1. Select a paragraph or sentence with mouse
+2. Right-click on the selected text
+3. Look for menu item "🔊 Read this text"
+
+**Expected Result:**
+- ✅ Menu item appears in context menu
+- ✅ Has speaker icon (🔊)
+
+4. Click "🔊 Read this text"
+
+**Expected Result:**
+- ✅ Reading starts immediately
+- ✅ Only selected text is read
+- ✅ Highlighting appears on selected text
+
+**Test without selection:**
+5. Click anywhere to deselect text
+6. Right-click
+
+**Expected Result:**
+- ✅ "Read this text" option should NOT appear (or be grayed out)
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 5: Ctrl+Click Reading 🎯
+
+**Test Ctrl indicator:**
+1. Stop any ongoing reading
+2. Press and hold `Ctrl` key
+
+**Expected Result:**
+- ✅ Blue indicator appears at top-center of page
+- ✅ Shows message: "Ctrl held - Click text to start reading from there"
+
+3. Release `Ctrl` key
+
+**Expected Result:**
+- ✅ Indicator disappears
+
+**Test Ctrl+Click:**
+4. Hold `Ctrl` key
+5. Click on any paragraph in the middle of the page
+6. Release `Ctrl`
+
+**Expected Result:**
+- ✅ Reading starts from clicked paragraph
+- ✅ Continues to end of page
+- ✅ Skips paragraphs before clicked one
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 6: Visual Highlighting 🎨
+
+**Test chunk highlighting:**
+1. Start reading (any method)
+2. Observe paragraph currently being read
+
+**Expected Result:**
+- ✅ Paragraph has yellow background (25% opacity)
+- ✅ Has subtle glow/shadow around edges
+- ✅ Background color is semi-transparent
+- ✅ Text remains fully readable
+
+**Test word highlighting:**
+3. Watch individual words within paragraph
+
+**Expected Result:**
+- ✅ Each word highlights individually (65% opacity)
+- ✅ Word highlighting is brighter than paragraph
+- ✅ Word highlight moves smoothly across text
+- ✅ Only one word highlighted at a time
+- ✅ Slight scale effect on current word
+
+**Test color change** (via popup):
+4. Click extension icon in toolbar
+5. Change "Highlight Color" to "Green"
+6. Start reading
+
+**Expected Result:**
+- ✅ Highlights now appear in green
+- ✅ Both chunk and word highlights use new color
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 7: Progress Tracking 📊
+
+**Steps:**
+1. Start reading a page with multiple paragraphs
+2. Watch the floating menu
+
+**Expected Result:**
+- ✅ Progress bar fills from left to right
+- ✅ Percentage increases smoothly
+- ✅ Chunk counter shows: "Chunk X/Y"
+- ✅ Time remaining shows estimate: "MM:SS"
+- ✅ Status text updates
+
+**Test at completion:**
+3. Let reading complete naturally
+
+**Expected Result:**
+- ✅ Progress bar reaches 100%
+- ✅ Counter shows "Chunk Y/Y" (max/max)
+- ✅ Status changes to "Ready to read"
+- ✅ Stop button disappears
+- ✅ Read buttons reappear
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 8: Settings & Configuration ⚙️
+
+**Test popup:**
+1. Click extension icon in toolbar
+2. Popup should open
+
+**Expected Result:**
+- ✅ Popup shows all settings
+- ✅ Backend URL is "http://localhost:8000"
+- ✅ Chunk Size is "50"
+- ✅ Voice dropdown has options (Piper, Kokoro, etc.)
+- ✅ Speed slider (0.5x - 2.0x)
+
+**Test connection:**
+3. Click "Test Connection" button
+
+**Expected Result:**
+- ✅ Status shows "Testing..."
+- ✅ Changes to "✅ Connected" (green)
+- ✅ If backend offline: Shows "❌ Connection failed" (red)
+
+**Test settings save:**
+4. Change chunk size to 100
+5. Change speed to 1.5x
+6. Change highlight color to Blue
+7. Click "Save Settings"
+
+**Expected Result:**
+- ✅ Success message appears
+- ✅ Settings persist after closing popup
+- ✅ Reload page and verify settings still applied
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 9: Error Handling 🚨
+
+**Test backend offline:**
+1. Stop the backend server (Ctrl+C in terminal)
+2. Try to read text with extension
+
+**Expected Result:**
+- ✅ Shows error in floating menu status
+- ✅ Error message: "Backend error: 500" or similar
+- ✅ Doesn't cause extension to crash
+- ✅ Can retry after restarting backend
+
+**Test invalid text:**
+2. Try to read page with no text content
+
+**Expected Result:**
+- ✅ Shows "No readable content found"
+- ✅ Doesn't crash
+- ✅ Returns to ready state
+
+**Test rapid clicking:**
+3. Start reading, immediately click stop, then start again rapidly
+
+**Expected Result:**
+- ✅ Handles rapid state changes gracefully
+- ✅ No audio overlap
+- ✅ Highlights clear properly
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+### Test 10: Cross-Browser Compatibility 🌐
+
+**Test on Chrome:**
+- [ ] All features work
+- [ ] Keyboard shortcuts work
+- [ ] Context menu works
+- [ ] Visual design correct
+
+**Test on Firefox:**
+- [ ] All features work
+- [ ] Keyboard shortcuts work
+- [ ] Context menu works
+- [ ] Visual design correct
+
+**Status:** ⬜ Pass | ⬜ Fail | ⬜ Partial
+
+---
+
+## 📸 Visual Inspection Checklist
+
+### UI/UX Quality
+- [ ] Floating menu has smooth gradient
+- [ ] Buttons have hover effects
+- [ ] Animations are smooth (not janky)
+- [ ] Text is readable
+- [ ] Icons are crisp
+- [ ] No visual glitches
+- [ ] Shadows look professional
+- [ ] Colors are consistent
+- [ ] Responsive to window resize
+
+### Accessibility
+- [ ] Can tab through all controls
+- [ ] Focus indicators visible
+- [ ] Keyboard shortcuts work
+- [ ] Contrast is sufficient
+- [ ] Works with screen readers (if available)
+
+## 🐛 Bug Reporting Template
+
+If you find issues, document them:
+
+```
+**Bug Title**: [Short description]
+
+**Steps to Reproduce**:
+1.
+2.
+3.
+
+**Expected Behavior**:
+[What should happen]
+
+**Actual Behavior**:
+[What actually happens]
+
+**Browser**: Chrome/Firefox [version]
+**OS**: Windows [version]
+**Backend**: Running/Not running
+
+**Screenshots**: [If applicable]
+
+**Console Errors**: [Press F12, check Console tab]
+```
+
+## 📊 Test Summary
+
+Fill out after completing all tests:
+
+| Test | Status | Notes |
+|------|--------|-------|
+| 1. Paragraph Buttons | ⬜ Pass | |
+| 2. Floating Menu | ⬜ Pass | |
+| 3. Keyboard Shortcuts | ⬜ Pass | |
+| 4. Context Menu | ⬜ Pass | |
+| 5. Ctrl+Click | ⬜ Pass | |
+| 6. Highlighting | ⬜ Pass | |
+| 7. Progress Tracking | ⬜ Pass | |
+| 8. Settings | ⬜ Pass | |
+| 9. Error Handling | ⬜ Pass | |
+| 10. Cross-Browser | ⬜ Pass | |
+
+**Overall Status**: ⬜ All Pass | ⬜ Minor Issues | ⬜ Major Issues
+
+**Notes**:
+_[Add any additional observations]_
+
+---
+
+## 🚀 Quick Test Commands
+
+### PowerShell Helper Commands
+
+```powershell
+# Start backend
+cd d:\tts\OpenWebTTS; python app.py
+
+# Test health endpoint
+curl http://localhost:8000/api/health
+
+# Check if backend is running
+Test-NetConnection -ComputerName localhost -Port 8000
+
+# Kill backend if stuck
+Get-Process -Name python | Where-Object {$_.Path -like "*OpenWebTTS*"} | Stop-Process
+```
+
+### Browser Console Tests
+
+Press F12, paste in Console tab:
+
+```javascript
+// Test if content script loaded
+console.log('Testing OpenWebTTS...');
+
+// Check for floating menu
+document.querySelector('.owtts-floating-menu') ?
+ console.log('✅ Menu found') :
+ console.error('❌ Menu not found');
+
+// Check for paragraph buttons
+document.querySelectorAll('.owtts-para-btn').length > 0 ?
+ console.log('✅ Buttons found') :
+ console.warn('⚠️ No buttons found');
+
+// Force show menu
+if (window.browser || window.chrome) {
+ (window.browser || window.chrome).runtime.sendMessage({
+ action: 'toggleMenu'
+ });
+}
+```
+
+## 📞 Support
+
+If tests fail:
+1. Check browser console (F12) for errors
+2. Check backend terminal for errors
+3. Verify backend is on port 8000
+4. Try reloading the extension
+5. Try hard refresh (Ctrl+Shift+R) on page
+6. Check CORS headers in Network tab
+
+---
+
+**Happy Testing! 🎉**
diff --git a/browser-extension/background/background.js b/browser-extension/background/background.js
new file mode 100644
index 0000000..9c7d863
--- /dev/null
+++ b/browser-extension/background/background.js
@@ -0,0 +1,132 @@
+// Background script for OpenWebTTS browser extension
+// Handles communication, keyboard shortcuts, and context menus
+
+// Get browser API (works for both Chrome and Firefox)
+// Service workers don't have 'window', use 'self' or global 'chrome'
+const browser = self.browser || self.chrome || chrome;
+
+console.log('🔊 OpenWebTTS background service initialized');
+
+// Handle extension installation
+browser.runtime.onInstalled.addListener((details) => {
+ console.log('OpenWebTTS extension installed/updated');
+
+ // Create context menu
+ browser.contextMenus.create({
+ id: 'owtts-read-selection',
+ title: '🔊 Read this text',
+ contexts: ['selection']
+ });
+
+ if (details.reason === 'install') {
+ // Set default settings on first install
+ const defaultSettings = {
+ backendUrl: 'http://localhost:8000',
+ voice: 'piper',
+ speed: 1.0,
+ chunkSize: 50,
+ autoScroll: true,
+ wordHighlight: true,
+ highlightColor: 'yellow'
+ };
+
+ browser.storage.local.set({ settings: defaultSettings });
+ console.log('Default settings initialized');
+ }
+});
+
+// Handle context menu clicks
+browser.contextMenus.onClicked.addListener((info, tab) => {
+ if (info.menuItemId === 'owtts-read-selection' && info.selectionText) {
+ console.log('Context menu clicked - reading selection');
+
+ // Send message to content script to read the selection
+ browser.tabs.sendMessage(tab.id, {
+ action: 'startReading',
+ mode: 'selection'
+ });
+ }
+});
+
+// Handle keyboard shortcuts
+browser.commands.onCommand.addListener((command) => {
+ console.log('Keyboard command received:', command);
+
+ browser.tabs.query({active: true, currentWindow: true}, (tabs) => {
+ if (tabs[0]) {
+ // Forward command to content script
+ browser.tabs.sendMessage(tabs[0].id, {
+ command: command
+ });
+ }
+ });
+});
+
+// Handle extension icon click
+browser.action.onClicked.addListener((tab) => {
+ console.log('Extension icon clicked on tab:', tab.id);
+});
+
+// Listen for messages from content scripts and popup
+browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
+ console.log('Background received message:', message);
+
+ // Handle different message types
+ if (message.action === 'getState') {
+ // Return current state
+ browser.storage.local.get('state').then(result => {
+ sendResponse(result.state || {});
+ });
+ return true; // Keep channel open for async response
+ }
+
+ if (message.action === 'setState') {
+ // Save state
+ browser.storage.local.set({ state: message.state }).then(() => {
+ sendResponse({ success: true });
+ });
+ return true;
+ }
+
+ // Forward reading actions to active tab
+ if (message.action === 'startReading' || message.action === 'stopReading' || message.action === 'toggleMenu') {
+ browser.tabs.query({active: true, currentWindow: true}, (tabs) => {
+ if (tabs[0]) {
+ browser.tabs.sendMessage(tabs[0].id, message, (response) => {
+ sendResponse(response || {success: true});
+ });
+ }
+ });
+ return true;
+ }
+
+ // Forward progress messages to popup
+ if (message.action === 'updateProgress' ||
+ message.action === 'readingComplete' ||
+ message.action === 'readingError') {
+ browser.runtime.sendMessage(message);
+ }
+
+ // Health check
+ if (message.action === 'ping') {
+ sendResponse({pong: true});
+ }
+});
+
+// Cleanup on extension update
+browser.runtime.onUpdateAvailable.addListener((details) => {
+ console.log('Extension update available:', details);
+});
+
+// Keep service worker alive (Chrome specific)
+if (typeof chrome !== 'undefined' && chrome.alarms) {
+ // Create a keepalive alarm
+ chrome.alarms.create('keepalive', { periodInMinutes: 1 });
+
+ chrome.alarms.onAlarm.addListener((alarm) => {
+ if (alarm.name === 'keepalive') {
+ // Just log to keep service worker active
+ console.log('Service worker keepalive ping');
+ }
+ });
+}
diff --git a/browser-extension/content/content.css b/browser-extension/content/content.css
new file mode 100644
index 0000000..d37d2ce
--- /dev/null
+++ b/browser-extension/content/content.css
@@ -0,0 +1,313 @@
+/*
+ * Content styles for OpenWebTTS browser extension
+ * Applied to web pages for highlighting during reading
+ */
+
+/* Chunk-level highlighting (sentence/paragraph) */
+.owtts-chunk-highlight {
+ background-color: transparent !important;
+ border-radius: 4px !important;
+ padding: 2px 0 !important;
+ transition: background-color 0.3s ease !important;
+}
+
+/* Word-level highlighting */
+.owtts-word-highlight {
+ background-color: transparent !important;
+ border-radius: 3px !important;
+ padding: 1px 2px !important;
+ transition: background-color 0.15s ease !important;
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
+ display: inline-block !important;
+ transform: scale(1.02) !important;
+}
+
+/* Word span wrapper */
+.owtts-word {
+ display: inline !important;
+ transition: all 0.15s ease !important;
+}
+
+/* Color variants for chunk highlighting */
+.owtts-chunk-highlight.owtts-color-yellow {
+ background-color: rgba(255, 248, 0, 0.25) !important;
+ box-shadow: 0 0 0 2px rgba(255, 248, 0, 0.15) !important;
+}
+
+.owtts-chunk-highlight.owtts-color-green {
+ background-color: rgba(72, 255, 0, 0.25) !important;
+ box-shadow: 0 0 0 2px rgba(72, 255, 0, 0.15) !important;
+}
+
+.owtts-chunk-highlight.owtts-color-blue {
+ background-color: rgba(0, 196, 255, 0.25) !important;
+ box-shadow: 0 0 0 2px rgba(0, 196, 255, 0.15) !important;
+}
+
+.owtts-chunk-highlight.owtts-color-pink {
+ background-color: rgba(249, 168, 212, 0.35) !important;
+ box-shadow: 0 0 0 2px rgba(249, 168, 212, 0.2) !important;
+}
+
+.owtts-chunk-highlight.owtts-color-orange {
+ background-color: rgba(253, 186, 116, 0.35) !important;
+ box-shadow: 0 0 0 2px rgba(253, 186, 116, 0.2) !important;
+}
+
+/* Color variants for word highlighting */
+.owtts-word-highlight.owtts-word-color-yellow {
+ background-color: rgba(255, 248, 0, 0.65) !important;
+}
+
+.owtts-word-highlight.owtts-word-color-green {
+ background-color: rgba(72, 255, 0, 0.65) !important;
+}
+
+.owtts-word-highlight.owtts-word-color-blue {
+ background-color: rgba(0, 196, 255, 0.65) !important;
+}
+
+.owtts-word-highlight.owtts-word-color-pink {
+ background-color: rgba(249, 168, 212, 0.75) !important;
+}
+
+.owtts-word-highlight.owtts-word-color-orange {
+ background-color: rgba(253, 186, 116, 0.75) !important;
+}
+
+/* Legacy support - keep old color classes for backward compatibility */
+.owtts-word-highlight.owtts-color-yellow {
+ background-color: rgba(255, 248, 0, 0.65) !important;
+}
+
+.owtts-word-highlight.owtts-color-green {
+ background-color: rgba(72, 255, 0, 0.65) !important;
+}
+
+.owtts-word-highlight.owtts-color-blue {
+ background-color: rgba(0, 196, 255, 0.65) !important;
+}
+
+.owtts-word-highlight.owtts-color-pink {
+ background-color: rgba(249, 168, 212, 0.75) !important;
+}
+
+.owtts-word-highlight.owtts-color-orange {
+ background-color: rgba(253, 186, 116, 0.75) !important;
+}
+
+/* Animation for highlights */
+@keyframes owtts-highlight-fade-in {
+ from {
+ opacity: 0;
+ transform: scale(0.98);
+ }
+ to {
+ opacity: 1;
+ transform: scale(1);
+ }
+}
+
+.owtts-chunk-highlight {
+ animation: owtts-highlight-fade-in 0.2s ease-out !important;
+}
+
+.owtts-word-highlight {
+ animation: owtts-highlight-fade-in 0.1s ease-out !important;
+}
+
+/* Ensure highlights work on different backgrounds */
+.owtts-chunk-highlight,
+.owtts-word-highlight {
+ -webkit-text-fill-color: inherit !important;
+ color: inherit !important;
+ text-decoration: inherit !important;
+}
+
+/* Dark mode compatibility */
+@media (prefers-color-scheme: dark) {
+ .owtts-chunk-highlight {
+ box-shadow: 0 0 0 2px rgba(255, 248, 0, 0.25) !important;
+ }
+
+ .owtts-word-highlight {
+ box-shadow: 0 1px 3px rgba(255, 255, 255, 0.1) !important;
+ }
+}
+
+/* Print: hide highlights */
+@media print {
+ .owtts-chunk-highlight,
+ .owtts-word-highlight {
+ background-color: transparent !important;
+ box-shadow: none !important;
+ }
+}
+/* ========== Enhanced UI Elements ========== */
+
+/* Paragraph floating button (single reusable button) */
+.owtts-para-btn {
+ font-family: Arial, sans-serif !important;
+}
+
+.owtts-para-btn:hover {
+ transform: scale(1.1) !important;
+ box-shadow: 0 6px 20px rgba(102, 126, 234, 0.6) !important;
+}
+
+.owtts-para-btn:active {
+ transform: scale(1.0) !important;
+}
+
+/* Ctrl indicator */
+.owtts-ctrl-indicator {
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif !important;
+}
+
+/* Floating menu animations */
+@keyframes owtts-slide-in {
+ from {
+ opacity: 0;
+ transform: translateX(20px);
+ }
+ to {
+ opacity: 1;
+ transform: translateX(0);
+ }
+}
+
+.owtts-floating-menu {
+ animation: owtts-slide-in 0.3s ease-out !important;
+}
+
+.owtts-floating-menu * {
+ box-sizing: border-box !important;
+}
+
+/* Custom scrollbar for menu */
+.owtts-floating-menu::-webkit-scrollbar {
+ width: 6px !important;
+}
+
+.owtts-floating-menu::-webkit-scrollbar-track {
+ background: rgba(255, 255, 255, 0.1) !important;
+ border-radius: 3px !important;
+}
+
+.owtts-floating-menu::-webkit-scrollbar-thumb {
+ background: rgba(255, 255, 255, 0.3) !important;
+ border-radius: 3px !important;
+}
+
+.owtts-floating-menu::-webkit-scrollbar-thumb:hover {
+ background: rgba(255, 255, 255, 0.4) !important;
+}
+
+/* Button hover effects */
+.owtts-btn {
+ transition: all 0.2s cubic-bezier(0.4, 0, 0.2, 1) !important;
+}
+
+.owtts-btn-primary:hover {
+ background: rgba(255, 255, 255, 0.95) !important;
+ transform: translateY(-2px) !important;
+ box-shadow: 0 6px 16px rgba(0, 0, 0, 0.15) !important;
+}
+
+.owtts-btn-secondary:hover {
+ background: rgba(255, 255, 255, 0.3) !important;
+ transform: translateY(-2px) !important;
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2) !important;
+}
+
+.owtts-btn-danger:hover {
+ background: #dc2626 !important;
+ transform: translateY(-2px) !important;
+ box-shadow: 0 4px 12px rgba(220, 38, 38, 0.4) !important;
+}
+
+/* kbd element styling */
+kbd {
+ background: rgba(255, 255, 255, 0.2) !important;
+ padding: 2px 6px !important;
+ border-radius: 3px !important;
+ font-size: 10px !important;
+ font-family: 'Courier New', Courier, monospace !important;
+ border: 1px solid rgba(255, 255, 255, 0.3) !important;
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.1) !important;
+}
+
+/* Focus states for accessibility */
+.owtts-btn:focus {
+ outline: 2px solid white !important;
+ outline-offset: 2px !important;
+}
+
+.owtts-para-btn:focus {
+ outline: 2px solid #667eea !important;
+ outline-offset: 2px !important;
+}
+
+/* Smooth transitions for all interactive elements */
+.owtts-btn,
+.owtts-para-btn,
+#owtts-close-menu {
+ will-change: transform !important;
+}
+
+/* Prevent text selection in UI elements */
+.owtts-floating-menu,
+.owtts-para-btn,
+.owtts-ctrl-indicator {
+ user-select: none !important;
+ -webkit-user-select: none !important;
+ -moz-user-select: none !important;
+ -ms-user-select: none !important;
+}
+
+/* Glass morphism effect for menu */
+.owtts-floating-menu::before {
+ content: '' !important;
+ position: absolute !important;
+ inset: 0 !important;
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.1), rgba(255, 255, 255, 0.05)) !important;
+ border-radius: 16px !important;
+ pointer-events: none !important;
+}
+
+/* Responsive adjustments */
+@media (max-width: 768px) {
+ .owtts-floating-menu {
+ width: 240px !important;
+ padding: 15px !important;
+ }
+
+ .owtts-para-btn {
+ width: 28px !important;
+ height: 28px !important;
+ left: -35px !important;
+ }
+}
+
+/* High contrast mode support */
+@media (prefers-contrast: high) {
+ .owtts-floating-menu {
+ border: 2px solid white !important;
+ }
+
+ .owtts-para-btn {
+ border: 3px solid white !important;
+ }
+}
+
+/* Reduced motion support */
+@media (prefers-reduced-motion: reduce) {
+ .owtts-chunk-highlight,
+ .owtts-word-highlight,
+ .owtts-floating-menu,
+ .owtts-btn,
+ .owtts-para-btn {
+ animation: none !important;
+ transition: none !important;
+ }
+}
\ No newline at end of file
diff --git a/browser-extension/content/content.js b/browser-extension/content/content.js
new file mode 100644
index 0000000..3183c0d
--- /dev/null
+++ b/browser-extension/content/content.js
@@ -0,0 +1,1250 @@
+// Enhanced Content script for OpenWebTTS browser extension
+// Features: Floating buttons, draggable menu, clickable text, improved UI/UX
+
+(function() {
+ 'use strict';
+
+ // Get browser API (works for both Chrome and Firefox)
+ const browser = window.browser || window.chrome;
+
+ // State management
+ const state = {
+ isReading: false,
+ currentChunkIndex: 0,
+ chunks: [],
+ audioQueue: [],
+ currentAudio: null,
+ settings: null,
+ highlightedElements: [],
+ currentWordElement: null,
+ previousChunkElement: null,
+ audioBlobUrls: [], // Track blob URLs for CSP bypass
+ floatingMenu: null,
+ isMenuVisible: false,
+ menuPosition: { x: window.innerWidth - 320, y: 100 }
+ };
+
+ // Initialize extension
+ console.log('🔊 OpenWebTTS Enhanced Extension loaded');
+
+ // Create floating menu
+ function createFloatingMenu() {
+ if (state.floatingMenu) return;
+
+ const menu = document.createElement('div');
+ menu.id = 'owtts-floating-menu';
+ menu.className = 'owtts-floating-menu';
+ menu.style.cssText = `
+ position: fixed;
+ top: ${state.menuPosition.y}px;
+ right: 20px;
+ width: 280px;
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+ border-radius: 16px;
+ box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
+ z-index: 999999;
+ padding: 20px;
+ color: white;
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+ display: none;
+ backdrop-filter: blur(10px);
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+ `;
+
+ menu.innerHTML = `
+
+
+ `;
+
+ document.body.appendChild(menu);
+ state.floatingMenu = menu;
+
+ // Make draggable
+ makeDraggable(menu);
+
+ // Add event listeners
+ document.getElementById('owtts-close-menu').addEventListener('click', () => {
+ menu.style.display = 'none';
+ state.isMenuVisible = false;
+ });
+
+ document.getElementById('owtts-read-page').addEventListener('click', () => {
+ startReading('page', state.settings);
+ });
+
+ document.getElementById('owtts-read-selection').addEventListener('click', () => {
+ startReading('selection', state.settings);
+ });
+
+ document.getElementById('owtts-stop-reading').addEventListener('click', () => {
+ stopReading();
+ });
+
+ // Load settings
+ browser.storage.local.get('settings').then(result => {
+ state.settings = result.settings || {
+ backendUrl: 'http://localhost:8000',
+ voice: 'piper',
+ speed: 1.0,
+ chunkSize: 50,
+ autoScroll: true,
+ wordHighlight: true,
+ highlightColor: 'yellow',
+ wordHighlightColor: 'yellow'
+ };
+ });
+ }
+
+ // Make element draggable
+ function makeDraggable(element) {
+ const header = element.querySelector('.owtts-menu-header');
+ let isDragging = false;
+ let currentX, currentY, initialX, initialY;
+
+ header.addEventListener('mousedown', (e) => {
+ if (e.target.id === 'owtts-close-menu') return;
+ isDragging = true;
+ initialX = e.clientX - state.menuPosition.x;
+ initialY = e.clientY - state.menuPosition.y;
+ element.style.transition = 'none';
+ });
+
+ document.addEventListener('mousemove', (e) => {
+ if (!isDragging) return;
+ e.preventDefault();
+ currentX = e.clientX - initialX;
+ currentY = e.clientY - initialY;
+ state.menuPosition.x = currentX;
+ state.menuPosition.y = currentY;
+ element.style.right = 'auto';
+ element.style.left = currentX + 'px';
+ element.style.top = currentY + 'px';
+ });
+
+ document.addEventListener('mouseup', () => {
+ if (isDragging) {
+ isDragging = false;
+ element.style.transition = 'all 0.3s cubic-bezier(0.4, 0, 0.2, 1)';
+ }
+ });
+ }
+
+ // Add paragraph buttons (optimized with event delegation)
+ function addParagraphButtons() {
+ // Remove existing button if any
+ const existingBtn = document.querySelector('.owtts-para-btn');
+ if (existingBtn) existingBtn.remove();
+
+ // Create a single reusable button
+ const button = document.createElement('button');
+ button.className = 'owtts-para-btn';
+ button.innerHTML = '▶';
+ button.title = 'Click to read from this paragraph';
+ button.style.cssText = `
+ position: fixed;
+ width: 32px;
+ height: 32px;
+ border-radius: 50%;
+ background: linear-gradient(135deg, #667eea, #764ba2);
+ color: white;
+ border: 2px solid white;
+ box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
+ cursor: pointer;
+ opacity: 0;
+ pointer-events: none;
+ transition: opacity 0.2s ease;
+ z-index: 999998;
+ font-size: 12px;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ font-family: Arial, sans-serif;
+ `;
+ document.body.appendChild(button);
+
+ let currentTarget = null;
+ let hideTimeout = null;
+
+ // Use event delegation for hover detection
+ document.addEventListener('mouseover', (e) => {
+ const target = e.target.closest('p, h1, h2, h3, h4, h5, h6, li, blockquote');
+ if (!target) {
+ // Hide button when not over readable element
+ if (hideTimeout) clearTimeout(hideTimeout);
+ hideTimeout = setTimeout(() => {
+ button.style.opacity = '0';
+ button.style.pointerEvents = 'none';
+ currentTarget = null;
+ }, 300);
+ return;
+ }
+
+ // Skip short elements
+ if (target.textContent.trim().length < 20) return;
+
+ // Check if visible
+ const style = window.getComputedStyle(target);
+ if (style.display === 'none' || style.visibility === 'hidden') return;
+
+ // Clear hide timeout
+ if (hideTimeout) clearTimeout(hideTimeout);
+
+ // Position button next to element
+ const rect = target.getBoundingClientRect();
+ button.style.top = (rect.top + rect.height / 2 - 16) + 'px';
+ button.style.left = Math.max(10, rect.left - 45) + 'px';
+ button.style.opacity = '1';
+ button.style.pointerEvents = 'auto';
+ currentTarget = target;
+ }, { passive: true });
+
+ // Keep button visible when hovering over it
+ button.addEventListener('mouseenter', () => {
+ if (hideTimeout) clearTimeout(hideTimeout);
+ button.style.opacity = '1';
+ button.style.pointerEvents = 'auto';
+ });
+
+ button.addEventListener('mouseleave', () => {
+ if (hideTimeout) clearTimeout(hideTimeout);
+ hideTimeout = setTimeout(() => {
+ button.style.opacity = '0';
+ button.style.pointerEvents = 'none';
+ currentTarget = null;
+ }, 200);
+ });
+
+ // Handle button click
+ button.addEventListener('click', async (e) => {
+ e.stopPropagation();
+ if (!currentTarget) return;
+
+ const textNodes = [{
+ element: currentTarget,
+ text: currentTarget.textContent.trim()
+ }];
+ await readTextNodes(textNodes);
+ });
+ }
+
+ // Make text clickable (like PDF)
+ function makeTextClickable() {
+ document.addEventListener('click', async (e) => {
+ if (!e.ctrlKey && !e.metaKey) return;
+
+ // Find the closest readable element
+ let target = e.target;
+ const readableTags = ['P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'LI', 'BLOCKQUOTE', 'TD', 'TH', 'DIV', 'SPAN'];
+
+ while (target && !readableTags.includes(target.tagName)) {
+ target = target.parentElement;
+ if (target === document.body) return;
+ }
+
+ if (!target || !target.textContent.trim()) return;
+
+ // Find all readable content starting from this element
+ const allElements = Array.from(document.querySelectorAll('p, h1, h2, h3, h4, h5, h6, li, blockquote'));
+ const startIndex = allElements.indexOf(target);
+ if (startIndex === -1) return;
+
+ // Get elements from clicked one to the end
+ const elementsToRead = allElements.slice(startIndex).filter(el => {
+ const style = window.getComputedStyle(el);
+ return style.display !== 'none' && style.visibility !== 'hidden' && el.textContent.trim().length > 0;
+ });
+
+ const textNodes = elementsToRead.map(el => ({
+ element: el,
+ text: el.textContent.trim()
+ }));
+
+ await readTextNodes(textNodes);
+ });
+
+ // Show Ctrl indicator
+ let ctrlIndicator = null;
+ document.addEventListener('keydown', (e) => {
+ if ((e.ctrlKey || e.metaKey) && !ctrlIndicator) {
+ ctrlIndicator = document.createElement('div');
+ ctrlIndicator.className = 'owtts-ctrl-indicator';
+ ctrlIndicator.textContent = 'Ctrl held - Click text to start reading from there';
+ ctrlIndicator.style.cssText = `
+ position: fixed;
+ top: 20px;
+ left: 50%;
+ transform: translateX(-50%);
+ background: linear-gradient(135deg, rgba(59, 130, 246, 0.95), rgba(37, 99, 235, 0.95));
+ color: white;
+ padding: 10px 20px;
+ border-radius: 8px;
+ font-size: 13px;
+ font-weight: 600;
+ z-index: 1000000;
+ box-shadow: 0 4px 16px rgba(0, 0, 0, 0.2);
+ animation: owtts-slide-down 0.3s ease-out;
+ pointer-events: none;
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+ `;
+ document.body.appendChild(ctrlIndicator);
+ }
+ });
+
+ document.addEventListener('keyup', (e) => {
+ if ((!e.ctrlKey && !e.metaKey) && ctrlIndicator) {
+ ctrlIndicator.remove();
+ ctrlIndicator = null;
+ }
+ });
+ }
+
+ // Extract text from specific nodes
+ async function readTextNodes(textNodes) {
+ if (state.isReading) {
+ stopReading();
+ return;
+ }
+
+ state.isReading = true;
+ state.settings = state.settings || await loadSettings();
+ state.currentChunkIndex = 0;
+ state.chunks = [];
+ state.audioQueue = [];
+
+ updateMenuStatus('Processing text...', 'loading');
+
+ try {
+ // Combine all text for timing-based processing
+ const fullText = textNodes.map(node => node.text).join(' ');
+
+ console.log(`📖 Starting to read with timing-based sync`);
+
+ // Request TTS with timing data (separate audio for each chunk)
+ const timingData = await requestTTSWithTiming(fullText, state.settings);
+
+ console.log(`🎵 Received ${timingData.chunks.length} chunks with timing data`);
+
+ // Map timing chunks to DOM elements
+ state.chunks = timingData.chunks.map((chunkData, index) => {
+ // Find the best matching element for this chunk
+ // For now, use first element (can improve matching later)
+ const element = textNodes[0]?.element || document.body;
+
+ return {
+ text: chunkData.text,
+ element: element,
+ words: chunkData.words.map(w => w.word),
+ startOffset: chunkData.startOffset,
+ endOffset: chunkData.endOffset,
+ audioUrl: chunkData.audioUrl, // Each chunk has its own audio
+ timingData: chunkData // Store timing data with chunk
+ };
+ });
+
+ if (state.chunks.length === 0) {
+ throw new Error('No readable content found');
+ }
+
+ updateMenuStatus(`Reading ${state.chunks.length} chunks...`, 'reading');
+ showProgress();
+
+ // Play chunks sequentially with their own audio and timing data
+ for (let i = 0; i < state.chunks.length; i++) {
+ if (!state.isReading) break;
+
+ state.currentChunkIndex = i;
+ updateProgress(i + 1, state.chunks.length);
+
+ await playChunkWithTiming(state.chunks[i], i);
+ }
+
+ // Reading complete
+ console.log('✅ Reading complete');
+ stopReading();
+
+ } catch (error) {
+ console.error('Reading error:', error);
+ stopReading();
+ updateMenuStatus('Error: ' + error.message, 'error');
+ }
+ }
+
+ // Load settings
+ async function loadSettings() {
+ const result = await browser.storage.local.get('settings');
+ return result.settings || {
+ backendUrl: 'http://localhost:8000',
+ voice: 'piper',
+ speed: 1.0,
+ chunkSize: 50,
+ autoScroll: true,
+ wordHighlight: true,
+ highlightColor: 'yellow',
+ wordHighlightColor: 'yellow'
+ };
+ }
+
+ // Update menu status
+ function updateMenuStatus(message, type = 'idle') {
+ const statusEl = document.getElementById('owtts-status');
+ if (!statusEl) return;
+
+ statusEl.textContent = message;
+ statusEl.style.background = {
+ 'idle': 'rgba(255, 255, 255, 0.15)',
+ 'loading': 'rgba(255, 193, 7, 0.3)',
+ 'reading': 'rgba(76, 175, 80, 0.3)',
+ 'error': 'rgba(244, 67, 54, 0.3)'
+ }[type] || 'rgba(255, 255, 255, 0.15)';
+ }
+
+ // Show progress
+ function showProgress() {
+ const progressEl = document.getElementById('owtts-progress');
+ const stopBtn = document.getElementById('owtts-stop-reading');
+ const readBtn = document.getElementById('owtts-read-page');
+ const selBtn = document.getElementById('owtts-read-selection');
+
+ if (progressEl) progressEl.style.display = 'block';
+ if (stopBtn) stopBtn.style.display = 'block';
+ if (readBtn) readBtn.style.display = 'none';
+ if (selBtn) selBtn.style.display = 'none';
+ }
+
+ // Hide progress
+ function hideProgress() {
+ const progressEl = document.getElementById('owtts-progress');
+ const stopBtn = document.getElementById('owtts-stop-reading');
+ const readBtn = document.getElementById('owtts-read-page');
+ const selBtn = document.getElementById('owtts-read-selection');
+
+ if (progressEl) progressEl.style.display = 'none';
+ if (stopBtn) stopBtn.style.display = 'none';
+ if (readBtn) readBtn.style.display = 'block';
+ if (selBtn) selBtn.style.display = 'block';
+ }
+
+ // Update progress
+ function updateProgress(current, total) {
+ const progressBar = document.getElementById('owtts-progress-bar');
+ const chunkCounter = document.getElementById('owtts-chunk-counter');
+ const timeRemaining = document.getElementById('owtts-time-remaining');
+
+ if (progressBar) {
+ const percent = (current / total) * 100;
+ progressBar.style.width = `${percent}%`;
+ }
+
+ if (chunkCounter) {
+ chunkCounter.textContent = `Chunk ${current}/${total}`;
+ }
+
+ if (timeRemaining) {
+ const remaining = (total - current) * 3; // Rough estimate
+ const minutes = Math.floor(remaining / 60);
+ const seconds = remaining % 60;
+ timeRemaining.textContent = `${minutes}:${seconds.toString().padStart(2, '0')}`;
+ }
+ }
+
+ // Extract readable text from page
+function extractPageText() {
+ const elementsToExclude = ['script', 'style', 'noscript', 'iframe', 'svg', 'nav', 'header', 'footer'];
+ const readableElements = ['P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'LI', 'BLOCKQUOTE', 'TD', 'TH', 'ARTICLE', 'SECTION'];
+
+ const allElements = Array.from(document.body.getElementsByTagName('*'));
+ const textNodes = [];
+
+ for (const element of allElements) {
+ if (elementsToExclude.includes(element.tagName.toLowerCase())) continue;
+ if (!readableElements.includes(element.tagName)) continue;
+
+ const style = window.getComputedStyle(element);
+ if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') continue;
+
+ const text = element.textContent.trim();
+ if (text.length === 0) continue;
+
+ // Avoid duplicates (parent and child both selected)
+ const hasReadableParent = textNodes.some(node => node.element.contains(element));
+ if (hasReadableParent) continue;
+
+ textNodes.push({
+ element: element,
+ text: text
+ });
+ }
+
+ return textNodes;
+ }
+
+ // Extract selected text
+ function extractSelection() {
+ const selection = window.getSelection();
+ const selectedText = selection.toString().trim();
+
+ if (!selectedText) return null;
+
+ const range = selection.getRangeAt(0);
+ const container = range.commonAncestorContainer;
+ const element = container.nodeType === Node.TEXT_NODE ? container.parentElement : container;
+
+ return [{
+ element: element,
+ text: selectedText
+ }];
+ }
+
+ // Split text into chunks
+ function splitIntoChunks(textNodes, chunkSize) {
+ const chunks = [];
+
+ for (const { element, text } of textNodes) {
+ if (text.length <= chunkSize) {
+ chunks.push({
+ text: text,
+ element: element,
+ words: text.split(/\s+/).filter(w => w.length > 0),
+ startOffset: 0,
+ endOffset: text.length
+ });
+ } else {
+ // Split by sentences
+ const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
+ let currentChunk = '';
+ let startOffset = 0;
+
+ for (const sentence of sentences) {
+ if ((currentChunk + sentence).length <= chunkSize) {
+ currentChunk += sentence;
+ } else {
+ if (currentChunk) {
+ const trimmed = currentChunk.trim();
+ const endPos = text.indexOf(trimmed, startOffset) + trimmed.length;
+ chunks.push({
+ text: trimmed,
+ element: element,
+ words: trimmed.split(/\s+/).filter(w => w.length > 0),
+ startOffset: text.indexOf(trimmed, startOffset),
+ endOffset: endPos
+ });
+ startOffset = endPos;
+ }
+ currentChunk = sentence;
+ }
+ }
+
+ if (currentChunk) {
+ const trimmed = currentChunk.trim();
+ chunks.push({
+ text: trimmed,
+ element: element,
+ words: trimmed.split(/\s+/).filter(w => w.length > 0),
+ startOffset: text.indexOf(trimmed, startOffset),
+ endOffset: text.indexOf(trimmed, startOffset) + trimmed.length
+ });
+ }
+ }
+ }
+
+ return chunks;
+ }
+
+ // Request TTS audio from backend
+ async function requestTTS(text, settings) {
+ const backendUrl = settings.backendUrl || 'http://localhost:8000';
+
+ try {
+ const response = await fetch(`${backendUrl}/api/generate_speech`, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify({
+ text: text,
+ voice: settings.voice || 'piper',
+ speed: settings.speed || 1.0
+ })
+ });
+
+ if (!response.ok) {
+ throw new Error(`Backend error: ${response.status}`);
+ }
+
+ const blob = await response.blob();
+ const audioUrl = URL.createObjectURL(blob);
+
+ return audioUrl;
+ } catch (error) {
+ console.error('TTS request failed:', error);
+ throw error;
+ }
+ }
+
+ // Request TTS audio with timing data from backend
+ async function requestTTSWithTiming(text, settings) {
+ const backendUrl = settings.backendUrl || 'http://localhost:8000';
+
+ try {
+ const response = await fetch(`${backendUrl}/api/generate_speech_with_timing`, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify({
+ text: text,
+ voice: settings.voice || 'piper',
+ speed: settings.speed || 1.0,
+ chunkSize: settings.chunkSize || 50
+ })
+ });
+
+ if (!response.ok) {
+ throw new Error(`Backend error: ${response.status}`);
+ }
+
+ const data = await response.json();
+
+ // Convert each chunk's audio URL to a blob URL to bypass CSP restrictions
+ for (const chunk of data.chunks) {
+ const audioUrl = `${backendUrl}${chunk.audioUrl}`;
+ try {
+ const audioResponse = await fetch(audioUrl);
+ if (!audioResponse.ok) {
+ console.warn(`⚠️ Failed to fetch audio: ${audioUrl}`);
+ continue;
+ }
+ const audioBlob = await audioResponse.blob();
+ const blobUrl = URL.createObjectURL(audioBlob);
+ chunk.audioUrl = blobUrl; // Replace with blob URL
+ state.audioBlobUrls.push(blobUrl); // Track for cleanup
+ } catch (error) {
+ console.error(`Error converting audio to blob: ${error}`);
+ }
+ }
+
+ return {
+ audioUrl: data.audioUrl, // Not used, kept for compatibility
+ duration: data.duration,
+ chunks: data.chunks,
+ normalizedText: data.normalizedText
+ };
+ } catch (error) {
+ console.error('TTS with timing request failed:', error);
+ throw error;
+ }
+ }
+
+ // Highlight chunk
+ function highlightChunk(chunk) {
+ // Clear previous chunk highlights (but not word wrapping)
+ state.highlightedElements.forEach(el => {
+ if (el.dataset.owttsHighlight === 'chunk') {
+ // Unwrap highlight span
+ const parent = el.parentNode;
+ while (el.firstChild) {
+ parent.insertBefore(el.firstChild, el);
+ }
+ parent.removeChild(el);
+ } else {
+ el.classList.remove('owtts-chunk-highlight');
+ // Remove all possible color classes
+ el.classList.remove('owtts-color-yellow', 'owtts-color-green', 'owtts-color-blue', 'owtts-color-pink', 'owtts-color-orange');
+ }
+ });
+ state.highlightedElements = [];
+
+ // Clear word highlighting (but keep word wrapping)
+ if (state.currentWordElement) {
+ state.currentWordElement.classList.remove('owtts-word-highlight');
+ // Remove all possible word color classes
+ state.currentWordElement.classList.remove('owtts-word-color-yellow', 'owtts-word-color-green', 'owtts-word-color-blue', 'owtts-word-color-pink', 'owtts-word-color-orange');
+ state.currentWordElement.classList.remove('owtts-color-yellow', 'owtts-color-green', 'owtts-color-blue', 'owtts-color-pink', 'owtts-color-orange');
+ state.currentWordElement = null;
+ }
+
+ const element = chunk.element;
+
+ // If word wrapping is already applied, highlight word spans instead of wrapping text
+ if (element.dataset.owttsProcessed) {
+ const wordOffset = parseInt(element.dataset.owttsWordOffset || '0', 10);
+ const chunkWordCount = chunk.words.length;
+ const wordSpans = element.querySelectorAll('.owtts-word');
+
+ // Add chunk highlight class to all words in this chunk
+ for (let i = 0; i < chunkWordCount; i++) {
+ const span = wordSpans[wordOffset + i];
+ if (span) {
+ span.classList.add('owtts-chunk-highlight', `owtts-color-${state.settings.highlightColor}`);
+ state.highlightedElements.push(span);
+ }
+ }
+ } else {
+ // No word wrapping, use traditional range-based chunk highlighting
+ const fullText = element.textContent;
+ const startIdx = fullText.indexOf(chunk.text, chunk.startOffset || 0);
+
+ if (startIdx === -1) {
+ // Fallback: highlight whole element
+ element.classList.add('owtts-chunk-highlight', `owtts-color-${state.settings.highlightColor}`);
+ state.highlightedElements.push(element);
+ } else {
+ // Wrap only the chunk text in a highlight span
+ const range = document.createRange();
+ const walker = document.createTreeWalker(element, NodeFilter.SHOW_TEXT);
+
+ let charCount = 0;
+ let startNode = null, startNodeOffset = 0;
+ let endNode = null, endNodeOffset = 0;
+
+ while (walker.nextNode()) {
+ const node = walker.currentNode;
+ const nodeLength = node.textContent.length;
+
+ if (startNode === null && charCount + nodeLength > startIdx) {
+ startNode = node;
+ startNodeOffset = startIdx - charCount;
+ }
+
+ if (startNode !== null && charCount + nodeLength >= startIdx + chunk.text.length) {
+ endNode = node;
+ endNodeOffset = startIdx + chunk.text.length - charCount;
+ break;
+ }
+
+ charCount += nodeLength;
+ }
+
+ if (startNode && endNode) {
+ try {
+ range.setStart(startNode, startNodeOffset);
+ range.setEnd(endNode, endNodeOffset);
+
+ const span = document.createElement('span');
+ span.className = `owtts-chunk-highlight owtts-color-${state.settings.highlightColor}`;
+ span.dataset.owttsHighlight = 'chunk';
+ range.surroundContents(span);
+ state.highlightedElements.push(span);
+ } catch (e) {
+ // Fallback if range wrapping fails
+ element.classList.add('owtts-chunk-highlight', `owtts-color-${state.settings.highlightColor}`);
+ state.highlightedElements.push(element);
+ }
+ }
+ }
+ }
+
+ // Scroll to element if auto-scroll is enabled
+ if (state.settings.autoScroll) {
+ element.scrollIntoView({
+ behavior: 'smooth',
+ block: 'center'
+ });
+ }
+ }
+
+ // Prepare word highlighting by wrapping words in advance
+ function prepareWordHighlighting(chunk) {
+ // If switching to a different element, clear its word wrapping
+ if (state.previousChunkElement && state.previousChunkElement !== chunk.element) {
+ if (state.previousChunkElement.dataset.owttsOriginal) {
+ state.previousChunkElement.innerHTML = state.previousChunkElement.dataset.owttsOriginal;
+ delete state.previousChunkElement.dataset.owttsOriginal;
+ delete state.previousChunkElement.dataset.owttsProcessed;
+ delete state.previousChunkElement.dataset.owttsChunkId;
+ delete state.previousChunkElement.dataset.owttsWordOffset;
+ }
+ }
+
+ // If same element but different chunk index, recalculate offset
+ const currentChunkId = state.currentChunkIndex.toString();
+ if (chunk.element.dataset.owttsProcessed && chunk.element.dataset.owttsChunkId !== currentChunkId) {
+ // Element already processed for a different chunk - recalculate offset
+ const textBeforeChunk = chunk.element.textContent.slice(0, chunk.startOffset || 0);
+ const wordsBeforeChunk = textBeforeChunk.split(/\s+/).filter(w => w.trim().length > 0).length;
+ chunk.element.dataset.owttsWordOffset = wordsBeforeChunk.toString();
+ chunk.element.dataset.owttsChunkId = currentChunkId;
+ }
+
+ // Process current chunk if not already processed
+ if (!chunk.element.dataset.owttsProcessed) {
+ wrapWordsInSpans(chunk);
+ chunk.element.dataset.owttsChunkId = currentChunkId;
+
+ // Calculate word offset: count words before chunk's startOffset
+ const textBeforeChunk = chunk.element.textContent.slice(0, chunk.startOffset || 0);
+ const wordsBeforeChunk = textBeforeChunk.split(/\s+/).filter(w => w.trim().length > 0).length;
+ chunk.element.dataset.owttsWordOffset = wordsBeforeChunk.toString();
+
+ state.previousChunkElement = chunk.element;
+ }
+ }
+
+ // Highlight individual word
+ function highlightWord(chunk, wordIndex) {
+ if (state.currentWordElement) {
+ state.currentWordElement.classList.remove('owtts-word-highlight');
+ // Remove all possible word color classes
+ state.currentWordElement.classList.remove('owtts-word-color-yellow', 'owtts-word-color-green', 'owtts-word-color-blue', 'owtts-word-color-pink', 'owtts-word-color-orange');
+ state.currentWordElement.classList.remove('owtts-color-yellow', 'owtts-color-green', 'owtts-color-blue', 'owtts-color-pink', 'owtts-color-orange');
+
+ // Restore chunk highlighting to previous word
+ if (state.currentWordElement.classList.contains('owtts-word')) {
+ state.currentWordElement.classList.add('owtts-chunk-highlight', `owtts-color-${state.settings.highlightColor}`);
+ }
+ }
+
+ if (!state.settings.wordHighlight) return;
+
+ // Apply the word offset to get the correct position in the element
+ const wordOffset = parseInt(chunk.element.dataset.owttsWordOffset || '0', 10);
+ const actualWordIndex = wordOffset + wordIndex;
+
+ const wordSpans = chunk.element.querySelectorAll('.owtts-word');
+ if (wordSpans[actualWordIndex]) {
+ // Remove chunk highlighting from current word to show only word color
+ wordSpans[actualWordIndex].classList.remove('owtts-chunk-highlight');
+ wordSpans[actualWordIndex].classList.remove('owtts-color-yellow', 'owtts-color-green', 'owtts-color-blue', 'owtts-color-pink', 'owtts-color-orange');
+
+ // Apply word highlighting
+ const wordColor = state.settings.wordHighlightColor || state.settings.highlightColor;
+ wordSpans[actualWordIndex].classList.add('owtts-word-highlight', `owtts-word-color-${wordColor}`);
+ state.currentWordElement = wordSpans[actualWordIndex];
+ }
+ }
+
+ // Wrap words in spans
+ function wrapWordsInSpans(chunk) {
+ const element = chunk.element;
+ element.dataset.owttsOriginal = element.innerHTML;
+ element.dataset.owttsProcessed = 'true';
+
+ const walker = document.createTreeWalker(element, NodeFilter.SHOW_TEXT, null);
+ const textNodes = [];
+ while (walker.nextNode()) {
+ if (walker.currentNode.textContent.trim().length > 0) {
+ textNodes.push(walker.currentNode);
+ }
+ }
+
+ for (const textNode of textNodes) {
+ const words = textNode.textContent.split(/(\s+)/);
+ const fragment = document.createDocumentFragment();
+
+ words.forEach(word => {
+ if (word.trim().length > 0) {
+ const span = document.createElement('span');
+ span.className = 'owtts-word';
+ span.textContent = word;
+ fragment.appendChild(span);
+ } else if (word.length > 0) {
+ fragment.appendChild(document.createTextNode(word));
+ }
+ });
+
+ textNode.parentNode.replaceChild(fragment, textNode);
+ }
+ }
+
+ // Clear highlights
+ function clearHighlights() {
+ state.highlightedElements.forEach(el => {
+ if (el.dataset.owttsHighlight === 'chunk') {
+ // Unwrap highlight span
+ const parent = el.parentNode;
+ while (el.firstChild) {
+ parent.insertBefore(el.firstChild, el);
+ }
+ parent.removeChild(el);
+ } else {
+ el.classList.remove('owtts-chunk-highlight');
+ // Remove all possible color classes
+ el.classList.remove('owtts-color-yellow', 'owtts-color-green', 'owtts-color-blue', 'owtts-color-pink', 'owtts-color-orange');
+ }
+ });
+ state.highlightedElements = [];
+
+ if (state.currentWordElement) {
+ state.currentWordElement.classList.remove('owtts-word-highlight');
+ // Remove all possible word color classes
+ state.currentWordElement.classList.remove('owtts-word-color-yellow', 'owtts-word-color-green', 'owtts-word-color-blue', 'owtts-word-color-pink', 'owtts-word-color-orange');
+ state.currentWordElement.classList.remove('owtts-color-yellow', 'owtts-color-green', 'owtts-color-blue', 'owtts-color-pink', 'owtts-color-orange');
+ state.currentWordElement = null;
+ }
+
+ document.querySelectorAll('[data-owtts-processed]').forEach(el => {
+ if (el.dataset.owttsOriginal) {
+ el.innerHTML = el.dataset.owttsOriginal;
+ delete el.dataset.owttsOriginal;
+ delete el.dataset.owttsProcessed;
+ delete el.dataset.owttsChunkId;
+ delete el.dataset.owttsWordOffset;
+ }
+ });
+
+ // Normalize text nodes
+ document.querySelectorAll('p, h1, h2, h3, h4, h5, h6, li, blockquote').forEach(el => {
+ if (el.normalize) el.normalize();
+ });
+ }
+
+ // Play audio chunk with timing data
+ async function playChunkWithTiming(chunk, chunkIndex) {
+ // Prepare word wrapping if needed
+ if (state.settings.wordHighlight) {
+ prepareWordHighlighting(chunk);
+ }
+
+ highlightChunk(chunk);
+
+ // Get audio URL from chunk (already converted to blob URL)
+ const audioUrl = chunk.audioUrl; // Already a blob URL from requestTTSWithTiming
+ const chunkTimingData = chunk.timingData;
+
+ return new Promise((resolve, reject) => {
+ const audio = new Audio(audioUrl);
+ audio.playbackRate = state.settings.speed || 1.0;
+ state.currentAudio = audio;
+
+ let lastWordIndex = -1;
+ let animationFrame = null;
+
+ // Use precise timing data from backend
+ const updateWordHighlight = () => {
+ if (!state.settings.wordHighlight) {
+ animationFrame = requestAnimationFrame(updateWordHighlight);
+ return;
+ }
+
+ if (!audio.duration || audio.duration === 0 || audio.paused || audio.ended) {
+ animationFrame = requestAnimationFrame(updateWordHighlight);
+ return;
+ }
+
+ const currentTime = audio.currentTime;
+
+ // Find the current word based on timing data
+ if (!chunkTimingData || !chunkTimingData.words) {
+ animationFrame = requestAnimationFrame(updateWordHighlight);
+ return;
+ }
+
+ // Find which word should be highlighted at current time
+ let wordIndex = -1;
+ for (let i = 0; i < chunkTimingData.words.length; i++) {
+ const wordTiming = chunkTimingData.words[i];
+
+ // Skip citation markers and other marked skip regions
+ if (wordTiming.skip) {
+ continue;
+ }
+
+ if (currentTime >= wordTiming.startTime && currentTime <= wordTiming.endTime) {
+ wordIndex = i;
+ break;
+ } else if (currentTime < wordTiming.startTime) {
+ // We haven't reached this word yet, find previous non-skip word
+ for (let j = i - 1; j >= 0; j--) {
+ if (!chunkTimingData.words[j].skip) {
+ wordIndex = j;
+ break;
+ }
+ }
+ break;
+ }
+ }
+
+ // If past all words, use last non-skip word
+ if (wordIndex === -1 && chunkTimingData.words.length > 0) {
+ const lastWordTiming = chunkTimingData.words[chunkTimingData.words.length - 1];
+ if (currentTime > lastWordTiming.endTime) {
+ // Find last non-skip word
+ for (let i = chunkTimingData.words.length - 1; i >= 0; i--) {
+ if (!chunkTimingData.words[i].skip) {
+ wordIndex = i;
+ break;
+ }
+ }
+ }
+ }
+
+ // Update highlight if word changed
+ if (wordIndex !== lastWordIndex && wordIndex >= 0) {
+ lastWordIndex = wordIndex;
+ highlightWord(chunk, wordIndex);
+ }
+
+ animationFrame = requestAnimationFrame(updateWordHighlight);
+ };
+
+ audio.addEventListener('loadedmetadata', () => {
+ console.log(`🎵 Audio loaded: ${audio.duration}s, ${chunkTimingData.words.length} words`);
+ });
+
+ audio.addEventListener('play', () => {
+ if (animationFrame) cancelAnimationFrame(animationFrame);
+ animationFrame = requestAnimationFrame(updateWordHighlight);
+ });
+
+ audio.addEventListener('playing', () => {
+ if (!animationFrame) {
+ animationFrame = requestAnimationFrame(updateWordHighlight);
+ }
+ });
+
+ audio.addEventListener('ended', () => {
+ if (animationFrame) cancelAnimationFrame(animationFrame);
+ animationFrame = null;
+ resolve();
+ });
+
+ audio.addEventListener('pause', () => {
+ if (animationFrame) cancelAnimationFrame(animationFrame);
+ animationFrame = null;
+ });
+
+ audio.addEventListener('error', (error) => {
+ if (animationFrame) cancelAnimationFrame(animationFrame);
+ animationFrame = null;
+ reject(error);
+ });
+
+ audio.play().catch(reject);
+ });
+ }
+
+ // Main reading function
+ async function startReading(mode, settings) {
+ if (state.isReading) {
+ console.warn('Already reading, stopping current...');
+ stopReading();
+ // Small delay to ensure cleanup
+ await new Promise(resolve => setTimeout(resolve, 100));
+ }
+
+ state.settings = settings || await loadSettings();
+
+ let textNodes;
+ if (mode === 'selection') {
+ textNodes = extractSelection();
+ if (!textNodes || textNodes.length === 0) {
+ updateMenuStatus('No text selected', 'error');
+ return;
+ }
+ } else {
+ textNodes = extractPageText();
+ }
+
+ await readTextNodes(textNodes);
+ }
+
+ // Stop reading
+ function stopReading() {
+ state.isReading = false;
+
+ if (state.currentAudio) {
+ state.currentAudio.pause();
+ state.currentAudio = null;
+ }
+
+ clearHighlights();
+
+ // Delay blob URL cleanup to prevent ERR_REQUEST_RANGE_NOT_SATISFIABLE
+ setTimeout(() => {
+ // Clean up old audioQueue blob URLs
+ state.audioQueue.forEach(url => {
+ try {
+ URL.revokeObjectURL(url);
+ } catch (e) {
+ // Ignore errors during cleanup
+ }
+ });
+ state.audioQueue = [];
+
+ // Clean up audioBlobUrls (timing-based audio)
+ state.audioBlobUrls.forEach(url => {
+ try {
+ URL.revokeObjectURL(url);
+ } catch (e) {
+ // Ignore errors during cleanup
+ }
+ });
+ state.audioBlobUrls = [];
+ }, 500);
+
+ state.chunks = [];
+ state.currentChunkIndex = 0;
+ state.previousChunkElement = null;
+
+ updateMenuStatus('Ready to read', 'idle');
+ hideProgress();
+
+ console.log('⏹️ Reading stopped');
+ }
+
+ // Toggle menu visibility
+ function toggleMenu() {
+ if (!state.floatingMenu) {
+ createFloatingMenu();
+ }
+
+ const menu = state.floatingMenu;
+ if (menu.style.display === 'none' || !state.isMenuVisible) {
+ menu.style.display = 'block';
+ state.isMenuVisible = true;
+ } else {
+ menu.style.display = 'none';
+ state.isMenuVisible = false;
+ }
+ }
+
+ // Listen for messages from popup and background
+ browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
+ if (message.action === 'startReading') {
+ startReading(message.mode, message.settings);
+ sendResponse({ success: true });
+ } else if (message.action === 'stopReading') {
+ stopReading();
+ sendResponse({ success: true });
+ } else if (message.action === 'toggleMenu') {
+ toggleMenu();
+ sendResponse({ success: true });
+ } else if (message.action === 'readFromHere') {
+ // This will be handled by click listener
+ sendResponse({ success: true });
+ } else if (message.action === 'updateSettings') {
+ // Update settings immediately without page refresh
+ state.settings = message.settings;
+
+ // If currently reading, update highlight colors immediately
+ if (state.isReading) {
+ // Reapply chunk highlight with new color
+ const currentChunk = state.chunks[state.currentChunkIndex];
+ if (currentChunk) {
+ // Clear old highlights with old colors
+ state.highlightedElements.forEach(el => {
+ el.classList.remove('owtts-chunk-highlight');
+ el.classList.remove('owtts-color-yellow', 'owtts-color-green', 'owtts-color-blue', 'owtts-color-pink', 'owtts-color-orange');
+ });
+ state.highlightedElements = [];
+
+ // Reapply with new color
+ highlightChunk(currentChunk);
+
+ // Update word highlight color if active
+ if (state.currentWordElement) {
+ state.currentWordElement.classList.remove('owtts-word-color-yellow', 'owtts-word-color-green', 'owtts-word-color-blue', 'owtts-word-color-pink', 'owtts-word-color-orange');
+ const wordColor = state.settings.wordHighlightColor || state.settings.highlightColor;
+ state.currentWordElement.classList.add(`owtts-word-color-${wordColor}`);
+ }
+ }
+ }
+
+ sendResponse({ success: true });
+ }
+
+ return true;
+ });
+
+ // Listen for keyboard commands
+ browser.runtime.onMessage.addListener((message) => {
+ if (message.command) {
+ switch (message.command) {
+ case 'read-page':
+ startReading('page', state.settings);
+ break;
+ case 'read-selection':
+ startReading('selection', state.settings);
+ break;
+ case 'stop-reading':
+ stopReading();
+ break;
+ case 'toggle-menu':
+ toggleMenu();
+ break;
+ }
+ }
+ });
+
+ // Initialize
+ setTimeout(() => {
+ createFloatingMenu();
+ addParagraphButtons();
+ makeTextClickable();
+
+ // Show menu by default
+ toggleMenu();
+ }, 1000);
+
+ // Add CSS animation
+ const style = document.createElement('style');
+ style.textContent = `
+ @keyframes owtts-slide-down {
+ from {
+ opacity: 0;
+ transform: translateX(-50%) translateY(-10px);
+ }
+ to {
+ opacity: 1;
+ transform: translateX(-50%) translateY(0);
+ }
+ }
+
+ kbd {
+ background: rgba(255, 255, 255, 0.2);
+ padding: 2px 6px;
+ border-radius: 3px;
+ font-size: 10px;
+ font-family: monospace;
+ }
+
+ .owtts-btn:hover {
+ transform: translateY(-2px);
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
+ }
+
+ .owtts-btn:active {
+ transform: translateY(0);
+ }
+
+ #owtts-close-menu:hover {
+ background: rgba(255, 255, 255, 0.3);
+ transform: rotate(90deg);
+ }
+ `;
+ document.head.appendChild(style);
+
+})();
diff --git a/browser-extension/generate_icons.py b/browser-extension/generate_icons.py
new file mode 100644
index 0000000..c23e81a
--- /dev/null
+++ b/browser-extension/generate_icons.py
@@ -0,0 +1,102 @@
+"""
+Icon generator for OpenWebTTS browser extension
+Converts SVG icons to PNG format
+"""
+
+from PIL import Image, ImageDraw, ImageFont
+import os
+
+def create_icon(size, output_path):
+ """Create a simple icon with gradient background and speaker emoji"""
+
+ # Create image with gradient
+ img = Image.new('RGBA', (size, size), (0, 0, 0, 0))
+ draw = ImageDraw.Draw(img)
+
+ # Draw gradient background (purple gradient)
+ for y in range(size):
+ # Interpolate between two colors
+ ratio = y / size
+ r = int(102 + (118 - 102) * ratio)
+ g = int(126 + (75 - 126) * ratio)
+ b = int(234 + (162 - 234) * ratio)
+ draw.rectangle([(0, y), (size, y + 1)], fill=(r, g, b, 255))
+
+ # Add rounded corners
+ corner_radius = max(3, size // 8)
+ mask = Image.new('L', (size, size), 0)
+ mask_draw = ImageDraw.Draw(mask)
+ mask_draw.rounded_rectangle([(0, 0), (size, size)], corner_radius, fill=255)
+
+ # Apply mask
+ img.putalpha(mask)
+
+ # Add speaker icon (simple geometric shape)
+ # Draw speaker cone
+ speaker_size = size // 2
+ left = size // 4
+ top = size // 4
+
+ # Speaker body (rectangle)
+ draw.rectangle(
+ [(left, top + speaker_size // 3),
+ (left + speaker_size // 3, top + 2 * speaker_size // 3)],
+ fill=(255, 255, 255, 255)
+ )
+
+ # Speaker cone (polygon)
+ points = [
+ (left + speaker_size // 3, top + speaker_size // 4),
+ (left + 2 * speaker_size // 3, top),
+ (left + 2 * speaker_size // 3, top + speaker_size),
+ (left + speaker_size // 3, top + 3 * speaker_size // 4)
+ ]
+ draw.polygon(points, fill=(255, 255, 255, 255))
+
+ # Sound waves
+ if size >= 48:
+ wave_start_x = left + 2 * speaker_size // 3 + 5
+ mid_y = top + speaker_size // 2
+
+ # Small wave
+ draw.arc(
+ [(wave_start_x, mid_y - 8), (wave_start_x + 16, mid_y + 8)],
+ -30, 30, fill=(255, 255, 255, 200), width=2
+ )
+
+ # Medium wave
+ if size >= 128:
+ draw.arc(
+ [(wave_start_x + 5, mid_y - 14), (wave_start_x + 26, mid_y + 14)],
+ -30, 30, fill=(255, 255, 255, 150), width=2
+ )
+
+ # Save
+ img.save(output_path, 'PNG')
+ print(f"✅ Created {output_path} ({size}x{size})")
+
+def main():
+ """Generate all icon sizes"""
+ script_dir = os.path.dirname(os.path.abspath(__file__))
+ icons_dir = os.path.join(script_dir, 'icons')
+
+ # Create icons directory if it doesn't exist
+ os.makedirs(icons_dir, exist_ok=True)
+
+ # Generate icons
+ sizes = [16, 48, 128]
+
+ for size in sizes:
+ output_path = os.path.join(icons_dir, f'icon{size}.png')
+ create_icon(size, output_path)
+
+ print("\n✨ All icons generated successfully!")
+ print("📁 Location: browser-extension/icons/")
+
+if __name__ == '__main__':
+ try:
+ main()
+ except Exception as e:
+ print(f"❌ Error generating icons: {e}")
+ print("\n💡 If PIL is not installed, run: pip install pillow")
+ print("Or use the provided SVG icons directly - modern browsers support them!")
diff --git a/browser-extension/icons/icon128.svg b/browser-extension/icons/icon128.svg
new file mode 100644
index 0000000..51f039b
--- /dev/null
+++ b/browser-extension/icons/icon128.svg
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+ 🔊
+
diff --git a/browser-extension/icons/icon16.svg b/browser-extension/icons/icon16.svg
new file mode 100644
index 0000000..578c649
--- /dev/null
+++ b/browser-extension/icons/icon16.svg
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/browser-extension/icons/icon48.svg b/browser-extension/icons/icon48.svg
new file mode 100644
index 0000000..8e88448
--- /dev/null
+++ b/browser-extension/icons/icon48.svg
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+ 🔊
+
diff --git a/browser-extension/manifest.json b/browser-extension/manifest.json
new file mode 100644
index 0000000..c58f5f0
--- /dev/null
+++ b/browser-extension/manifest.json
@@ -0,0 +1,76 @@
+{
+ "manifest_version": 3,
+ "name": "OpenWebTTS - Text to Speech Reader",
+ "version": "1.0.0",
+ "description": "Read any webpage aloud with word-by-word highlighting. Powered by OpenWebTTS.",
+ "permissions": [
+ "activeTab",
+ "storage",
+ "scripting",
+ "contextMenus"
+ ],
+ "host_permissions": [
+ "http://localhost:8000/*",
+ "http://127.0.0.1:8000/*"
+ ],
+ "commands": {
+ "read-page": {
+ "suggested_key": {
+ "default": "Ctrl+Shift+R",
+ "mac": "Command+Shift+R"
+ },
+ "description": "Read entire page"
+ },
+ "read-selection": {
+ "suggested_key": {
+ "default": "Ctrl+Shift+S",
+ "mac": "Command+Shift+S"
+ },
+ "description": "Read selected text"
+ },
+ "stop-reading": {
+ "suggested_key": {
+ "default": "Ctrl+Shift+X",
+ "mac": "Command+Shift+X"
+ },
+ "description": "Stop reading"
+ },
+ "toggle-menu": {
+ "suggested_key": {
+ "default": "Ctrl+Shift+M",
+ "mac": "Command+Shift+M"
+ },
+ "description": "Toggle floating menu"
+ }
+ },
+ "action": {
+ "default_popup": "popup/popup.html",
+ "default_icon": {
+ "16": "icons/icon16.svg",
+ "48": "icons/icon48.svg",
+ "128": "icons/icon128.svg"
+ }
+ },
+ "background": {
+ "service_worker": "background/background.js"
+ },
+ "content_scripts": [
+ {
+ "matches": [""],
+ "js": ["content/content.js"],
+ "css": ["content/content.css"],
+ "run_at": "document_idle"
+ }
+ ],
+ "icons": {
+ "16": "icons/icon16.svg",
+ "48": "icons/icon48.svg",
+ "128": "icons/icon128.svg"
+ },
+ "web_accessible_resources": [
+ {
+ "resources": ["icons/*.svg", "icons/*.png"],
+ "matches": [""]
+ }
+ ]
+}
diff --git a/browser-extension/manifest_firefox.json b/browser-extension/manifest_firefox.json
new file mode 100644
index 0000000..43db7c2
--- /dev/null
+++ b/browser-extension/manifest_firefox.json
@@ -0,0 +1,71 @@
+{
+ "manifest_version": 2,
+ "name": "OpenWebTTS - Text to Speech Reader",
+ "version": "1.0.0",
+ "description": "Read any webpage aloud with word-by-word highlighting. Powered by OpenWebTTS.",
+ "permissions": [
+ "activeTab",
+ "storage",
+ "",
+ "contextMenus",
+ "http://localhost:8000/*",
+ "http://127.0.0.1:8000/*"
+ ],
+ "commands": {
+ "read-page": {
+ "suggested_key": {
+ "default": "Ctrl+Shift+R"
+ },
+ "description": "Read entire page"
+ },
+ "read-selection": {
+ "suggested_key": {
+ "default": "Ctrl+Shift+S"
+ },
+ "description": "Read selected text"
+ },
+ "stop-reading": {
+ "suggested_key": {
+ "default": "Ctrl+Shift+X"
+ },
+ "description": "Stop reading"
+ },
+ "toggle-menu": {
+ "suggested_key": {
+ "default": "Ctrl+Shift+M"
+ },
+ "description": "Toggle floating menu"
+ }
+ },
+ "browser_action": {
+ "default_popup": "popup/popup.html",
+ "default_icon": {
+ "16": "icons/icon16.png",
+ "48": "icons/icon48.png",
+ "128": "icons/icon128.png"
+ }
+ },
+ "background": {
+ "scripts": ["background/background.js"]
+ },
+ "content_scripts": [
+ {
+ "matches": [""],
+ "js": ["content/content.js"],
+ "css": ["content/content.css"],
+ "run_at": "document_idle"
+ }
+ ],
+ "icons": {
+ "16": "icons/icon16.png",
+ "48": "icons/icon48.png",
+ "128": "icons/icon128.png"
+ },
+ "web_accessible_resources": ["icons/*.png"],
+ "browser_specific_settings": {
+ "gecko": {
+ "id": "openwebtts@extension.local",
+ "strict_min_version": "57.0"
+ }
+ }
+}
diff --git a/browser-extension/package.json b/browser-extension/package.json
new file mode 100644
index 0000000..7fcb004
--- /dev/null
+++ b/browser-extension/package.json
@@ -0,0 +1,28 @@
+{
+ "name": "openwebtts-extension",
+ "version": "1.0.0",
+ "description": "Browser extension for OpenWebTTS - Text to Speech Reader",
+ "scripts": {
+ "start:chrome": "echo 'Load extension manually in chrome://extensions/'",
+ "start:firefox": "web-ext run --source-dir=. --firefox-profile=dev",
+ "build:firefox": "web-ext build --source-dir=. --artifacts-dir=../dist --overwrite-dest",
+ "lint": "web-ext lint --source-dir=.",
+ "package": "npm run build:firefox"
+ },
+ "devDependencies": {
+ "web-ext": "^7.11.0"
+ },
+ "repository": {
+ "type": "git",
+ "url": "https://github.com/Gyyyn/OpenWebTTS.git"
+ },
+ "keywords": [
+ "tts",
+ "text-to-speech",
+ "accessibility",
+ "browser-extension",
+ "speech-synthesis"
+ ],
+ "author": "OpenWebTTS Contributors",
+ "license": "MIT"
+}
diff --git a/browser-extension/popup/popup.css b/browser-extension/popup/popup.css
new file mode 100644
index 0000000..37f39bb
--- /dev/null
+++ b/browser-extension/popup/popup.css
@@ -0,0 +1,302 @@
+* {
+ margin: 0;
+ padding: 0;
+ box-sizing: border-box;
+}
+
+body {
+ width: 380px;
+ min-height: 500px;
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+ color: #333;
+}
+
+.container {
+ background: white;
+ border-radius: 0;
+ overflow: hidden;
+}
+
+.header {
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+ color: white;
+ padding: 20px;
+ text-align: center;
+}
+
+.header h1 {
+ font-size: 24px;
+ font-weight: 700;
+ margin-bottom: 5px;
+}
+
+.subtitle {
+ font-size: 13px;
+ opacity: 0.9;
+}
+
+.status-section {
+ padding: 15px 20px;
+ background: #f8f9fa;
+ border-bottom: 1px solid #e9ecef;
+}
+
+.status {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ font-size: 13px;
+ font-weight: 500;
+}
+
+.status-dot {
+ width: 10px;
+ height: 10px;
+ border-radius: 50%;
+ animation: pulse 2s infinite;
+}
+
+.status.connecting .status-dot {
+ background: #ffc107;
+}
+
+.status.connected .status-dot {
+ background: #28a745;
+}
+
+.status.disconnected .status-dot {
+ background: #dc3545;
+}
+
+@keyframes pulse {
+ 0%, 100% { opacity: 1; }
+ 50% { opacity: 0.5; }
+}
+
+.settings-section {
+ padding: 20px;
+ max-height: 350px;
+ overflow-y: auto;
+}
+
+.setting-group {
+ margin-bottom: 15px;
+}
+
+.setting-group label {
+ display: block;
+ font-size: 13px;
+ font-weight: 600;
+ color: #495057;
+ margin-bottom: 6px;
+}
+
+.setting-group input[type="text"],
+.setting-group input[type="number"],
+.setting-group select {
+ width: 100%;
+ padding: 8px 12px;
+ border: 1px solid #ced4da;
+ border-radius: 6px;
+ font-size: 13px;
+ transition: border-color 0.2s;
+}
+
+.setting-group input:focus,
+.setting-group select:focus {
+ outline: none;
+ border-color: #667eea;
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
+}
+
+.setting-group input[type="range"] {
+ width: 100%;
+ height: 6px;
+ -webkit-appearance: none;
+ appearance: none;
+ background: #e9ecef;
+ border-radius: 3px;
+ outline: none;
+}
+
+.setting-group input[type="range"]::-webkit-slider-thumb {
+ -webkit-appearance: none;
+ appearance: none;
+ width: 18px;
+ height: 18px;
+ background: #667eea;
+ border-radius: 50%;
+ cursor: pointer;
+ transition: background 0.2s;
+}
+
+.setting-group input[type="range"]::-webkit-slider-thumb:hover {
+ background: #5568d3;
+}
+
+.setting-group input[type="range"]::-moz-range-thumb {
+ width: 18px;
+ height: 18px;
+ background: #667eea;
+ border-radius: 50%;
+ border: none;
+ cursor: pointer;
+ transition: background 0.2s;
+}
+
+.checkbox-label {
+ display: flex;
+ align-items: center;
+ gap: 8px;
+ cursor: pointer;
+ user-select: none;
+}
+
+.checkbox-label input[type="checkbox"] {
+ width: 18px;
+ height: 18px;
+ cursor: pointer;
+}
+
+.checkbox-label span {
+ font-size: 13px;
+ font-weight: 500;
+}
+
+.controls-section {
+ padding: 20px;
+ display: flex;
+ flex-direction: column;
+ gap: 10px;
+ border-top: 1px solid #e9ecef;
+}
+
+.btn {
+ padding: 12px 20px;
+ border: none;
+ border-radius: 8px;
+ font-size: 14px;
+ font-weight: 600;
+ cursor: pointer;
+ transition: all 0.2s;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ gap: 8px;
+}
+
+.btn:disabled {
+ opacity: 0.5;
+ cursor: not-allowed;
+}
+
+.btn-primary {
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+ color: white;
+}
+
+.btn-primary:hover:not(:disabled) {
+ transform: translateY(-2px);
+ box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
+}
+
+.btn-secondary {
+ background: #6c757d;
+ color: white;
+}
+
+.btn-secondary:hover:not(:disabled) {
+ background: #5a6268;
+ transform: translateY(-2px);
+ box-shadow: 0 4px 12px rgba(108, 117, 125, 0.4);
+}
+
+.btn-danger {
+ background: #dc3545;
+ color: white;
+}
+
+.btn-danger:hover:not(:disabled) {
+ background: #c82333;
+ transform: translateY(-2px);
+ box-shadow: 0 4px 12px rgba(220, 53, 69, 0.4);
+}
+
+.playback-info {
+ padding: 15px 20px;
+ background: #f8f9fa;
+ border-top: 1px solid #e9ecef;
+}
+
+.progress-bar {
+ width: 100%;
+ height: 6px;
+ background: #e9ecef;
+ border-radius: 3px;
+ overflow: hidden;
+ margin-bottom: 10px;
+}
+
+.progress-fill {
+ height: 100%;
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+ width: 0%;
+ transition: width 0.3s;
+}
+
+.playback-stats {
+ display: flex;
+ justify-content: space-between;
+ font-size: 12px;
+ color: #6c757d;
+}
+
+.footer {
+ padding: 15px 20px;
+ background: #f8f9fa;
+ border-top: 1px solid #e9ecef;
+ display: flex;
+ justify-content: center;
+ align-items: center;
+ gap: 10px;
+}
+
+.btn-link {
+ background: none;
+ border: none;
+ color: #667eea;
+ font-size: 12px;
+ font-weight: 600;
+ cursor: pointer;
+ text-decoration: none;
+ transition: color 0.2s;
+}
+
+.btn-link:hover {
+ color: #5568d3;
+ text-decoration: underline;
+}
+
+.separator {
+ color: #ced4da;
+ font-size: 12px;
+}
+
+/* Scrollbar styling */
+.settings-section::-webkit-scrollbar {
+ width: 6px;
+}
+
+.settings-section::-webkit-scrollbar-track {
+ background: #f1f3f5;
+}
+
+.settings-section::-webkit-scrollbar-thumb {
+ background: #ced4da;
+ border-radius: 3px;
+}
+
+.settings-section::-webkit-scrollbar-thumb:hover {
+ background: #adb5bd;
+}
diff --git a/browser-extension/popup/popup.html b/browser-extension/popup/popup.html
new file mode 100644
index 0000000..e5f779b
--- /dev/null
+++ b/browser-extension/popup/popup.html
@@ -0,0 +1,119 @@
+
+
+
+
+
+ OpenWebTTS
+
+
+
+
+
+
+
+
+
+
+ Backend URL:
+
+
+
+
+ Voice:
+
+ Piper (Default)
+ Kokoro
+ Coqui TTS
+ OpenAI TTS
+
+
+
+
+ Speed: 1.0x
+
+
+
+
+ Chunk Size:
+
+
+
+
+
+
+ Auto-scroll while reading
+
+
+
+
+
+
+ Word-by-word highlighting
+
+
+
+
+ Highlight Color:
+
+ Yellow
+ Green
+ Blue
+ Pink
+ Orange
+
+
+
+
+ Word Highlight Color:
+
+ Yellow
+ Green
+ Blue
+ Pink
+ Orange
+
+
+
+
+
+
+ 📖 Read Page
+
+
+ 📝 Read Selection
+
+
+ ⏹️ Stop
+
+
+
+
+
+
+
+ Chunk 0/0
+ --:--
+
+
+
+
+
+
+
+
+
+
diff --git a/browser-extension/popup/popup.js b/browser-extension/popup/popup.js
new file mode 100644
index 0000000..5a6cbf6
--- /dev/null
+++ b/browser-extension/popup/popup.js
@@ -0,0 +1,281 @@
+// Popup script for OpenWebTTS browser extension
+
+// Get browser API (works for both Chrome and Firefox)
+const browser = window.browser || window.chrome;
+
+// DOM Elements
+const statusElement = document.getElementById('status');
+const statusText = document.getElementById('status-text');
+const backendUrlInput = document.getElementById('backend-url');
+const voiceSelect = document.getElementById('voice-select');
+const speedSlider = document.getElementById('speed-slider');
+const speedValue = document.getElementById('speed-value');
+const chunkSizeInput = document.getElementById('chunk-size');
+const autoScrollCheckbox = document.getElementById('auto-scroll');
+const wordHighlightCheckbox = document.getElementById('word-highlight');
+const highlightColorSelect = document.getElementById('highlight-color');
+const wordHighlightColorSelect = document.getElementById('word-highlight-color');
+const readPageBtn = document.getElementById('read-page');
+const readSelectionBtn = document.getElementById('read-selection');
+const stopReadingBtn = document.getElementById('stop-reading');
+const testConnectionBtn = document.getElementById('test-connection');
+const saveSettingsBtn = document.getElementById('save-settings');
+const playbackInfo = document.getElementById('playback-info');
+const progressFill = document.getElementById('progress-fill');
+const currentChunkSpan = document.getElementById('current-chunk');
+const timeRemainingSpan = document.getElementById('time-remaining');
+
+// Default settings
+const defaultSettings = {
+ backendUrl: 'http://localhost:8000',
+ voice: 'piper',
+ speed: 1.0,
+ chunkSize: 50,
+ autoScroll: true,
+ wordHighlight: true,
+ highlightColor: 'yellow',
+ wordHighlightColor: 'yellow'
+};
+
+// Load settings from storage
+async function loadSettings() {
+ try {
+ const result = await browser.storage.local.get('settings');
+ const settings = result.settings || defaultSettings;
+
+ backendUrlInput.value = settings.backendUrl;
+ voiceSelect.value = settings.voice;
+ speedSlider.value = settings.speed;
+ speedValue.textContent = `${settings.speed}x`;
+ chunkSizeInput.value = settings.chunkSize;
+ autoScrollCheckbox.checked = settings.autoScroll;
+ wordHighlightCheckbox.checked = settings.wordHighlight;
+ highlightColorSelect.value = settings.highlightColor;
+ wordHighlightColorSelect.value = settings.wordHighlightColor || settings.highlightColor;
+
+ return settings;
+ } catch (error) {
+ console.error('Error loading settings:', error);
+ return defaultSettings;
+ }
+}
+
+// Save settings to storage
+async function saveSettings() {
+ const settings = {
+ backendUrl: backendUrlInput.value.trim(),
+ voice: voiceSelect.value,
+ speed: parseFloat(speedSlider.value),
+ chunkSize: parseInt(chunkSizeInput.value),
+ autoScroll: autoScrollCheckbox.checked,
+ wordHighlight: wordHighlightCheckbox.checked,
+ highlightColor: highlightColorSelect.value,
+ wordHighlightColor: wordHighlightColorSelect.value
+ };
+
+ try {
+ await browser.storage.local.set({ settings });
+
+ // Notify all tabs about settings change for immediate effect
+ try {
+ const tabs = await browser.tabs.query({ active: true, currentWindow: true });
+ for (const tab of tabs) {
+ browser.tabs.sendMessage(tab.id, {
+ action: 'updateSettings',
+ settings: settings
+ }).catch(() => {
+ // Tab might not have content script, ignore error
+ });
+ }
+ } catch (e) {
+ // Ignore message sending errors
+ }
+
+ showStatus('connected', 'Settings saved!');
+ setTimeout(() => testConnection(), 1000);
+ return settings;
+ } catch (error) {
+ console.error('Error saving settings:', error);
+ showStatus('disconnected', 'Failed to save settings');
+ }
+}
+
+// Update status display
+function showStatus(state, message) {
+ statusElement.className = `status ${state}`;
+ statusText.textContent = message;
+}
+
+// Test backend connection
+async function testConnection() {
+ const backendUrl = backendUrlInput.value.trim();
+ showStatus('connecting', 'Testing connection...');
+
+ try {
+ const response = await fetch(`${backendUrl}/api/health`, {
+ method: 'GET',
+ headers: { 'Content-Type': 'application/json' }
+ });
+
+ if (response.ok) {
+ showStatus('connected', 'Connected to backend');
+ return true;
+ } else {
+ showStatus('disconnected', 'Backend not responding');
+ return false;
+ }
+ } catch (error) {
+ console.error('Connection error:', error);
+ showStatus('disconnected', 'Cannot reach backend');
+ return false;
+ }
+}
+
+// Get current tab
+async function getCurrentTab() {
+ const tabs = await browser.tabs.query({ active: true, currentWindow: true });
+ return tabs[0];
+}
+
+// Send message to content script
+async function sendMessageToContent(message) {
+ const tab = await getCurrentTab();
+ return browser.tabs.sendMessage(tab.id, message);
+}
+
+// Read entire page
+async function readPage() {
+ const settings = await loadSettings();
+
+ showStatus('connecting', 'Starting to read page...');
+ readPageBtn.disabled = true;
+ readSelectionBtn.disabled = true;
+ stopReadingBtn.disabled = false;
+ playbackInfo.style.display = 'block';
+
+ try {
+ await sendMessageToContent({
+ action: 'startReading',
+ mode: 'page',
+ settings: settings
+ });
+
+ showStatus('connected', 'Reading page...');
+ } catch (error) {
+ console.error('Error starting reading:', error);
+ showStatus('disconnected', 'Failed to start reading');
+ readPageBtn.disabled = false;
+ readSelectionBtn.disabled = false;
+ stopReadingBtn.disabled = true;
+ playbackInfo.style.display = 'none';
+ }
+}
+
+// Read selected text
+async function readSelection() {
+ const settings = await loadSettings();
+
+ showStatus('connecting', 'Starting to read selection...');
+ readPageBtn.disabled = true;
+ readSelectionBtn.disabled = true;
+ stopReadingBtn.disabled = false;
+ playbackInfo.style.display = 'block';
+
+ try {
+ await sendMessageToContent({
+ action: 'startReading',
+ mode: 'selection',
+ settings: settings
+ });
+
+ showStatus('connected', 'Reading selection...');
+ } catch (error) {
+ console.error('Error starting reading:', error);
+ showStatus('disconnected', 'Failed to start reading');
+ readPageBtn.disabled = false;
+ readSelectionBtn.disabled = false;
+ stopReadingBtn.disabled = true;
+ playbackInfo.style.display = 'none';
+ }
+}
+
+// Stop reading
+async function stopReading() {
+ try {
+ await sendMessageToContent({ action: 'stopReading' });
+
+ showStatus('connected', 'Stopped reading');
+ readPageBtn.disabled = false;
+ readSelectionBtn.disabled = false;
+ stopReadingBtn.disabled = true;
+ playbackInfo.style.display = 'none';
+ progressFill.style.width = '0%';
+ } catch (error) {
+ console.error('Error stopping reading:', error);
+ }
+}
+
+// Update playback progress
+function updateProgress(current, total, timeRemaining) {
+ const percent = total > 0 ? (current / total) * 100 : 0;
+ progressFill.style.width = `${percent}%`;
+ currentChunkSpan.textContent = `Chunk ${current}/${total}`;
+
+ if (timeRemaining) {
+ const minutes = Math.floor(timeRemaining / 60);
+ const seconds = Math.floor(timeRemaining % 60);
+ timeRemainingSpan.textContent = `${minutes}:${seconds.toString().padStart(2, '0')}`;
+ }
+}
+
+// Listen for messages from content script
+browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
+ if (message.action === 'updateProgress') {
+ updateProgress(message.current, message.total, message.timeRemaining);
+ } else if (message.action === 'readingComplete') {
+ showStatus('connected', 'Reading complete!');
+ readPageBtn.disabled = false;
+ readSelectionBtn.disabled = false;
+ stopReadingBtn.disabled = true;
+ setTimeout(() => {
+ playbackInfo.style.display = 'none';
+ progressFill.style.width = '0%';
+ }, 2000);
+ } else if (message.action === 'readingError') {
+ showStatus('disconnected', message.error || 'Reading error');
+ readPageBtn.disabled = false;
+ readSelectionBtn.disabled = false;
+ stopReadingBtn.disabled = true;
+ playbackInfo.style.display = 'none';
+ }
+});
+
+// Event listeners
+speedSlider.addEventListener('input', (e) => {
+ speedValue.textContent = `${e.target.value}x`;
+});
+
+testConnectionBtn.addEventListener('click', testConnection);
+saveSettingsBtn.addEventListener('click', saveSettings);
+readPageBtn.addEventListener('click', readPage);
+readSelectionBtn.addEventListener('click', readSelection);
+stopReadingBtn.addEventListener('click', stopReading);
+
+// Auto-save settings on change
+[voiceSelect, autoScrollCheckbox, wordHighlightCheckbox, highlightColorSelect, wordHighlightColorSelect, speedSlider, chunkSizeInput].forEach(element => {
+ element.addEventListener('change', saveSettings);
+});
+
+// Save backend URL on blur (when user finishes typing)
+backendUrlInput.addEventListener('blur', saveSettings);
+
+// Also save on input for immediate feedback on sliders
+speedSlider.addEventListener('input', () => {
+ speedValue.textContent = speedSlider.value + 'x';
+});
+
+// Initialize
+(async function init() {
+ await loadSettings();
+ await testConnection();
+})();
diff --git a/browser-extension/test-page.html b/browser-extension/test-page.html
new file mode 100644
index 0000000..7b2e24a
--- /dev/null
+++ b/browser-extension/test-page.html
@@ -0,0 +1,255 @@
+
+
+
+
+
+ OpenWebTTS Extension Test Page
+
+
+
+
+
OpenWebTTS Browser Extension Test Page
+
+
+ Welcome to the OpenWebTTS browser extension test page! This page contains various types of text content
+ to help you test all the features of the extension, including paragraph reading, heading detection,
+ and word-by-word highlighting.
+
+
+
About Text-to-Speech Technology
+
+
+ Text-to-speech (TTS) technology has revolutionized how we consume written content. From helping individuals
+ with visual impairments to allowing multitasking while learning, TTS has become an essential accessibility
+ tool in the modern digital age.
+
+
+
+ OpenWebTTS takes this technology a step further by providing a completely open-source, privacy-respecting
+ solution that runs entirely on your local machine. Unlike cloud-based services that track your reading
+ habits and upload your documents, OpenWebTTS ensures your data never leaves your computer.
+
+
+
Key Features to Test
+
+
+ Testing Instructions: Click the OpenWebTTS extension icon in your browser toolbar,
+ then click "Read Page" to have this entire page read aloud with real-time highlighting.
+
+
+
1. Paragraph Reading
+
+
+ The extension should seamlessly read through paragraphs like this one, highlighting each sentence as it
+ reads. Notice how the highlighting follows along with the spoken words, making it easy to track where
+ you are in the text.
+
+
+
2. List Reading
+
+
Here are some benefits of using OpenWebTTS:
+
+
+ Completely free and open source with no hidden costs or subscriptions
+ Privacy-first design that keeps all your data local and secure
+ Multiple voice engine options including Piper, Kokoro, and Coqui
+ Customizable reading speed from half-speed to double-speed
+ Word-by-word highlighting with multiple color themes to choose from
+ Works offline without requiring an internet connection
+
+
+
3. Ordered Lists
+
+
Follow these steps to get the best experience:
+
+
+ Ensure your OpenWebTTS backend is running on localhost:5000
+ Install the browser extension following the installation guide
+ Open the extension popup and verify the connection status shows green
+ Navigate to any webpage with text content you want to read
+ Click the "Read Page" button or select specific text for targeted reading
+ Adjust settings like speed and highlighting colors to your preference
+
+
+
Advanced Features
+
+
+ "The power of open source lies in its accessibility and transparency. OpenWebTTS embodies these principles
+ by providing enterprise-grade text-to-speech capabilities to everyone, completely free."
+
+
+
+
Selection Reading Test
+
+ Try selecting just this paragraph with your mouse, then click "Read Selection" in the extension popup.
+ The extension should read only the selected text, ignoring everything else on the page. This is perfect
+ for reading specific sections without having to listen to the entire document.
+
+
+
+
Technical Details
+
+
+ The OpenWebTTS extension uses several advanced techniques to provide the best reading experience. The content
+ script intelligently extracts readable text from web pages, filtering out navigation menus, advertisements,
+ and other non-essential content. It focuses on main paragraphs, headings, lists, and blockquotes.
+
+
+
+ For highlighting, the extension implements a two-tier system. First, it highlights the entire sentence being
+ read with a subtle background color at 25% opacity. Then, as the audio progresses, it highlights individual
+ words with a more prominent 65% opacity. This creates a smooth, easy-to-follow reading experience.
+
+
+
Performance Optimization
+
+
+ Behind the scenes, the extension caches audio files to avoid regenerating the same content multiple times.
+ It uses efficient DOM manipulation techniques to minimize impact on page performance. The highlighting system
+ is powered by CSS classes that are applied and removed dynamically as words are spoken.
+
+
+
Accessibility Considerations
+
+
+ Text-to-speech technology serves many important purposes beyond convenience. For individuals with dyslexia,
+ visual impairments, or other reading difficulties, TTS can be a transformative tool. It can help with
+ comprehension, reduce eye strain, and make consuming written content more enjoyable.
+
+
+
+ The visual highlighting feature adds an extra layer of accessibility by helping users track along with
+ the audio. This multi-sensory approach (hearing + seeing) can improve comprehension and retention,
+ especially for complex or technical content.
+
+
+
Supported Voice Engines
+
+
+ OpenWebTTS supports multiple TTS engines, each with its own strengths:
+
+
+
+ Piper: A fast, high-quality neural TTS engine that works completely offline
+ Kokoro: An alternative engine with excellent voice quality and emotional expression
+ Coqui: Perfect for voice cloning, allowing you to create custom voices
+ OpenAI: Cloud-based option for users who prefer cutting-edge AI voices
+
+
+
Customization Options
+
+
+ The extension provides extensive customization to suit your preferences. You can adjust the reading speed
+ anywhere from 0.5x (slow and deliberate) to 2.0x (quick skim reading). The chunk size determines how much
+ text is processed at once, with smaller chunks providing more frequent pauses.
+
+
+
+ Highlight colors can be changed to match your preference or mood. Choose from yellow (classic), green
+ (easy on the eyes), blue (calm and focused), pink (vibrant), or orange (energetic). The auto-scroll feature
+ keeps the currently reading section centered on your screen automatically.
+
+
+
Privacy and Security
+
+
+ One of the key advantages of OpenWebTTS over commercial alternatives is its commitment to privacy. All
+ processing happens locally on your machine. The text you read never gets uploaded to external servers
+ (unless you specifically choose to use cloud-based voices like OpenAI).
+
+
+
+ Your reading history, preferences, and document content remain completely private. There's no tracking,
+ no analytics, and no data collection. You have full control over your information at all times.
+
+
+
Conclusion
+
+
+ The OpenWebTTS browser extension brings powerful text-to-speech capabilities to your everyday web browsing.
+ Whether you're reading articles, documentation, research papers, or just catching up on news, the extension
+ makes it all more accessible and convenient.
+
+
+
+ Best of all, it's completely free, open source, and respects your privacy. Give it a try on this page
+ or any other website, and experience a new way of consuming written content. Happy reading (or listening)!
+
+
+
+ Ready to test? Click the extension icon now and select "Read Page" to hear this entire
+ document read aloud with synchronized highlighting. Or select any paragraph above and use "Read Selection"
+ to test targeted reading!
+
+
+
+
+
+ OpenWebTTS Browser Extension Test Page • Version 1.0 • Open Source • Privacy-First
+
+
+
+
diff --git a/desktop_tts_app.py b/desktop_tts_app.py
new file mode 100644
index 0000000..e8bf6d4
--- /dev/null
+++ b/desktop_tts_app.py
@@ -0,0 +1,1676 @@
+"""
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+ SPEECHIFY CLONE - 2026 LOCAL VOICE AI READER
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+100% Local/Offline Neural TTS Desktop Application
+Modern, cross-platform text-to-speech reader with AI features
+
+═══════════════════════════════════════════════════════════════════════════════
+📦 INSTALLATION INSTRUCTIONS
+═══════════════════════════════════════════════════════════════════════════════
+
+1. REQUIRED PYTHON PACKAGES:
+ pip install PyQt6 PyQt6-WebEngine numpy soundfile pydub
+ pip install PyMuPDF python-docx ebooklib beautifulsoup4 lxml
+ pip install requests trafilatura readability-lxml
+ pip install pygame # For audio playback
+
+2. TTS ENGINE (Choose ONE - Piper recommended):
+
+ A) PIPER TTS (RECOMMENDED - Fast, Natural, CPU-Friendly):
+ Windows:
+ - Download: https://github.com/rhasspy/piper/releases
+ - Extract piper.exe to same folder as this script
+
+ Linux/Mac:
+ wget https://github.com/rhasspy/piper/releases/latest/download/piper_amd64.tar.gz
+ tar -xzf piper_amd64.tar.gz
+
+ Voices:
+ - Download from: https://huggingface.co/rhasspy/piper-voices/tree/main
+ - Place .onnx and .onnx.json files in: models/piper/
+ - Example: en_US-lessac-high.onnx + en_US-lessac-high.onnx.json
+
+ B) MELOTTS (Multilingual, High Quality):
+ pip install melotts
+ # Models auto-download on first use
+
+ C) COQUI XTTS (Best Quality, Voice Cloning, GPU Recommended):
+ pip install TTS
+ # Models auto-download on first use (~2GB)
+
+3. OPTIONAL AI FEATURES:
+
+ - Voice Transcription (Dictation):
+ pip install faster-whisper
+ # Or: pip install openai-whisper
+
+ - Local LLM (Summary/Chat):
+ Install Ollama: https://ollama.com/download
+ ollama pull phi3:mini # or llama3.2:3b, gemma2:9b
+
+ - OCR (Scan Documents):
+ pip install easyocr
+ # or: pip install pytesseract
+
+4. RUN THE APPLICATION:
+ python desktop_tts_app.py
+
+═══════════════════════════════════════════════════════════════════════════════
+⚡ SYSTEM REQUIREMENTS
+═══════════════════════════════════════════════════════════════════════════════
+- Python 3.8+
+- RAM: 8GB minimum, 16GB+ recommended for AI features
+- Disk: 2-5GB for models
+- GPU: Optional, significantly speeds up XTTS and Whisper
+- OS: Windows 10+, macOS 10.15+, Linux (Ubuntu 20.04+)
+
+═══════════════════════════════════════════════════════════════════════════════
+🎯 FEATURES INCLUDED
+═══════════════════════════════════════════════════════════════════════════════
+✅ Multi-format import (TXT, PDF, DOCX, EPUB, Web URLs)
+✅ High-quality neural TTS (Piper/MeloTTS/XTTS)
+✅ Real-time sentence highlighting with visual sync
+✅ Speed (0.5-5×), Pitch, Volume controls
+✅ Export to MP3/WAV
+✅ Keyboard shortcuts (Space, Esc, Ctrl+O, etc.)
+✅ Dark/Light theme with persistence
+✅ Voice transcription (dictation mode)
+✅ AI document summarization (requires Ollama)
+✅ Voice Q&A chat about documents
+✅ OCR support for scanned documents
+✅ Progress tracking for long texts
+✅ Multi-threaded (UI never freezes)
+
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+"""
+
+import sys
+import os
+import re
+import json
+import time
+import wave
+import tempfile
+import subprocess
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass
+from threading import Thread, Event
+from queue import Queue
+import traceback
+
+# GUI Framework
+from PyQt6.QtWidgets import (
+ QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
+ QTextEdit, QPushButton, QComboBox, QSlider, QLabel, QFileDialog,
+ QProgressBar, QGroupBox, QCheckBox, QSpinBox, QSplitter,
+ QTabWidget, QStatusBar, QMenuBar, QMenu, QMessageBox, QLineEdit,
+ QTextBrowser, QDockWidget, QListWidget, QFrame, QScrollArea
+)
+from PyQt6.QtCore import Qt, QThread, pyqtSignal, QTimer, QSettings, QUrl
+from PyQt6.QtGui import (
+ QFont, QTextCursor, QTextCharFormat, QColor, QPalette,
+ QKeySequence, QShortcut, QAction, QIcon, QTextDocument
+)
+
+# Document readers
+try:
+ import fitz # PyMuPDF
+ HAS_PDF = True
+except ImportError:
+ HAS_PDF = False
+ print("⚠️ PyMuPDF not installed. PDF support disabled.")
+
+try:
+ from docx import Document as DocxDocument
+ HAS_DOCX = True
+except ImportError:
+ HAS_DOCX = False
+ print("⚠️ python-docx not installed. DOCX support disabled.")
+
+try:
+ import ebooklib
+ from ebooklib import epub
+ HAS_EPUB = True
+except ImportError:
+ HAS_EPUB = False
+ print("⚠️ ebooklib not installed. EPUB support disabled.")
+
+try:
+ from bs4 import BeautifulSoup
+ HAS_BS4 = True
+except ImportError:
+ HAS_BS4 = False
+ print("⚠️ beautifulsoup4 not installed. Web import limited.")
+
+try:
+ import requests
+ HAS_REQUESTS = True
+except ImportError:
+ HAS_REQUESTS = False
+ print("⚠️ requests not installed. Web import disabled.")
+
+try:
+ import trafilatura
+ HAS_TRAFILATURA = True
+except ImportError:
+ HAS_TRAFILATURA = False
+
+# Audio processing
+try:
+ import numpy as np
+ import soundfile as sf
+ HAS_AUDIO_EXPORT = True
+except ImportError:
+ HAS_AUDIO_EXPORT = False
+ print("⚠️ soundfile/numpy not installed. Audio export limited.")
+
+try:
+ from pydub import AudioSegment
+ HAS_PYDUB = True
+except ImportError:
+ HAS_PYDUB = False
+
+try:
+ import pygame
+ pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=512)
+ HAS_PYGAME = True
+except ImportError:
+ HAS_PYGAME = False
+ print("⚠️ pygame not installed. Audio playback disabled.")
+
+# AI Features (optional)
+try:
+ from faster_whisper import WhisperModel
+ HAS_WHISPER = True
+except ImportError:
+ HAS_WHISPER = False
+
+try:
+ import ollama
+ HAS_OLLAMA = True
+except ImportError:
+ HAS_OLLAMA = False
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 📚 DATA STRUCTURES
+# ═══════════════════════════════════════════════════════════════════════════
+
+@dataclass
+class Sentence:
+ """Represents a sentence with timing and position info"""
+ text: str
+ start_pos: int
+ end_pos: int
+ audio_file: Optional[str] = None
+ duration: float = 0.0
+
+
+@dataclass
+class Voice:
+ """Voice model information"""
+ name: str
+ display_name: str
+ language: str
+ quality: str # 'low', 'medium', 'high'
+ path: Optional[Path] = None
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 🎙️ TTS ENGINE WRAPPER
+# ═══════════════════════════════════════════════════════════════════════════
+
+class TTSEngine:
+ """Unified interface for multiple TTS backends"""
+
+ def __init__(self, engine_type: str = "piper"):
+ self.engine_type = engine_type
+ self.voices: List[Voice] = []
+ self.models_dir = Path("models")
+
+ if engine_type == "piper":
+ self._init_piper()
+ elif engine_type == "melotts":
+ self._init_melotts()
+ elif engine_type == "coqui":
+ self._init_coqui()
+
+ def _init_piper(self):
+ """Initialize Piper TTS"""
+ piper_dir = self.models_dir / "piper"
+ if not piper_dir.exists():
+ print(f"Creating models directory: {piper_dir}")
+ piper_dir.mkdir(parents=True, exist_ok=True)
+
+ # Find piper executable
+ self.piper_exe = self._find_piper_exe()
+ if not self.piper_exe:
+ print("⚠️ Piper executable not found!")
+ return
+
+ # Discover voices
+ for onnx_file in piper_dir.glob("*.onnx"):
+ json_file = onnx_file.with_suffix(".onnx.json")
+ if json_file.exists():
+ # Parse voice info from filename
+ name = onnx_file.stem
+ parts = name.split('-')
+ lang = parts[0] if parts else "en_US"
+ quality = parts[-1] if len(parts) > 1 else "medium"
+
+ voice = Voice(
+ name=name,
+ display_name=name.replace('_', ' ').replace('-', ' ').title(),
+ language=lang,
+ quality=quality,
+ path=onnx_file
+ )
+ self.voices.append(voice)
+
+ print(f"✅ Piper: Found {len(self.voices)} voices")
+
+ def _find_piper_exe(self) -> Optional[Path]:
+ """Locate Piper executable"""
+ # Check current directory
+ for name in ["piper.exe", "piper"]:
+ if Path(name).exists():
+ return Path(name)
+
+ # Check PATH
+ import shutil
+ piper_path = shutil.which("piper")
+ if piper_path:
+ return Path(piper_path)
+
+ return None
+
+ def _init_melotts(self):
+ """Initialize MeloTTS (stub - implement if needed)"""
+ print("MeloTTS support coming soon")
+
+ def _init_coqui(self):
+ """Initialize Coqui XTTS (stub - implement if needed)"""
+ print("Coqui XTTS support coming soon")
+
+ def synthesize(self, text: str, voice_name: str, output_path: str,
+ speed: float = 1.0, **kwargs) -> bool:
+ """Generate speech from text"""
+ if self.engine_type == "piper":
+ return self._synthesize_piper(text, voice_name, output_path, speed)
+ return False
+
+ def _synthesize_piper(self, text: str, voice_name: str,
+ output_path: str, speed: float) -> bool:
+ """Generate speech using Piper"""
+ voice = next((v for v in self.voices if v.name == voice_name), None)
+ if not voice or not self.piper_exe:
+ return False
+
+ try:
+ cmd = [
+ str(self.piper_exe),
+ "--model", str(voice.path),
+ "--output_file", output_path
+ ]
+
+ # Speed control (length_scale is inverse of speed)
+ if speed != 1.0:
+ cmd.extend(["--length_scale", str(1.0 / speed)])
+
+ process = subprocess.Popen(
+ cmd,
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True
+ )
+
+ stdout, stderr = process.communicate(input=text, timeout=30)
+
+ return process.returncode == 0 and Path(output_path).exists()
+
+ except Exception as e:
+ print(f"Piper synthesis error: {e}")
+ return False
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 📄 DOCUMENT PROCESSORS
+# ═══════════════════════════════════════════════════════════════════════════
+
+class DocumentProcessor:
+ """Extract text from various document formats"""
+
+ @staticmethod
+ def extract_text(file_path: str) -> str:
+ """Auto-detect format and extract text"""
+ path = Path(file_path)
+ ext = path.suffix.lower()
+
+ if ext == ".txt":
+ return DocumentProcessor._extract_txt(file_path)
+ elif ext == ".pdf" and HAS_PDF:
+ return DocumentProcessor._extract_pdf(file_path)
+ elif ext == ".docx" and HAS_DOCX:
+ return DocumentProcessor._extract_docx(file_path)
+ elif ext == ".epub" and HAS_EPUB:
+ return DocumentProcessor._extract_epub(file_path)
+ else:
+ raise ValueError(f"Unsupported file type: {ext}")
+
+ @staticmethod
+ def _extract_txt(file_path: str) -> str:
+ """Extract from TXT"""
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ return f.read()
+
+ @staticmethod
+ def _extract_pdf(file_path: str) -> str:
+ """Extract from PDF"""
+ doc = fitz.open(file_path)
+ text = ""
+ for page in doc:
+ text += page.get_text() + "\n\n"
+ doc.close()
+ return text
+
+ @staticmethod
+ def _extract_docx(file_path: str) -> str:
+ """Extract from DOCX"""
+ doc = DocxDocument(file_path)
+ return "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+
+ @staticmethod
+ def _extract_epub(file_path: str) -> str:
+ """Extract from EPUB"""
+ book = epub.read_epub(file_path)
+ text = ""
+
+ for item in book.get_items():
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
+ soup = BeautifulSoup(item.get_content(), 'html.parser')
+ text += soup.get_text() + "\n\n"
+
+ return text
+
+ @staticmethod
+ def extract_from_url(url: str) -> str:
+ """Extract article text from URL"""
+ if not HAS_REQUESTS:
+ raise RuntimeError("requests library not installed")
+
+ try:
+ # Try trafilatura first (best for articles)
+ if HAS_TRAFILATURA:
+ downloaded = trafilatura.fetch_url(url)
+ text = trafilatura.extract(downloaded)
+ if text:
+ return text
+
+ # Fallback to simple extraction
+ response = requests.get(url, timeout=10)
+ response.raise_for_status()
+
+ if HAS_BS4:
+ soup = BeautifulSoup(response.content, 'html.parser')
+
+ # Remove script and style elements
+ for script in soup(["script", "style"]):
+ script.decompose()
+
+ # Get text
+ text = soup.get_text()
+
+ # Clean up whitespace
+ lines = (line.strip() for line in text.splitlines())
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
+ text = '\n'.join(chunk for chunk in chunks if chunk)
+
+ return text
+ else:
+ return response.text
+
+ except Exception as e:
+ raise RuntimeError(f"Failed to fetch URL: {e}")
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 🎵 AUDIO PROCESSOR
+# ═══════════════════════════════════════════════════════════════════════════
+
+class AudioProcessor:
+ """Handle audio playback, export, and manipulation"""
+
+ def __init__(self):
+ self.is_playing = False
+ self.is_paused = False
+ self.current_file: Optional[str] = None
+ self.volume = 1.0
+
+ def play(self, audio_file: str):
+ """Play audio file"""
+ if not HAS_PYGAME:
+ return False
+
+ try:
+ pygame.mixer.music.load(audio_file)
+ pygame.mixer.music.set_volume(self.volume)
+ pygame.mixer.music.play()
+ self.current_file = audio_file
+ self.is_playing = True
+ self.is_paused = False
+ return True
+ except Exception as e:
+ print(f"Playback error: {e}")
+ return False
+
+ def pause(self):
+ """Pause playback"""
+ if HAS_PYGAME and self.is_playing:
+ pygame.mixer.music.pause()
+ self.is_paused = True
+
+ def resume(self):
+ """Resume playback"""
+ if HAS_PYGAME and self.is_paused:
+ pygame.mixer.music.unpause()
+ self.is_paused = False
+
+ def stop(self):
+ """Stop playback"""
+ if HAS_PYGAME:
+ pygame.mixer.music.stop()
+ self.is_playing = False
+ self.is_paused = False
+
+ def is_busy(self) -> bool:
+ """Check if audio is playing"""
+ if not HAS_PYGAME:
+ return False
+ return pygame.mixer.music.get_busy() and not self.is_paused
+
+ def set_volume(self, volume: float):
+ """Set volume (0.0 to 1.0)"""
+ self.volume = max(0.0, min(1.0, volume))
+ if HAS_PYGAME:
+ pygame.mixer.music.set_volume(self.volume)
+
+ @staticmethod
+ def combine_wav_files(files: List[str], output: str) -> bool:
+ """Combine multiple WAV files into one"""
+ try:
+ if not files:
+ return False
+
+ # Read all audio data
+ audio_data = []
+ sample_rate = None
+
+ for file in files:
+ data, sr = sf.read(file)
+ if sample_rate is None:
+ sample_rate = sr
+ elif sr != sample_rate:
+ # Resample if needed (simple approach)
+ pass
+ audio_data.append(data)
+
+ # Concatenate
+ combined = np.concatenate(audio_data)
+
+ # Write output
+ sf.write(output, combined, sample_rate)
+ return True
+
+ except Exception as e:
+ print(f"Audio combine error: {e}")
+ return False
+
+ @staticmethod
+ def export_to_mp3(wav_file: str, mp3_file: str) -> bool:
+ """Convert WAV to MP3"""
+ if not HAS_PYDUB:
+ print("pydub not installed, cannot export MP3")
+ return False
+
+ try:
+ audio = AudioSegment.from_wav(wav_file)
+ audio.export(mp3_file, format="mp3", bitrate="192k")
+ return True
+ except Exception as e:
+ print(f"MP3 export error: {e}")
+ return False
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 🧵 BACKGROUND WORKER THREADS
+# ═══════════════════════════════════════════════════════════════════════════
+
+class TTSWorker(QThread):
+ """Background thread for TTS generation"""
+
+ progress = pyqtSignal(int, str) # (percentage, status_message)
+ sentence_ready = pyqtSignal(int, str) # (index, audio_file)
+ finished = pyqtSignal(str) # Combined audio file
+ error = pyqtSignal(str)
+
+ def __init__(self, tts_engine: TTSEngine, sentences: List[Sentence],
+ voice: str, speed: float):
+ super().__init__()
+ self.tts_engine = tts_engine
+ self.sentences = sentences
+ self.voice = voice
+ self.speed = speed
+ self._stop_flag = Event()
+
+ def stop(self):
+ self._stop_flag.set()
+
+ def run(self):
+ """Generate TTS for all sentences"""
+ temp_files = []
+
+ try:
+ total = len(self.sentences)
+
+ for i, sentence in enumerate(self.sentences):
+ if self._stop_flag.is_set():
+ break
+
+ self.progress.emit(
+ int((i / total) * 100),
+ f"Generating audio: {i+1}/{total}"
+ )
+
+ # Generate audio
+ temp_file = tempfile.mktemp(suffix=".wav")
+
+ if self.tts_engine.synthesize(
+ sentence.text,
+ self.voice,
+ temp_file,
+ self.speed
+ ):
+ temp_files.append(temp_file)
+ sentence.audio_file = temp_file
+
+ # Calculate duration
+ if HAS_AUDIO_EXPORT:
+ try:
+ data, sr = sf.read(temp_file)
+ sentence.duration = len(data) / sr
+ except:
+ sentence.duration = len(sentence.text) / 15.0 # Estimate
+ else:
+ sentence.duration = len(sentence.text) / 15.0
+
+ self.sentence_ready.emit(i, temp_file)
+ else:
+ self.error.emit(f"Failed to generate audio for sentence {i+1}")
+ return
+
+ if self._stop_flag.is_set():
+ # Clean up
+ for f in temp_files:
+ try:
+ Path(f).unlink()
+ except:
+ pass
+ return
+
+ # Combine all audio files
+ self.progress.emit(95, "Combining audio files...")
+
+ output_file = tempfile.mktemp(suffix=".wav")
+ if AudioProcessor.combine_wav_files(temp_files, output_file):
+ self.finished.emit(output_file)
+ else:
+ self.error.emit("Failed to combine audio files")
+
+ # Clean up temp files
+ for f in temp_files:
+ try:
+ Path(f).unlink()
+ except:
+ pass
+
+ except Exception as e:
+ self.error.emit(f"TTS generation error: {str(e)}\n{traceback.format_exc()}")
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 📝 TEXT UTILITIES
+# ═══════════════════════════════════════════════════════════════════════════
+
+class TextProcessor:
+ """Text processing utilities"""
+
+ @staticmethod
+ def split_into_sentences(text: str) -> List[Sentence]:
+ """Split text into sentences with position tracking"""
+ # Enhanced sentence splitting with better punctuation handling
+ pattern = r'([.!?]+(?:\s+|$)|(?<=\n)\n+)'
+
+ sentences = []
+ current_pos = 0
+ parts = re.split(pattern, text)
+
+ current_sentence = ""
+ for i, part in enumerate(parts):
+ if not part.strip():
+ continue
+
+ current_sentence += part
+
+ # Check if this is a sentence delimiter
+ if re.match(pattern, part) or i == len(parts) - 1:
+ text_clean = current_sentence.strip()
+ if text_clean and len(text_clean) > 1:
+ start_pos = text.find(text_clean, current_pos)
+ if start_pos == -1:
+ start_pos = current_pos
+
+ end_pos = start_pos + len(text_clean)
+
+ sentences.append(Sentence(
+ text=text_clean,
+ start_pos=start_pos,
+ end_pos=end_pos
+ ))
+
+ current_pos = end_pos
+
+ current_sentence = ""
+
+ return sentences
+
+ @staticmethod
+ def clean_text(text: str) -> str:
+ """Clean and normalize text"""
+ # Remove excessive whitespace
+ text = re.sub(r'\s+', ' ', text)
+ # Remove special characters that might break TTS
+ text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]', '', text)
+ return text.strip()
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 🖥️ MAIN APPLICATION WINDOW
+# ═══════════════════════════════════════════════════════════════════════════
+
+class VoiceAIReader(QMainWindow):
+ """Main application window"""
+
+ def __init__(self):
+ super().__init__()
+
+ self.setWindowTitle("Voice AI Reader - 2026 Local TTS")
+ self.setGeometry(100, 100, 1400, 900)
+
+ # Settings
+ self.settings = QSettings("VoiceAIReader", "2026")
+
+ # Components
+ self.tts_engine = TTSEngine("piper")
+ self.audio_processor = AudioProcessor()
+ self.tts_worker: Optional[TTSWorker] = None
+
+ # State
+ self.sentences: List[Sentence] = []
+ self.current_sentence_idx = 0
+ self.combined_audio_file: Optional[str] = None
+ self.playback_timer = QTimer()
+ self.playback_timer.timeout.connect(self._update_playback)
+
+ # Initialize UI
+ self._init_ui()
+ self._load_settings()
+ self._setup_shortcuts()
+
+ # Check for missing components
+ self._check_dependencies()
+
+ def _init_ui(self):
+ """Initialize the user interface"""
+ # Menu bar
+ self._create_menu_bar()
+
+ # Central widget with splitter
+ central = QWidget()
+ self.setCentralWidget(central)
+ main_layout = QHBoxLayout(central)
+
+ # Main splitter (sidebar | text area | settings)
+ splitter = QSplitter(Qt.Orientation.Horizontal)
+
+ # Left sidebar
+ sidebar = self._create_sidebar()
+ splitter.addWidget(sidebar)
+
+ # Center - Text display with controls
+ center_widget = self._create_center_area()
+ splitter.addWidget(center_widget)
+
+ # Right - Settings panel
+ settings_panel = self._create_settings_panel()
+ splitter.addWidget(settings_panel)
+
+ splitter.setSizes([250, 700, 350])
+ main_layout.addWidget(splitter)
+
+ # Status bar
+ self.status_bar = QStatusBar()
+ self.setStatusBar(self.status_bar)
+ self.status_bar.showMessage("Ready")
+
+ # Apply theme
+ self._apply_theme()
+
+ def _create_menu_bar(self):
+ """Create menu bar"""
+ menubar = self.menuBar()
+
+ # File menu
+ file_menu = menubar.addMenu("&File")
+
+ open_action = QAction("&Open File...", self)
+ open_action.setShortcut(QKeySequence.StandardKey.Open)
+ open_action.triggered.connect(lambda: self._import_file("auto"))
+ file_menu.addAction(open_action)
+
+ url_action = QAction("Import from &URL...", self)
+ url_action.triggered.connect(self._import_from_url)
+ file_menu.addAction(url_action)
+
+ file_menu.addSeparator()
+
+ export_action = QAction("&Export Audio...", self)
+ export_action.setShortcut("Ctrl+E")
+ export_action.triggered.connect(self._export_audio)
+ file_menu.addAction(export_action)
+
+ file_menu.addSeparator()
+
+ quit_action = QAction("&Quit", self)
+ quit_action.setShortcut(QKeySequence.StandardKey.Quit)
+ quit_action.triggered.connect(self.close)
+ file_menu.addAction(quit_action)
+
+ # Edit menu
+ edit_menu = menubar.addMenu("&Edit")
+
+ clear_action = QAction("&Clear Text", self)
+ clear_action.setShortcut("Ctrl+L")
+ clear_action.triggered.connect(self.text_edit.clear)
+ edit_menu.addAction(clear_action)
+
+ # View menu
+ view_menu = menubar.addMenu("&View")
+
+ theme_action = QAction("Toggle &Dark Mode", self)
+ theme_action.setShortcut("Ctrl+D")
+ theme_action.triggered.connect(self._toggle_theme)
+ view_menu.addAction(theme_action)
+
+ # Help menu
+ help_menu = menubar.addMenu("&Help")
+
+ about_action = QAction("&About", self)
+ about_action.triggered.connect(self._show_about)
+ help_menu.addAction(about_action)
+
+ def _create_sidebar(self) -> QWidget:
+ """Create left sidebar with file operations"""
+ widget = QWidget()
+ layout = QVBoxLayout(widget)
+
+ # Title
+ title = QLabel("📁 Import")
+ title.setFont(QFont("Arial", 14, QFont.Weight.Bold))
+ title.setAlignment(Qt.AlignmentFlag.AlignCenter)
+ layout.addWidget(title)
+
+ # Import buttons
+ imports = [
+ ("📄 Text File", "txt"),
+ ("📕 PDF Document", "pdf"),
+ ("📘 Word Document", "docx"),
+ ("📚 EPUB Book", "epub"),
+ ("🌐 Web Article", "url"),
+ ]
+
+ for label, file_type in imports:
+ btn = QPushButton(label)
+ btn.clicked.connect(lambda checked, ft=file_type: self._import_file(ft))
+ layout.addWidget(btn)
+
+ layout.addSpacing(20)
+
+ # AI Features
+ ai_group = QGroupBox("🤖 AI Features")
+ ai_layout = QVBoxLayout()
+
+ self.btn_dictation = QPushButton("🎤 Dictation")
+ self.btn_dictation.setEnabled(HAS_WHISPER)
+ self.btn_dictation.clicked.connect(self._start_dictation)
+ ai_layout.addWidget(self.btn_dictation)
+
+ self.btn_summarize = QPushButton("📝 Summarize")
+ self.btn_summarize.setEnabled(HAS_OLLAMA)
+ self.btn_summarize.clicked.connect(self._summarize_text)
+ ai_layout.addWidget(self.btn_summarize)
+
+ self.btn_chat = QPushButton("💬 Q&A Chat")
+ self.btn_chat.setEnabled(HAS_OLLAMA and HAS_WHISPER)
+ self.btn_chat.clicked.connect(self._start_chat)
+ ai_layout.addWidget(self.btn_chat)
+
+ ai_group.setLayout(ai_layout)
+ layout.addWidget(ai_group)
+
+ layout.addStretch()
+
+ # Info
+ info = QLabel(
+ "💡 100% Local\n"
+ "No cloud APIs\n"
+ "Privacy-first"
+ )
+ info.setStyleSheet("color: #666; font-size: 9pt; padding: 10px;")
+ info.setAlignment(Qt.AlignmentFlag.AlignCenter)
+ layout.addWidget(info)
+
+ return widget
+
+ def _create_center_area(self) -> QWidget:
+ """Create center text display area"""
+ widget = QWidget()
+ layout = QVBoxLayout(widget)
+
+ # Top controls
+ controls = self._create_playback_controls()
+ layout.addLayout(controls)
+
+ # Text editor with highlighting
+ self.text_edit = QTextEdit()
+ self.text_edit.setPlaceholderText(
+ "📖 Paste or type your text here...\n\n"
+ "Or import a file using the sidebar.\n\n"
+ "Keyboard shortcuts:\n"
+ " Space - Play/Pause\n"
+ " Esc - Stop\n"
+ " Ctrl+O - Open file\n"
+ " Ctrl+E - Export audio"
+ )
+ self.text_edit.setFont(QFont("Georgia", 12))
+ layout.addWidget(self.text_edit)
+
+ # Progress bar
+ self.progress_bar = QProgressBar()
+ self.progress_bar.setVisible(False)
+ layout.addWidget(self.progress_bar)
+
+ return widget
+
+ def _create_playback_controls(self) -> QHBoxLayout:
+ """Create playback control buttons"""
+ layout = QHBoxLayout()
+
+ self.btn_play = QPushButton("▶️ Play")
+ self.btn_play.setFont(QFont("Arial", 11, QFont.Weight.Bold))
+ self.btn_play.clicked.connect(self._play)
+ layout.addWidget(self.btn_play)
+
+ self.btn_pause = QPushButton("⏸️ Pause")
+ self.btn_pause.setEnabled(False)
+ self.btn_pause.clicked.connect(self._pause)
+ layout.addWidget(self.btn_pause)
+
+ self.btn_stop = QPushButton("⏹️ Stop")
+ self.btn_stop.setEnabled(False)
+ self.btn_stop.clicked.connect(self._stop)
+ layout.addWidget(self.btn_stop)
+
+ layout.addSpacing(20)
+
+ # Position display
+ self.position_label = QLabel("0 / 0")
+ layout.addWidget(self.position_label)
+
+ layout.addStretch()
+
+ return layout
+
+ def _create_settings_panel(self) -> QWidget:
+ """Create right settings panel"""
+ widget = QWidget()
+ layout = QVBoxLayout(widget)
+
+ # Scroll area for settings
+ scroll = QScrollArea()
+ scroll.setWidgetResizable(True)
+ scroll_content = QWidget()
+ scroll_layout = QVBoxLayout(scroll_content)
+
+ # Voice settings
+ voice_group = self._create_voice_settings()
+ scroll_layout.addWidget(voice_group)
+
+ # Playback settings
+ playback_group = self._create_playback_settings()
+ scroll_layout.addWidget(playback_group)
+
+ # Display settings
+ display_group = self._create_display_settings()
+ scroll_layout.addWidget(display_group)
+
+ scroll_layout.addStretch()
+ scroll.setWidget(scroll_content)
+ layout.addWidget(scroll)
+
+ return widget
+
+ def _create_voice_settings(self) -> QGroupBox:
+ """Create voice selection group"""
+ group = QGroupBox("🎙️ Voice Settings")
+ layout = QVBoxLayout()
+
+ layout.addWidget(QLabel("Voice:"))
+ self.voice_combo = QComboBox()
+
+ # Populate voices
+ for voice in self.tts_engine.voices:
+ self.voice_combo.addItem(
+ f"{voice.display_name} ({voice.quality})",
+ voice.name
+ )
+
+ if self.voice_combo.count() == 0:
+ self.voice_combo.addItem("No voices found", None)
+
+ layout.addWidget(self.voice_combo)
+
+ # Voice info
+ info = QLabel(
+ f"Found {len(self.tts_engine.voices)} voices\n"
+ f"Add more voices to models/piper/"
+ )
+ info.setStyleSheet("color: #666; font-size: 9pt;")
+ info.setWordWrap(True)
+ layout.addWidget(info)
+
+ group.setLayout(layout)
+ return group
+
+ def _create_playback_settings(self) -> QGroupBox:
+ """Create playback controls group"""
+ group = QGroupBox("⚙️ Playback Settings")
+ layout = QVBoxLayout()
+
+ # Speed
+ self.speed_label = QLabel("Speed: 1.0×")
+ layout.addWidget(self.speed_label)
+
+ self.speed_slider = QSlider(Qt.Orientation.Horizontal)
+ self.speed_slider.setMinimum(5) # 0.5×
+ self.speed_slider.setMaximum(50) # 5.0×
+ self.speed_slider.setValue(10) # 1.0×
+ self.speed_slider.setTickPosition(QSlider.TickPosition.TicksBelow)
+ self.speed_slider.setTickInterval(5)
+ self.speed_slider.valueChanged.connect(self._update_speed_label)
+ layout.addWidget(self.speed_slider)
+
+ # Volume
+ self.volume_label = QLabel("Volume: 100%")
+ layout.addWidget(self.volume_label)
+
+ self.volume_slider = QSlider(Qt.Orientation.Horizontal)
+ self.volume_slider.setMinimum(0)
+ self.volume_slider.setMaximum(100)
+ self.volume_slider.setValue(100)
+ self.volume_slider.setTickPosition(QSlider.TickPosition.TicksBelow)
+ self.volume_slider.setTickInterval(10)
+ self.volume_slider.valueChanged.connect(self._update_volume)
+ layout.addWidget(self.volume_slider)
+
+ group.setLayout(layout)
+ return group
+
+ def _create_display_settings(self) -> QGroupBox:
+ """Create display options group"""
+ group = QGroupBox("🎨 Display")
+ layout = QVBoxLayout()
+
+ self.highlight_checkbox = QCheckBox("Enable real-time highlighting")
+ self.highlight_checkbox.setChecked(True)
+ layout.addWidget(self.highlight_checkbox)
+
+ self.dark_mode_checkbox = QCheckBox("Dark mode")
+ self.dark_mode_checkbox.stateChanged.connect(self._apply_theme)
+ layout.addWidget(self.dark_mode_checkbox)
+
+ # Font size
+ font_layout = QHBoxLayout()
+ font_layout.addWidget(QLabel("Text size:"))
+ self.font_size_spin = QSpinBox()
+ self.font_size_spin.setMinimum(8)
+ self.font_size_spin.setMaximum(32)
+ self.font_size_spin.setValue(12)
+ self.font_size_spin.valueChanged.connect(self._update_font_size)
+ font_layout.addWidget(self.font_size_spin)
+ layout.addLayout(font_layout)
+
+ group.setLayout(layout)
+ return group
+
+ def _setup_shortcuts(self):
+ """Setup keyboard shortcuts"""
+ # Play/Pause: Space
+ play_shortcut = QShortcut(QKeySequence(Qt.Key.Key_Space), self)
+ play_shortcut.activated.connect(self._toggle_play_pause)
+
+ # Stop: Esc
+ stop_shortcut = QShortcut(QKeySequence(Qt.Key.Key_Escape), self)
+ stop_shortcut.activated.connect(self._stop)
+
+ def _check_dependencies(self):
+ """Check and report missing dependencies"""
+ missing = []
+
+ if not HAS_PYGAME:
+ missing.append("pygame (audio playback)")
+ if not HAS_PDF:
+ missing.append("PyMuPDF (PDF support)")
+ if not HAS_DOCX:
+ missing.append("python-docx (DOCX support)")
+ if not HAS_EPUB:
+ missing.append("ebooklib (EPUB support)")
+ if not HAS_AUDIO_EXPORT:
+ missing.append("soundfile/numpy (audio export)")
+
+ if missing:
+ msg = "⚠️ Optional features disabled:\n\n" + "\n".join(f"• {m}" for m in missing)
+ msg += "\n\nInstall missing packages to enable all features."
+ QMessageBox.information(self, "Missing Dependencies", msg)
+
+ if not self.tts_engine.voices:
+ QMessageBox.warning(
+ self,
+ "No Voices Found",
+ "No Piper voices found!\n\n"
+ "Download voices from:\n"
+ "https://huggingface.co/rhasspy/piper-voices\n\n"
+ "Place .onnx and .onnx.json files in:\n"
+ "models/piper/"
+ )
+
+ # ═══════════════════════════════════════════════════════════════════════
+ # 🎮 PLAYBACK CONTROL METHODS
+ # ═══════════════════════════════════════════════════════════════════════
+
+ def _play(self):
+ """Start playback"""
+ # If already have audio, just play it
+ if self.combined_audio_file and Path(self.combined_audio_file).exists():
+ self.audio_processor.play(self.combined_audio_file)
+ self.btn_play.setEnabled(False)
+ self.btn_pause.setEnabled(True)
+ self.btn_stop.setEnabled(True)
+ self.playback_timer.start(100) # Update every 100ms
+ self.status_bar.showMessage("Playing...")
+ return
+
+ # Generate new audio
+ text = self.text_edit.toPlainText().strip()
+ if not text:
+ QMessageBox.warning(self, "No Text", "Please enter or import text first.")
+ return
+
+ if not self.tts_engine.voices:
+ QMessageBox.warning(self, "No Voices", "No TTS voices available.")
+ return
+
+ # Clean and split text
+ text = TextProcessor.clean_text(text)
+ self.sentences = TextProcessor.split_into_sentences(text)
+
+ if not self.sentences:
+ QMessageBox.warning(self, "No Content", "Could not extract sentences from text.")
+ return
+
+ # Get settings
+ voice = self.voice_combo.currentData()
+ if not voice:
+ QMessageBox.warning(self, "No Voice", "Please select a voice.")
+ return
+
+ speed = self.speed_slider.value() / 10.0
+
+ # Start TTS generation
+ self.tts_worker = TTSWorker(self.tts_engine, self.sentences, voice, speed)
+ self.tts_worker.progress.connect(self._on_tts_progress)
+ self.tts_worker.sentence_ready.connect(self._on_sentence_ready)
+ self.tts_worker.finished.connect(self._on_tts_finished)
+ self.tts_worker.error.connect(self._on_tts_error)
+ self.tts_worker.start()
+
+ # Update UI
+ self.progress_bar.setVisible(True)
+ self.progress_bar.setValue(0)
+ self.btn_play.setEnabled(False)
+ self.btn_stop.setEnabled(True)
+ self.status_bar.showMessage("Generating audio...")
+
+ def _pause(self):
+ """Pause playback"""
+ self.audio_processor.pause()
+ self.btn_play.setEnabled(True)
+ self.btn_pause.setEnabled(False)
+ self.playback_timer.stop()
+ self.status_bar.showMessage("Paused")
+
+ def _stop(self):
+ """Stop playback"""
+ # Stop TTS worker if running
+ if self.tts_worker and self.tts_worker.isRunning():
+ self.tts_worker.stop()
+ self.tts_worker.wait()
+
+ # Stop audio
+ self.audio_processor.stop()
+ self.playback_timer.stop()
+
+ # Clear highlighting
+ self._clear_highlighting()
+
+ # Reset state
+ self.current_sentence_idx = 0
+
+ # Update UI
+ self.progress_bar.setVisible(False)
+ self.btn_play.setEnabled(True)
+ self.btn_pause.setEnabled(False)
+ self.btn_stop.setEnabled(False)
+ self.status_bar.showMessage("Stopped")
+
+ def _toggle_play_pause(self):
+ """Toggle between play and pause"""
+ if self.audio_processor.is_paused:
+ self.audio_processor.resume()
+ self.btn_play.setEnabled(False)
+ self.btn_pause.setEnabled(True)
+ self.playback_timer.start(100)
+ self.status_bar.showMessage("Playing...")
+ elif self.audio_processor.is_playing:
+ self._pause()
+ else:
+ self._play()
+
+ def _update_playback(self):
+ """Update playback state (called by timer)"""
+ if not self.audio_processor.is_busy():
+ # Playback finished
+ self.playback_timer.stop()
+ self._stop()
+ self.status_bar.showMessage("Playback completed")
+ return
+
+ # Update highlighting if enabled
+ if self.highlight_checkbox.isChecked():
+ self._update_highlighting()
+
+ def _update_highlighting(self):
+ """Update sentence highlighting during playback"""
+ if not self.sentences:
+ return
+
+ # Calculate which sentence should be playing based on elapsed time
+ # This is a simplified version - in production you'd track actual audio position
+
+ # For now, just cycle through sentences
+ if self.current_sentence_idx >= len(self.sentences):
+ return
+
+ # Clear previous highlighting
+ cursor = self.text_edit.textCursor()
+ cursor.select(QTextCursor.SelectionType.Document)
+ fmt = QTextCharFormat()
+ cursor.setCharFormat(fmt)
+
+ # Highlight current sentence
+ sentence = self.sentences[self.current_sentence_idx]
+ full_text = self.text_edit.toPlainText()
+
+ # Find sentence position in current text
+ start = full_text.find(sentence.text)
+ if start >= 0:
+ cursor = self.text_edit.textCursor()
+ cursor.setPosition(start)
+ cursor.setPosition(start + len(sentence.text), QTextCursor.MoveMode.KeepAnchor)
+
+ fmt = QTextCharFormat()
+ fmt.setBackground(QColor(255, 255, 0, 120)) # Yellow highlight
+ cursor.setCharFormat(fmt)
+
+ # Scroll to visible
+ self.text_edit.setTextCursor(cursor)
+ self.text_edit.ensureCursorVisible()
+
+ # Update position display
+ self.position_label.setText(f"{self.current_sentence_idx + 1} / {len(self.sentences)}")
+
+ # Move to next sentence after estimated duration
+ # In production, sync with actual audio position
+ QTimer.singleShot(
+ int(sentence.duration * 1000),
+ lambda: setattr(self, 'current_sentence_idx', self.current_sentence_idx + 1)
+ )
+
+ def _clear_highlighting(self):
+ """Remove all text highlighting"""
+ cursor = self.text_edit.textCursor()
+ cursor.select(QTextCursor.SelectionType.Document)
+ fmt = QTextCharFormat()
+ cursor.setCharFormat(fmt)
+
+ # ═══════════════════════════════════════════════════════════════════════
+ # 🔄 TTS WORKER CALLBACKS
+ # ═══════════════════════════════════════════════════════════════════════
+
+ def _on_tts_progress(self, percentage: int, message: str):
+ """Handle TTS generation progress"""
+ self.progress_bar.setValue(percentage)
+ self.status_bar.showMessage(message)
+
+ def _on_sentence_ready(self, index: int, audio_file: str):
+ """Handle individual sentence audio ready"""
+ pass # Could implement streaming playback here
+
+ def _on_tts_finished(self, audio_file: str):
+ """Handle TTS generation complete"""
+ self.combined_audio_file = audio_file
+ self.progress_bar.setVisible(False)
+
+ # Start playback
+ self.audio_processor.play(audio_file)
+ self.btn_play.setEnabled(False)
+ self.btn_pause.setEnabled(True)
+ self.btn_stop.setEnabled(True)
+ self.playback_timer.start(100)
+ self.status_bar.showMessage("Playing...")
+
+ # Reset sentence index
+ self.current_sentence_idx = 0
+
+ def _on_tts_error(self, error: str):
+ """Handle TTS generation error"""
+ QMessageBox.critical(self, "TTS Error", f"Failed to generate audio:\n\n{error}")
+ self._stop()
+
+ # ═══════════════════════════════════════════════════════════════════════
+ # 📁 FILE IMPORT METHODS
+ # ═══════════════════════════════════════════════════════════════════════
+
+ def _import_file(self, file_type: str):
+ """Import file and extract text"""
+ if file_type == "url":
+ self._import_from_url()
+ return
+
+ # File dialog filters
+ filters = {
+ "auto": "All Supported (*.txt *.pdf *.docx *.epub)",
+ "txt": "Text Files (*.txt)",
+ "pdf": "PDF Documents (*.pdf)",
+ "docx": "Word Documents (*.docx)",
+ "epub": "EPUB Books (*.epub)"
+ }
+
+ file_filter = filters.get(file_type, filters["auto"])
+
+ file_path, _ = QFileDialog.getOpenFileName(
+ self,
+ "Import File",
+ "",
+ file_filter
+ )
+
+ if not file_path:
+ return
+
+ try:
+ self.status_bar.showMessage(f"Loading {Path(file_path).name}...")
+ QApplication.processEvents()
+
+ text = DocumentProcessor.extract_text(file_path)
+ self.text_edit.setPlainText(text)
+
+ self.status_bar.showMessage(
+ f"Loaded: {Path(file_path).name} ({len(text)} characters)"
+ )
+
+ except Exception as e:
+ QMessageBox.critical(
+ self,
+ "Import Error",
+ f"Failed to import file:\n\n{str(e)}"
+ )
+ self.status_bar.showMessage("Import failed")
+
+ def _import_from_url(self):
+ """Import article from URL"""
+ if not HAS_REQUESTS:
+ QMessageBox.warning(
+ self,
+ "Feature Unavailable",
+ "URL import requires 'requests' library.\n\n"
+ "Install with: pip install requests beautifulsoup4 trafilatura"
+ )
+ return
+
+ # URL input dialog
+ url, ok = QLineEdit().getText(
+ self,
+ "Import from URL",
+ "Enter article URL:"
+ )
+
+ if not ok or not url:
+ return
+
+ try:
+ self.status_bar.showMessage(f"Fetching {url}...")
+ QApplication.processEvents()
+
+ text = DocumentProcessor.extract_from_url(url)
+ self.text_edit.setPlainText(text)
+
+ self.status_bar.showMessage(
+ f"Loaded from URL ({len(text)} characters)"
+ )
+
+ except Exception as e:
+ QMessageBox.critical(
+ self,
+ "Import Error",
+ f"Failed to fetch URL:\n\n{str(e)}"
+ )
+ self.status_bar.showMessage("URL import failed")
+
+ def _export_audio(self):
+ """Export generated audio to file"""
+ if not self.combined_audio_file or not Path(self.combined_audio_file).exists():
+ QMessageBox.warning(
+ self,
+ "No Audio",
+ "Please generate audio first by clicking Play."
+ )
+ return
+
+ file_path, _ = QFileDialog.getSaveFileName(
+ self,
+ "Export Audio",
+ "",
+ "WAV Audio (*.wav);;MP3 Audio (*.mp3)" if HAS_PYDUB else "WAV Audio (*.wav)"
+ )
+
+ if not file_path:
+ return
+
+ try:
+ if file_path.endswith('.wav'):
+ import shutil
+ shutil.copy(self.combined_audio_file, file_path)
+ QMessageBox.information(
+ self,
+ "Export Successful",
+ f"Audio saved to:\n{file_path}"
+ )
+ elif file_path.endswith('.mp3'):
+ if AudioProcessor.export_to_mp3(self.combined_audio_file, file_path):
+ QMessageBox.information(
+ self,
+ "Export Successful",
+ f"Audio saved to:\n{file_path}"
+ )
+ else:
+ raise RuntimeError("MP3 export failed")
+
+ except Exception as e:
+ QMessageBox.critical(
+ self,
+ "Export Error",
+ f"Failed to export audio:\n\n{str(e)}"
+ )
+
+ # ═══════════════════════════════════════════════════════════════════════
+ # 🤖 AI FEATURE METHODS (Stubs for now)
+ # ═══════════════════════════════════════════════════════════════════════
+
+ def _start_dictation(self):
+ """Start voice dictation (requires faster-whisper)"""
+ QMessageBox.information(
+ self,
+ "Feature Coming Soon",
+ "Voice dictation with faster-whisper will be available soon.\n\n"
+ "Install: pip install faster-whisper"
+ )
+
+ def _summarize_text(self):
+ """Summarize text with local LLM"""
+ if not HAS_OLLAMA:
+ QMessageBox.warning(
+ self,
+ "Feature Unavailable",
+ "AI summarization requires Ollama.\n\n"
+ "Install from: https://ollama.com/download\n"
+ "Then: ollama pull phi3:mini"
+ )
+ return
+
+ text = self.text_edit.toPlainText().strip()
+ if not text:
+ QMessageBox.warning(self, "No Text", "Please enter text to summarize.")
+ return
+
+ # Implement Ollama summarization
+ QMessageBox.information(
+ self,
+ "Feature Coming Soon",
+ "AI summarization with Ollama coming soon!"
+ )
+
+ def _start_chat(self):
+ """Start voice Q&A chat"""
+ QMessageBox.information(
+ self,
+ "Feature Coming Soon",
+ "Voice Q&A chat will be available soon.\n\n"
+ "Requires: faster-whisper + Ollama"
+ )
+
+ # ═══════════════════════════════════════════════════════════════════════
+ # ⚙️ SETTINGS AND UI METHODS
+ # ═══════════════════════════════════════════════════════════════════════
+
+ def _update_speed_label(self):
+ """Update speed label"""
+ speed = self.speed_slider.value() / 10.0
+ self.speed_label.setText(f"Speed: {speed:.1f}×")
+
+ def _update_volume(self):
+ """Update volume"""
+ volume = self.volume_slider.value() / 100.0
+ self.volume_label.setText(f"Volume: {int(volume * 100)}%")
+ self.audio_processor.set_volume(volume)
+
+ def _update_font_size(self):
+ """Update text editor font size"""
+ size = self.font_size_spin.value()
+ font = self.text_edit.font()
+ font.setPointSize(size)
+ self.text_edit.setFont(font)
+
+ def _apply_theme(self):
+ """Apply dark or light theme"""
+ if self.dark_mode_checkbox.isChecked():
+ # Dark theme
+ palette = QPalette()
+ palette.setColor(QPalette.ColorRole.Window, QColor(45, 45, 45))
+ palette.setColor(QPalette.ColorRole.WindowText, QColor(220, 220, 220))
+ palette.setColor(QPalette.ColorRole.Base, QColor(30, 30, 30))
+ palette.setColor(QPalette.ColorRole.AlternateBase, QColor(45, 45, 45))
+ palette.setColor(QPalette.ColorRole.Text, QColor(220, 220, 220))
+ palette.setColor(QPalette.ColorRole.Button, QColor(45, 45, 45))
+ palette.setColor(QPalette.ColorRole.ButtonText, QColor(220, 220, 220))
+ palette.setColor(QPalette.ColorRole.BrightText, QColor(255, 0, 0))
+ palette.setColor(QPalette.ColorRole.Link, QColor(42, 130, 218))
+ palette.setColor(QPalette.ColorRole.Highlight, QColor(42, 130, 218))
+ palette.setColor(QPalette.ColorRole.HighlightedText, QColor(0, 0, 0))
+ self.setPalette(palette)
+
+ # Style sheet for better appearance
+ self.setStyleSheet("""
+ QGroupBox {
+ border: 1px solid #555;
+ border-radius: 5px;
+ margin-top: 10px;
+ padding-top: 10px;
+ }
+ QGroupBox::title {
+ subcontrol-origin: margin;
+ left: 10px;
+ padding: 0 5px;
+ }
+ QPushButton {
+ padding: 5px 15px;
+ border: 1px solid #555;
+ border-radius: 3px;
+ }
+ QPushButton:hover {
+ background-color: #555;
+ }
+ """)
+ else:
+ # Light theme (default)
+ self.setPalette(QApplication.style().standardPalette())
+ self.setStyleSheet("")
+
+ def _toggle_theme(self):
+ """Toggle dark mode"""
+ self.dark_mode_checkbox.setChecked(not self.dark_mode_checkbox.isChecked())
+
+ def _show_about(self):
+ """Show about dialog"""
+ QMessageBox.about(
+ self,
+ "About Voice AI Reader",
+ "Voice AI Reader 2026 "
+ "100% Local/Offline TTS Application
"
+ "Modern text-to-speech reader with AI features
"
+ " "
+ "Features:
"
+ ""
+ "Neural TTS (Piper/MeloTTS/XTTS) "
+ "Multi-format import (TXT/PDF/DOCX/EPUB/Web) "
+ "Real-time highlighting "
+ "Voice transcription (Whisper) "
+ "AI summarization (Ollama) "
+ "Privacy-first, no cloud APIs "
+ " "
+ " "
+ "Made with ❤️ using PyQt6
"
+ )
+
+ def _load_settings(self):
+ """Load saved settings"""
+ # Voice
+ if self.settings.value("voice"):
+ index = self.voice_combo.findData(self.settings.value("voice"))
+ if index >= 0:
+ self.voice_combo.setCurrentIndex(index)
+
+ # Speed
+ if self.settings.value("speed"):
+ self.speed_slider.setValue(int(float(self.settings.value("speed")) * 10))
+
+ # Volume
+ if self.settings.value("volume"):
+ self.volume_slider.setValue(int(float(self.settings.value("volume")) * 100))
+
+ # Dark mode
+ if self.settings.value("dark_mode"):
+ self.dark_mode_checkbox.setChecked(
+ self.settings.value("dark_mode") == "true"
+ )
+
+ # Font size
+ if self.settings.value("font_size"):
+ self.font_size_spin.setValue(int(self.settings.value("font_size")))
+
+ def _save_settings(self):
+ """Save settings"""
+ self.settings.setValue("voice", self.voice_combo.currentData())
+ self.settings.setValue("speed", str(self.speed_slider.value() / 10.0))
+ self.settings.setValue("volume", str(self.volume_slider.value() / 100.0))
+ self.settings.setValue("dark_mode", str(self.dark_mode_checkbox.isChecked()))
+ self.settings.setValue("font_size", str(self.font_size_spin.value()))
+
+ def closeEvent(self, event):
+ """Handle window close"""
+ # Save settings
+ self._save_settings()
+
+ # Clean up
+ self._stop()
+
+ # Clean up temp audio file
+ if self.combined_audio_file and Path(self.combined_audio_file).exists():
+ try:
+ Path(self.combined_audio_file).unlink()
+ except:
+ pass
+
+ event.accept()
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 🚀 APPLICATION ENTRY POINT
+# ═══════════════════════════════════════════════════════════════════════════
+
+def main():
+ """Application entry point"""
+ print("""
+ ╔════════════════════════════════════════════════════════════════╗
+ ║ VOICE AI READER - 2026 LOCAL TTS APPLICATION ║
+ ╚════════════════════════════════════════════════════════════════╝
+ """)
+
+ # Create application
+ app = QApplication(sys.argv)
+ app.setApplicationName("Voice AI Reader")
+ app.setOrganizationName("VoiceAI")
+ app.setOrganizationDomain("voiceai.local")
+
+ # Set app-wide font
+ app.setFont(QFont("Arial", 10))
+
+ # Create main window
+ window = VoiceAIReader()
+ window.show()
+
+ print("""
+ ✅ Application started successfully
+ 📖 Import text or paste directly to begin
+ 🎙️ Voice models: Check models/piper/ directory
+ """)
+
+ # Run application
+ sys.exit(app.exec())
+
+
+if __name__ == "__main__":
+ main()
diff --git a/functions/audio.py b/functions/audio.py
index c86d9e7..cabf5ed 100644
--- a/functions/audio.py
+++ b/functions/audio.py
@@ -1,7 +1,21 @@
from pydub import AudioSegment, effects
+import os
+import time
def normalize_audio(path):
+ # Longer delay to ensure file is fully written
+ time.sleep(0.3)
+
rawsound = AudioSegment.from_file(f'{path}', "wav")
normalizedsound = effects.compress_dynamic_range(rawsound)
normalizedsound = effects.normalize(rawsound)
- normalizedsound.export(f'{path}', format="wav")
\ No newline at end of file
+ normalizedsound.export(f'{path}', format="wav")
+
+ # Ensure file is flushed to disk (Windows-compatible)
+ try:
+ # Force flush file system buffers
+ with open(path, 'rb') as f:
+ os.fsync(f.fileno())
+ except Exception:
+ pass
+ time.sleep(0.1) # Additional safety delay
\ No newline at end of file
diff --git a/functions/kokoro.py b/functions/kokoro.py
index 9e938fb..5bdc321 100644
--- a/functions/kokoro.py
+++ b/functions/kokoro.py
@@ -1,3 +1,17 @@
+import warnings
+import os
+# Suppress torch warnings
+warnings.filterwarnings('ignore', category=UserWarning, module='torch.nn.modules.rnn')
+warnings.filterwarnings('ignore', category=FutureWarning, module='torch.nn.utils.weight_norm')
+# Suppress HuggingFace Hub warnings
+warnings.filterwarnings('ignore', message='.*HF_TOKEN.*')
+
+# Set HF_HUB_DISABLE_TELEMETRY to suppress additional warnings
+os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
+# Set a dummy token or use offline mode to suppress authentication warnings
+if not os.environ.get('HF_TOKEN'):
+ os.environ['HF_HUB_OFFLINE'] = '1'
+
from kokoro import KPipeline
import soundfile as sf
import torch
@@ -10,7 +24,8 @@ def kokoro_process_audio(voice, lang, text, output):
if lang == False:
lang = voice[0]
- pipeline = KPipeline(lang, device=DEVICE)
+ # Explicitly pass repo_id to suppress warning
+ pipeline = KPipeline(lang, device=DEVICE, repo_id='hexgrad/Kokoro-82M')
generator = pipeline(text, voice)
for i, (gs, ps, audio) in enumerate(generator):
diff --git a/functions/pdf_processor.py b/functions/pdf_processor.py
new file mode 100644
index 0000000..ea3940a
--- /dev/null
+++ b/functions/pdf_processor.py
@@ -0,0 +1,332 @@
+"""
+PDF Processor Module
+====================
+Comprehensive backend PDF processing that generates structured, executable data
+for efficient client-side rendering, clickability, and real-time word highlighting.
+
+This module extracts text with precise positioning, creates word-level chunks,
+and returns optimized data structures ready for client interaction.
+"""
+
+import fitz # PyMuPDF
+from typing import List, Dict, Any, Optional, Tuple
+import re
+import hashlib
+import json
+
+
+class PDFTextElement:
+ """Represents a single word or text element with position data."""
+
+ def __init__(self, text: str, bbox: Tuple[float, float, float, float],
+ font: str, size: float, page_num: int, element_id: int):
+ self.text = text
+ self.x0, self.y0, self.x1, self.y1 = bbox
+ self.font = font
+ self.size = size
+ self.page_num = page_num
+ self.element_id = element_id
+ self.word_index = 0 # Will be set later
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for JSON serialization."""
+ return {
+ "id": self.element_id,
+ "text": self.text,
+ "bbox": [self.x0, self.y0, self.x1, self.y1],
+ "x": self.x0,
+ "y": self.y0,
+ "width": self.x1 - self.x0,
+ "height": self.y1 - self.y0,
+ "font": self.font,
+ "size": self.size,
+ "page": self.page_num,
+ "word_idx": self.word_index
+ }
+
+
+class PDFPage:
+ """Represents a PDF page with all text elements and metadata."""
+
+ def __init__(self, page_num: int, width: float, height: float):
+ self.page_num = page_num
+ self.width = width
+ self.height = height
+ self.text_elements: List[PDFTextElement] = []
+ self.full_text = ""
+ self.word_map: Dict[int, List[int]] = {} # word_index -> [element_ids]
+
+ def add_element(self, element: PDFTextElement):
+ """Add a text element to this page."""
+ self.text_elements.append(element)
+
+ def build_word_map(self):
+ """Build mapping from word indices to text elements for highlighting."""
+ all_text = " ".join([elem.text for elem in self.text_elements])
+ self.full_text = all_text
+
+ # Split into words and track their positions
+ words = re.findall(r'\S+', all_text)
+ current_pos = 0
+ word_idx = 0
+
+ for word in words:
+ # Find word position in full text
+ word_start = all_text.find(word, current_pos)
+ word_end = word_start + len(word)
+
+ # Find which elements contain this word
+ char_pos = 0
+ for elem_idx, elem in enumerate(self.text_elements):
+ elem_start = char_pos
+ elem_end = char_pos + len(elem.text) + 1 # +1 for space
+
+ # Check if word overlaps with this element
+ if not (word_end <= elem_start or word_start >= elem_end):
+ if word_idx not in self.word_map:
+ self.word_map[word_idx] = []
+ self.word_map[word_idx].append(elem.element_id)
+ elem.word_index = word_idx
+
+ char_pos = elem_end
+
+ word_idx += 1
+ current_pos = word_end
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for JSON serialization."""
+ return {
+ "page_num": self.page_num,
+ "width": self.width,
+ "height": self.height,
+ "text": self.full_text,
+ "elements": [elem.to_dict() for elem in self.text_elements],
+ "word_map": self.word_map
+ }
+
+
+class PDFDocument:
+ """Represents a complete PDF document with all pages and text data."""
+
+ def __init__(self, pdf_bytes: bytes):
+ self.pdf_bytes = pdf_bytes
+ self.doc_hash = hashlib.sha256(pdf_bytes).hexdigest()
+ self.pages: List[PDFPage] = []
+ self.total_pages = 0
+ self.full_text = ""
+ self.word_to_page_map: Dict[int, int] = {} # global word index -> page number
+
+ def process(self):
+ """Process the entire PDF and extract all text with positions."""
+ pdf_doc = fitz.open(stream=self.pdf_bytes, filetype="pdf")
+ self.total_pages = len(pdf_doc)
+
+ element_counter = 0
+
+ for page_num in range(self.total_pages):
+ mupdf_page = pdf_doc.load_page(page_num)
+ page = PDFPage(
+ page_num=page_num + 1, # 1-indexed for display
+ width=mupdf_page.rect.width,
+ height=mupdf_page.rect.height
+ )
+
+ # Extract text with detailed position information
+ blocks = mupdf_page.get_text("dict")["blocks"]
+
+ for block in blocks:
+ if block.get("type") == 0: # Text block
+ for line in block.get("lines", []):
+ for span in line.get("spans", []):
+ text = span.get("text", "").strip()
+ if not text:
+ continue
+
+ bbox = span.get("bbox", [0, 0, 0, 0])
+ font = span.get("font", "")
+ size = span.get("size", 12)
+
+ element = PDFTextElement(
+ text=text,
+ bbox=bbox,
+ font=font,
+ size=size,
+ page_num=page_num + 1,
+ element_id=element_counter
+ )
+
+ page.add_element(element)
+ element_counter += 1
+
+ # Build word mapping for this page
+ page.build_word_map()
+ self.pages.append(page)
+
+ # Build full document text and global word mapping
+ self._build_global_mappings()
+
+ pdf_doc.close()
+
+ def _build_global_mappings(self):
+ """Build global text and word-to-page mappings."""
+ all_text_parts = []
+ global_word_idx = 0
+
+ for page in self.pages:
+ page_text = page.full_text
+ all_text_parts.append(page_text)
+
+ # Map each word in this page to the page number
+ words = re.findall(r'\S+', page_text)
+ for _ in words:
+ self.word_to_page_map[global_word_idx] = page.page_num
+ global_word_idx += 1
+
+ self.full_text = "\n".join(all_text_parts)
+
+ def get_text_chunks(self, chunk_size: int = 50) -> List[Dict[str, Any]]:
+ """
+ Split document into readable chunks for TTS processing.
+ Each chunk contains word indices for highlighting.
+ """
+ words = re.findall(r'\S+', self.full_text)
+ chunks = []
+
+ for i in range(0, len(words), chunk_size):
+ chunk_words = words[i:i + chunk_size]
+ chunk_text = " ".join(chunk_words)
+
+ chunks.append({
+ "id": len(chunks),
+ "text": chunk_text,
+ "word_start": i,
+ "word_end": i + len(chunk_words),
+ "page_start": self.word_to_page_map.get(i, 1),
+ "page_end": self.word_to_page_map.get(i + len(chunk_words) - 1, 1)
+ })
+
+ return chunks
+
+ def get_highlight_data_for_words(self, word_start: int, word_end: int) -> Dict[str, List[Dict]]:
+ """
+ Get highlighting data for a range of words.
+ Returns element positions grouped by page for efficient rendering.
+ """
+ highlight_data = {}
+
+ for word_idx in range(word_start, word_end):
+ page_num = self.word_to_page_map.get(word_idx)
+ if not page_num:
+ continue
+
+ # Find the page
+ page = next((p for p in self.pages if p.page_num == page_num), None)
+ if not page:
+ continue
+
+ # Get elements for this word
+ element_ids = page.word_map.get(word_idx - sum(len(re.findall(r'\S+', p.full_text)) for p in self.pages if p.page_num < page_num), [])
+
+ if page_num not in highlight_data:
+ highlight_data[page_num] = []
+
+ for elem_id in element_ids:
+ elem = next((e for e in page.text_elements if e.element_id == elem_id), None)
+ if elem:
+ highlight_data[page_num].append(elem.to_dict())
+
+ return highlight_data
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert complete document to dictionary for JSON serialization."""
+ return {
+ "doc_hash": self.doc_hash,
+ "total_pages": self.total_pages,
+ "full_text": self.full_text,
+ "pages": [page.to_dict() for page in self.pages],
+ "word_count": len(re.findall(r'\S+', self.full_text)),
+ "word_to_page": self.word_to_page_map
+ }
+
+
+def process_pdf_for_interactive_reading(pdf_bytes: bytes, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+ """
+ Main entry point for PDF processing.
+
+ Args:
+ pdf_bytes: Raw PDF file bytes
+ options: Optional processing options (e.g., chunk_size, skip_headers)
+
+ Returns:
+ Complete structured data ready for client-side rendering and interaction
+ """
+ options = options or {}
+ chunk_size = options.get("chunk_size", 50)
+
+ # Create and process document
+ doc = PDFDocument(pdf_bytes)
+ doc.process()
+
+ # Get reading chunks
+ chunks = doc.get_text_chunks(chunk_size=chunk_size)
+
+ # Build optimized structure
+ result = {
+ "status": "success",
+ "document": {
+ "hash": doc.doc_hash,
+ "total_pages": doc.total_pages,
+ "word_count": len(re.findall(r'\S+', doc.full_text)),
+ "full_text": doc.full_text
+ },
+ "pages": [page.to_dict() for page in doc.pages],
+ "chunks": chunks,
+ "metadata": {
+ "processing_version": "1.0",
+ "chunk_size": chunk_size,
+ "total_chunks": len(chunks)
+ }
+ }
+
+ return result
+
+
+def get_highlight_data_for_chunk(pdf_data: Dict[str, Any], chunk_id: int) -> Dict[str, Any]:
+ """
+ Extract highlighting data for a specific chunk from processed PDF data.
+ This is a utility function for real-time highlighting during playback.
+
+ Args:
+ pdf_data: The complete processed PDF data structure
+ chunk_id: ID of the chunk to highlight
+
+ Returns:
+ Highlighting information for the requested chunk
+ """
+ chunks = pdf_data.get("chunks", [])
+ if chunk_id >= len(chunks):
+ return {"error": "Invalid chunk ID"}
+
+ chunk = chunks[chunk_id]
+ word_start = chunk["word_start"]
+ word_end = chunk["word_end"]
+
+ # Find elements to highlight
+ highlight_elements = {}
+
+ for page_data in pdf_data.get("pages", []):
+ page_num = page_data["page_num"]
+
+ for element in page_data["elements"]:
+ word_idx = element.get("word_idx", 0)
+ if word_start <= word_idx < word_end:
+ if page_num not in highlight_elements:
+ highlight_elements[page_num] = []
+ highlight_elements[page_num].append(element)
+
+ return {
+ "chunk_id": chunk_id,
+ "chunk_text": chunk["text"],
+ "page_range": [chunk["page_start"], chunk["page_end"]],
+ "elements": highlight_elements
+ }
diff --git a/functions/piper.py b/functions/piper.py
index 9739961..ada76c3 100644
--- a/functions/piper.py
+++ b/functions/piper.py
@@ -3,17 +3,35 @@
from config import DEVICE
def piper_process_audio(voice, lang, text, output):
-
- command = [
- "piper",
- "--model", voice,
- "--output_file", output
- ]
+ try:
+ command = [
+ "piper",
+ "--model", voice,
+ "--output_file", output
+ ]
- if DEVICE == 'cuda':
- command.append('--cuda')
+ if DEVICE == 'cuda':
+ command.append('--cuda')
- subprocess.run(command, input=text, text=True, check=True, encoding='utf-8')
+ result = subprocess.run(
+ command,
+ input=text,
+ text=True,
+ check=True,
+ encoding='utf-8',
+ capture_output=True
+ )
+ except subprocess.CalledProcessError as e:
+ error_msg = e.stderr or str(e)
+ if 'espeakbridge' in error_msg or 'espeak' in error_msg.lower():
+ raise RuntimeError(
+ "Piper TTS requires espeak-ng to be installed. "
+ "Please install espeak-ng or try using Kokoro engine instead. "
+ f"Original error: {error_msg}"
+ )
+ raise RuntimeError(f"Piper TTS failed: {error_msg}")
+ except Exception as e:
+ raise RuntimeError(f"Piper TTS error: {str(e)}")
# Normalize the audio
normalize_audio(output)
\ No newline at end of file
diff --git a/functions/routes.py b/functions/routes.py
index 0c6bcce..c40c7dd 100644
--- a/functions/routes.py
+++ b/functions/routes.py
@@ -1,9 +1,10 @@
import os
+import base64
import hashlib
import shutil
import tempfile
from io import BytesIO
-from typing import List, Dict, Optional
+from typing import List, Dict, Optional, Any
import ebooklib
import fitz
import docx
@@ -12,10 +13,27 @@
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
+
+# Configure tesseract path for Windows
+import platform
+if platform.system() == 'Windows':
+ # Try common Tesseract installation paths
+ possible_paths = [
+ r'C:\Program Files\Tesseract-OCR\tesseract.exe',
+ r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe',
+ ]
+ for path in possible_paths:
+ if os.path.exists(path):
+ pytesseract.pytesseract.tesseract_cmd = path
+ print(f"✅ Tesseract configured at: {path}")
+ break
+ else:
+ print("⚠️ Tesseract not found in standard paths. OCR may fail.")
+
from bs4 import BeautifulSoup
from ebooklib import epub
from fastapi import (APIRouter, BackgroundTasks, File, Form, HTTPException, Request, UploadFile)
-from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
+from fastapi.responses import HTMLResponse, JSONResponse, FileResponse, Response
from pydantic import BaseModel, Field
from langdetect import detect
@@ -25,6 +43,7 @@
# Import other function modules
from functions.users import UserManager
from functions.webpage import extract_readable_content
+from functions.text_processor import process_text_for_tts, split_into_sentences_semantic
# Lazy imports for TTS engines - these will be imported only when needed
def lazy_import_piper():
@@ -293,10 +312,357 @@ async def read_root(request: Request):
async def read_config(request: Request):
return templates.TemplateResponse("config.html", {"request": request})
+@router.get("/favicon.ico")
+async def favicon():
+ return FileResponse("static/favicon.png", media_type="image/png")
+
# -----------------------
# --- API Endpoints ---
# -----------------------
+@router.get("/api/health")
+async def health_check():
+ """Health check endpoint for browser extension and monitoring."""
+ return JSONResponse(content={
+ "status": "healthy",
+ "service": "OpenWebTTS",
+ "version": "1.0.0"
+ })
+
+class GenerateSpeechRequest(BaseModel):
+ text: str = Field(..., description="Text to convert to speech")
+ voice: Optional[str] = Field("piper", description="TTS engine/voice to use")
+ speed: Optional[float] = Field(1.0, description="Playback speed multiplier")
+ chunkSize: Optional[int] = Field(50, description="Words per chunk for splitting")
+
+class WordTiming(BaseModel):
+ word: str
+ startTime: float
+ endTime: float
+ index: int
+
+class ChunkTiming(BaseModel):
+ text: str
+ startTime: float
+ endTime: float
+ startOffset: int
+ endOffset: int
+ words: List[WordTiming]
+
+class SpeechTimingResponse(BaseModel):
+ audioUrl: str
+ duration: float
+ chunks: List[ChunkTiming]
+ originalText: str
+ normalizedText: str
+
+def calculate_word_timings(text: str, duration: float, chunk_size: int = 50) -> List[ChunkTiming]:
+ """
+ Calculate precise timing for words and chunks based on text and audio duration.
+ Uses character-based distribution for more accurate timing.
+ """
+ import re
+
+ # Normalize text: remove extra whitespace, normalize punctuation
+ normalized_text = ' '.join(text.split())
+ normalized_text = re.sub(r'\s+([.,!?;:])', r'\1', normalized_text)
+
+ # Split into words while preserving positions
+ words_with_pos = []
+ for match in re.finditer(r'\S+', normalized_text):
+ words_with_pos.append({
+ 'word': match.group(),
+ 'start': match.start(),
+ 'end': match.end()
+ })
+
+ if not words_with_pos:
+ return []
+
+ total_chars = len(normalized_text)
+ chunks = []
+
+ # Split words into chunks
+ for chunk_idx in range(0, len(words_with_pos), chunk_size):
+ chunk_words = words_with_pos[chunk_idx:chunk_idx + chunk_size]
+
+ # Calculate chunk boundaries
+ chunk_start_offset = chunk_words[0]['start']
+ chunk_end_offset = chunk_words[-1]['end']
+ chunk_text = normalized_text[chunk_start_offset:chunk_end_offset]
+
+ # Calculate chunk timing based on character position
+ chunk_start_time = (chunk_start_offset / total_chars) * duration
+ chunk_end_time = (chunk_end_offset / total_chars) * duration
+
+ # Calculate word timings within chunk
+ word_timings = []
+ for word_idx, word_data in enumerate(chunk_words):
+ word_start_time = (word_data['start'] / total_chars) * duration
+ word_end_time = (word_data['end'] / total_chars) * duration
+
+ word_timings.append(WordTiming(
+ word=word_data['word'],
+ startTime=round(word_start_time, 3),
+ endTime=round(word_end_time, 3),
+ index=word_idx
+ ))
+
+ chunks.append(ChunkTiming(
+ text=chunk_text,
+ startTime=round(chunk_start_time, 3),
+ endTime=round(chunk_end_time, 3),
+ startOffset=chunk_start_offset,
+ endOffset=chunk_end_offset,
+ words=word_timings
+ ))
+
+ return chunks
+
+@router.post("/api/generate_speech")
+async def generate_speech(request: GenerateSpeechRequest):
+ """
+ Generate speech audio from text for browser extension.
+ Returns audio file directly as response.
+ """
+ if not request.text.strip():
+ raise HTTPException(status_code=400, detail="Text cannot be empty.")
+
+ # Create hash for caching
+ hash_input = f"{request.text}-{request.voice}-{request.speed}"
+ unique_hash = hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+ output_filename = f"{unique_hash}.wav"
+ output_path = os.path.join(AUDIO_CACHE_DIR, output_filename)
+
+ # Check if audio already exists in cache
+ if not os.path.exists(output_path):
+ # Generate audio based on engine/voice
+ try:
+ engine = request.voice.lower()
+
+ if engine == "piper":
+ piper_process_audio = lazy_import_piper()
+ # Piper requires: model_path, lang, text, output
+ model_path = os.path.join(PIPER_DIR, "en_US-lessac-high.onnx")
+ piper_process_audio(model_path, "en_US", request.text, output_path)
+ elif engine == "kokoro":
+ kokoro_process_audio = lazy_import_kokoro()
+ # Kokoro requires: voice, lang, text, output
+ # Default to American English Liam voice
+ kokoro_process_audio(voice="am_liam", lang="a", text=request.text, output=output_path)
+ elif engine == "coqui":
+ coqui_process_audio, _ = lazy_import_coqui()
+ # Coqui requires: voice_path, lang, text, output
+ # Use default voice sample if available
+ voice_path = os.path.join(COQUI_DIR, "default.wav")
+ if not os.path.exists(voice_path):
+ raise HTTPException(status_code=400, detail="Coqui requires a voice sample. Please upload one first.")
+ coqui_process_audio(voice_path, "en", request.text, output_path)
+ elif engine == "openai":
+ # For OpenAI, we'd need API key - use default for now
+ raise HTTPException(status_code=400, detail="OpenAI TTS requires API key configuration")
+ else:
+ # Default to Piper
+ piper_process_audio = lazy_import_piper()
+ model_path = os.path.join(PIPER_DIR, "en_US-lessac-high.onnx")
+ piper_process_audio(model_path, "en_US", request.text, output_path)
+
+ except Exception as e:
+ print(f"❌ Error generating speech: {e}")
+ raise HTTPException(status_code=500, detail=f"Speech generation failed: {str(e)}")
+
+ # Return audio file
+ if not os.path.exists(output_path):
+ raise HTTPException(status_code=500, detail="Audio generation failed")
+
+ # For backward compatibility, return file directly
+ return FileResponse(
+ output_path,
+ media_type="audio/wav",
+ headers={
+ "Content-Disposition": f"attachment; filename={output_filename}",
+ "Cache-Control": "public, max-age=31536000",
+ "Access-Control-Allow-Origin": "*",
+ "Access-Control-Allow-Methods": "POST, OPTIONS",
+ "Access-Control-Allow-Headers": "Content-Type, Authorization",
+ "Access-Control-Allow-Private-Network": "true"
+ }
+ )
+
+@router.post("/api/generate_speech_with_timing")
+async def generate_speech_with_timing(request: GenerateSpeechRequest):
+ """
+ Generate speech audio with precise timing information for synchronized highlighting.
+ Generates separate audio for each chunk with timing data.
+ Detects and marks skip regions (like citation markers) for non-highlighting.
+ """
+ if not request.text.strip():
+ raise HTTPException(status_code=400, detail="Text cannot be empty.")
+
+ import re
+
+ # Normalize text
+ normalized_text = ' '.join(request.text.split())
+ normalized_text = re.sub(r'\s+([.,!?;:])', r'\1', normalized_text)
+
+ # Pattern to detect skip regions: citation markers like [1], [2][3], etc.
+ skip_pattern = re.compile(r'\[\d+\](?:\[\d+\])*')
+
+ # Split into chunks based on chunk size
+ words = normalized_text.split()
+ chunk_size = request.chunkSize or 50
+
+ chunks_data = []
+ current_pos = 0
+
+ for chunk_idx in range(0, len(words), chunk_size):
+ chunk_words = words[chunk_idx:chunk_idx + chunk_size]
+ chunk_text = ' '.join(chunk_words)
+
+ # Calculate character offsets
+ start_offset = normalized_text.find(chunk_text, current_pos)
+ end_offset = start_offset + len(chunk_text)
+ current_pos = end_offset
+
+ # Create hash for this chunk
+ hash_input = f"{chunk_text}-{request.voice}-{request.speed}"
+ unique_hash = hashlib.sha256(hash_input.encode('utf-8')).hexdigest()
+ output_filename = f"{unique_hash}.wav"
+ output_path = os.path.join(AUDIO_CACHE_DIR, output_filename)
+
+ # Generate audio if not cached (includes citation markers for TTS)
+ if not os.path.exists(output_path):
+ try:
+ engine = request.voice.lower()
+
+ if engine == "piper":
+ piper_process_audio = lazy_import_piper()
+ model_path = os.path.join(PIPER_DIR, "en_US-lessac-high.onnx")
+ piper_process_audio(model_path, "en_US", chunk_text, output_path)
+ elif engine == "kokoro":
+ kokoro_process_audio = lazy_import_kokoro()
+ kokoro_process_audio(voice="am_liam", lang="a", text=chunk_text, output=output_path)
+ elif engine == "coqui":
+ coqui_process_audio, _ = lazy_import_coqui()
+ voice_path = os.path.join(COQUI_DIR, "default.wav")
+ if not os.path.exists(voice_path):
+ raise HTTPException(status_code=400, detail="Coqui requires a voice sample.")
+ coqui_process_audio(voice_path, "en", chunk_text, output_path)
+ else:
+ # Default to Piper
+ piper_process_audio = lazy_import_piper()
+ model_path = os.path.join(PIPER_DIR, "en_US-lessac-high.onnx")
+ piper_process_audio(model_path, "en_US", chunk_text, output_path)
+
+ except Exception as e:
+ print(f"❌ Error generating speech for chunk {chunk_idx}: {e}")
+ raise HTTPException(status_code=500, detail=f"Speech generation failed: {str(e)}")
+
+ if not os.path.exists(output_path):
+ raise HTTPException(status_code=500, detail=f"Audio generation failed for chunk {chunk_idx}")
+
+ # Get audio duration for this chunk
+ import wave
+ try:
+ with wave.open(output_path, 'rb') as wav_file:
+ frames = wav_file.getnframes()
+ rate = wav_file.getframerate()
+ duration = frames / float(rate)
+ except Exception as e:
+ print(f"⚠️ Could not read audio duration: {e}")
+ duration = len(chunk_words) * 0.5 # Fallback
+
+ # Calculate word timings for this chunk with skip detection
+ word_timings = []
+ chunk_char_count = len(chunk_text)
+
+ for word_idx, word in enumerate(chunk_words):
+ # Find word position in chunk text
+ word_search_start = sum(len(chunk_words[i]) + 1 for i in range(word_idx))
+ word_start_pos = chunk_text.find(word, word_search_start)
+ if word_start_pos == -1:
+ word_start_pos = word_search_start
+ word_end_pos = word_start_pos + len(word)
+
+ # Calculate timing based on character position
+ word_start_time = (word_start_pos / chunk_char_count) * duration if chunk_char_count > 0 else 0
+ word_end_time = (word_end_pos / chunk_char_count) * duration if chunk_char_count > 0 else duration
+
+ # Check if this word matches skip pattern (citation marker)
+ is_skip = bool(skip_pattern.fullmatch(word))
+
+ word_timings.append({
+ 'word': word,
+ 'startTime': round(word_start_time, 3),
+ 'endTime': round(word_end_time, 3),
+ 'index': word_idx,
+ 'skip': is_skip # Mark citation markers to skip highlighting
+ })
+
+ # Create chunk timing data
+ chunks_data.append({
+ 'audioUrl': f"/audio_cache/{output_filename}",
+ 'duration': round(duration, 3),
+ 'text': chunk_text,
+ 'startTime': 0, # Each chunk's audio starts at 0
+ 'endTime': round(duration, 3),
+ 'startOffset': start_offset,
+ 'endOffset': end_offset,
+ 'words': word_timings
+ })
+
+ # Return timing data
+ return JSONResponse(
+ content={
+ 'chunks': chunks_data,
+ 'originalText': request.text,
+ 'normalizedText': normalized_text
+ },
+ headers={
+ "Access-Control-Allow-Origin": "*",
+ "Access-Control-Allow-Methods": "POST, OPTIONS",
+ "Access-Control-Allow-Headers": "Content-Type, Authorization",
+ "Access-Control-Allow-Private-Network": "true"
+ }
+ )
+
+@router.options("/api/generate_speech")
+async def generate_speech_options(request: Request):
+ """Handle CORS preflight request for browser extension."""
+ # Check if Private Network Access is requested
+ access_pna = request.headers.get("Access-Control-Request-Private-Network") == "true"
+
+ headers = {
+ "Access-Control-Allow-Origin": "*",
+ "Access-Control-Allow-Methods": "POST, OPTIONS",
+ "Access-Control-Allow-Headers": "Content-Type, Authorization",
+ "Access-Control-Max-Age": "3600"
+ }
+
+ # Add Private Network Access header if requested
+ if access_pna:
+ headers["Access-Control-Allow-Private-Network"] = "true"
+
+ return JSONResponse(content={}, headers=headers)
+
+@router.options("/api/generate_speech_with_timing")
+async def generate_speech_with_timing_options(request: Request):
+ """Handle CORS preflight request for timing-based endpoint."""
+ access_pna = request.headers.get("Access-Control-Request-Private-Network") == "true"
+
+ headers = {
+ "Access-Control-Allow-Origin": "*",
+ "Access-Control-Allow-Methods": "POST, OPTIONS",
+ "Access-Control-Allow-Headers": "Content-Type, Authorization",
+ "Access-Control-Max-Age": "3600"
+ }
+
+ if access_pna:
+ headers["Access-Control-Allow-Private-Network"] = "true"
+
+ return JSONResponse(content={}, headers=headers)
+# -----------------------
+
@router.get("/api/piper_voices")
async def get_piper_voices_from_hf():
try:
@@ -322,7 +688,7 @@ async def download_piper_voice(voice: PiperVoice):
config_response = requests.get(config_url)
config_response.raise_for_status()
config_path = os.path.join(PIPER_DIR, f"{voice.key}.onnx.json")
- with open(config_path, "w") as f:
+ with open(config_path, "w", encoding="utf-8") as f:
f.write(config_response.text)
return JSONResponse(content={"message": f"Successfully downloaded {voice.key}"})
except requests.exceptions.RequestException as e:
@@ -405,20 +771,91 @@ async def synthesize_speech(request: SynthesizeRequest, background_tasks: Backgr
def _perform_ocr(pdf_bytes: bytes, task_id: str):
"""Background task to perform OCR and save the result."""
try:
- images = convert_from_bytes(pdf_bytes)
+ # Convert PDF to images - requires Poppler
+ # On Windows, specify poppler path if installed via conda/scoop/manual
+ poppler_path = None
+ if platform.system() == 'Windows':
+ # Try common Poppler installation locations
+ possible_poppler = [
+ r'C:\Program Files\poppler\Library\bin',
+ r'C:\poppler\Library\bin',
+ os.path.expanduser('~\\scoop\\apps\\poppler\\current\\Library\\bin'),
+ ]
+ for path in possible_poppler:
+ if os.path.exists(path):
+ poppler_path = path
+ break
+
+ images = convert_from_bytes(pdf_bytes, poppler_path=poppler_path)
ocr_text = ""
- for image in images:
- ocr_text += pytesseract.image_to_string(image)
+
+ # Process images in batches to avoid memory issues
+ for i, image in enumerate(images):
+ try:
+ page_text = pytesseract.image_to_string(image, timeout=30)
+ ocr_text += f"\n--- Page {i+1} ---\n{page_text}"
+ except Exception as page_error:
+ print(f"Warning: Failed to OCR page {i+1}: {page_error}")
+ ocr_text += f"\n--- Page {i+1} ---\n[OCR failed for this page]\n"
result_path = os.path.join(OCR_CACHE_DIR, f"{task_id}.txt")
with open(result_path, "w", encoding="utf-8") as f:
f.write(ocr_text)
- print(f"OCR for task {task_id} completed. Result saved to {result_path}")
+ print(f"✅ OCR for task {task_id} completed. Result saved to {result_path}")
+
+ except ImportError as e:
+ error_msg = f"Missing dependency: {e}. Install with: pip install pdf2image pytesseract"
+ print(f"❌ {error_msg}")
+ result_path = os.path.join(OCR_CACHE_DIR, f"{task_id}.error")
+ with open(result_path, "w", encoding="utf-8") as f:
+ f.write(error_msg)
+
except Exception as e:
- print(f"Error during OCR for task {task_id}: {e}")
+ error_type = type(e).__name__
+ error_msg = str(e)
+
+ # Provide specific error messages based on the exception
+ if 'poppler' in error_msg.lower() or 'Unable to get page count' in error_msg:
+ error_msg = (
+ "Poppler not found. OCR requires Poppler to convert PDF pages to images. "
+ "Install with: 'choco install poppler' or download from https://github.com/oschwartz10612/poppler-windows/releases/"
+ )
+ elif 'tesseract' in error_msg.lower():
+ error_msg = (
+ "Tesseract error. Ensure Tesseract is properly installed. "
+ "Download from: https://github.com/UB-Mannheim/tesseract/wiki"
+ )
+ else:
+ error_msg = f"{error_type}: {error_msg}"
+
+ print(f"❌ Error during OCR for task {task_id}: {error_msg}")
result_path = os.path.join(OCR_CACHE_DIR, f"{task_id}.error")
with open(result_path, "w", encoding="utf-8") as f:
- f.write(str(e))
+ f.write(error_msg)
+
+def _merge_extracted_and_ocr_text(extracted_text: str, ocr_text: str) -> str:
+ """
+ Intelligently merge direct text extraction with OCR results.
+ Prioritizes direct extraction but adds OCR content for pages with little/no text.
+ """
+ if not ocr_text or not ocr_text.strip():
+ return extracted_text
+
+ if not extracted_text or not extracted_text.strip():
+ return ocr_text
+
+ # Split by pages if OCR has page markers
+ ocr_pages = ocr_text.split("--- Page ")
+ extracted_lines = extracted_text.split('\n')
+
+ # If OCR found significantly more content, prefer OCR
+ if len(ocr_text.strip()) > len(extracted_text.strip()) * 1.5:
+ print(f"OCR found more content ({len(ocr_text)} vs {len(extracted_text)} chars), using OCR")
+ return ocr_text
+
+ # Otherwise, combine: use extracted text + OCR for pages with minimal extracted text
+ print(f"Combining extracted text ({len(extracted_text)} chars) with OCR ({len(ocr_text)} chars)")
+ return extracted_text + "\n\n--- Additional OCR Content ---\n" + ocr_text
@router.post("/api/read_pdf")
async def read_pdf(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
@@ -427,32 +864,44 @@ async def read_pdf(file: UploadFile = File(...), background_tasks: BackgroundTas
try:
pdf_bytes = await file.read()
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
- text = ""
+
+ # Always extract direct text first
+ extracted_text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
- text += page.get_text()
-
- if text.strip():
- print("Extracted text directly from PDF.")
- return JSONResponse(content={"status": "completed", "text": text})
-
- # If text is empty, perform OCR in the background
- print("No text in PDF, starting OCR in background.")
+ extracted_text += page.get_text()
- # Generate a unique ID for this task
+ # Generate a unique ID for caching
task_id = hashlib.sha256(pdf_bytes).hexdigest()
-
- # Check if result already exists (e.g. from a previous run)
result_path = os.path.join(OCR_CACHE_DIR, f"{task_id}.txt")
+
+ # Check if OCR already completed from previous run
if os.path.exists(result_path):
with open(result_path, "r", encoding="utf-8") as f:
ocr_text = f.read()
- return JSONResponse(content={"status": "completed", "text": ocr_text})
-
+ merged_text = _merge_extracted_and_ocr_text(extracted_text, ocr_text)
+ print(f"Using cached OCR + extracted text ({len(merged_text)} chars total)")
+ return JSONResponse(content={"status": "completed", "text": merged_text})
+
+ # Start OCR in background regardless of extracted text
+ print(f"Starting OCR in background (extracted {len(extracted_text)} chars directly)")
if background_tasks:
background_tasks.add_task(_perform_ocr, pdf_bytes, task_id)
- return JSONResponse(content={"status": "ocr_started", "task_id": task_id})
+ # Return extracted text immediately, OCR will enhance it later
+ if extracted_text.strip():
+ return JSONResponse(content={
+ "status": "ocr_started",
+ "task_id": task_id,
+ "text": extracted_text,
+ "partial": True
+ })
+ else:
+ # No extracted text, wait for OCR
+ return JSONResponse(content={
+ "status": "ocr_started",
+ "task_id": task_id
+ })
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to read PDF. Reason: {str(e)}")
@@ -473,6 +922,146 @@ async def get_ocr_result(task_id: str):
else:
return JSONResponse(content={"status": "processing"})
+@router.post("/api/read_pdf_with_chunks")
+async def read_pdf_with_chunks(file: UploadFile = File(...)):
+ """
+ Extract PDF content with positional information for each text element.
+ Returns both raw text and structured data for rendering interactive text layer.
+ """
+ if not file.filename.endswith(".pdf"):
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
+ try:
+ pdf_bytes = await file.read()
+ pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+ pages_data = []
+ full_text = ""
+
+ for page_num in range(len(pdf_document)):
+ page = pdf_document.load_page(page_num)
+ text = page.get_text()
+ full_text += text + "\n"
+
+ # Get text with position information
+ blocks = page.get_text("dict")["blocks"]
+ text_elements = []
+
+ for block in blocks:
+ if block.get("type") == 0: # Text block
+ for line in block.get("lines", []):
+ for span in line.get("spans", []):
+ text_elements.append({
+ "text": span.get("text", ""),
+ "bbox": span.get("bbox", [0, 0, 0, 0]), # [x0, y0, x1, y1]
+ "font": span.get("font", ""),
+ "size": span.get("size", 12)
+ })
+
+ pages_data.append({
+ "page_num": page_num + 1,
+ "width": page.rect.width,
+ "height": page.rect.height,
+ "text_elements": text_elements
+ })
+
+ return JSONResponse(content={
+ "status": "success",
+ "full_text": full_text,
+ "pages": pages_data,
+ "num_pages": len(pdf_document)
+ })
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Failed to read PDF with chunks. Reason: {str(e)}")
+
+@router.post("/api/process_pdf_interactive")
+async def process_pdf_interactive(file: UploadFile = File(...), chunk_size: int = 50):
+ """
+ Comprehensive PDF processing endpoint that returns complete structured data
+ for client-side rendering, clicking, and word-level highlighting.
+
+ This endpoint:
+ - Extracts text with precise positioning for every text element
+ - Creates word-level mappings for highlighting during reading
+ - Returns optimized data structure ready for client interaction
+ - Enables clickable text overlay and real-time word highlighting
+
+ Args:
+ file: PDF file upload
+ chunk_size: Number of words per reading chunk (default: 50)
+
+ Returns:
+ Complete structured PDF data with pages, elements, and chunks
+ """
+ from functions.pdf_processor import process_pdf_for_interactive_reading
+
+ if not file.filename.endswith(".pdf"):
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
+
+ try:
+ pdf_bytes = await file.read()
+
+ # Process PDF with comprehensive data extraction
+ result = process_pdf_for_interactive_reading(
+ pdf_bytes=pdf_bytes,
+ options={"chunk_size": chunk_size}
+ )
+
+ return JSONResponse(content=result)
+
+ except Exception as e:
+ import traceback
+ error_detail = f"Failed to process PDF: {str(e)}\n{traceback.format_exc()}"
+ print(f"❌ PDF Processing Error: {error_detail}")
+ raise HTTPException(status_code=500, detail=error_detail)
+
+
+@router.post("/api/get_chunk_highlight")
+async def get_chunk_highlight(chunk_id: int, pdf_data: Dict[str, Any]):
+ """
+ Get highlighting data for a specific chunk during playback.
+ This is called by the client during audio playback to highlight words.
+
+ Args:
+ chunk_id: ID of the chunk being played
+ pdf_data: The processed PDF data structure
+
+ Returns:
+ Highlighting information for the requested chunk
+ """
+ from functions.pdf_processor import get_highlight_data_for_chunk
+
+ try:
+ highlight_data = get_highlight_data_for_chunk(pdf_data, chunk_id)
+ return JSONResponse(content=highlight_data)
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Failed to get highlight data: {str(e)}")
+
+
+class TextProcessRequest(BaseModel):
+ text: str
+ chunk_size: int = Field(default=200, ge=50, le=1000)
+ use_llm: bool = True
+
+@router.post("/api/process_text")
+async def process_text(request: TextProcessRequest):
+ """
+ Process text with semantic sentence splitting using Qwen2.5 LLM.
+ Falls back to rule-based splitting if LLM unavailable.
+ """
+ try:
+ chunks = process_text_for_tts(
+ text=request.text,
+ chunk_size=request.chunk_size,
+ use_llm=request.use_llm
+ )
+ return JSONResponse(content={
+ "status": "success",
+ "chunks": chunks,
+ "chunk_count": len(chunks)
+ })
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Failed to process text: {str(e)}")
+
@router.post("/api/read_epub", response_model=PdfText)
async def read_epub(file: UploadFile = File(...)):
if not file.filename.endswith((".epub", ".opf")):
@@ -693,7 +1282,115 @@ async def get_user_pdf(username: str, filename: str):
if not os.path.exists(user_pdf_path):
raise HTTPException(status_code=404, detail="PDF not found.")
- return FileResponse(user_pdf_path, media_type="application/pdf")
+ # Read the PDF file content and return it directly with proper headers
+ with open(user_pdf_path, 'rb') as f:
+ pdf_content = f.read()
+
+ return Response(
+ content=pdf_content,
+ media_type="application/pdf",
+ headers={
+ "Content-Disposition": f"inline; filename={sanitized_filename}",
+ "Accept-Ranges": "bytes",
+ "Cache-Control": "public, max-age=3600"
+ }
+ )
+
+def extract_pdf_text_positions(pdf_path: str):
+ """Extract text with positions from PDF using PyMuPDF - comprehensive extraction"""
+ try:
+ doc = fitz.open(pdf_path)
+ pages_data = []
+
+ for page_num in range(len(doc)):
+ page = doc[page_num]
+ page_dict = {
+ "page_number": page_num + 1, # 1-based page numbering
+ "width": page.rect.width,
+ "height": page.rect.height,
+ "text_items": []
+ }
+
+ # Extract text with detailed position information using "dict" method
+ # This provides the most complete text extraction
+ try:
+ blocks = page.get_text("dict")["blocks"]
+
+ for block in blocks:
+ if block.get("type") == 0: # Text block
+ for line in block.get("lines", []):
+ for span in line.get("spans", []):
+ # Only add non-empty text
+ if span["text"].strip():
+ text_item = {
+ "text": span["text"],
+ "x": span["bbox"][0], # left
+ "y": span["bbox"][1], # top
+ "width": span["bbox"][2] - span["bbox"][0],
+ "height": span["bbox"][3] - span["bbox"][1],
+ "font": span.get("font", "sans-serif"),
+ "size": span.get("size", 12),
+ "color": span.get("color", 0)
+ }
+ page_dict["text_items"].append(text_item)
+ except Exception as e:
+ print(f"Error extracting text from page {page_num + 1}: {e}")
+ # Fallback to simpler text extraction
+ try:
+ text_content = page.get_text("text")
+ if text_content.strip():
+ # Create a single text item for the whole page as fallback
+ rect = page.rect
+ page_dict["text_items"].append({
+ "text": text_content,
+ "x": rect.x0,
+ "y": rect.y0,
+ "width": rect.width,
+ "height": rect.height,
+ "font": "sans-serif",
+ "size": 12,
+ "color": 0
+ })
+ except Exception as fallback_error:
+ print(f"Fallback text extraction also failed for page {page_num + 1}: {fallback_error}")
+
+ pages_data.append(page_dict)
+ print(f"✅ Page {page_num + 1}: Extracted {len(page_dict['text_items'])} text items")
+
+ doc.close()
+ print(f"📄 Total pages processed: {len(pages_data)}")
+ return pages_data
+ except Exception as e:
+ print(f"❌ Error extracting PDF text positions: {e}")
+ import traceback
+ traceback.print_exc()
+ return []
+
+@router.post("/api/users/{username}/pdfs/{filename}/data")
+async def get_user_pdf_data(username: str, filename: str):
+ """Return PDF as base64 JSON data with text positions extracted by PyMuPDF"""
+ # Sanitize filename to prevent path traversal issues
+ sanitized_filename = os.path.basename(filename)
+ user_pdf_path = os.path.join(user_manager._get_user_folder(username), sanitized_filename)
+
+ if not os.path.exists(user_pdf_path):
+ raise HTTPException(status_code=404, detail="PDF not found.")
+
+ # Read the PDF file content and encode as base64
+ with open(user_pdf_path, 'rb') as f:
+ pdf_content = f.read()
+
+ pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
+
+ # Extract text positions using PyMuPDF
+ text_positions = extract_pdf_text_positions(user_pdf_path)
+
+ return JSONResponse(content={
+ "filename": sanitized_filename,
+ "data": pdf_base64,
+ "size": len(pdf_content),
+ "text_positions": text_positions
+ })
# --------------------------------
# --- Users Podcast Management ---
diff --git a/functions/text_processor.py b/functions/text_processor.py
new file mode 100644
index 0000000..789a940
--- /dev/null
+++ b/functions/text_processor.py
@@ -0,0 +1,214 @@
+"""
+Text processing utilities for semantic sentence splitting and text extraction.
+"""
+import re
+import requests
+from typing import List, Dict, Optional
+import json
+
+
+def split_into_sentences_rule_based(text: str) -> List[str]:
+ """
+ Rule-based sentence splitter as fallback.
+ Handles common abbreviations, quotes, commas, and edge cases.
+ Splits on periods, commas, and other punctuation for better semantic chunking.
+ """
+ # Common abbreviations that shouldn't end sentences
+ abbreviations = r'(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|Inc|Ltd|Corp|St|Ave|Rd|Blvd|approx|min|max|e\.g|i\.e|vol|pp|ca|cf|ed|al|seq|c\.f)'
+
+ # Replace abbreviations temporarily
+ text = re.sub(f'({abbreviations})\.', r'\1', text, flags=re.IGNORECASE)
+
+ # First split on sentence boundaries: . ! ? followed by space
+ # Handle both straight quotes (") and smart quotes (" " ' ')
+ sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z"\'""''\u201C\u201D\u2018\u2019])', text)
+
+ # Further split each sentence on commas for better semantic chunking
+ semantic_chunks = []
+ for sentence in sentences:
+ # Split on commas followed by space, keeping the comma with the preceding text
+ parts = re.split(r'(,\s+)', sentence)
+
+ # Reconstruct chunks with commas attached
+ current_chunk = ""
+ for i, part in enumerate(parts):
+ if part.strip(): # Skip empty parts
+ current_chunk += part
+ # If this is a comma separator or we're at the end, finalize the chunk
+ if part == ', ' or i == len(parts) - 1:
+ if current_chunk.strip():
+ semantic_chunks.append(current_chunk.strip())
+ current_chunk = ""
+
+ # Restore abbreviations
+ semantic_chunks = [s.replace('', '.').strip() for s in semantic_chunks if s.strip()]
+
+ # Filter out empty chunks
+ semantic_chunks = [s for s in semantic_chunks if len(s) > 0]
+
+ return semantic_chunks
+
+
+def split_into_sentences_semantic(text: str, llm_url: str = "http://localhost:11434/api/generate") -> List[str]:
+ """
+ Use local Qwen2.5 LLM to semantically split text into sentences.
+ Falls back to rule-based splitting if LLM unavailable.
+
+ Args:
+ text: Text to split
+ llm_url: Ollama API endpoint (default: localhost:11434)
+
+ Returns:
+ List of semantically meaningful sentences
+ """
+ # If text is short, just use rule-based
+ if len(text) < 200:
+ return split_into_sentences_rule_based(text)
+
+ try:
+ # Prepare prompt for Qwen2.5
+ prompt = f"""Split the following text into semantically meaningful chunks for text-to-speech. IMPORTANT:
+- Keep ALL text content - do not skip or omit anything
+- Split on natural boundaries: periods (.), commas (,), semicolons (;), and other punctuation
+- Preserve names in quotes (like "John Smith") as part of the chunk
+- Keep dialogue and quotes as complete units
+- Create smaller chunks (15-30 words each) for better highlighting during speech
+- Each chunk should be a complete phrase or clause
+- Maintain complete thoughts within each chunk
+
+Return ONLY a JSON array of text chunks with ALL original text preserved.
+
+Text:
+{text[:2000]}
+
+Format: ["chunk1", "chunk2", "chunk3", ...]"""
+
+ payload = {
+ "model": "qwen2.5:latest",
+ "prompt": prompt,
+ "stream": False,
+ "options": {
+ "temperature": 0.1, # Low temperature for consistent output
+ "num_predict": 1000
+ }
+ }
+
+ response = requests.post(llm_url, json=payload, timeout=30)
+
+ if response.status_code == 200:
+ result = response.json()
+ llm_output = result.get('response', '').strip()
+
+ # Extract JSON array from response
+ json_match = re.search(r'\[.*\]', llm_output, re.DOTALL)
+ if json_match:
+ sentences = json.loads(json_match.group(0))
+ if sentences and isinstance(sentences, list):
+ print(f"LLM semantic splitting succeeded: {len(sentences)} sentences")
+ return [s.strip() for s in sentences if s.strip()]
+
+ print("LLM response invalid, falling back to rule-based splitting")
+
+ except requests.exceptions.ConnectionError:
+ print("LLM not available (connection error), using rule-based splitting")
+ except requests.exceptions.Timeout:
+ print("LLM timeout, using rule-based splitting")
+ except Exception as e:
+ print(f"LLM error: {e}, falling back to rule-based splitting")
+
+ # Fallback to rule-based
+ return split_into_sentences_rule_based(text)
+
+
+def chunk_sentences(sentences: List[str], max_chunk_size: int = 200) -> List[str]:
+ """
+ Group semantic text chunks that don't exceed max_chunk_size.
+ Preserves semantic boundaries (sentences, clauses, phrases) for better highlighting.
+
+ Args:
+ sentences: List of semantic text chunks (can be sentences, clauses, or phrases)
+ max_chunk_size: Maximum characters per chunk
+
+ Returns:
+ List of text chunks ready for TTS
+ """
+ chunks = []
+ current_chunk = []
+ current_size = 0
+
+ for sentence in sentences:
+ sentence = sentence.strip()
+ if not sentence:
+ continue
+
+ sentence_len = len(sentence)
+
+ # If single sentence exceeds max, add it as its own chunk
+ if sentence_len > max_chunk_size:
+ # Save any current chunk first
+ if current_chunk:
+ chunks.append(' '.join(current_chunk))
+ current_chunk = []
+ current_size = 0
+ # Add the long sentence as its own chunk
+ chunks.append(sentence)
+ continue
+
+ # If adding this sentence exceeds limit and we have content, save current chunk
+ if current_chunk and current_size + sentence_len + 1 > max_chunk_size:
+ chunks.append(' '.join(current_chunk))
+ current_chunk = []
+ current_size = 0
+
+ # Add sentence to current chunk
+ current_chunk.append(sentence)
+ current_size += sentence_len + (1 if current_chunk else 0) # +1 for space between sentences
+
+ # Add remaining sentences
+ if current_chunk:
+ chunks.append(' '.join(current_chunk))
+
+ return chunks
+
+
+def process_text_for_tts(text: str, chunk_size: int = 200, use_llm: bool = True) -> List[str]:
+ """
+ Process text for TTS by splitting semantically and chunking appropriately.
+ Creates semantic chunks based on punctuation (periods, commas) or AI analysis.
+
+ Args:
+ text: Input text to process
+ chunk_size: Maximum characters per chunk
+ use_llm: Whether to use LLM for semantic splitting (default True)
+
+ Returns:
+ List of text chunks ready for TTS
+ """
+ if not text or not text.strip():
+ return []
+
+ print(f"Processing text (length: {len(text)}, use_llm: {use_llm})")
+
+ # Split into semantic chunks (sentences, clauses, phrases)
+ if use_llm:
+ semantic_chunks = split_into_sentences_semantic(text)
+ else:
+ semantic_chunks = split_into_sentences_rule_based(text)
+
+ # Debug: Log first few chunks to verify semantic splitting
+ if semantic_chunks:
+ print(f"First semantic chunk: {semantic_chunks[0][:100]}...")
+ if len(semantic_chunks) > 1:
+ print(f"Last semantic chunk: {semantic_chunks[-1][:100]}...")
+
+ # Group into final chunks respecting max size
+ chunks = chunk_sentences(semantic_chunks, chunk_size)
+
+ # Verify all text is preserved
+ total_semantic_chars = sum(len(s) for s in semantic_chunks)
+ total_chunk_chars = sum(len(c) for c in chunks)
+ if total_chunk_chars < total_semantic_chars * 0.95: # Allow 5% for whitespace normalization
+ print(f"⚠️ WARNING: Text loss detected! Semantic: {total_semantic_chars} chars → Chunks: {total_chunk_chars} chars")
+
+ print(f"Processed text: {len(semantic_chunks)} semantic chunks → {len(chunks)} TTS chunks")
+ return chunks
diff --git a/functions/users.py b/functions/users.py
index da51def..c51337f 100644
--- a/functions/users.py
+++ b/functions/users.py
@@ -43,7 +43,7 @@ def create_user(self, username, password):
"books": {},
"podcasts": {}
}
- with open(user_file, 'w') as f:
+ with open(user_file, 'w', encoding='utf-8') as f:
json.dump(user_data, f, indent=4)
return True, "User created successfully."
@@ -52,7 +52,7 @@ def authenticate_user(self, username, password):
if not os.path.exists(user_file):
return False, "Invalid username or password."
- with open(user_file, 'r') as f:
+ with open(user_file, 'r', encoding='utf-8') as f:
user_data = json.load(f)
if checkpw(password.encode('utf-8'), user_data['password'].encode('utf-8')):
@@ -63,7 +63,7 @@ def authenticate_user(self, username, password):
def get_user_data(self, username):
user_file = self._get_user_file_path(username)
if os.path.exists(user_file):
- with open(user_file, 'r') as f:
+ with open(user_file, 'r', encoding='utf-8') as f:
user_data = json.load(f)
# Migration: Convert books from list to dict if necessary.
@@ -78,7 +78,7 @@ def get_user_data(self, username):
def save_user_data(self, username, data):
user_file = self._get_user_file_path(username)
- with open(user_file, 'w') as f:
+ with open(user_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4)
def add_book(self, username, book_data):
@@ -160,3 +160,63 @@ def update_podcast(self, username, podcast_id, new_data):
self.save_user_data(username, user_data)
return True
return False
+
+ # -------------------------
+ # Highlights/Notes Methods
+ # -------------------------
+ def get_highlights(self, username, book_id):
+ """Get all highlights for a specific book"""
+ user_data = self.get_user_data(username)
+ if user_data:
+ highlights = user_data.get('highlights', {}).get(book_id, [])
+ return highlights
+ return []
+
+ def add_highlight(self, username, book_id, highlight_data):
+ """Add a highlight to a book"""
+ user_data = self.get_user_data(username)
+ if user_data:
+ highlight_id = str(uuid.uuid4())
+ highlight_data['id'] = highlight_id
+
+ # Ensure highlights structure exists
+ if 'highlights' not in user_data:
+ user_data['highlights'] = {}
+ if book_id not in user_data['highlights']:
+ user_data['highlights'][book_id] = []
+
+ user_data['highlights'][book_id].append(highlight_data)
+ self.save_user_data(username, user_data)
+ return True, highlight_id
+ return False, None
+
+ def update_highlight(self, username, book_id, highlight_id, new_data):
+ """Update a highlight (e.g., add/edit note, change color)"""
+ user_data = self.get_user_data(username)
+ if user_data:
+ highlights = user_data.get('highlights', {}).get(book_id, [])
+ for i, h in enumerate(highlights):
+ if h.get('id') == highlight_id:
+ user_data['highlights'][book_id][i].update(new_data)
+ self.save_user_data(username, user_data)
+ return True
+ return False
+
+ def delete_highlight(self, username, book_id, highlight_id):
+ """Delete a highlight"""
+ user_data = self.get_user_data(username)
+ if user_data:
+ highlights = user_data.get('highlights', {}).get(book_id, [])
+ for i, h in enumerate(highlights):
+ if h.get('id') == highlight_id:
+ user_data['highlights'][book_id].pop(i)
+ self.save_user_data(username, user_data)
+ return True
+ return False
+
+ def get_all_highlights(self, username):
+ """Get all highlights for all books (for export/overview)"""
+ user_data = self.get_user_data(username)
+ if user_data:
+ return user_data.get('highlights', {})
+ return {}
diff --git a/package.json b/package.json
index f3223cc..a48567f 100644
--- a/package.json
+++ b/package.json
@@ -9,9 +9,9 @@
},
"scripts": {
"build-css": "npx @tailwindcss/cli -i ./static/css/pre.css -o ./static/css/style.css",
- "build-pdf-js": "cp node_modules/pdfjs-dist/build/pdf.min.mjs static/js/; cp node_modules/pdfjs-dist/build/pdf.worker.min.mjs static/js/",
- "build-marked": "cp node_modules/marked/lib/marked.umd.js static/js/",
- "build-icons": "rm -rf static/icons; cp -r node_modules/@fortawesome/fontawesome-free static/icons/",
+ "build-pdf-js": "node -e \"require('fs').copyFileSync('node_modules/pdfjs-dist/build/pdf.min.mjs', 'static/js/pdf.min.mjs'); require('fs').copyFileSync('node_modules/pdfjs-dist/build/pdf.worker.min.mjs', 'static/js/pdf.worker.min.mjs');\"",
+ "build-marked": "node -e \"require('fs').copyFileSync('node_modules/marked/lib/marked.umd.js', 'static/js/marked.umd.js');\"",
+ "build-icons": "node -e \"const fs=require('fs');const path=require('path');if(fs.existsSync('static/icons'))fs.rmSync('static/icons',{recursive:true});fs.cpSync('node_modules/@fortawesome/fontawesome-free','static/icons',{recursive:true});\"",
"build": "concurrently \"npm run build-pdf-js\" \"npm run build-marked\" \"npm run build-icons\" \"npm run build-css\""
}
}
diff --git a/requirements_desktop.txt b/requirements_desktop.txt
new file mode 100644
index 0000000..c48e54c
--- /dev/null
+++ b/requirements_desktop.txt
@@ -0,0 +1,25 @@
+# Speechify Desktop Clone - Requirements
+# 100% Local/Offline TTS Reader
+
+# GUI Framework
+PyQt6>=6.6.0
+
+# Document Readers
+PyMuPDF>=1.23.0 # PDF support (fitz)
+python-docx>=1.1.0 # DOCX support
+ebooklib>=0.18 # EPUB support
+beautifulsoup4>=4.12.0 # HTML parsing for EPUB
+
+# Audio Playback
+pygame>=2.5.0 # Audio playback
+
+# TTS Engine (Piper)
+# Note: Piper executable must be installed separately
+# Download from: https://github.com/rhasspy/piper/releases
+# Or build from source
+
+# Optional: For MP3 export
+# ffmpeg (system dependency - install separately)
+# Windows: choco install ffmpeg
+# macOS: brew install ffmpeg
+# Linux: apt install ffmpeg
diff --git a/requirements_full_desktop.txt b/requirements_full_desktop.txt
new file mode 100644
index 0000000..7b4f2f6
--- /dev/null
+++ b/requirements_full_desktop.txt
@@ -0,0 +1,107 @@
+# Voice AI Reader - 2026 Local TTS Application
+# Complete requirements for production deployment
+
+# ═══════════════════════════════════════════════════════════════════════════
+# CORE DEPENDENCIES (Required)
+# ═══════════════════════════════════════════════════════════════════════════
+
+# GUI Framework
+PyQt6>=6.6.0
+PyQt6-WebEngine>=6.6.0
+
+# Audio Processing & Playback
+pygame>=2.5.0
+numpy>=1.24.0
+soundfile>=0.12.0
+pydub>=0.25.0
+
+# ═══════════════════════════════════════════════════════════════════════════
+# DOCUMENT READERS (All formats)
+# ═══════════════════════════════════════════════════════════════════════════
+
+# PDF Support
+PyMuPDF>=1.23.0
+
+# Word Documents
+python-docx>=1.1.0
+
+# EPUB Books
+ebooklib>=0.18
+
+# Web Articles & HTML
+beautifulsoup4>=4.12.0
+lxml>=4.9.0
+requests>=2.31.0
+trafilatura>=1.6.0
+readability-lxml>=0.8.1
+
+# ═══════════════════════════════════════════════════════════════════════════
+# TTS ENGINES (Choose at least one)
+# ═══════════════════════════════════════════════════════════════════════════
+
+# Option 1: Piper TTS (RECOMMENDED - Fast, Natural, CPU-Friendly)
+# Download separately from: https://github.com/rhasspy/piper/releases
+# Place piper executable in same directory as script
+# Download voice models from: https://huggingface.co/rhasspy/piper-voices
+# Place .onnx and .onnx.json files in models/piper/
+
+# Option 2: MeloTTS (Multilingual, High Quality)
+# melotts>=0.1.0
+# Note: Auto-downloads models on first use
+
+# Option 3: Coqui TTS XTTS (Best Quality, GPU Recommended)
+# TTS>=0.22.0
+# Note: ~2GB models auto-download on first use
+
+# ═══════════════════════════════════════════════════════════════════════════
+# OPTIONAL AI FEATURES
+# ═══════════════════════════════════════════════════════════════════════════
+
+# Voice Transcription (Dictation Mode)
+# faster-whisper>=0.10.0
+# or: openai-whisper>=20231117
+
+# Local LLM for Summarization & Q&A
+# Install Ollama separately: https://ollama.com/download
+# Then run: ollama pull phi3:mini
+# Python client:
+# ollama>=0.1.0
+
+# OCR for Scanned Documents
+# easyocr>=1.7.0
+# or: pytesseract>=0.3.10
+
+# ═══════════════════════════════════════════════════════════════════════════
+# PACKAGING (For creating standalone executables)
+# ═══════════════════════════════════════════════════════════════════════════
+
+# PyInstaller for .exe/.app creation
+pyinstaller>=6.0.0
+
+# ═══════════════════════════════════════════════════════════════════════════
+# INSTALLATION COMMANDS
+# ═══════════════════════════════════════════════════════════════════════════
+
+# Minimal install (core features only):
+# pip install PyQt6 pygame numpy soundfile PyMuPDF python-docx ebooklib beautifulsoup4 lxml requests
+
+# Full install (all features):
+# pip install -r requirements_full_desktop.txt
+
+# With AI features:
+# pip install -r requirements_full_desktop.txt faster-whisper ollama easyocr
+
+# ═══════════════════════════════════════════════════════════════════════════
+# SYSTEM DEPENDENCIES
+# ═══════════════════════════════════════════════════════════════════════════
+
+# MP3 Export (Optional):
+# Windows: choco install ffmpeg
+# macOS: brew install ffmpeg
+# Linux: apt install ffmpeg
+
+# OCR Support (Optional):
+# Tesseract OCR if using pytesseract
+# Windows: choco install tesseract
+# macOS: brew install tesseract
+# Linux: apt install tesseract-ocr
diff --git a/response.json b/response.json
new file mode 100644
index 0000000..0d38ba0
--- /dev/null
+++ b/response.json
@@ -0,0 +1 @@
+{"status":"completed","text":"Game Hacking\nDeveloping Autonomous Bots \nfor Online Games\nNick Cano\nForeword by Dr. Jared DeMott\nGame Hacking\nGAME \nHACKING\nDeveloping Autonomous \nBots for Online Games\nby Nick Cano\nSan Francisco\nGame Hacking. Copyright © 2016 by Nick Cano.\nAll rights reserved. No part of this work may be reproduced or transmitted in any form or by any means, \nelectronic or mechanical, including photocopying, recording, or by any information storage or retrieval \nsystem, without the prior written permission of the copyright owner and the publisher.\nPrinted in USA\nFirst printing\n20 19 18 17 16 1 2 3 4 5 6 7 8 9\nISBN-10: 1-59327-669-9\nISBN-13: 978-1-59327-669-0\nPublisher: William Pollock\nProduction Editor: Laurel Chun\nCover Illustration: Ryan Milner \nInterior Design: Octopod Studios\nDevelopmental Editor: Jennifer Griffith-Delgado\nTechnical Reviewer: Stephen Lawler\nCopyeditor: Rachel Monaghan\nCompositor: Laurel Chun\nProofreader: Paula L. Fleming\nIndexer: BIM Creatives, LLC\nFor information on distribution, translations, or bulk sales, please contact No Starch Press, Inc. directly:\nNo Starch Press, Inc.\n245 8th Street, San Francisco, CA 94103\nphone: 415.863.9900; info@nostarch.com \nwww.nostarch.com\nLibrary of Congress Cataloging-in-Publication Data\nCano, Nick, author.\n Game hacking : developing autonomous bots for online games / by Nick Cano.\n pages cm\n Includes index.\n Summary: \"A hands-on guide to hacking computer games. Shows programmers how to dissect computer \ngames and create bots to alter their gaming environment. Covers the basics of game hacking, \nincluding reverse engineering, assembly code analysis, programmatic memory manipulation, persistent \nhacks, responsive hacks, and code injection.\"-- Provided by publisher.\n ISBN 978-1-59327-669-0 -- ISBN 1-59327-669-9\n 1. Intelligent agents (Computer software) 2. Internet programming. 3. Internet games--\nProgramming. 4. Hacking. I. Title.\n QA76.76.I58C36 2016\n 005.8--dc23\n 2015036294\nNo Starch Press and the No Starch Press logo are registered trademarks of No Starch Press, Inc. Other \nproduct and company names mentioned herein may be the trademarks of their respective owners. Rather \nthan use a trademark symbol with every occurrence of a trademarked name, we are using the names only \nin an editorial fashion and to the benefit of the trademark owner, with no intention of infringement of the \ntrademark.\nThe information in this book is distributed on an “As Is” basis, without warranty. While every precaution \nhas been taken in the preparation of this work, neither the author nor No Starch Press, Inc. shall have any \nliability to any person or entity with respect to any loss or damage caused or alleged to be caused directly or \nindirectly by the information contained in it.\nAbout the Author\nNick Cano wrote his first scripts for open source game servers when he \nwas 12 and started a business selling his bots when he was 16. He has \nbeen a part of the game-hacking community ever since and advises game \ndevelopers and designers on best practices to protect their games against \nbots. Nick also has years of experience in detecting and defending against \nmalware, and he has spoken at many conferences about his research and \ntools.\nAbout the Technical Reviewer\nStephen Lawler is the founder and president of a small computer software \nand security consulting firm. He has been actively working in information \nsecurity for over 10 years, primarily in reverse engineering, malware analy-\nsis, and vulnerability research. He was a member of the Mandiant malware \nanalysis team and assisted with high-profile computer intrusions affecting \nseveral Fortune 100 companies. Stephen also developed and teaches the \nPractical ARM Exploitation class, which has been offered at BlackHat and \nseveral other security conferences for the past five years.\nBr ie f Con t e n t s\nForeword by Dr. Jared DeMott . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xv\nAcknowledgments . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xvii\nIntroduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xix\nPART 1: TOOLS OF THE TRADE\nChapter 1: Scanning Memory Using Cheat Engine . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3\nChapter 2: Debugging Games with OllyDbg . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 23\nChapter 3: Reconnaissance with Process Monitor and Process Explorer . . . . . . . . . . . . . . . . 49\nPART 2: GAME DISSECTION\nChapter 4: From Code to Memory: A General Primer . . . . . . . . . . . . . . . . . . . . . . . . . . . . 65\nChapter 5: Advanced Memory Forensics . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 97\nChapter 6: Reading from and Writing to Game Memory . . . . . . . . . . . . . . . . . . . . . . . . . 119\nPART 3: PROCESS PUPPETEERING\nChapter 7: Code Injection . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 133\nChapter 8: Manipulating Control Flow in a Game . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 149\nPART 4: CREATING BOTS\nChapter 9: Using Extrasensory Perception to Ward Off Fog of War . . . . . . . . . . . . . . . . . 189\nChapter 10: Responsive Hacks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 203\nChapter 11: Putting It All Together: Writing Autonomous Bots . . . . . . . . . . . . . . . . . . . . . . 221\nChapter 12: Staying Hidden . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 245\nIndex . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 265\nCo n t e n t s in De ta il\nForeword by Dr. Jared DeMott\t\nxv\nAcknowledgments\t\nxvii\nIntroduction\t\nxix\nPrerequisites for the Reader . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xx\nA Brief Game Hacking History . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xx\nWhy Hack Games? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xxi\nHow This Book Is Organized . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xxii\nAbout the Online Resources . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xxiv\nHow to Use This Book . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . xxiv\nPart 1\nTools of the Trade\n1\nScanning Memory Using Cheat Engine\t\n3\nWhy Memory Scanners Are Important . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 4\nBasic Memory Scanning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 4\nCheat Engine’s Memory Scanner . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\nScan Types . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\nRunning Your First Scan . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\nNext Scans . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7\nWhen You Can’t Get a Single Result . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7\nCheat Tables . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7\nMemory Modification in Games . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8\nManual Modification with Cheat Engine . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 8\nTrainer Generator . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9\nPointer Scanning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11\nPointer Chains . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 11\nPointer Scanning Basics . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 12\nPointer Scanning with Cheat Engine . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 14\nPointer Rescanning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 17\nLua Scripting Environment . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 18\nSearching for Assembly Patterns . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 19\nSearching for Strings . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 21\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 22\nx Contents in Detail\n2\nDebugging Games with OllyDbg\t\n23\nA Brief Look at OllyDbg’s User Interface . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 24\nOllyDbg’s CPU Window . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 26\nViewing and Navigating a Game’s Assembly Code . . . . . . . . . . . . . . . . . . . . 27\nViewing and Editing Register Contents . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 29\nViewing and Searching a Game’s Memory . . . . . . . . . . . . . . . . . . . . . . . . . . 29\nViewing a Game’s Call Stack . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 30\nCreating Code Patches . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 31\nTracing Through Assembly Code . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 32\nOllyDbg’s Expression Engine . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 33\nUsing Expressions in Breakpoints . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 34\nUsing Operators in the Expression Engine . . . . . . . . . . . . . . . . . . . . . . . . . . . 34\nWorking with Basic Expression Elements . . . . . . . . . . . . . . . . . . . . . . . . . . . . 35\nAccessing Memory Contents with Expressions . . . . . . . . . . . . . . . . . . . . . . . . 36\nOllyDbg Expressions in Action . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 36\nPausing Execution When a Specific Player’s Name Is Printed . . . . . . . . . . . . . 37\nPausing Execution When Your Character’s Health Drops . . . . . . . . . . . . . . . . 39\nOllyDbg Plug-ins for Game Hackers . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 42\nCopying Assembly Code with Asm2Clipboard . . . . . . . . . . . . . . . . . . . . . . . 42\nAdding Cheat Engine to OllyDbg with Cheat Utility . . . . . . . . . . . . . . . . . . . . 42\nControlling OllyDbg Through the Command Line . . . . . . . . . . . . . . . . . . . . . . 43\nVisualizing Control Flow with OllyFlow . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 45\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 47\n3\nReconnaissance with Process Monitor\nand Process Explorer\t\n49\nProcess Monitor . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 50\nLogging In-Game Events . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 50\nInspecting Events in the Process Monitor Log . . . . . . . . . . . . . . . . . . . . . . . . . 52\nDebugging a Game to Collect More Data . . . . . . . . . . . . . . . . . . . . . . . . . . . 53\nProcess Explorer . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 55\nProcess Explorer’s User Interface and Controls . . . . . . . . . . . . . . . . . . . . . . . . 56\nExamining Process Properties . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 57\nHandle Manipulation Options . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 59\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 61\nPart 2\nGame Dissection\n4\nFrom Code to Memory: A General Primer\t\n65\nHow Variables and Other Data Manifest in Memory . . . . . . . . . . . . . . . . . . . . . . . . . . 66\nNumeric Data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 67\nString Data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 69\nData Structures . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 71\nContents in Detail xi\nUnions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 73\nClasses and VF Tables . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 74\nx86 Assembly Crash Course . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 78\nCommand Syntax . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 79\nProcessor Registers . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 81\nThe Call Stack . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 86\nImportant x86 Instructions for Game Hacking . . . . . . . . . . . . . . . . . . . . . . . . 89\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 96\n5\nAdvanced Memory Forensics\t\n97\nAdvanced Memory Scanning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 98\nDeducing Purpose . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 98\nFinding the Player’s Health with OllyDbg . . . . . . . . . . . . . . . . . . . . . . . . . . . 99\nDetermining New Addresses After Game Updates . . . . . . . . . . . . . . . . . . . . 101\nIdentifying Complex Structures in Game Data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 105\nThe std::string Class . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 105\nThe std::vector Class . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 108\nThe std::list Class . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 110\nThe std::map Class . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 114\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 118\n6\nReading from and Writing to Game Memory\t\n119\nObtaining the Game’s Process Identifier . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 120\nObtaining Process Handles . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 121\nWorking with OpenProcess() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 121\nAccessing Memory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 122\nWorking with ReadProcessMemory() and WriteProcessMemory() . . . . . . . . . 122\nAccessing a Value in Memory with ReadProcessMemory() \nand WriteProcessMemory() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 123\nWriting Templated Memory Access Functions . . . . . . . . . . . . . . . . . . . . . . . 123\nMemory Protection . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 124\nDifferentiating x86 Windows Memory Protection Attributes . . . . . . . . . . . . . . 125\nChanging Memory Protection . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 126\nAddress Space Layout Randomization . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 128\nDisabling ASLR to Simplify Bot Development . . . . . . . . . . . . . . . . . . . . . . . . 128\nBypassing ASLR in Production . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 128\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 130\nPart 3\nProcess Puppeteering\n7\nCode Injection\t\n133\nInjecting Code Caves with Thread Injection . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 134\nCreating an Assembly Code Cave . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 134\nTranslating the Assembly to Shellcode . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 135\nxii Contents in Detail\nWriting the Code Cave to Memory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 136\nUsing Thread Injection to Execute the Code Cave . . . . . . . . . . . . . . . . . . . . 137\nHijacking a Game’s Main Thread to Execute Code Caves . . . . . . . . . . . . . . . . . . . . . 138\nBuilding the Assembly Code Cave . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 138\nGenerating Skeleton Shellcode and Allocating Memory . . . . . . . . . . . . . . . . 140\nFinding and Freezing the Main Thread . . . . . . . . . . . . . . . . . . . . . . . . . . . . 141\nInjecting DLLs for Full Control . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 142\nTricking a Process into Loading Your DLL . . . . . . . . . . . . . . . . . . . . . . . . . . . 143\nAccessing Memory in an Injected DLL . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 145\nBypassing ASLR in an Injected DLL . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 146\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 147\n8\nManipulating Control Flow in a Game\t\n149\nNOPing to Remove Unwanted Code . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 150\nWhen to NOP . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 150\nHow to NOP . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 151\nHooking to Redirect Game Execution . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 153\nCall Hooking . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 153\nVF Table Hooking . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 156\nIAT Hooking . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 160\nJump Hooking . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 165\nApplying Call Hooks to Adobe AIR . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 169\nAccessing the RTMP Goldmine . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 169\nHooking the RTMPS encode() Function . . . . . . . . . . . . . . . . . . . . . . . . . . . . 171\nHooking the RTMPS decode() Function . . . . . . . . . . . . . . . . . . . . . . . . . . . . 172\nPlacing the Hooks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 173\nApplying Jump Hooks and VF Hooks to Direct3D . . . . . . . . . . . . . . . . . . . . . . . . . . . 175\nThe Drawing Loop . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 176\nFinding the Direct3D Device . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 177\nWriting a Hook for EndScene() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 182\nWriting a Hook for Reset() . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 183\nWhat’s Next? . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 184\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 185\nPart 4\nCreating Bots\n9\nUsing Extrasensory Perception to Ward Off \nFog of War \t\n189\nBackground Knowledge . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 190\nRevealing Hidden Details with Lighthacks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 190\nAdding a Central Ambient Light Source . . . . . . . . . . . . . . . . . . . . . . . . . . . 190\nIncreasing the Absolute Ambient Light . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 191\nCreating Other Types of Lighthacks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 192\nContents in Detail xiii\nRevealing Sneaky Enemies with Wallhacks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 192\nRendering with Z-Buffering . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 193\nCreating a Direct3D Wallhack . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 194\nFingerprinting the Model You Want to Reveal . . . . . . . . . . . . . . . . . . . . . . . 196\nGetting a Wider Field of Vision with Zoomhacks . . . . . . . . . . . . . . . . . . . . . . . . . . . 197\nUsing NOPing Zoomhacks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 197\nScratching the Surface of Hooking Zoomhacks . . . . . . . . . . . . . . . . . . . . . . 198\nDisplaying Hidden Data with HUDs . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 198\nCreating an Experience HUD . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 199\nUsing Hooks to Locate Data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 200\nAn Overview of Other ESP Hacks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 201\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 202\n10\nResponsive Hacks\t\n203\nObserving Game Events . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 204\nMonitoring Memory . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 204\nDetecting Visual Cues . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 205\nIntercepting Network Traffic . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 206\nPerforming In-Game Actions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 211\nEmulating the Keyboard . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 211\nSending Packets . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 215\nTying the Pieces Together . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 218\nMaking the Perfect Healer . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 218\nResisting Enemy Crowd-Control Attacks . . . . . . . . . . . . . . . . . . . . . . . . . . . . 218\nAvoiding Wasted Mana . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 219\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 219\n11\nPutting It All Together: \nWriting Autonomous Bots\t\n221\nControl Theory and Game Hacking . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 222\nState Machines . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 223\nCombining Control Theory and State Machines . . . . . . . . . . . . . . . . . . . . . . . . . . . . 225\nA Basic Healer State Machine . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 225\nA Complex Hypothetical State Machine . . . . . . . . . . . . . . . . . . . . . . . . . . . 228\nError Correction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 230\nPathfinding with Search Algorithms . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 232\nTwo Common Search Techniques . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 233\nHow Obstacles Disrupt Searches . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 233\nAn A* Search Algorithm . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 234\nWhen A* Searches Are Particularly Useful . . . . . . . . . . . . . . . . . . . . . . . . . 240\nCommon and Cool Automated Hacks . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 241\nLooting with Cavebots . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 241\nAutomating Combat with Warbots . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 243\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 244\nxiv Contents in Detail\n12\nStaying Hidden\t\n245\nProminent Anti-Cheat Software . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 246\nThe PunkBuster Toolkit . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 246\nSignature-Based Detection . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 246\nScreenshots . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 247\nHash Validation . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 247\nThe ESEA Anti-Cheat Toolkit . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 247\nThe VAC Toolkit . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 247\nDNS Cache Scans . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 248\nBinary Validation . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 248\nFalse Positives . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 248\nThe GameGuard Toolkit . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 248\nUser-Mode Rootkit . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 248\nKernel-Mode Rootkit . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 249\nThe Warden Toolkit . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 249\nCarefully Managing a Bot’s Footprint . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 250\nMinimizing a Bot’s Footprint . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 250\nMasking Your Footprint . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 251\nTeaching a Bot to Detect Debuggers . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 251\nAnti-Debugging Techniques . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 255\nDefeating Signature-Based Detection . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 256\nDefeating Screenshots . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 258\nDefeating Binary Validation . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 259\nDefeating an Anti-Cheat Rootkit . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 261\nDefeating Heuristics . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 262\nClosing Thoughts . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 263\nIndex\t\n265\nFor e wor d\nNick is great. We first hit it off in all the right and wrong ways, as you can \nimagine. I’ve been in the security field a while; he’s a little younger. I’ve had \nthe schooling, whereas he’s not much for college. I’m a faith guy, and he’s \nnot. The interesting thing is that none of that matters; we’ve had a blast \nanyway. Age, race, gender, degrees—when it comes to gaming, hacking, and \ncoding, no one cares! \nNick gets it done. He’s fun. He’s brilliant. He’s hard working. And prob-\nably most pertinent: he’s one of the rare few who understand the intersec-\ntion of gaming, hacking, and coding. He’s worked in this niche and created \nprofitable bots.\nIn this first-of-its-kind book, Nick walks you through what it means to \npull apart games. He teaches you the software investigation tools and tricks \nof the trade. You’ll learn about game internals, how to pull them apart, and \nhow to modify play. For example, Nick teaches how to avoid anti-cheat so \nthat you can automate play. Wouldn’t it be cool to have your own bot that \ncollects experience, gold, items, and more—all while you’re away?\nEver wonder how the cheaters cheat? Ever wanted to patch or protect \nyour game? Grab a coffee, crack open your laptop, and enjoy.\n \nBlessings to you and yours,\nDr. Jared DeMott\nSecurity Expert & Software Builder\nAck now l e dg m e n t s\nWriting this book was an amazing journey, and I couldn’t have done it \nalone. No Starch Press has been extremely supportive and worked closely \nwith me to take this book from concept to reality. In particular, I’d like to \nthank my developmental editor, Jennifer Griffith-Delgado, and my produc-\ntion editor, Laurel Chun. Bill Pollock, Tyler Ortman, Alison Law, and the \nrest of the team at No Starch are wonderful people, and I’m pleased to have \nworked with them.\nThanks to copyeditor Rachel Monaghan, proofreader Paula L. Fleming, \nand technical reviewer Stephen Lawler. Thanks also to my friends Cavitt \n“synt4x” Glover and Vadim Kotov, who took the time to skim some chapters \nbefore submission, and to Jared DeMott for writing the book’s foreword.\nI’d like to thank all of the people on TPForums who took me in when I \nwas just a naive kid and helped me learn how to hack games. In particular, \nI owe my thanks to Joseph “jo3bingham” Bingham, Ian Obermiller, and \njeremic, who all had a significant influence on my progression as a hacker, \nand to TPForums founder Josh “Zyphrus” Hartzell, who helped me find my \nconfidence and skills when my future looked its bleakest. \nThanks also to my entire forum staff and every customer who has ever \nused my bots. And finally, thanks to my family, friends, and colleagues, \nwho have been fun and supportive and helped shape me into the man I am \ntoday.\nIn t rodu c t ion\nA common misconception in the world \nof online gaming is the idea that the only \ngame you can play is the one in the title. \nIn fact, game hackers enjoy playing the game \nthat hides behind the curtain: a cat-and-mouse game \nof wits between them and the game developers. While\ngame hackers work to reverse engineer game binaries, automate aspects \nof game play, and modify gaming environments, game developers combat \nthe hacker-designed tools (normally referred to as bots) using anti-reversing \ntechniques, bot detection algorithms, and heuristic data mining.\nAs the battle between game hackers and developers has progressed, the \ntechnical methods implemented by both parties—many of which resemble \ntechniques utilized by malware developers and antivirus vendors—have \nevolved, becoming more complex. This book highlights the fight put up by \ngame hackers, and the advanced methods they have engineered to manipu-\nlate games while simultaneously eluding game developers in the dark cor-\nners of their own software.\nxx Introduction\nAlthough the book focuses on teaching you to develop tools that would \nlikely be considered a nuisance or even malicious by gaming companies, \nyou’ll find that many of the techniques are useful for development of tools \nthat are perfectly benign and neutral. Furthermore, the knowledge of how \nthese techniques are implemented is key for the game developers working \nto prevent their use.\nPrerequisites for the Reader\nThis book does not aim to teach you software development, and therefore \nassumes that you have, at minimum, a solid software development back-\nground. This background should include familiarity with native Windows-\nbased development, as well as light experience with game development and \nmemory management. While these skills will be enough for you to follow \nthis book, experience with x86 assembly and Windows internals will ensure \nthat details of more advanced implementations are not lost on you.\nFurthermore, since all the advanced hacks discussed in this book rely \non code injection, an ability to write code in a native language like C or \nC++ is a must. All of the example code in this book is written in C++ and \ncan be compiled with Microsoft Visual C++ Express Edition. (You can \ndownload MSVC++ Express Edition from http://www.visualstudio.com/en-US/\nproducts/visual-studio-express-vs.)\nN o t e \t\nOther languages that compile to native code, such as Delphi, are also capable of injec-\ntion, but I will not discuss them in this book.\nA Brief Game Hacking History\nSince the dawn of online PC gaming in the early 1980s, an ongoing war of \nwits between game hackers and game developers has been taking place. \nThis seemingly endless struggle has prompted game developers to devote \ncountless hours toward preventing hackers from taking their games apart \nand greasing between the gears. These hackers, who fight back with their \nsophisticated stealth implementations, have many motivations: customized \ngraphics, better performance, ease of use, autonomous play, in-game asset \nacquisition, and, of course, real-life profit.\nThe late 1990s and early 2000s were the golden age of game hacking, \nwhen online PC games became advanced enough to draw large crowds \nbut were still simple enough to easily reverse engineer and manipulate. \nOnline games that came out during this time, such as Tibia (January 1997), \nRunescape (January 2001), and Ultima Online (September 1997), were heavily \ntargeted by bot developers. The developers of these games and others like \nthem still struggle today to control the massive communities of bot devel-\nopers and bot users. The game developers’ lack of action and the hackers’ \nIntroduction xxi\ntenacity have not only completely shattered the economies within the games, \nbut have also produced a thriving for-profit industry focused around bot \ndevelopment and bot defense.\nIn the years since the golden age, more mature game companies \nstarted taking bot defense very seriously. These companies now have dedi-\ncated teams focused on developing bot prevention systems, and many also \nview bots as a legal matter and will not hesitate to banish players who use \nbots and sue the bot developers who provided them. As a result, many game \nhackers have been forced to develop advanced stealth techniques to keep \ntheir users safe.\nThis war wages on, and the numbers on both sides of the fight will con-\ntinue to grow as online gaming becomes more prevalent over the coming \nyears. Major game developers are pursuing hackers with endless determina-\ntion, even slamming some game hacking giants with multimillion-dollar \nlawsuits. This means that game hackers who are serious about their business \nmust either target smaller gaming companies, or anonymously market their \nproducts from the shadows in order to escape prosecution. For the foresee-\nable future, game hacking and bot development will continue to grow into \na larger and more lucrative industry for those game hackers bold enough to \ntake the risks.\nWhy Hack Games?\nAside from its obvious allure and challenging nature, game hacking has \nsome practical and profitable purposes. Every day, thousands of novice pro-\ngrammers experiment with small-scale game hacking as a way to automate \nmonotonous tasks or perform menial actions. These script kiddies will use \nautomation tools like AutoIt for their small, relatively harmless hacks. On \nthe other hand, professional game hackers, backed by their large toolkits \nand years of programming experience, will devote hundreds of hours to the \ndevelopment of advanced game hacks. These types of game hacks, which \nare the focus of this book, are often created with the intent of making large \namounts of money.\nGaming is a huge industry that generated $22.4 billion in sales in \n2014, according to the Entertainment Software Association. Of the tens \nof millions of players who play games daily, 20 percent play massively multi\nplayer online role-playing games (MMORPGs). These MMORPGs often \nhave thousands of players who trade virtual goods within thriving in-game \neconomies. Players often have a need for in-game assets and are willing to \nbuy these assets with real-world money. Consequently, MMORPG players \nend up developing large communities that provide gold-for-cash services. \nThese services often go as far as enforcing exchange rates from in-game \ngold to real-world currencies.\nTo take advantage of this, game hackers will create bots that are capable \nof automatically farming gold and leveling characters. Then, depending \non their goal, hackers will either set up massive gold farms and sell their \nxxii Introduction\nin-game profits, or perfect and sell their software to players who wish to \nseamlessly obtain levels and gold with minimal interference. Due to the \nmassive communities surrounding popular MMORPGs, these game hackers \ncan make between six and seven figures annually.\nWhile MMORPGs provide the largest attack surface for hackers, they \nhave a relatively small audience overall. About 38 percent of gamers favor \nreal-time strategy (RTS) and massive online battle arena (MOBA) games, \nand another 6 percent play primarily first-person shooter (FPS) games. \nThese competitive player versus player (PvP) games collectively represent \n44 percent of the gaming market and provide great rewards to determined \ngame hackers.\nPvP games are often episodic in nature; each match is an isolated game, \nand there’s typically not much profitable progression for botting away from \nkeyboard (AFK). This means that, instead of running gold farms or creat-\ning autonomous bots to level up characters, hackers will create reactive bots \nthat assist players in combat.\nThese highly competitive games are about skill and tactics, and most \nplayers participate to prove their ability to themselves and others. As a \nconsequence, the number of people seeking bots for PvP-type games is \nsubstantially lower than you’d find in the grind-heavy world of MMORPGs. \nNevertheless, hackers can still make a pretty penny selling their PvP bots, \nwhich are often much easier to develop than full-fledged autonomous bots.\nHow This Book Is Organized\nThis book is split into four parts, each of which focuses on a different core \naspect of game hacking. In Part 1: Tools of the Trade, you’ll get a box full \nof tools to help you hack games. \n• \nChapter 1: Scanning Memory Using Cheat Engine will teach you how \nto scan a game’s memory for important values using Cheat Engine. \n• \nIn Chapter 2: Debugging Games with OllyDbg, you’ll get a crash \ncourse in debugging and reverse engineering with OllyDbg. The \nskills you learn here will be extremely useful when you start making \nadvanced bots and injecting code. \n• \nTo wrap up, Chapter 3: Reconnaissance with Process Monitor and \nProcess Explorer, will teach you how to use two reconnaissance tools to \ninspect how games interact with files, other processes, the network, and \nthe operating system. \nThe online resources for each chapter in Part 1 include custom binaries \nI created to give you a safe place to test and hone your newly discovered \nskills. \nOnce you’re comfortable with every wrench and hammer, Part 2: Game \nDissection, will teach you how to get under the hood and figure out how \ngames work. \nIntroduction xxiii\n• \nIn Chapter 4: From Code to Memory: A General Primer, you’ll learn \nwhat a game’s source code and data look like once compiled into a \ngame binary. \n• \nChapter 5: Advanced Memory Forensics builds on the knowledge \nyou’ll gain from Chapter 4. You’ll learn how to scan memory and use \ndebugging to seamlessly locate tricky memory values and dissect com-\nplex classes and structures. \n• \nFinally, Chapter 6: Reading from and Writing to Game Memory shows \nyou how to read and modify data within a running game. \nThese chapters provide lots of in-depth proof-of-concept example code \nthat you can use to verify everything you read.\nIn Part 3: Process Puppeteering, you’ll become a puppeteer as you \nlearn how to turn any game into a marionette. \n• \nBuilding on the skills from Parts 1 and 2, Chapter 7: Code Injection \ndescribes how to inject and execute your own code in the address space \nof a game. \n• \nOnce you’ve mastered injection, Chapter 8: Manipulating Control Flow \nin a Game will teach you how to use injection to intercept, modify, or \ndisable any function call made by a game, and will wrap up with some \nuseful real-world examples for the common libraries Adobe AIR and \nDirect 3D. \nTo complement your puppeteering classes, these chapters are accompa-\nnied by thousands of lines of production-ready code that you can use as a \nboilerplate library for a future bot.\nIn Part 4: Creating Bots, you’ll see how to combine your toolbox, dis-\nsection abilities, puppeteering skills, and software engineering background \nto create powerful bots. \n• \nChapter 9: Using Extrasensory Perception to Ward Off Fog of War \nexplores ways to make a game display useful information that isn’t \nexposed by default, such as the locations of hidden enemies and the \namount of experience you earn per hour.\n• \nChapter 10: Responsive Hacks shows code patterns you can use to \ndetect in-game events, like decreases in health, and to make bots that \nreact to those events faster than human players.\n• \nChapter 11: Putting It All Together: Writing Autonomous Bots reveals \nhow bots that play games without human interaction work. Automated \nbots combine control theory, state machines, search algorithms, and \nmathematical models, and this chapter is a crash course in those topics.\n• \nIn Chapter 12: Staying Hidden, you’ll learn about some of the high-\nlevel techniques you can use to escape and evade any system that would \ninterfere with your bots. \nxxiv Introduction\nAs you’ve probably come to expect, these chapters have lots of example \ncode. Some of the hacks shown in this part are built on example code from \nprevious chapters. Others explore succinct, straightforward design pat-\nterns you can use to create your own bots. Once you’ve finished all four \nparts of this book, you’ll be sent off into the virtual world with your new \nsuperpower.\nAbout the Online Resources\nYou’ll find many additional resources for this book at https://www.nostarch\n.com/gamehacking/. These resources include compiled binaries to test your \nskills, a considerable amount of example code, and quite a few snippets of \nproduction-ready game hacking code. These resources go hand-in-hand \nwith the book, and it really isn’t complete without them, so make sure to \ndownload them before you continue.\nHow to Use This Book\nThis book should be used first and foremost as a guide to get you started \nin game hacking. The progression is such that the content of each chap-\nter introduces new skills and abilities that build on all previous chapters. \nAs you complete chapters, I encourage you to play with the example code \nand test your skills on a real game before continuing your reading. This is \nimportant, as some covered topics will have use cases that don’t become evi-\ndent until you’re 10 feet deep in the mud.\nOnce you’ve finished the book, I hope it can still be useful to you as a \nfield manual. If you come across some data structure you’re unsure of, maybe \nthe details in Chapter 5 can help. If you reverse engineer a game’s map for-\nmat and are ready to create a pathfinder, you can always flip to Chapter 11, \nstudy the content, and use some of the example code as a starting point. \nAlthough it’s impossible to anticipate all the problems you might face when \nyou’re hacking away, I’ve tried to ensure you’ll find some answers within \nthese pages.\nA Note from the Publisher\nThis book does not condone piracy, violating the DMCA, infringing copyright, \nor breaking in-game Terms of Service. Game hackers have been banned from \ngames for life, sued for millions of dollars, and even jailed for their work.\nPart 1\nT oo l s o f t h e T r a de\n1\nSc a n ning M e mory \nU sing Ch e at E ng in e\nThe best game hackers in the world spend \nyears personalizing expansive arsenals with \ncustom-built tools. Such potent toolkits \nenable these hackers to seamlessly analyze \ngames, effortlessly prototype hacks, and effectively \ndevelop bots. At the core, however, each unique kit is \nbuilt from the same four-piece powerhouse: a memory \nscanner, an assembler-level debugger, a process moni-\ntor, and a hex editor. \nMemory scanning is the gateway to game hacking, and this chapter will \nteach you about Cheat Engine, a powerful memory scanner that searches a \ngame’s operating memory (which lives in RAM) for values like the player’s \nlevel, health, or in-game money. First, I’ll focus on basic memory scanning, \nmemory modification, and pointer scanning. Following that, we’ll dive into \nCheat Engine’s powerful embedded Lua scripting engine.\n4 Chapter 1\nN o t e \t\nYou can grab Cheat Engine from http://www.cheatengine.org/. Pay attention \nwhen running the installer because it will try to install some toolbars and other bloat-\nware. You can disable those options if you wish.\nWhy Memory Scanners Are Important\nKnowing a game’s state is paramount to interacting with the game intel-\nligently, but unlike humans, software can’t determine the state of a game \nsimply by looking at what’s on the screen. Fortunately, underneath all of \nthe stimuli produced by a game, a computer’s memory contains a purely \nnumeric representation of that game’s state—and programs can under-\nstand numbers easily. Hackers use memory scanners to find those values in \nmemory, and then in their programs, they read the memory in these loca-\ntions to understand the game’s state. \nFor example, a program that heals players when they fall below 500 \nhealth needs to know how to do two things: track a player’s current health \nand cast a healing spell. The former requires access to the game’s state, \nwhile the latter might only require a button to be pressed. Given the loca-\ntion where a player’s health is stored and the way to read a game’s memory, \nthe program would look something like this pseudocode:\n// do this in some loop\nhealth = readMemory(game, HEALTH_LOCATION)\nif (health < 500)\n pressButton(HEAL_BUTTON)\nA memory scanner allows you to find HEALTH_LOCATION so that your soft-\nware can query it for you later.\nBasic Memory Scanning\nThe memory scanner is the most basic, yet most important, tool for the \naspiring game hacker. As in any program, all data in the memory of a game \nresides at an absolute location called a memory address. If you think of the \nmemory as a very large byte array, a memory address is an index pointing \nto a value in that array. When a memory scanner is told to find some value \nx (called a scan value, because it’s the value you’re scanning for) in a game’s \nmemory, the scanner loops through the byte array looking for any value \nequal to x. Every time it finds a matching value, it adds the index of the \nmatch to a result list.\nDue to the sheer size of a game’s memory, however, the value of x can \nappear in hundreds of locations. Imagine that x is the player’s health, which \nis currently 500. Our x uniquely holds 500, but 500 is not uniquely held by \nx, so a scan for x returns all variables with a value of 500. Any addresses not \nrelated to x are ultimately clutter; they share a value of 500 with x only by \nScanning Memory Using Cheat Engine 5\nchance. To filter out these unwanted values, the memory scanner allows you \nto rescan the result list, removing addresses that no longer hold the same \nvalue as x, whether x is still 500 or has changed.\nFor these rescans to be effective, the overall state of the game must \nhave significant entropy—a measure of disorder. You increase entropy by \nchanging the in-game environment, often by moving around, killing crea-\ntures, or switching characters. As entropy increases, unrelated addresses are \nless likely to continue to arbitrarily hold the same value, and given enough \nentropy, a few rescans should filter out all false positives and leave you with \nthe true address of x.\nCheat Engine’s Memory Scanner\nThis section gives you a tour of Cheat Engine’s memory-scanning options, \nwhich will help you track down the addresses of game state values in mem-\nory. I’ll give you a chance to try the scanner out in “Basic Memory Editing” \non page 11; for now, open Cheat Engine and have a look around. The \nmemory scanner is tightly encapsulated in its main window, as shown in \nFigure 1-1.\n\u001f\n\u001e\n\u001d\n\u001c\n\u001b\n\u001a\nFigure 1-1: Cheat Engine main screen\nTo begin scanning a game’s memory, click the Attach icon u to attach \nto a process and then enter the scan value (referred to as x in our concep-\ntual scanner) you want to locate w. By attaching to a process, we’re telling \n6 Chapter 1\nCheat Engine to prepare to operate on it; in this case, that operation is a \nscan. It helps to also tell Cheat Engine what kind of scan to run, as I’ll dis-\ncuss next.\nScan Types\nCheat Engine allows you to select two different scan directives, called Scan \nType and Value Type x. Scan Type tells the scanner how to compare your \nscan value with the memory being scanned using one of the following scan \ntypes:\nExact Value Returns addresses pointing to values equal to the \nscan value. Choose this option if the value you are looking for won’t \nchange during the scan; health, mana, and level typically fall into this \ncategory.\nBigger Than Returns addresses pointing to values greater than the \nscan value. This option is useful when the value you’re searching for is \nsteadily increasing, which often happens with timers.\nSmaller Than Returns addresses pointing to values smaller than the \nscan value. Like Bigger Than, this option is useful for finding timers \n(in this case, ones that count down rather than up).\nValue Between Returns addresses pointing to values within a scan \nvalue range. This option combines Bigger Than and Smaller Than, \ndisplaying a secondary scan value box that allows you to input a much \nsmaller range of values.\nUnknown Initial Value Returns all addresses in a program’s memory, \nallowing rescans to examine the entire address range relative to their \ninitial values. This option is useful for finding item or creature types, \nsince you won’t always know the internal values the game developers \nused to represent these objects.\nThe Value Type directive tells the Cheat Engine scanner what type of \nvariable it’s searching for. \nRunning Your First Scan\nOnce the two scan directives are set, click First Scan v to run an initial \nscan for values, and the scanner will populate the results list y. Any green \naddresses in this list are static, meaning that they should remain persistent \nacross program restarts. Addresses listed in black reside in dynamically allo-\ncated memory, memory that is allocated at runtime. \nWhen the results list is first populated, it shows the address and real-\ntime value of each result. Each rescan will also show the value of each result \nduring the previous scan. (Any real-time values displayed are updated at \nan interval that you can set in Edit4Settings4General Settings4Update \ninterval.)\nScanning Memory Using Cheat Engine 7\nNext Scans\nOnce the results list is populated, the scanner enables the Next Scan v but-\nton, which offers six new scan types. These additional scan types allow you \nto compare the addresses in the results list to their values in the previous \nscan, which will help you narrow down which address holds the game state \nvalue you’re scanning for. They are as follows:\nIncreased Value Returns addresses pointing to values that have \nincreased. This complements the Bigger Than scan type by keeping \nthe same minimum value and removing any address whose value has \ndecreased.\nIncreased Value By Returns addresses pointing to values that have \nincreased by a defined amount. This scan type usually returns far fewer \nfalse positives, but you can use it only when you know exactly how much \na value has increased.\nDecreased Value This option is the opposite of Increased Value.\nDecreased Value By This option is the opposite of Increased Value By.\nChanged Value Returns addresses pointing to values that have \nchanged. This type is useful when you know a value will mutate, but \nyou’re unsure how.\nUnchanged Value Returns addresses pointing to values that haven’t \nchanged. This can help you eliminate false positives, since you can eas-\nily create a large amount of entropy while ensuring the desired value \nstays the same.\nYou’ll usually need to use multiple scan types in order to narrow down \na large result list and find the correct address. Eliminating false positives is \noften a matter of properly creating entropy (as described in “Basic Memory \nScanning” on page 4), tactically changing your scan directives, bravely \npressing Next Scan, and then repeating the process until you have a single \nremaining address.\nWhen You Can’t Get a Single Result\nSometimes it is impossible to pinpoint a single result in Cheat Engine, in \nwhich case you must determine the correct address through experimenta-\ntion. For example, if you’re looking for your character’s health and can’t \nnarrow it down to fewer than five addresses, you could try modifying the \nvalue of each address (as discussed in “Manual Modification with Cheat \nEngine” on page 8) until you see the health display change or the other \nvalues automatically change to the one you set.\nCheat Tables\nOnce you’ve found the correct address, you can double-click it to add it to \nthe cheat table pane z; addresses in the cheat table pane can be modified, \nwatched, and saved to cheat table files for future use.\n8 Chapter 1\nFor each address in the cheat table pane, you can add a description by \ndouble-clicking the Description column, and you can add a color by right-\nclicking and selecting Change Color. You can also display the values of each \naddress in hexadecimal or decimal format by right-clicking and selecting \nShow as hexadecimal or Show as decimal, respectively. Lastly, you can \nchange the data type of each value by double-clicking the Type column, \nor you can change the value itself by double-clicking the Value column.\nSince the main purpose of the cheat table pane is to allow a game \nhacker to neatly track addresses, it can be dynamically saved and loaded. \nGo to File4Save or File4Save As to save the current cheat table pane to \na .ct document file containing each address with its value type, description, \ndisplay color, and display format. To load the saved .ct documents, go to \nFile4Load. (You’ll find many ready-made cheat tables for popular games \nat http://cheatengine.org/tables.php.)\nNow that I’ve described how to scan for a game state value, I’ll discuss \nhow you can change that value when you know where it lives in memory.\nMemory Modification in Games\nBots cheat a game system by modifying memory values in the game’s state \nin order to give you lots of in-game money, modify your character’s health, \nchange your character’s position, and so on. In most online games, a char-\nacter’s vitals (such as health, mana, skills, and position) are held in memory \nbut are controlled by the game server and relayed to your local game client \nover the Internet, so modifying such values during online play is merely \ncosmetic and doesn’t affect the actual values. (Any useful memory modifica-\ntion to an online game requires a much more advanced hack that’s beyond \nCheat Engine’s capabilities.) In local games with no remote server, however, \nyou can manipulate all of these values at will. \nManual Modification with Cheat Engine\nWe’ll use Cheat Engine to understand how the memory modification magic \nworks.\nTo modify memory manually, do the following: \n1.\t Attach Cheat Engine to a game.\n2.\t Either scan for the address you wish to modify or load a cheat table that \ncontains it. \n3.\t Double-click on the Value column for the address to open an input \nprompt where you can enter a new value.\n4.\t If you want to make sure the new value can’t be overwritten, select the \nbox under the Active column to freeze the address, which will make \nCheat Engine keep writing the same value back to it every time it \nchanges.\nScanning Memory Using Cheat Engine 9\nThis method works wonders for quick-and-dirty hacks, but constantly \nchanging values by hand is cumbersome; an automated solution would be \nmuch more appealing.\nTrainer Generator\nCheat Engine’s trainer generator allows you to automate the whole memory \nmodification process without writing any code. \nTo create a trainer (a simple bot that binds memory modification actions \nto keyboard hotkeys), go to File4Create generic trainer Lua script from \ntable. This opens a Trainer generator dialog similar to the one shown in \nFigure 1-2.\nFigure 1-2: Cheat Engine Trainer generator dialog\nThere are a number of fields to modify here:\nProcessname The name of the executable the trainer should attach \nto. This is the name shown in the process list when you attach with \nCheat Engine, and it should be autofilled with the name of the pro-\ncess Cheat Engine is attached to.\nPopup trainer on keypress Optionally enables a hotkey—which you \nset by entering a key combination in the box below the checkbox—to \ndisplay the trainer’s main window.\nTitle The name of your trainer, which will be displayed on its inter-\nface. This is optional.\nAbout text The description of your trainer, to be displayed on the \ninterface; this is also optional.\nFreeze interval (in milliseconds) The interval during which a freeze \noperation overwrites the value. You should generally leave this at 250, as \nlower intervals can sap resources and higher values may be too slow.\n10 Chapter 1\nOnce these values are configured, click Add Hotkey to set up a key \nsequence to activate your trainer. You will be prompted to select a value \nfrom your cheat table. Enter a value, and you will be taken to a Set/Change \nhotkey screen similar to Figure 1-3.\n\u001f\n\u001e\n\u001d\n\u001c\nFigure 1-3: Cheat Engine Set/Change hotkey screen\nOn this screen, place your cursor in the box labeled Type the keys you \nwant to set the hotkey to u and enter the desired key combination. Next, \nchoose the desired action from the drop-down menu v; your options \nshould appear in the following order:\nToggle freeze Toggles the freeze state of the address.\nToggle freeze and allow increase Toggles the freeze state of the \naddress but allows the value to increase. Any time the value decreases, \nthe trainer overwrites it with its previous value. Increased values will \nnot be overwritten.\nToggle freeze and allow decrease Does the opposite of Toggle freeze \nand allow increase.\nFreeze Sets the address to frozen if it’s not frozen already.\nUnfreeze Unfreezes the address if it’s frozen.\nSet value to Sets the value to whatever you specify in the value box w.\nDecrease value with Decreases the value by the amount you specify in \nthe value box w.\nIncrease value with Does the opposite of Decrease value with.\nFinally, you can set a description for the action x. Click Apply, then \nOK, and your action will appear in the list on the Trainer generator \nscreen. At this point, Cheat Engine runs the trainer in the background, \nand you can simply press the hotkeys you configured to execute the mem-\nory actions. \nScanning Memory Using Cheat Engine 11\nTo save your trainer to a portable executable, click Generate trainer. \nRunning this executable after the game is launched will attach your trainer \nto the game so you can use it without starting Cheat Engine.\nNow that you know your way around Cheat Engine’s memory scanner \nand trainer generator, try modifying some memory yourself.\nPointer Scanning\nAs I’ve mentioned, online games often store values in dynamically allocated \nmemory. While addresses that reference dynamic memory are useless to us in \nand of themselves, some static address will always point to another address, \nwhich in turn points to another, and so on, until the tail of the chain points \nto the dynamic memory we’re interested in. Cheat Engine can locate these \nchains using a method called pointer scanning. \nIn this section, I’ll introduce you to pointer chains and then describe \nhow pointer scanning works in Cheat Engine. When you have a good grasp \nof the user interface, you can get some hands-on experience in “Pointer \nScanning” on page 18.\nPointer Chains\nThe chain of offsets I’ve just described is called a pointer chain and looks \nlike this:\nlist chain = {start, offset1, offset2[, ...]}\nThe first value in this pointer chain (start) is called a memory pointer. It’s \nan address that starts the chain. The remaining values (offset1, offset2, and \nso on) make up the route to the desired value, called a pointer path. \nBasic Memory Editing\nDownload the files for this book from https://www.nostarch.com/gamehacking/, \nand run the file BasicMemory.exe. Next, start up Cheat Engine and attach to \nthe binary. Then, using only Cheat Engine, find the addresses for the x- and \ny-coordinates of the gray ball. (Hint: Use the 4 Bytes value type.) \nOnce you’ve found the values, modify them to place the ball on top of the \nblack square. The game will let you know once you’ve succeeded by display-\ning the text “Good job!” (Hint: Each time the ball is moved, its position—stored \nas a 4-byte integer—in that plane is changed by 1. Also, try to look only for \nstatic [green] results.)\n12 Chapter 1\nThis pseudocode shows how a pointer chain might be read:\nint readPointerChain(chain) {\nu ret = read(chain[0])\n for i = 1, chain.len - 1, 1 {\n offset = chain[i]\n ret = read(ret + offset)\n }\n return ret\n}\nThis code creates the function readPointerPath(), which takes a pointer \nchain called chain as a parameter. The function readPointerPath() treats the \npointer path in chain as a list of memory offsets from the address ret, which \nis initially set to the memory pointer at u. It then loops through these off-\nsets, updating the value of ret to the result of read(ret + offset) on each \niteration and returning ret once it’s finished. This pseudocode shows what \nreadPointerPath() looks like when the loop is unrolled:\nlist chain = {0xDEADBEEF, 0xAB, 0x10, 0xCC}\nvalue = readPointerPath(chain)\n// the function call unrolls to this\nret = read(0xDEADBEEF) //chain[0]\nret = read(ret + 0xAB)\nret = read(ret + 0x10)\nret = read(ret + 0xCC)\nint value = ret\nThe function ultimately calls read four times, on four different \naddresses—one for each element in chain. \nN o t e \t\nMany game hackers prefer to code their chain reads in place, instead of encapsulating \nthem in functions like readPointerPath().\nPointer Scanning Basics\nPointer chains exist because every chunk of dynamically allocated memory \nmust have a corresponding static address that the game’s code can use to \nreference it. Game hackers can access these chunks by locating the pointer \nchains that reference them. Because of their multitier structure, however, \npointer chains cannot be located through the linear approach that memory \nscanners use, so game hackers have devised new ways to find them.\nFrom a reverse engineering perspective, you could locate and analyze \nthe assembly code in order to deduce what pointer path it used to access the \nvalue, but doing so is very time-consuming and requires advanced tools. \nPointer scanners solve this problem by using brute-force to recursively iterate \nover every possible pointer chain until they find one that resolves to the tar-\nget memory address. \nScanning Memory Using Cheat Engine 13\nThe Listing 1-1 pseudocode should give you a general idea of how a \npointer scanner works.\nlist pointerScan(target, maxAdd, maxDepth) {\nu for address = BASE, 0x7FFFFFF, 4 {\n ret = rScan(address, target, maxAdd, maxDepth, 1)\n if (ret.len > 0) {\n ret.pushFront(address)\n return ret\n }\n }\n return {}\n}\nlist rScan(address, target, maxAdd, maxDepth, curDepth) {\nv for offset = 0, maxAdd, 4 {\n value = read(address + offset)\nw if (value == target)\n return list(offset)\n }\nx if (curDepth < maxDepth) {\n curDepth++\ny for offset = 0, maxAdd, 4 {\n ret = rScan(address + offset, target, maxAdd, maxDepth, curDepth)\nz if (ret.len > 0) {\n ret.pushFront(offset)\n{ return ret\n }\n }\n }\n return {}\n}\nListing 1-1: Pseudocode for a pointer scanner\nThis code creates the functions pointerScan() and rScan(). \npointerScan()\nThe pointerScan() function is the entry point to the scan. It takes the param-\neters target (the dynamic memory address to find), maxAdd (the maximum \nvalue of any offset), and maxDepth (the maximum length of the pointer path). \nIt then loops through every 4-byte aligned address u in the game, calling \nrScan() with the parameters address (the address in the current iteration), \ntarget, maxAdd, maxDepth, and curDepth (the depth of the path, which is always 1 \nin this case).\nrScan()\nThe rScan() function reads memory from every 4-byte aligned offset between \n0 and maxAdd v, and returns if a result is equal to target w. If rScan() doesn’t \nreturn in the first loop and the recursion is not too deep x, it increments \ncurDepth and again loops over each offset y, calling itself for each iteration. \n14 Chapter 1\nIf a self call returns a partial pointer path z, rScan() will prepend the \ncurrent offset to the path and return up the recursion chain { until it \nreaches pointerScan(). When a call to rScan() from pointerScan() returns a \npointer path, pointerScan() pushes the current address to the front of the \npath and returns it as a complete chain.\nPointer Scanning with Cheat Engine\nThe previous example showed the basic process of pointer scanning, but \nthe implementation I’ve shown is primitive. Aside from being insanely \nslow to execute, it would generate countless false positives. Cheat Engine’s \npointer scanner uses a number of advanced interpolations to speed up the \nscan and make it more accurate, and in this section, I’ll introduce you to \nthe smorgasbord of available scanning options.\nTo initiate a pointer scan in Cheat Engine, right-click on a dynamic \nmemory address in your cheat table and click Pointer scan for this address. \nWhen you initiate a pointer scan, Cheat Engine will ask you where to store \nthe scan results as a .ptr file. Once you enter a location, a Pointerscanner \nscanoptions dialog similar to the one shown in Figure 1-4 will appear.\nFigure 1-4: Cheat Engine Pointerscanner scanoptions dialog\nScanning Memory Using Cheat Engine 15\nThe Address to find input field at the top displays your dynamic mem-\nory address. Now carefully select from among Cheat Engine’s many scan \noptions.\nKey Options\nSeveral of Cheat Engine’s scan options typically retain their default values. \nThose options are as follows:\nAddresses must be 32-bits aligned Tells Cheat Engine to scan only \naddresses that are multiples of 4, which greatly increases the scan \nspeed. As you’ll learn in Chapter 4, compilers align data so that most \naddresses will be multiples of 4 anyway by default. You’ll rarely need to \ndisable this option.\nOnly find paths with a static address Speeds up the scan by prevent-\ning Cheat Engine from searching paths with a dynamic start pointer. \nThis option should always be enabled because scanning for a path start-\ning at another dynamic address can be counterproductive.\nDon’t include pointers with read-only nodes Should also always be \nenabled. Dynamically allocated memory that stores volatile data should \nnever be read-only.\nStop traversing a path when a static has been found Terminates the \nscan when it finds a pointer path with a static start address. This should \nbe enabled to reduce false positives and speed up the scan.\nPointer path may only be inside this region Can typically be left as is. \nThe other options available to you compensate for this large range by \nintelligently narrowing the scope of the scan.\nFirst element of pointerstruct must point to module Tells Cheat \nEngine not to search heap chunks in which virtual function tables are \nnot found, under the assumption that the game was coded using object \norientation. While this setting can immensely speed up scans, it’s highly \nunreliable and you should almost always leave it disabled.\nNo looping pointers Invalidates any paths that point to themselves, \nweeding out inefficient paths but slightly slowing down the scan. This \nshould usually be enabled.\nMax level Determines the maximum length of the pointer path. \n(Remember the maxDepth variable in the example code in Listing 1-1?) \nThis should be kept around 6 or 7.\nOf course, there will be times when you’ll need to change these options \nfrom the settings described. For example, failing to obtain reliable results \nwith the No looping pointers or Max level settings typically means that the \nvalue you’re looking for exists in a dynamic data structure, like a linked list, \nbinary tree, or vector. Another example is the Stop traversing a path when a \nstatic has been found option, which in rare cases can prevent you from get-\nting reliable results.\n16 Chapter 1\nSituational Options\nUnlike the previous options, your settings for the remaining ones will depend \non your situation. Here’s how to determine the best configuration for each:\nImprove pointerscan with gathered heap data Allows Cheat Engine \nto use the heap allocation record to determine offset limits, effectively \nspeeding up the scan by weeding out many false positives. If you run \ninto a game using a custom memory allocator (which is becoming \nincreasingly common), this option can actually do the exact opposite \nof what it’s meant to do. You can leave this setting enabled in initial \nscans, but it should be the first to go when you’re unable to find reli-\nable paths.\nOnly allow static and heap addresses in the path Invalidates all paths \nthat can’t be optimized with heap data, making this approach even \nmore aggressive.\nMax different offsets per node Limits the number of same-value \npointers the scanner checks. That is, if n different addresses point to \n0x0BADF00D, this option tells Cheat Engine to consider only the first m \naddresses. This can be extremely helpful when you’re unable to narrow \ndown your result set. In other cases, you may want to disable it, as it will \nmiss many valid paths.\nAllow stack addresses of the first thread(s) to be handled as static \nScans the call stacks of oldest m threads in the game, considering the \nfirst n bytes in each one. This allows Cheat Engine to scan the param-\neters and local variables of functions in the game’s call chain (the goal \nbeing to find variables used by the game’s main loop). The paths found \nwith this option can be both highly volatile and extremely useful; I use \nit only when I fail to find heap addresses.\nStack addresses as only static address Takes the previous option even \nfurther by allowing only stack addresses in pointer paths.\nPointers must end with specific offsets Can be useful if you know the \noffset(s) at the end of a valid path. This option will allow you to specify \nthose offsets (starting with the last offset at the top), greatly reducing \nthe scope of the scan.\nNr of threads scanning Determines how many threads the scanner \nwill use. A number equal to the number of cores in your processor \noften works best. A drop-down menu with options allows you to specify \nthe priority for each thread. Idle is best if you want your scan to go very \nslowly, Normal is what you should use for most scans, and Time critical \nis useful for lengthy scans but will render your computer useless for the \nscan duration.\nMaximum offset value Determines the maximum value of each offset \nin the path. (Remember the maxAdd variable in Listing 1-1?) I typically \nstart with a low value, increasing it only if my scan fails; 128 is a good \nstarting value. Keep in mind that this value is mostly ignored if you’re \nusing the heap optimization options.\nScanning Memory Using Cheat Engine 17\nN o t e \t\nWhat if both Only allow static and heap addresses in the path and Stack addresses as \nonly static address are enabled? Will the scan come up empty? Seems like a fun, albeit \nuseless, experiment.\nOnce you have defined your scan options, click OK to start a pointer \nscan. When the scan completes, a results window will appear with the list of \npointer chains found. This list often has thousands of results, containing \nboth real chains and false positives.\nPointer Rescanning\nThe pointer scanner has a rescan feature that can help you eliminate false \npositives. To begin, press ctrl-R from the results window to open the \nRescan pointerlist dialog, as shown in Figure 1-5.\n\u001f\n\u001e\nFigure 1-5: Cheat Engine Rescan pointerlist \ndialog\nThere are two main options to consider when you tell Cheat Engine to \nrescan:\nOnly filter out invalid pointers If you check this box u, the rescan \nwill discard only pointer chains that point to invalid memory, which \nhelps if your initial result set is very large. Disable this to filter out paths \nthat don’t resolve to a specific address or value (as shown in the figure).\nRepeat rescan until stopped If you check this box v, the rescan will \nexecute in a continuous loop. Ideally, you should enable this setting and \nlet rescan run while you create a large amount of memory entropy.\nFor the initial rescan, enable both Only filter out invalid pointers and \nRepeat rescan until stopped, and then press OK to initiate the rescan. The \nrescan window will go away, and a Stop rescan loop button will appear in \nthe results window. The result list will be constantly rescanned until you click \nStop rescan loop, but spend a few minutes creating memory entropy before \ndoing so.\n18 Chapter 1\nIn rare cases, rescanning using a rescan loop may still leave you with \na large list of possible paths. When this happens, you may need to restart \nthe game, find the address that holds your value (it may have changed!), \nand use the rescan feature on this address to further narrow results. In this \nscan, leave Only filter out invalid pointers unchecked and enter the new \naddress in the Address to find field.\nN o t e \t\nIf you had to close the results window, you can reopen it and load the result list by \ngoing to the main Cheat Engine window and pressing the Memory View button below \nthe results pane. This should bring up a memory dump window. When the window \nappears, press ctrl-P to open the pointer scan results list. Then press ctrl-O to open \nthe .ptr file where you saved the pointer scan.\nIf your results still aren’t narrow enough, try running the same scan \nacross system restarts or even on different systems. If this still yields a large \nresult set, each result can safely be considered static because more than one \npointer chain can resolve to the same address.\nOnce you’ve narrowed down your result set, double-click on a usable \npointer chain to add it to your cheat table. If you have a handful of seem-\ningly usable chains, grab the one with the fewest offsets. If you find multiple \nchains with identical offsets that start with the same pointer but diverge \nafter a certain point, your data may be stored in a dynamic data structure.\nThat’s all there is to pointer scanning in Cheat Engine. Try it yourself!\nLua Scripting Environment\nHistorically, bot developers rarely used Cheat Engine to update their \naddresses when a game released a patch because it was much easier to \ndo so in OllyDbg. This made Cheat Engine useless to game hackers other \nPointer Scanning\nGo to https://www.nostarch.com/gamehacking/ and download MemoryPointers\n.exe. Unlike the last task, which required you to win only once, this one requires \nthat you win 50 times in 10 seconds. Upon each win, the memory addresses \nfor the x- and y-coordinates will change, meaning you will be able to freeze \nthe value only if you have found a proper pointer path. Start this exercise the \nsame way as the previous one, but once you’ve found the addresses, use the \nPointer scan feature to locate pointer paths to them. Then, place the ball on \ntop of the black square, freeze the value in place, and press tab to begin the \ntest. Just as before, the game will let you know once you’ve won. (Hint: Try set-\nting the maximum level to 5 and the maximum offset value to 512. Also, play \nwith the options to allow stack addresses, terminate the scan when a static is \nfound, and improve the pointer scan with heap data. See which combination \nof options gives the best results.)\nScanning Memory Using Cheat Engine 19\nthan for initial research and development—that is, until a powerful Lua-\nbased embedded scripting engine was implemented around Cheat Engine’s \nrobust scanning environment. While this engine was created to enable the \ndevelopment of simple bots within Cheat Engine, professional game hack-\ners found they could also use it to easily write complex scripts to automati-\ncally locate addresses across different versions of a game’s binary—a task \nthat might otherwise take hours.\nN o t e \t\nYou’ll find more detail about the Cheat Engine Lua scripting engine on the wiki at \nhttp://wiki.cheatengine.org/. \nTo start using the Lua engine, press ctrl-alt-L from the main Cheat \nEngine window. Once the window opens, write your script in the text area \nand click Execute script to run it. Save a script with ctrl-S and open a \nsaved script with ctrl-O.\nThe scripting engine has hundreds of functions and infinite use cases, \nso I’ll give you just a glimpse of its abilities by breaking down two scripts. \nEvery game is different and every game hacker writes scripts to accomplish \nunique goals, so these scripts are only useful for demonstrating concepts.\nSearching for Assembly Patterns\nThis first script locates functions that compose outgoing packets and sends \nthem to the game server. It works by searching a game’s assembly code for \nfunctions that contain a certain code sequence.\nu BASEADDRESS = getAddress(\"Game.exe\")\nv function LocatePacketCreation(packetType)\nw for address = BASEADDRESS, (BASEADDRESS + 0x2ffffff) do\n local push = readBytes(address, 1, false)\n local type = readInteger(address + 1)\n local call = readInteger(address + 5)\nx if (push == 0x68 and type == packetType and call == 0xE8) then\n return address\n end\n end\n return 0\nend\nFUNCTIONHEADER = { 0xCC, 0x55, 0x8B, 0xEC, 0x6A }\ny function LocateFunctionHead(checkAddress)\n if (checkAddress == 0) then return 0 end\nz for address = checkAddress, (checkAddress - 0x1fff), -1 do\n local match = true\n local checkheader = readBytes(address, #FUNCTIONHEADER, true)\n{ for i, v in ipairs(FUNCTIONHEADER) do\n if (v ~= checkheader[i]) then\n match = false\n break\n end\n end\n| if (match) then return address + 1 end\n end\n20 Chapter 1\n return 0\nend\n} local funcAddress = LocateFunctionHead(LocatePacketCreation(0x64))\nif (funcAddress ~= 0) then\n print(string.format(\"0x%x\",funcAddress))\nelse\n print(\"Not found!\")\nend\nThe code begins by getting the base address of the module that \nCheat Engine is attached to u. Once it has the base address, the function \nLocatePacketCreation() is defined v. This function loops through the first \n0x2FFFFFF bytes of memory in the game w, searching for a sequence that \nrepresents this x86 assembler code:\nPUSH type ; Data is: 0x68 [4byte type]\nCALL offset ; Data is: 0xE8 [4byte offset]\nThe function checks that the type is equal to packetType, but it doesn’t \ncare what the function offset is x. Once this sequence is found, the func-\ntion returns.\nNext, the LocateFunctionHead() function is defined y. The function back-\ntracks up to 0x1FFF bytes from a given address z, and at each address, it \nchecks for a stub of assembler code { that looks something like this:\nINT3 ; 0xCC\nPUSH EBP ; 0x55\nMOV EBP, ESP ; 0x8B 0xEC\nPUSH [-1] ; 0x6A 0xFF\nThis stub will be present at the beginning of every function, because it’s \npart of the function prologue that sets up the function’s stack frame. Once \nit finds the code, the function will return the address of the stub plus 1 | \n(the first byte, 0xCC, is padding).\nTo tie these steps together, the LocatePacketCreation() function is called \nwith the packetType that I’m looking for (arbitrarily 0x64) and the resulting \naddress is passed into the LocateFunctionHead() function }. This effectively \nlocates the first function that pushes packetType into a function call and \nstores its address in funcAddress. This stub shows the result:\nINT3 ; LocateFunctionHead back-tracked to here\nPUSH EBP ; and returned this address\nMOV EBP, ESP\nPUSH [-1]\n--snip-- \nPUSH [0x64] ; LocatePacketCreation returned this address\nCALL [something]\nScanning Memory Using Cheat Engine 21\nThis 35-line script can automatically locate 15 different functions in \nunder a minute.\nSearching for Strings\nThis next Lua script scans a game’s memory for text strings. It works much as \nthe Cheat Engine’s memory scanner does when you use the string value type.\nBASEADDRESS = getAddress(\"Game.exe\")\nu function findString(str)\n local len = string.len(str)\nv local chunkSize = 4096\nw local chunkStep = chunkSize - len\n print(\"Found '\" .. str .. \"' at:\")\nx for address = BASEADDRESS, (BASEADDRESS + 0x2ffffff), chunkStep do\n local chunk = readBytes(address, chunkSize, true)\n if (not chunk) then break end\ny for c = 0, chunkSize-len do\nz checkForString(address , chunk, c, str, len)\n end\n end\nend\nfunction checkForString(address, chunk, start, str, len)\n for i = 1, len do\n if (chunk[start+i] ~= string.byte(str, i)) then\n return false\n end\n end\n{ print(string.format(\"\\t0x%x\", address + start))\nend\n| findString(\"hello\")\n} findString(\"world\")\nAfter getting the base address, the findString() function is defined u, \nwhich takes a string, str, as a parameter. This function loops through the \ngame’s memory x in 4,096-byte-long chunks v. The chunks are scanned \nsequentially, each one starting len (the length of str) bytes before the end \nof the previous one w to prevent missing a string that begins on one chunk \nand ends on another.\nAs findString() reads each chunk, it iterates over every byte until the over-\nlap point in the chunk y, passing each subchunk into the checkForString() \nfunction z. If checkForString() matches the subchunk to str, it prints the \naddress of that subchunk to the console {.\nLastly, to find all addresses that reference the strings \"hello\" and \n\"world\", the functions findString(\"hello\") | and findString(\"world\") } are \ncalled. By using this code to search for embedded debug strings and pairing \nit with the previous code to locate function headers, I’m able to find a large \nnumber of internal functions within a game in mere seconds.\n22 Chapter 1\nClosing Thoughts\nBy this point, you should have a basic understanding of Cheat Engine \nand how it works. Cheat Engine is a very important tool in your kit, and I \nencourage you to get some hands-on experience with it by following “Basic \nMemory Editing” on page 11 and “Pointer Scanning” on page 18 and \nplaying around with it on your own.\nOptimizing Memory Code\nDue to the high overhead of memory reading, optimization is extremely impor-\ntant when you’re writing code that performs memory reads. In the previous code \nsnippet, notice that the function findString() does not use the Lua engine’s built-\nin readString() function. Instead, it reads big chunks of memory and searches \nthem for the desired string. Let’s break down the numbers.\nA scan using readString() would try to read a string of len bytes at every \npossible memory address. This means it would read, at most, (0x2FFFFFF * len \n+ len) bytes. However, findString() reads chunks of 4,096 bytes and scans \nthem locally for matching strings. This means it would read, at most, (0x2FFFFFF \n+ 4096 + (0x2FFFFFF / (4096 - 10)) * len) bytes. When searching for a \nstring with a length of 10, the number of bytes that each method would read is \n503,316,480 and 50,458,923, respectively.\nNot only does findString() read an order of magnitude less data, it also \ninvokes far fewer memory reads. Reading in chunks of 4,096 bytes would \nrequire a total of (0x2FFFFFF / (4096 - len)) reads. Compare that to a scan \nusing readString(), which would need 0x2FFFFFF reads. The scan that uses \nfindString() is a huge improvement because invoking a read is much more \nexpensive than increasing the size of data being read. (Note that I chose \n4,096 arbitrarily. I keep the chunk relatively small because reading memory \ncan be time-consuming, and it might be wasteful to read four pages at a time \njust to find the string in the first.)\n2\nDe bu gg ing G a m e s \nw i t h Ol ly Dbg\nYou can scratch the surface of what hap-\npens as a game runs with Cheat Engine, \nbut with a good debugger, you can dig \ndeeper until you understand the game’s \nstructure and execution flow. That makes OllyDbg \nessential to your game-hacking arsenal. It’s packed \nwith a myriad of powerful tools like conditional breakpoints, referenced \nstring search, assembly pattern search, and execution tracing, making it a \nrobust assembler-level debugger for 32-bit Windows applications.\nI’ll cover low-level code structure in detail in Chapter 4, but for this \nchapter, I assume you’re at least familiar with modern code-level debug-\ngers, such as the one packaged with Microsoft Visual Studio. OllyDbg is \nfunctionally similar to those, with one major difference: it interfaces with \n24 Chapter 2\nthe assembly code of an application, working even in the absence of source \ncode and/or debug symbols, making it ideal when you need to dig into the \ninternals of a game. After all, game companies are rarely nice (or dumb) \nenough to ship their games with debug symbols!\nIn this chapter, I’ll go over OllyDbg’s user interface, show you how to \nuse its most common debugging features, break down its expression engine, \nand provide some real-world examples of how you can tie it in to your game \nhacking endeavors. As a wrap-up, I’ll teach you about some useful plug-ins \nand send you off with a test game designed to get you started in OllyDbg.\nN o t e \t\nThis chapter focuses on OllyDbg 1.10 and may not be entirely accurate for later \nversions. I use this version because, at the time of writing, the plug-in interface for \nOllyDbg 2 is still far less robust than the one for OllyDbg 1.\nWhen you feel like you have a handle on OllyDbg’s interface and fea-\ntures, you can try it on a game yourself with “Patching an if() Statement” \non page 46. \nA Brief Look at OllyDbg’s User Interface\nGo to the OllyDbg website (http://www.ollydbg.de/), download and install \nOllyDbg, and open the program. You should see the toolbar shown in \nFigure 2-1 above a multiple window interface area. \n\u001f\n\u001e\n\u001d\n\u001c\n\u001b\nFigure 2-1: OllyDbg main window\nThis toolbar contains the program controls u, the debug buttons v, \nthe Go to button w, the control window buttons x, and the Settings \nbutton y.\nThe three program controls allow you to open an executable and \nattach to the process it creates, restart the current process, or terminate \nexecution of the current process, respectively. You can also complete these \nfunctions with the hotkeys F3, ctrl-F2, and alt-F2, respectively. To attach \nto a process that is already running, click File4Attach.\nThe debug buttons control the debugger actions. Table 2-1 describes \nwhat these buttons do, along with their hotkeys and functions. This table \nalso lists three useful debugger actions that don’t have buttons on the \ndebug toolbar.\nDebugging Games with OllyDbg 25\nTable 2-1: Debug Buttons and Other Debugger Functions\nButton\nHotkey\nFunction\nPlay\nF9\nResumes normal execution of the process.\nPause\nF12\nPauses execution of all threads within the \nprocess and brings up the CPU window at the \ninstruction currently being executed.\nStep into\nF7\nSingle-steps to the next operation to be executed \n(will dive down into function calls).\nStep over\nF8\nSteps to the next operation to be executed \nwithin current scope (will skip over function \ncalls).\nTrace into\nctrl-F11\nRuns a deep trace, tracing every operation that \nis executed.\nTrace over\nctrl-F12\nRuns a passive trace that traces only operations \nwithin the current scope.\nExecute until return\nctrl-F9\nExecutes until a return operation is hit within the \ncurrent scope.\nctrl-F7\nAutomatically single-steps on every operation, \nfollowing execution in the disassembly window. \nThis makes execution appear to be animated.\nctrl-F8\nAlso animates execution, but steps over \nfunctions instead of stepping into them.\nesc\nStops animation, pausing execution on the \ncurrent operation.\nThe Go to button opens a dialog asking for a hexadecimal address. Once \nyou enter the address, OllyDbg opens the CPU window and shows the disas-\nsembly at the specified address. When the CPU window is in focus, you can \nalso show that information with the hotkey ctrl-G. \nThe control window buttons open different control windows, which display \nuseful information about the process you’re debugging and expose more \ndebugging functions, like the ability to set breakpoints. OllyDbg has a total of \n13 control windows, which can all be open simultaneously within the multiple \nwindow interface. Table 2-2 describes these windows, listed in the order in \nwhich they appear on the window buttons toolbar.\nTable 2-2: OllyDbg’s Control Windows\nWindow\nHotkey\nFunction\nLog\nalt-L\nDisplays a list of log messages, including debug prints, \nthread events, debugger events, module loads, and \nmuch more.\nModules\nalt-E\nDisplays a list of all executable modules loaded into \nthe process. Double-click a module to open it in the \nCPU window.\n(continued)\n26 Chapter 2\nTable 2-2 (continued)\nWindow\nHotkey\nFunction\nMemory map\nalt-M\nDisplays a list of all blocks of memory allocated by the \nprocess. Double-click a block in the list to bring up a \ndump window of that memory block.\nThreads\nDisplays a list of threads running in the process. For each \nthread in this list, the process has a structure called a \nThread Information Block (TIB). OllyDbg allows you to \nview each thread’s TIB; simply right-click a thread and \nselect Dump thread data block.\nWindows\nDisplays a list of window handles held by the process. \nRight-click a window in this list to jump to or set a \nbreakpoint on its class procedure (the function that \ngets called when a message is sent to the window).\nHandles\nDisplays a list of handles held by the process. (Note \nthat Process Explorer has a much better handle list than \nOllyDbg, as I will discuss in Chapter 3.)\nCPU\nalt-C\nDisplays the main disassembler interface and controls \na majority of the debugger functionality.\nPatches\nctrl-P\nDisplays a list of any assembly code modifications you \nhave made to modules within the process. \nCall stack\nalt-K\nDisplays the call stack for the active thread. The \nwindow updates when the process halts.\nBreakpoints\nalt-B\nDisplays a list of active debugger breakpoints and \nallows you to toggle them on and off.\nReferences\nDisplays the reference list, which typically holds the \nsearch results for many different types of searches. It \npops up on its own when you run a search.\nRun trace\nDisplays a list of operations logged by a debugger \ntrace.\nSource\nDisplays the source code of the disassembled module if \na program debug database is present.\nFinally, the Settings button opens the OllyDbg settings window. Keep \nthe default settings for now.\nNow that you’ve had a tour of the main OllyDbg window, let’s explore the \nCPU, Patches, and Run trace windows more closely. You’ll use those windows \nextensively as a game hacker, and knowing your way around them is key.\nOllyDbg’s CPU Window\nThe CPU window in Figure 2-2 is where game hackers spend most of their \ntime in OllyDbg because it is the main control window for the debugging \nfeatures.\nDebugging Games with OllyDbg 27\n\u001f\n\u001e\n\u001d\n\u001c\nFigure 2-2: OllyDbg CPU window \nThis window houses four distinct control panes: the disassembler \npane u, the registers pane v, the dump pane w, and the stack pane x. \nThese four panes encapsulate OllyDbg’s main debugger functions, so it’s \nimportant to know them inside and out.\nViewing and Navigating a Game’s Assembly Code\nYou’ll navigate game code and control most aspects of debugging from \nOllyDbg’s disassembler pane. This pane displays the assembly code for the \ncurrent module, and its data is neatly displayed in a table composed of four \ndistinct columns: Address, Hex dump, Disassembly, and Comment.\nThe Address column displays the memory addresses of each operation \nin the game process you’re attached to. You can double-click an address in \nthis column to toggle whether it’s the display base. When an address is set as \nthe display base, the Address column displays all other addresses as offsets \nrelative to it.\nThe Hex dump column displays the byte code for each operation, \ngrouping operation codes and parameters accordingly. Black braces span-\nning multiple lines on the left side of this column mark known function \nboundaries. Operations that have jumps going to them are shown with a \n28 Chapter 2\nright-facing arrow on the inside of these braces. Operations that perform \njumps are shown with either up-facing or down-facing arrows, depending \non the direction in which they jump, on the inside of these braces. For \nexample, in Figure 2-2, the instruction at address 0x779916B1 (highlighted \nin gray) has an up-facing arrow, indicating it’s an upward jump. You can \nthink of a jump as a goto operator.\nThe Disassembly column displays the assembly code of each operation \nthe game performs. So, for example, you can confirm that the instruction \nat 0x779916B1 in Figure 2-2 is a jump by looking at the assembly, which \nshows a JNZ (jump if nonzero) instruction. Black braces in this column \nmark the boundaries of loops. Right-facing arrows attached to these braces \npoint to the conditional statements that control whether the loops continue \nor exit. The three right-facing arrows in this column in Figure 2-2 point to \nCMP (compare) and TEST instructions, which are used by assembly code to \ncompare values.\nThe Comment column displays human-readable comments about each \noperation the game performs. If OllyDbg encounters known API function \nnames, it will automatically insert a comment with the name of the function. \nSimilarly, if it successfully detects arguments being passed to a function, \nit will label them (for example, Arg1, Arg2, . . . , ArgN). You can double-click \nin this column to add a customized comment. Black braces in this column \nmark the assumed boundaries of function call parameters.\nN o t e \t\nOllyDbg infers function boundaries, jump directions, loop structures, and function \nparameters during code analysis, so if these columns lack boundary lines or jump \narrows, just press ctrl-A to run a code analysis on the binary. \nWhen the disassembler pane is in focus, there are a few hotkeys you can \nuse to quickly navigate code and control the debugger. Use F2 for Toggle \nbreakpoint, shift-F12 for Place conditional breakpoint, - (hyphen) for Go \nback and + (plus) for Go forward (these two work as you’d expect in a web \nbrowser), * (asterisk) for Go to EIP (which is the execution pointer in the \nx86 architecture), ctrl-- (hyphen) for Go to previous function, and ctrl-+ \nfor Go to next function. \nThe disassembler can also populate the References window with differ-\nent types of search results. When you want to change the References win-\ndow’s contents, right-click in the disassembler pane, mouse over the Search \nfor menu to expand it, and select one of the following options:\nAll intermodular calls Searches for all calls to functions in remote \nmodules. This can, for example, allow you to see everywhere that a \ngame calls Sleep(), PeekMessage(), or any other Windows API function, \nenabling you to inspect or set breakpoints on the calls.\nAll commands Searches for all occurrences of a given operation writ-\nten in assembly, where the added operators CONST and R32 will match \na constant value or a register value, respectively. One use for this \noption might be searching for commands like MOV [0xDEADBEEF], CONST; \nDebugging Games with OllyDbg 29\nMOV [0xDEADBEEF], R32; and MOV [0xDEADBEEF], [R32+CONST] to list all opera-\ntions that modify memory at the address 0xDEADBEEF, which could be any-\nthing, including the address of your player’s health.\nAll sequences Searches for all occurrences of a given sequence of \noperations. This is similar to the previous options, but it allows you to \nspecify multiple commands.\nAll constants Searches for all instances of a given hexadecimal con-\nstant. For instance, if you enter the address of your character’s health, \nthis will list all of the commands that directly access it.\nAll switches Searches for all switch-case blocks.\nAll referenced text strings Searches for all strings referenced in code. \nYou can use this option to search through all referenced strings and see \nwhat code accesses them, which can be useful for correlating in-game \ntext displays with the code that displays them. This option is also very \nuseful for locating any debug assertion or logging strings, which can be \na tremendous help in determining the purpose of code parts.\nThe disassembler can also populate the Names window with all labels \nin the current module (ctrl-N) or all known labels in all modules (Search \nfor4Name in all modules). Known API functions will be automatically \nlabeled with their names, and you can add a label to a command by high-\nlighting it, pressing shift-; and entering the label when prompted. When \na labeled command is referenced in code, the label will be shown in place of \nthe address. One way to use this feature is to name functions that you’ve \nanalyzed (just set a label on the first command in a function) so you can \nsee their names when other functions call them.\nViewing and Editing Register Contents\nThe registers pane displays the contents of the eight processor registers, all \neight flag bits, the six segment registers, the last Windows error code, and \nEIP. Underneath these values, this pane can display either Floating-Point \nUnit (FPU) registers or debug registers; click on the pane’s header to change \nwhich registers are displayed. The values in this pane are populated only if \nyou freeze your process. Values that are displayed in red have been changed \nsince the previous pause. Double-click on values in this pane to edit them.\nViewing and Searching a Game’s Memory\nThe dump pane displays a dump of the memory at a specific address. To \njump to an address and display the memory contents, press ctrl-G and \nenter the address in the box that appears. You can also jump to the address \nof an entry in the other CPU window panes by right-clicking on the Address \ncolumn and selecting Follow in dump.\nWhile there are always three columns in the dump pane, the only one \nyou should always see is the Address column, which behaves much like its \ncousin within the disassembler pane. The data display type you choose \n30 Chapter 2\ndetermines the other two columns shown. Right-click the dump pane to \nchange the display type; for the one shown in Figure 2-2, you’d right-click \nand select Hex4Hex/ASCII (8 bytes).\nYou can set a memory breakpoint on an address shown in the dump \npane by right-clicking that address and expanding the Breakpoint submenu. \nSelect Memory4On access from this menu to break on any code that uses \nthe address at all, or select Memory4On write to break only on code that \nwrites to that space in memory. To remove a memory breakpoint, select \nRemove memory breakpoint in the same menu; this option appears only \nwhen the address you right-click has a breakpoint.\nWith one or more values selected in the dump, you can press ctrl-R to \nsearch the current module’s code for references to addresses of the selected \nvalues; results of this search appear in the References window. You can also \nsearch for values in this pane using ctrl-B for binary strings and ctrl-N for \nlabels. After you initiate a search, press ctrl-L to jump to the next match. \nctrl-E allows you to edit any values you have selected.\nN o t e \t\nThe dump windows that you can open from the Memory window work in the same \nway as the dump pane.\nViewing a Game’s Call Stack\nThe final CPU pane is the stack pane, and as the name suggests, it shows \nthe call stack. Like the dump and disassembler panes, the stack pane has an \nAddress column. The stack pane also has a Value column, which shows the \nstack as an array of 32-bit integers, and a Comment column, which shows \nreturn addresses, known function names, and other informative labels. The \nstack pane supports all the same hotkeys as the dump pane, with the excep-\ntion of ctrl-N.\nMulticlient Patching\nOne type of hack, called a multiclient patch, overwrites the single-instance \nlimitation code within a game’s binary with no-operation code, allowing the \nuser to run multiple game clients, even when doing so is normally forbidden. \nBecause the code that performs instance limitation must be executed very early \nafter a game client is launched, it can be nearly impossible for a bot to inject \nits patch on time. The easiest workaround for this is to make multiclient patches \npersist by applying them within OllyDbg and saving them directly to the game \nbinary.\nDebugging Games with OllyDbg 31\nCreating Code Patches\nOllyDbg’s code patches let you make assembly code modifications for a game \nyou want to hack, removing the need to engineer a tool tailored to that \nspecific game. This makes prototyping control flow hacks—which manipulate \ngame behavior through a mix of game design flaws, x86 assembly protocols, \nand common binary constructs—much easier.\nGame hackers typically include perfected patches as optional features \nin a bot’s tool suite, but in some cases, making those features persistent is \nactually more convenient for your end user. Luckily, OllyDbg patches pro-\nvide the complete functionality you need to design, test, and permanently \nsave code modifications to an executable binary using only OllyDbg.\nTo place a patch, navigate to the line of assembly code you want to patch \nin the CPU window, double-click the instruction you wish to modify, place \na new assembly instruction in the pop-up prompt, and click Assemble, as \nshown in Figure 2-3.\nFigure 2-3: Placing a patch with OllyDbg\nAlways pay attention to the size of your patch—you can’t just resize and \nmove around assembled code however you’d like. Patches larger than the \ncode you intend to replace will overflow into subsequent operations, poten-\ntially removing critical functionality. Patches smaller than the operations \nyou intend to replace are safe, as long as Fill with NOPs is checked. This \noption fills any abandoned bytes with no-operation (NOP) commands, which \nare single-byte operations that do nothing when executed.\nAll patches you place are listed, along with the address, size, state, old \ncode, new code, and comment, in the Patches window. Select a patch in this \nlist to access a small but powerful set of hotkeys, shown in Table 2-3.\n32 Chapter 2\nTable 2-3: Patches Window Hotkeys\nOperator\nFunction\nenter\nJumps to the patch in the disassembler.\nspacebar\nToggles the patch on or off.\nF2\nPlaces a breakpoint on the patch.\nshift-F2\nPlaces a conditional breakpoint on the patch.\nshift-F4\nPlaces a conditional log breakpoint on the patch.\ndel\nRemoves the patch entry from the list only.\nIn OllyDbg, you can also save your patches directly to the binary. First, \nright-click in the disassembler and click Copy to executable4All modifica-\ntions. If you want to copy only certain patches, highlight them in the disas-\nsembly pane and press Copy to executable4Selection instead.\nTracing Through Assembly Code \nWhen you run a trace on any program, OllyDbg single-steps over every \nexecuted operation and stores data about each one. When the trace is \ncomplete, the logged data is displayed in the Run trace window, shown \nin Figure 2-4. \nDetermining Patch Size\nThere are a few ways to determine whether your patch will be a different size \nthan the original code. For example, in Figure 2-3, you can see the command \nat 0x7790ED2E being changed from SHR AL, 6 to SHR AL, 7. If you look at the \nbytes to the left of the command, you see 3 bytes that represent the memory of \nthe command. This means our new command must either be 3 bytes or padded \nwith NOPs if it’s less than 3 bytes. Furthermore, these bytes are arranged in \ntwo columns. The first column contains 0xC0 and 0x08, which represent the com-\nmand SHR and the first operand, AL. The second column contains 0x06, which \nrepresents the original operand. Because the second column shows a single \nbyte, any replacement operand must also be 1 byte (between 0x00 and 0xFF). \nIf this column had shown 0x00000006 instead, a replacement operand could be \nup to 4 bytes in length.\nTypical code patches will either use all NOPs to completely remove a \ncommand (by leaving the box empty and letting it fill the entire command with \nNOPs) or just replace a single operand, so this method of checking patch size \nis almost always effective.\nDebugging Games with OllyDbg 33\nFigure 2-4: The Run trace window\nThe Run trace window is organized into the following six columns:\nBack The number of operations logged between an operation and the \ncurrent execution state\nThread The thread that executed the operation\nModule The module where the operation resides\nAddress The address of the operation\nCommand The operation that was executed\nModified registers The registers changed by the operation and their \nnew values\nWhen hacking games, I find OllyDbg’s trace feature very effective at \nhelping me find pointer paths to dynamic memory when Cheat Engine \nscans prove inconclusive. This works because you can follow the log in the \nRun trace window backward from the point when the memory is used to \nthe point where it is resolved from a static address.\nThis potent feature’s usefulness is limited only by the creativity of the \nhacker using it. Though I typically use it only to find pointer paths, I’ve \ncome across a few other situations where it has proven invaluable. The anec-\ndotes in “OllyDbg Expressions in Action” on page 36 will help to illumi-\nnate the functionality and power of tracing.\nOllyDbg’s Expression Engine\nOllyDbg is home to a custom expression engine that can compile and evalu-\nate advanced expressions with a simple syntax. The expression engine is \nsurprisingly powerful and, when utilized properly, can be the difference \nbetween an average OllyDbg user and an OllyDbg wizard. You can use this \nengine to specify expressions for many features, such as conditional break-\npoints, conditional traces, and the command line plug-in. This section \nintroduces the expression engine and the options it provides.\n34 Chapter 2\nN o t e \t\nParts of this section are based on the official expressions documentation (http://\nwww.ollydbg.de/Help/i_Expressions.htm). I have found, however, that a few \nof the components defined in the documentation don’t seem to work, at least not in \nOllyDbg v1.10. Two examples are the INT and ASCII data types, which must be sub-\nstituted with the aliases LONG and STRING. For this reason, here I include only compo-\nnents that I’ve personally tested and fully understand.\nUsing Expressions in Breakpoints\nWhen a conditional breakpoint is toggled on, OllyDbg prompts you to enter an \nexpression for the condition; this is where most expressions are used. When \nthat breakpoint is executed, OllyDbg silently pauses execution and evalu-\nates the expression. If the result of the evaluation is nonzero, execution \nremains paused and you will see the breakpoint get triggered. But if the \nresult of the evaluation is 0, OllyDbg silently resumes execution as if noth-\ning happened.\nWith the huge number of executions that happen within a game every \nsecond, you’ll often find that a piece of code is executed in far too many \ncontexts for a breakpoint to be an effective way of getting the data you are \nlooking for. A conditional breakpoint paired with a good understanding of \nthe code surrounding it is a foolproof way to avoid these situations.\nUsing Operators in the Expression Engine\nFor numeric data types, OllyDbg expressions support general C-style opera-\ntors, as seen in Table 2-4. While there is no clear documentation on the \noperator precedence, OllyDbg seems to follow C-style precedence and can \nuse parenthesized scoping.\nTable 2-4: OllyDbg Numeric Operators\nOperator\nFunction\na == b\nReturns 1 if a is equal to b, else returns 0.\na != b\nReturns 1 if a is not equal to b, else returns 0.\na > b\nReturns 1 if a is greater than b, else returns 0.\na < b\nReturns 1 if a is less than b, else returns 0.\na >= b\nReturns 1 if a is greater than or equal to b, else returns 0.\na <= b\nReturns 1 if a is less than or equal to b, else returns 0.\na && b\nReturns 1 if a and b are both nonzero, else returns 0.\na || b\nReturns 1 if either a or b are nonzero, else returns 0.\na ^ b\nReturns the result of XOR(a, b).\na % b\nReturns the result of MODULUS(a, b).\na & b\nReturn the result of AND(a, b).\na | b\nReturn the result of OR(a, b).\na << b\nReturns the result of a shifted b bits to the left.\na >> b\nReturns the result of a shifted b bits to the right.\nDebugging Games with OllyDbg 35\nOperator\nFunction\na + b\nReturns the sum of a plus b.\na - b\nReturns the difference of a minus b.\na / b\nReturns the quotient of a divided by b.\na * b\nReturns the product of a times b.\n+a\nReturns the signed representation of a.\n-a\nReturns a*-1.\n!a\nReturns 1 if a is 0, else returns 0.\nFor strings, on the other hand, the only available operators are == and \n!=, which both adhere to the following set of rules:\n• \nString comparisons are case insensitive.\n• \nIf only one of the operands is a string literal, the comparison will termi-\nnate after it reaches the length of the literal. As a result, the expression \n[STRING EAX]==\"ABC123\", where EAX is a pointer to the string ABC123XYZ, will \nevaluate to 1 instead of 0.\n• \nIf no type is specified for an operand in a string comparison and the \nother operand is a string literal (for example, \"MyString\"!=EAX), the com-\nparison will first assume the nonliteral operand is an ASCII string, and, \nif that compare would return 0, it will try a second compare assuming \nthe operand is a Unicode string.\nOf course, operators aren’t much use without operands. Let’s look at \nsome of the data you can evaluate in expressions.\nWorking with Basic Expression Elements\nExpressions are able to evaluate many different elements, including:\nCPU registers \nEAX, EBX, ECX, EDX, ESP, EBP, ESI, and EDI. You can also use \nthe 1-byte and 2-byte registers (for example, AL for the low byte and AX \nfor the low word of EAX). EIP can also be used.\nSegment registers \nCS, DS, ES, SS, FS, and GS.\nFPU registers \nST0, ST1, ST2, ST3, ST4, ST5, ST6, and ST7.\nSimple labels Can be API function names, such as GetModuleHandle, or \nuser-defined labels.\nWindows constants Such as ERROR_SUCCESS.\nIntegers Are written in hexadecimal format or decimal format if fol-\nlowed by a trailing decimal point (for example, FFFF or 65535.).\nFloating-point numbers Allow exponents in decimal format (for \nexample, 654.123e-5).\nString literals Are wrapped in quotation marks (for example, \"my \nstring\").\n36 Chapter 2\nThe expressions engine looks for these elements in the order they’re \nlisted here. For example, if you have a label that matches the name of a \nWindows constant, the engine uses the address of the label instead of the \nconstant’s value. But if you have a label named after a register, such as EAX, \nthe engine uses the register value, not the label value.\nAccessing Memory Contents with Expressions \nOllyDbg expressions are also powerful enough to incorporate memory read-\ning, which you can do by wrapping a memory address, or an expression that \nevaluates to one, in square brackets. For example, [EAX+C] and [401000] rep-\nresent the contents at the addresses EAX+C and 401000. To read the mem-\nory as a type other than DWORD, you can specify the desired type either before \nthe brackets, as in BYTE [EAX], or as the first token within them, as in [STRING \nESP+C]. Supported types are listed in Table 2-5.\nTable 2-5: OllyDbg Data Types\nData type\nInterpretation\nBYTE\n8-bit integer (unsigned)\nCHAR\n8-bit integer (signed)\nWORD\n16-bit integer (unsigned)\nSHORT\n16-bit integer (signed)\nDWORD\n32-bit integer (unsigned)\nLONG\n32-bit integer (signed)\nFLOAT\n32-bit floating-point number\nDOUBLE\n64-bit floating-point number\nSTRING\nPointer to an ASCII string (null-terminated)\nUNICODE\nPointer to a Unicode string (null-terminated)\nPlugging memory contents directly into your OllyDbg expressions is \nincredibly useful in game hacking, in part because you can tell the debug-\nger to check a character’s health, name, gold, and so on in memory before \nbreaking. You’ll see an example of this in “Pausing Execution When a \nSpecific Player’s Name Is Printed” on page 37.\nOllyDbg Expressions in Action\nExpressions in OllyDbg use a syntax similar to that of most programming \nlanguages; you can even combine multiple expressions and nest one expres-\nsion within another. Game hackers (really, all hackers) commonly use them \nto create conditional breakpoints, as I described in “Using Expressions in \nBreakpoints” on page 34, but you can use them in many different places \nin OllyDbg. For instance, OllyDbg’s command line plug-in can evaluate \nDebugging Games with OllyDbg 37\nexpressions in place and display their results, allowing you to easily read \narbitrary memory, inspect values that are being calculated by assembly code, \nor quickly get the results of mathematical equations. Furthermore, hackers \ncan even create intelligent, position-agnostic breakpoints by coupling expres-\nsions with the trace feature.\nIn this section, I’ll share some anecdotes where the expression engine \nhas come in handy during my work. I will explain my thought process, walk \nthrough my entire debugging session, and break each expression down into \nits component parts so you can see some ways to use OllyDbg expressions in \ngame hacking.\nN o t e \t\nThese examples contain some assembly code, but if you don’t have much experience \nwith assembly, don’t worry. Just ignore the fine details and know that values like ECX, \nEAX, and ESP are process registers like the ones discussed in “Viewing and Editing \nRegister Contents” on page 29. From there, I’ll explain everything else.\nIf you get confused about an operator, element, or data type in an \nexpression as I walk through these anecdotes, just refer to “OllyDbg’s \nExpression Engine” on page 33.\nPausing Execution When a Specific Player’s Name Is Printed\nDuring one particular debugging session, I needed to figure out exactly \nwhat was happening when a game was drawing the names of players on \nscreen. Specifically, I needed to invoke a breakpoint before the game drew \nthe name “Player 1,” ignoring all other names that were drawn.\nFiguring Out Where to Pause\nAs a starting point, I used Cheat Engine to find the address of Player 1’s \nname in memory. Once I had the address, I used OllyDbg to set a memory \nbreakpoint on the first byte of the string. Every time this breakpoint got \nhit, I quickly inspected the assembly code to determine how it was using \nPlayer 1’s name. Eventually, I found the name being accessed directly above \na call to a function that I had previously given the name printText(). I had \nfound the code that was drawing the name.\nI removed my memory breakpoint and replaced it with a code breakpoint \non the call to printText(). There was a problem, however: because the call to \nprintText() was inside a loop that iterated over every player in the game, my \nnew breakpoint was getting hit every time a name was drawn—and that was \nmuch too often. I needed to fix it to hit only on a specific player.\nInspecting the assembly code at my previous memory breakpoint told \nme that each player’s name was accessed using the following assembly code:\nPUSH DWORD PTR DS:[EAX+ECX*90+50]\nThe EAX register contained the address of an array of player data; I’ll \ncall it playerStruct. The size of playerStruct was 0x90 bytes, the ECX register \ncontained the iteration index (the famous variable i), and each player’s \n38 Chapter 2\nname was stored 0x50 bytes after the start of its respective playerStruct. This \nmeant that this PUSH instruction essentially put EAX[ECX].name (the name of the \nplayer at index i) on the stack to be passed as an argument to the printText() \nfunction call. The loop, then, broke down to something like the following \npsuedocode:\nplayerStruct EAX[MAX_PLAYERS]; // this is filled elsewhere\nfor (int uECX = 0; ECX < MAX_PLAYERS; ECX++) {\n char* name = vEAX[ECX].name;\n breakpoint(); // my code breakpoint was basically right here\n printText(name);\n}\nPurely through analysis, I determined that the playerStruct() function \ncontained data for all players, and the loop iterated over the total number \nof players (counting up with ECX u), fetched the character name v for each \nindex, and printed the name. \nCrafting the Conditional Breakpoint\nKnowing that, to pause execution only when printing “Player 1” all I had to \ndo was check the current player name before executing my breakpoint. In \npseudocode, the new breakpoint would look like this:\nif (EAX[ECX].name == \"Player 1\") breakpoint();\nOnce I figured out the form of my new breakpoint, I needed to access \nEAX[ECX].name from within the loop. That’s where OllyDbg’s expression engine \ncame in: I could achieve my goal by making slight modifications to the \nexpression that the assembly code used, leaving me with this expression:\n[STRING EAX + ECX*0x90 + 0x50] == \"Player 1\"\nI removed the code breakpoint on printText() and replaced it with \na conditional breakpoint that used this expression, which told OllyDbg \nto break only if the string value stored at EAX + ECX*0x90 + 0x50 matched \nPlayer 1’s name. This breakpoint hit only when \"Player 1\" was being drawn, \nallowing me to continue my analysis. \nThe amount of work it took to engineer this breakpoint might seem \nextensive, but with practice, the entire process becomes as intuitive as writ-\ning code. Experienced hackers can do this in a matter of seconds.\nIn practice, this breakpoint enabled me to inspect certain values in \nthe playerStruct() function for \"Player 1\" as soon as he appeared on screen. \nDoing it this way was important, as the states of these values were relevant \nto my analysis only in the first few frames after the player entered the screen. \nCreatively using breakpoints like this can enable you to analyze all sorts of \ncomplex game behavior.\nDebugging Games with OllyDbg 39\nPausing Execution When Your Character’s Health Drops \nDuring another debugging session, I needed to find the first function called \nafter my character’s health dropped below the maximum. I knew two ways \nto approach this problem:\n• \nFind every piece of code that accesses the health value and place a con-\nditional breakpoint that checks the health on each one. Then, once one \nof these breakpoints is hit, single-step through the code until the next \nfunction call.\n• \nUse OllyDbg’s trace function to create a dynamic breakpoint that can \nstop exactly where I need.\nThe first method required more setup and was not easily repeatable, \nmostly due to the sheer number of breakpoints needed and the fact that \nI’d have to single-step by hand. In contrast, the latter method had a quick \nsetup, and since it did everything automatically, it was easily repeatable. \nThough using the trace function would slow the game down consider-\nably (every single operation was captured by the trace), I chose the latter \nmethod.\nWriting an Expression to Check Health\nOnce again, I started by using Cheat Engine to find the address that stored \nmy health. Using the method described in “Cheat Engine’s Memory Scanner” \non page 5, I determined the address to be 0x40A000. \nNext, I needed an expression that told OllyDbg to return 1 when my \nhealth was below maximum and return 0 otherwise. Knowing that my health \nwas stored at 0x40A000 and that the maximum value was 500, I initially \ndevised this expression:\n[0x40A000] < 500.\nThis expression would invoke a break when my health was below 500 \n(remember, decimal numbers must be suffixed with a period in the expres-\nsion engine), but instead of waiting for a function to be called, the break \nwould happen immediately. To ensure that it waited until a function was \ncalled, I appended another expression with the && operator: \n[0x40A000] < 500. && [uBYTE EIP] == 0xE8\nOn x86 processors, the EIP register stores the address of the operation \nbeing executed, so I decided to check the first byte at EIP u to see if it was \nequal to 0xE8. This value tells the processor to execute a near function call, \nwhich is the type of call I was looking for. \nBefore starting my trace, I had to do one last thing. Because the trace \nfeature repeatedly single-steps (Trace into uses step into and Trace over \n40 Chapter 2\nuses step over, as described in “A Brief Look at OllyDbg’s User Interface” \non page 24), I needed to start the trace at a location scoped at or above \nthe level of any code that could possibly update the health value.\nFiguring Out Where to Start the Trace\nTo find a good location, I opened the game’s main module in OllyDbg’s \nCPU window, right-clicked in the disassembler pane, and selected Search \nfor4All intermodular calls. The References window popped up and dis-\nplayed a list of external API functions that were called by the game. \nNearly all gaming software polls for new messages using the Windows \nUSER32.PeekMessage() function, so I sorted the list using the Destination col-\numn and typed PEEK (you can search the list by simply typing a name with \nthe window in focus) to locate the first call to USER32.PeekMessage(). \nThanks to the Destination sorting, every call to this function was listed \nin a contiguous chunk following the first, as shown in Figure 2-5. I set a \nbreakpoint on each by selecting it and pressing F2.\nFigure 2-5: OllyDbg’s Found intermodular calls window\nThough there were around a dozen calls to USER32.PeekMessage(), only \ntwo of them were setting off my breakpoints. Even better, the active calls \nwere beside one another in an unconditional loop. At the bottom of this \nloop were a number of internal function calls. This looked exactly like a \nmain game loop.\nActivating the Trace\nTo finally set my trace, I removed all of my previous breakpoints and placed \none at the top of the suspected main loop. I removed the breakpoint as soon \nas it was hit. I then pressed ctrl-T from the CPU window, which brought up a \nDebugging Games with OllyDbg 41\ndialog called Condition to pause run trace, shown in Figure 2-6. Within this \nnew dialog, I enabled the Condition is TRUE option, placed my expression in \nthe box beside it, and pressed OK. Then, I went back to the CPU window and \npressed ctrl-F11 to begin a Trace Into session.\nFigure 2-6: Condition to pause run trace dialog\nOnce the trace began, the game ran so slowly it was nearly unplayable. \nTo decrease my test character’s health, I opened a second instance of the \ngame, logged into a different character, and attacked my test character. \nWhen the execution of the trace caught up to real time, OllyDbg saw my \nhealth change and triggered the breakpoint on the following function \ncall—just as expected.\nIn this game, the main pieces of code that would modify the health \nvalue were directly invoked from the network code. Using this trace, I was \nable to find the function that the network module called directly after a \nnetwork packet told the game to change the player’s health. Here’s the \npsuedocode of what the game was doing:\nvoid network::check() {\n while (this->hasPacket()) {\n packet = this->getPacket();\n if (packet.type == UPDATE_HEALTH) {\n oldHealth = player->health;\n player->health = packet.getInteger();\nu observe(HEALTH_CHANGE, oldHealth, player->health);\n }\n }\n}\nI knew the game had code that needed to execute only when the player’s \nhealth was changed, and I needed to add code that could also respond to \nsuch changes. Without knowing the overall code structure, I guessed that \n42 Chapter 2\nthe health-dependent code would be executed from some function call \ndirectly after health was updated. My trace conditional breakpoint con-\nfirmed this hunch, as it broke directly on the observe() function u. From \nthere, I was able to place a hook on the function (hooking, a way to intercept \nfunction calls, is described in “Hooking to Redirect Game Execution” on \npage 153) and execute my own code when the player’s health changed.\nOllyDbg Plug-ins for Game Hackers\nOllyDbg’s highly versatile plug-in system is perhaps one of its most power-\nful features. Experienced game hackers often configure their OllyDbg \nenvironments with dozens of useful plug-ins, both publicly available and \ncustom-made.\nYou can download popular plug-ins from the OpenRCE (http://www\n.openrce.org/downloads/browse/OllyDbg_Plugins) and tuts4you (http://www\n.tuts4you.com/download.php?list.9/) plug-in repositories. Installing them is \neasy: just unzip the plug-in files and place them inside OllyDbg’s installa-\ntion folder.\nOnce installed, some plug-ins can be accessed from the OllyDbg’s Plugin \nmenu item. Other plug-ins, however, might be found only in specific places \nthroughout the OllyDbg interface.\nYou can find hundreds of potent plug-ins using these online reposito-\nries, but you should be careful when constructing your arsenal. Working \nin an environment bloated by unused plug-ins can actually impede pro-\nductivity. In this section, I’ve carefully selected four plug-ins that I believe \nare not only integral to a game hacker’s toolkit but also noninvasive to the \nenvironment.\nCopying Assembly Code with Asm2Clipboard\nAsm2Clipboard is a minimalistic plug-in from the OpenRCE repository that \nallows you to copy chunks of assembly code from the disassembler pane to \nthe clipboard. This can be useful for updating address offsets and devising \ncode caves, two game-hacking essentials I cover deeply in Chapters 5 and 7.\nWith Asm2Clipboard installed, you can highlight a block of assem-\nbly code in the disassembler, right-click the highlighted code, expand \nthe Asm2Clipboard submenu, and select either Copy fixed Asm code to \nclipboard or Copy Asm code to clipboard. The latter prepends the code \naddress of each instruction as a comment, while the former copies only \nthe pure code. \nAdding Cheat Engine to OllyDbg with Cheat Utility\nThe Cheat Utility plug-in from tuts4you provides a highly slimmed-down \nversion of Cheat Engine within OllyDbg. While Cheat Utility only allows \nyou to do exact-value scans with a very limited number of data types, it can \nDebugging Games with OllyDbg 43\nmake simple scans much easier when you don’t need the full functional-\nity of Cheat Engine to find what you’re looking for. After installing Cheat \nUtility, to open its interface (shown in Figure 2-7), select Plugins4Cheat \nutility4Start. \nFigure 2-7: Cheat Utility interface\nCheat Utility’s user interface and operation mimic Cheat Engine \nclosely, so review Chapter 1 if you need a refresher. \nN o t e \t\nGames Invader, an updated version of Cheat Utility also from tuts4you, was created \nto provide more functionality. I’ve found it buggy, however, and I prefer Cheat Utility \nsince I can always use Cheat Engine for advanced scans. \nControlling OllyDbg Through the Command Line\nThe command line plug-in enables you to control OllyDbg through a small \ncommand line interface. To access the plug-in, either press alt-F1 or select \nPlugins4Command line4Command line. You should then see a window, \nshown in Figure 2-8, which acts as the command line interface. \n\u001f\n\u001e\n\u001d\nFigure 2-8: Command line interface\nTo execute a command, type it into the input box u and press enter. \nYou will see a session-level command history in the center list v, and the \nbottom label displays the command’s return value w (if any). \n44 Chapter 2\nThough there are many commands available, I find a majority of \nthem useless. I primarily use this tool as a way to test that expressions are \nparsing as expected and as a handy calculator, but there are a few addi-\ntional use cases that are also worth mentioning. I’ve described these in \nTable 2-6.\nTable 2-6: Command Line Plug-in Commands\nCommand\nFunction\nBC identifier\nRemoves any breakpoints present on identifier, \nwhich can be a code address or API function name.\nBP identifier [,condition]\nPlaces a debugger breakpoint on identifier, \nwhich can be a code address or API function \nname. When identifier is an API function name, the \nbreakpoint will be placed on the function entry point. \nThe condition parameter is an optional expression \nthat, if present, will be set as the breakpoint \ncondition.\nBPX label\nPlaces a debugger breakpoint on every instance \nof label within the module currently being \ndisassembled. This label will typically be an \nAPI function name.\nCALC expression\n? expression\nEvaluates expression and displays the result.\nHD address\nRemoves any hardware breakpoints present on \naddress.\nHE address\nPlaces a hardware on-execute breakpoint on \naddress.\nHR address\nPlaces a hardware on-access breakpoint on \naddress. Only four hardware breakpoints can \nexist at a time.\nHW address\nPlaces a hardware on-write breakpoint on address.\nMD\nRemoves any existing memory breakpoint, if \npresent.\nMR address1, address2\nPlaces a memory on-access breakpoint starting at \naddress1 and spanning until address2. Will replace \nany existing memory breakpoint.\nMW address1, address2\nPlaces a memory on-write breakpoint starting at \naddress1 and spanning until address2. Will replace \nany existing memory breakpoint.\nWATCH expression\nW expression\nOpens the Watches window and adds expression \nto the watch list. Expressions in this list will be \nreevaluated every time the process receives \na message and the evaluation results will be \ndisplayed beside them.\nThe command line plug-in was made by the OllyDbg developer and \nshould come preinstalled with OllyDbg.\nDebugging Games with OllyDbg 45\nVisualizing Control Flow with OllyFlow\nOllyFlow, which can be found in the OpenRCE plug-in directory, is a purely \nvisual plug-in that can generate code graphs like the one in Figure 2-9 and \ndisplay them using Wingraph32.\nFigure 2-9: An OllyFlow function flowchart\nN O T E \t\nWingraph32 is not provided with OllyFlow, but it is available with the free version of \nIDA here: https://www.hex-rays.com/products/ida/. Download it and drop the \n.exe in your OllyDbg installation folder.\nThough not interactive, these graphs allow you to easily identify con-\nstructs such as loops and nested if() statements in game code, which can be \nparamount in control flow analysis. With OllyFlow installed, you can gener-\nate a graph by going to Plugins4OllyFlow (alternatively, right-click in the \ndisassembler pane and expand the OllyFlow graph submenu) and selecting \none of the following options:\nGenerate function flowchart Generates a graph of the function cur-\nrently in scope, breaking apart different code blocks and showing jump \npaths. Figure 2-9 shows a function flowchart. Without a doubt, this is \nOllyFlow’s most useful feature.\nGenerate xrefs from graph Generates a graph of all functions called \nby the function that is currently in scope.\nGenerate xrefs to graph Generates a graph of all functions that call \nthe function currently in scope.\n46 Chapter 2\nGenerate call stack graph Generates a graph of the assumed call path \nfrom the process entry point to the function currently in scope.\nGenerate module graph Theoretically generates a complete graph of \nall function calls in the entire module, but rarely actually works.\nTo get an idea of the usefulness of OllyFlow, take a look at the graph \nin Figure 2-9 and compare it to the relatively simple assembly function that \ngenerated it:\n76f86878:\nu MOV EAX,DWORD PTR DS:[76FE7E54] \n TEST AL,1\n JE ntdll.76F8689B\n76f86881:\nv MOV EAX,DWORD PTR FS:[18] \n MOV EAX,DWORD PTR DS:[EAX+30]\n OR DWORD PTR DS:[EAX+68],2000000\n MOV EAX,DWORD PTR DS:[76FE66E0]\n OR DWORD PTR DS:[EAX],1\n JMP ntdll.76F868B2\n76f8689b:\nw TEST EAX,8000 \n JE ntdll.76F868B2\n76f868a2:\nx MOV EAX,DWORD PTR FS:[18] \n MOV EAX,DWORD PTR DS:[EAX+30]\n OR DWORD PTR DS:[EAX+68],2000000\n76f868b2:\ny MOV AL,1 \n RETN\nThere are five boxes in Figure 2-9, and they map to the five pieces of \nthis function. The function starts with u, and it falls through to v if the \nbranch fails or jumps to w if it succeeds. After v executes, it jumps directly \nto piece y, which then returns out of the function. After w executes, it \neither falls through to x or branches to y to return directly. After x \nexecutes, it unconditionally falls through to y. What this function does is \nirrelevant to understanding OllyFlow; for now, just focus on seeing how the \ncode maps to the graph.\nPatching an if( ) Statement\nIf you think you’re ready to get your hands dirty with OllyDbg, keep read-\ning. Go to https://www.nostarch.com/gamehacking/, download the book’s \nresource files, grab BasicDebugging.exe, and execute it. At first glance, you’ll \nsee that it looks like the classic game Pong. In this version of Pong, the ball is \ninvisible to you when it is on your opponent’s screen. Your task is to disable \nthis feature so that you can always see the ball. To make it easier for you, I’ve \nmade the game autonomous. You don’t have to play, only hack.\nDebugging Games with OllyDbg 47\nClosing Thoughts\nOllyDbg is a much more complex beast than Cheat Engine, but you’ll learn \nbest by using it, so dive in and get your hands dirty! You can start by pairing \nthe controls taught in this chapter with your debugging skills and going to \nwork on some real games. If you are not yet ready to tamper with your vir-\ntual fate, however, try tackling the example in “Patching an if() Statement” \nfor a practice environment. When you’re done, read on to Chapter 3, where \nI’ll introduce you to Process Monitor and Process Explorer, two tools you’ll \nfind invaluable in game-hacking reconnaissance.\nTo start, attach OllyDbg to the game. Then focus the CPU window on the \nmain module (find the .exe in the module list and double-click it) and use the \nReferenced text strings feature to locate the string that is displayed when the \nball is hidden. Next, double-click the string to bring it up in the code and \nanalyze the surrounding code until you find the if() statement that determines \nwhether to hide the ball. Lastly, using the code-patching feature, patch the \nif() statement so the ball is always drawn. As an added bonus, you might try \nusing OllyFlow to graph this function so you can get a better understanding \nof what exactly it is doing. (Hint: The if() statement checks whether the ball’s \nx-coordinate is less than 0x140. If so, it jumps to code that draws the ball. \nIf not, it draws the scene without the ball. If you can change 0x140 to, say, \n0xFFFF, the ball will never get hidden.)\n3\nR eco n n a i s s a nce \nw i t h Proce s s M oni t or \na n d Proce s s E x pl or e r\nCheat Engine and OllyDbg can help you \ntear apart a game’s memory and code, but \nyou also need to understand how the game \ninteracts with files, registry values, network \nconnections, and other processes. To learn how those \ninteractions work, you must use two tools that excel at \nmonitoring the external actions of processes: Process Monitor and Process \nExplorer. With these tools, you can track down the complete game map, \nlocate save files, identify registry keys used to store settings, and enumerate \nthe Internet Protocol (IP) addresses of remote game servers.\nIn this chapter, I’ll teach you how to use both Process Monitor and \nProcess Explorer to log system events and inspect them to see how a game \nwas involved. Useful mainly for initial reconnaissance, these tools are \n50 Chapter 3\namazing at giving a clear, verbose picture of exactly how a game interacts \nwith your system. You can download both programs from the Windows \nSysinternals website (https://technet.microsoft.com/en-us/sysinternals/).\nProcess Monitor\nYou can learn a lot about a game simply by exploring how it interacts with \nthe registry, filesystem, and network. Process Monitor is a powerful system-\nmonitoring tool that logs such events in real time and lets you seamlessly \nintegrate the data into a debugging session. This tool provides extensive \namounts of useful data regarding a game’s interaction with the external \nenvironment. With calculated review (and sometimes, spontaneous intu-\nition) on your part, this data can reveal details about data files, network \nconnections, and registry events that are helpful to your ability to see and \nmanipulate how the game functions.\nIn this section, I’ll show you how to use Process Monitor to log data, \nnavigate it, and make educated guesses about the files a game interacts with. \nAfter this interface tour, you’ll have a chance to try out Process Monitor for \nyourself in “Finding a High Score File” on page 55.\nLogging In-Game Events\nProcess Monitor’s logs can hold all sorts of potentially useful informa-\ntion, but their most practical use is to help you figure out where data files, \nsuch as in-game item definitions, might be stored. When you start Process \nMonitor, the first dialog you see is the Process Monitor Filter, shown in \nFigure 3-1.\nFigure 3-1: Process Monitor Filter dialog\nThis dialog allows you to show or suppress events based on a number \nof dynamic properties they possess. To start monitoring processes, select \nProcess Name4Is4YourGameFilename.exe4Include and then press Add, \nReconnaissance with Process Monitor and Process Explorer 51\nApply, and OK. This tells Process Monitor to show events invoked by \nYourGameFilename.exe. With the proper filters set, you will be taken to the \nmain window shown in Figure 3-2.\nFigure 3-2: Process Monitor main window\nTo configure the columns displayed in Process Monitor’s log area, right-\nclick on the header and choose Select Columns. There’s an impressive \nnumber of options, but I recommend seven.\nTime of Day Lets you see when actions are happening.\nProcess Name Is useful if you’re monitoring multiple processes, but \nwith the single-process filter that is typically used for games; disabling \nthis option can save precious space.\nProcess ID Is like Process Name, but it shows the ID rather than \nthe name.\nOperation Shows what action was performed; thus, this option is \ncompulsory.\nPath Shows the path of the action’s target; also compulsory.\nDetail Is useful only in some cases, but enabling it won’t hurt.\nResult Shows when actions, such as loading files, fail.\nAs you show more columns, the log can get very crowded, but sticking \nwith these options should help keep the output succinct.\nOnce the monitor is running and you’ve defined the columns you wish \nto see, there are five event class filters, outlined in black in Figure 3-2, that \nyou can toggle to clean up your logs even further. Event class filters let you \nchoose which events to show in the log, based on type. From left to right, \nthese filters are as follows:\nRegistry Shows all registry activity. There will be a lot of white noise \nin the registry upon process creation, as games rarely use the registry \nand Windows libraries always use it. Leaving this filter disabled can save \na lot of space in the log.\n52 Chapter 3\nFilesystem Shows all filesystem activity. This is the most important \nevent class filter, since knowing where data files are stored and how they \nare accessed is integral to writing an effective bot.\nNetwork Shows all network activity. The call stack on network events \ncan be useful in finding network-related code within a game.\nProcess and thread activity Shows all process and thread actions. The \ncall stack on these events can give you insight into how a game’s code \nhandles threads.\nProcess profiling Periodically shows information about the memory \nand CPU usage of each running process; a game hacker will rarely use it.\nIf class-level event filtering is still not precise enough to filter out \nunwanted pollution in your logs, right-click on specific events for event-\nlevel filtering options. Once you have your event filtering configured to log \nonly what you need, you can begin navigating the log. Table 3-1 lists some \nuseful hotkeys for controlling the log’s behavior.\nTable 3-1: Process Monitor Hotkeys\nHotkey\nAction\nctrl-E\nToggles logging.\nctrl-A\nToggles automatic scrolling of the log.\nctrl-X\nClears the log.\nctrl-L\nDisplays the Filter dialog.\nctrl-H\nDisplays the Highlight dialog. This dialog looks very similar to \nthe Filter dialog, but it is used to indicate which events should \nbe highlighted.\nctrl-F\nDisplays the Search dialog.\nctrl-P\nDisplays the Event Properties dialog for the selected event.\nAs you navigate the log, you can examine the operations recorded to \nsee the fine-grained details of an event.\nInspecting Events in the Process Monitor Log\nProcess Monitor logs every data point it possibly can about an event, \nenabling you to learn more about these events than just the files they act \nupon. Carefully inspecting data-rich columns, such as Result and Detail, \ncan yield some very interesting information.\nFor example, I’ve found that games sometimes read data structures, \nelement by element, directly from files. This behavior is apparent when a \nlog contains a large number of reads to the same file, where each read has \nsequential offsets but differing lengths. Consider the hypothetical event log \nshown in Table 3-2.\nReconnaissance with Process Monitor and Process Explorer 53\nTable 3-2: Example Event Log\nOperation\nPath\nDetail\nCreate File\nC:\\file.dat\nDesired Access: Read\nRead File\nC:\\file.dat\nOffset: 0 Size: 4\nRead File\nC:\\file.dat\nOffset: 4 Size: 2\nRead File\nC:\\file.dat\nOffset: 6 Size: 2\nRead File\nC:\\file.dat\nOffset: 8 Size: 4\nRead File\nC:\\file.dat\nOffset: 12 Size: 4\n...\n...\n...Continues to read chunks of 4 bytes for a while\nThis log reveals that the game is reading a structure from the file piece \nby piece, disclosing some hints about what the structure looks like. For \nexample, let’s say that these reads reflect the following data file:\nstruct myDataFile\n{\n int header; // 4 bytes (offset 0)\n short effectCount; // 2 bytes (offset 4)\n short itemCount; // 2 bytes (offset 6)\n int* effects;\n int* items;\n};\nCompare the log in Table 3-2 with this structure. First, the game reads \nthe 4 header bytes. Then, it reads two 2-byte values: effectCount and itemCount. \nIt then creates two integer arrays, effects and items, of respective lengths \neffectCount and itemCount. The game then fills these arrays with data from \nthe file, reading 4 bytes effectCount + itemCount times.\nN o t e \t\nDevelopers definitely shouldn’t use a process like this to read data from a file, but \nyou’d be amazed at how often it happens. Fortunately for you, naïveté like this just \nmakes your analysis easier.\nIn this case, the event log can identify small pieces of information \nwithin a file. But keep in mind that, while correlating the reads with the \nknown structure is easy, it’s much harder to reverse engineer an unknown \nstructure from nothing but an event log. Typically, game hackers will use \na debugger to get more context about each interesting event, and the \ndata from Process Monitor can be seamlessly integrated into a debugging \nsession, effectively tying together the two powerful reverse engineering \nparadigms.\nDebugging a Game to Collect More Data\nLet’s step away from this hypothetical file read and look at how Process \nMonitor lets you transition from event logging to debugging. Process Monitor \nstores a complete stack trace for each event, showing the full execution \n54 Chapter 3\nchain that led to the event being triggered. You can view these stack traces \nin the Stack tab of the Event Properties window (double-click the event or \npress ctrl-P), as shown in Figure 3-3.\n\u001f\n\u001e\n\u001d\n\u001c\n\u001b\nFigure 3-3: Process Monitor event call stack\nThe stack trace is displayed in a table starting with a Frame column u, \nwhich shows the execution mode and stack frame index. A pink K in this \ncolumn means the call happened in kernel mode, while a blue U means it \nhappened in user mode. Since game hackers typically work in user mode, \nkernel mode operations are usually meaningless.\nThe Module column v shows the executable module where the calling \ncode was located. Each module is just the name of the binary that made \nthe call; this makes it easy to identify which calls were actually made from \nwithin a game binary.\nThe Location column w shows the name of the function that made \neach call, as well as the call offset. These function names are deduced from \nthe export table of the module and will generally not be present for the \nfunctions within a game binary. When no function names are present, the \nLocation column instead shows the module name and the call’s offset (how \nmany bytes past the origin address the call is in memory) from the mod-\nule’s base address.\nReconnaissance with Process Monitor and Process Explorer 55\nN o t e \t\nIn the context of code, the offset is how many bytes of assembly code are between an \nitem and its origin.\nThe Address column x shows the code address of the call, which is very \nuseful because you can jump to the address in the OllyDbg disassembler. \nFinally, the Path column y shows the path to the module that made the call.\nIn my opinion, the stack trace is, by far, the most powerful feature in \nProcess Monitor. It reveals the entire context that led to an event, which can \nbe immensely useful when you are debugging a game. You can use it to find \nthe exact code that triggered an event, crawl up the call chain to see how it \ngot there, and even determine exactly what libraries were used to complete \neach action.\nProcess Monitor’s sister application, Process Explorer, doesn’t have \nmany capabilities beyond those in Process Monitor or OllyDbg. But it does \nexpose some of those capabilities much more effectively, making it an ideal \npick in certain situations.\nProcess Explorer\nProcess Explorer is an advanced task manager (it even has a button you \ncan press to make it your default task manager), and it’s very handy when \nyou’re starting to understand how a game operates. It provides complex \ndata about running processes, such as parent and child processes, CPU \nusage, memory usage, loaded modules, open handles, and command line \narguments, and it can manipulate those processes. It exceeds at showing \nyou high-level information, such as process trees, memory consumption, \nfile access, and process IDs, all of which can be very useful.\nOf course, none of this data is specifically useful in isolation. But with \na keen eye, you can make correlations and draw some useful conclusions \nabout what global objects—including files, mutexes, and shared memory \nFinding a High Score File\nIf you’re ready to test your Process Monitor skills, you’ve come to the right \nplace. Open the GameHackingExamples/Chapter3_FindingFiles directory \nand execute FindingFiles.exe. You’ll see that it is a game of Pong, like the one \nin “Patching an if() Statement” on page 46. Unlike in Chapter 2, though, \nnow the game is actually playable. It also displays your current score and \nyour all-time-high score.\nNow restart the game, firing up Process Monitor before executing it for the \nsecond time. Filtering for filesystem activity and creating any other filters you \nsee fit, try to locate where the game stores the high-score file. For bonus points, \ntry to modify this file to make the game show the highest possible score.\n56 Chapter 3\nsegments—a game has access to. Additionally, the data shown in Process \nExplorer can be even more valuable when cross-referenced with data gath-\nered in a debugging session.\nThis section introduces the Process Explorer interface, discusses the \nproperties it shows, and describes how you can use this tool to manipu-\nlate handles (references to system resources). After this introduction, use \n“Finding and Closing a Mutex” on page 60 to hone your skills.\nProcess Explorer’s User Interface and Controls\nWhen you open Process Explorer, you see a window that is split into three \ndistinct sections, as in Figure 3-4.\n\u001f\n\u001e\n\u001d\nFigure 3-4: Process Explorer main window\nThose three sections are the toolbar u, an upper pane v, and a lower \npane w. The upper pane shows a list of processes, utilizing a tree struc-\nture to display their parent/child relationships. Different processes are \nhighlighted with different colors; if you don’t like the current colors, click \nOptions4Configure Colors to display a dialog that allows you to view and \nchange them.\nJust as in Process Monitor, the display for this table is highly versatile, \nand you can customize it by right-clicking on the table header and choosing \nSelect Columns. There are probably more than 100 customization options, \nbut I find that the defaults with the addition of the ASLR Enabled column \nwork just fine.\nReconnaissance with Process Monitor and Process Explorer 57\nN o t e \t\nAddress Space Layout Randomization (ASLR) is a Windows security feature \nthat allocates executable images at unpredictable locations, and knowing whether it’s \non is invaluable when you’re trying to alter game state values in memory.\nThe lower pane has three possible states: Hidden, DLLs, and Handles. \nThe Hidden option hides the pane from view, DLLs displays a list of \nDynamic Link Libraries loaded within the current process, and Handles \nshows a list of handles held by the process (visible in Figure 3-4). You can \nhide or unhide the entire lower pane by toggling View4Show Lower \nPane. When it is visible, you can change the information display by \nselecting either View4Lower Pane View4DLLs or View4Lower Pane \nView4Handles.\nYou can also use hotkeys to quickly change between lower pane modes \nwithout affecting processes in the upper pane. These hotkeys are listed in \nTable 3-3.\nTable 3-3: Process Explorer Hotkeys\nHotkey\nAction\nctrl-F\nSearch through lower pane data sets for a value.\nctrl-L\nToggle the lower pane between hidden and visible.\nctrl-D\nToggle the lower pane to display DLLs.\nctrl-H\nToggle the lower pane to display handles.\nspacebar\nToggle process list autorefresh.\nenter\nDisplay the Properties dialog for the selected process.\ndel\nKill the selected process.\nshift-del\nKill the selected process and all child processes.\nUse the GUI or hotkeys to practice changing modes. When you’re \nacquainted with the main window, we’ll look at another important Process \nExplorer dialog, called Properties.\nExamining Process Properties\nMuch like Process Monitor, Process Explorer has a very kinetic approach to \ndata gathering; the end result is a broad and verbose spectrum of informa-\ntion. In fact, if you open the Properties dialog (shown in Figure 3-5) for a \nprocess, you’ll see a massive tab bar containing 10 tabs.\nThe Image tab, selected by default and shown in Figure 3-5, displays the \nexecutable name, version, build date, and complete path. It also displays the \ncurrent working directory and the Address Space Layout Randomization sta-\ntus of the executable. ASLR status is the most important piece of informa-\ntion here, because it has a direct effect on how a bot can read the memory \nfrom a game. I’ll talk about this more in Chapter 6.\n58 Chapter 3\nFigure 3-5: Process Explorer Properties dialog\nThe Performance, Performance Graph, Disk and Network, and GPU \nGraph tabs display a myriad of metrics about the CPU, memory, disk, net-\nwork, and GPU usage of the process. If you create a bot that injects into a \ngame, this information can be very useful to determine how much of a per-\nformance impact your bot has on the game.\nThe TCP/IP tab displays a list of active TCP connections, which you \ncan use to find any game server IP addresses that a game connects to. If \nyou’re trying to test connection speed, terminate connections, or research \na game’s network protocol, this information is critical.\nThe Strings tab displays a list of strings found in either the binary or \nthe memory of the process. Unlike the string list in OllyDbg, which shows \nonly strings referenced by assembly code, the list includes any occurrences \nof three or more consecutive readable characters, followed by a null termi-\nnator. When a game binary is updated, you can use a diffing tool on this \nlist from each game version to determine whether there are any new strings \nthat you want to investigate.\nThe Threads tab shows you a list of threads running within the pro-\ncess and allows you to pause, resume, or kill each thread; the Security tab \ndisplays the security privileges of the process; and the Environment tab dis-\nplays any environment variables known to or set by the process.\nReconnaissance with Process Monitor and Process Explorer 59\nN o t e \t\nIf you open the Properties dialog for a .NET process, you’ll notice two additional \ntabs: .NET Assemblies and .NET Performance. The data in these tabs is pretty self-\nexplanatory. Please keep in mind that a majority of the techniques in this book won’t \nwork with games written in .NET.\nHandle Manipulation Options\nAs you’ve seen, Process Explorer can provide you with a wealth of informa-\ntion about a process. That’s not all it’s good for, though: it can also manipu-\nlate certain parts of a process. For example, you can view and manipulate \nopen handles from the comfort of Process Explorer’s lower pane (see \nFigure 3-4). This alone makes a strong argument for adding Process \nExplorer to your toolbox. Closing a handle is as simple as right-clicking on \nit and selecting Close Handle. This can come in handy when you want, for \ninstance, to close mutexes, which is essential to certain types of hacks.\nN o t e \t\nYou can right-click on the lower pane header and click Select Columns to customize \nthe display. One column you might find particularly useful is Handle Value, which \ncan help when you see a handle being passed around in OllyDbg and want to know \nwhat it does.\nClosing Mutexes\nGames often allow only one client to run at a time; this is called single-\ninstance limitation. You can implement single-instance limitation in a \nnumber of ways, but using a system mutex is common because mutexes \nare sessionwide and can be accessed by a simple name. It’s trivial to limit \ninstances with mutexes, and thanks to Process Explorer, it’s just as trivial to \nremove that limit, allowing you to run multiple instances of a game at the \nsame time.\nFirst, here’s how a game might tackle single-instance limitation with a \nmutex:\nint main(int argc, char *argv[]) {\n // create the mutex\n HANDLE mutex = CreateMutex(NULL, FALSE, \"onlyoneplease\");\n if (GetLastError() == ERROR_ALREADY_EXISTS) {\n // the mutex already exists, so exit\n ErrorBox(\"An instance is already running.\");\n return 0;\n }\n // the mutex didn't exist; it was just created, so\n // let the game run\n RunGame();\n // the game is over; close the mutex to free it up\n // for future instances\n if (mutex)\n CloseHandle(mutex);\n return 0;\n}\n60 Chapter 3\nThis example code creates a mutex named onlyoneplease. Next, the func-\ntion checks GetLastError() to see whether the mutex was already created, and \nif so, it closes the game. If the mutex doesn’t already exist, the game creates \nthe first instance, thereby blocking any future game clients from running. In \nthis example, the game runs normally, and once it finishes, CloseHandle() is \ncalled to close the mutex and allow future game instances to run.\nYou can use Process Explorer to close instance-limiting mutexes and \nrun many game instances simultaneously. To do so, choose the Handles \nview of the lower pane, look for all handles with a type of Mutant, determine \nwhich one is limiting instances of the game, and close that mutex.\nW a r n i n g \t\nMutexes are also used to synchronize data across threads and processes. Close one \nonly if you’re sure that its sole purpose is the one you’re trying to subvert!\nMulticlient hacks are generally in high demand, so being able to quickly \ndevelop them for emerging games is crucial to your overall success as a bot \ndeveloper within that market. Since mutexes are one of the most common \nways to achieve single-instance limitation, Process Explorer is an integral \ntool for prototyping these kinds of hacks.\nInspecting File Accesses\nUnlike Process Monitor, Process Explorer can’t show a list of filesystem calls. \nOn the other hand, the Handles view of Process Explorer’s lower pane can \nshow all file handles that a game currently has open, revealing exactly what \nfiles are in continuous use without the need to set up advanced filtering cri-\nteria in Process Monitor. Just look for handles with a type of File to see all \nfiles the game is currently using.\nThis functionality can come in handy if you’re trying to locate logfiles \nor save files. Moreover, you can locate named pipes that are used for inter\nprocess communication (IPC); these are files prefixed with \\Device\\\nNamedPipe\\. Seeing one of these pipes is often a hint that the game is talk-\ning to another process.\nFinding and Closing a Mutex\nTo put your Process Explorer skills to use, go to the GameHackingExamples/\nChapter3_CloseMutex directory and execute CloseMutex.exe. This game plays \nexactly like the one in “Finding a High Score File” on page 55, but it pre-\nvents you from simultaneously running multiple instances. As you might have \nguessed, it does this using a single-instance-limitation mutex. Using Process \nExplorer’s Handles view in the lower pane, find the mutex responsible for this \nlimitation and close it. If you succeed, you’ll be able to open a second instance \nof the game.\nReconnaissance with Process Monitor and Process Explorer 61\nClosing Thoughts\nTo be effective when using Process Monitor and Process Explorer, you need, \nabove all else, a deep familiarity with the data that these applications dis-\nplay as well as the interfaces they use to display it. While this chapter’s over-\nview is a good baseline, the intricacies of these applications can be learned \nonly through experience, so I encourage you to play around with them on \nyour system.\nYou won’t use these tools on a regular basis, but at some point, they’ll \nsave the day: as you struggle to figure out how some code works, you’ll recall \nan obscure piece of information that caught your eye during a previous \nProcess Explorer or Process Monitor session. That’s why I consider them \nuseful reconnaissance tools.\nPart 2\nG a m e Di s sec t ion\n4\nF ro m Co de t o M e mory: \nA G e n e r a l Pr im e r\nAt the lowest level, a game’s code, data, \ninput, and output are complex abstrac-\ntions of erratically changing bytes. Many \nof these bytes represent variables or machine \ncode generated by a compiler that was fed the game’s \nsource code. Some represent images, models, and \nsounds. Others exist only for an instant, posted by the \ncomputer’s hardware as input and destroyed when the game finishes pro-\ncessing them. The bytes that remain inform the player of the game’s inter-\nnal state. But humans can’t think in bytes, so the computer must translate \nthem in a way we can understand.\nThere’s a huge disconnect in the opposite direction as well. A computer \ndoesn’t actually understand high-level code and visceral game content, so \nthese must be translated from the abstract into bytes. Some content—such \nas images, sounds, and text—is stored losslessly, ready to be presented to the \n66 Chapter 4\nplayer at a microsecond’s notice. A game’s code, logic, and variables, on the \nother hand, are stripped of all human readability and compiled down to \nmachine data.\nBy manipulating a game’s data, game hackers obtain humanly improb-\nable advantages within the game. To do this, however, they must understand \nhow a developer’s code manifests once it has been compiled and executed. \nEssentially, they must think like computers.\nTo get you thinking like a computer, this chapter will begin by teach-\ning you how numbers, text, simple structures, and unions are represented \nin memory at the byte level. Then you’ll dive deeper to explore how class \ninstances are stored in memory and how abstract instances know which vir-\ntual functions to call at runtime. In the last half of the chapter, you’ll take \nan x86 assembly language crash course that covers syntax, registers, oper-\nands, the call stack, arithmetic operations, branching operations, function \ncalls, and calling conventions.\nThis chapter focuses very heavily on general technical details. There \nisn’t a lot of juicy information that immediately relates to hacking games, \nbut the knowledge you gain here will be central in the coming chapters, \nwhen we talk about topics like programmatically reading and writing mem-\nory, injecting code, and manipulating control flow.\nSince C++ is the de facto standard for both game and bot development, \nthis chapter explains the relationships between C++ code and the memory \nthat represents it. Most native languages have very similar (sometimes iden-\ntical) low-level structure and behavior, however, so you should be able to \napply what you learn here to just about any piece of software.\nAll of the example code in this chapter is in the GameHackingExamples/\nChapter4_CodeToMemory directory of this book’s source files. The included \nprojects can be compiled with Visual Studio 2010 but should also work \nwith any other C++ compiler. Download them at https://www.nostarch.com/\ngamehacking/ and compile them if you want to follow along. \nHow Variables and Other Data Manifest in Memory\nProperly manipulating a game’s state can be very hard, and finding the \ndata that controls it is not always as easy as clicking Next Scan and hoping \nCheat Engine won’t fail you. In fact, many hacks must manipulate dozens \nof related values at once. Finding these values and their relationships often \nrequires you to analytically identify structures and patterns. Moreover, devel-\noping game hacks typically means re-creating the original structures within \nyour bot’s code. \nTo do these things, you need an in-depth understanding of exactly how \nvariables and data are laid out in the game’s memory. Through example \ncode, OllyDbg memory dumps, and some tables to tie everything together, \nthis section will teach you everything there is to know about how different \ntypes of data manifest in memory.\nFrom Code to Memory: A General Primer 67\nNumeric Data\nMost of the values game hackers need (like the player’s health, mana, loca-\ntion, and level) are represented by numeric data types. Because numeric \ndata types are also a building block for all other data types, understanding \nthem is extremely important. Luckily, they have relatively straightforward \nrepresentations in memory: they are predictably aligned and have a fixed \nbit width. Table 4-1 shows the five main numeric data types you’ll find in \nWindows games, along with their sizes and ranges. \nTable 4-1: Numeric Data Types\nType name(s)\nSize\nSigned range\nUnsigned range\nchar, BYTE\n8 bits\n−128 to 127\n0 to 255\nshort, WORD, \nwchar_t\n16 bits\n−32,768 to −32,767\n0 to 65535\nint, long, DWORD\n32 bits\n−2,147,483,648 to 2,147,483,647\n0 to 4,294,967,295\nlong long\n64 bits\n−9,223,372,036,854,775,808 to \n9,223,372,036,854,775,807\n0 to 18,446,744,073,709,551,615\nfloat\n32 bits\n+/−1.17549*10−38 to \n+/−3.40282*1038\nN/A\nThe sizes of numeric data types can differ between architectures and \neven compilers. Since this book focuses on hacking x86 games on Windows, \nI’m using type names and sizes made standard by Microsoft. With the excep-\ntion of float, the data types in Table 4-1 are stored with little-endian ordering, \nmeaning the least significant bytes of an integer are stored in the lowest \naddresses occupied by that integer. For example, Figure 4-1 shows that DWORD \n0x0A0B0C0D is represented by the bytes 0x0D 0x0C 0x0B 0x0A.\n0D\n0C\n0B\n0A\n. . .\n. . .\nRegister\nMemory\na:\na+1:\na+2:\na+3:\n0A0B0C0D\nFigure 4-1: Little-endian ordering diagram\nThe float data type can hold mixed numbers, so its representation in \nmemory isn’t as simple as that of other data types. For example, if you see \n68 Chapter 4\n0x0D 0x0C 0x0B 0x0A in memory and that value is a float, you can’t simply \nconvert it to 0x0A0B0C0D. Instead, float values have three components: the \nsign (bit 0), exponent (bits 1–8), and mantissa (bits 9–31). \nThe sign determines whether the number is negative or positive, the \nexponent determines how many places to move the decimal point (start-\ning before the mantissa), and the mantissa holds an approximation of \nthe value. You can retrieve the stored value by evaluating the expression \nmantissa × 10n (where n is the exponent) and multiplying the result by –1 \nif the sign is set.\nNow let’s look at some numeric data types in memory. Listing 4-1 ini-\ntializes nine variables.\nunsigned char ubyteValue = 0xFF;\nchar byteValue = 0xFE;\nunsigned short uwordValue = 0x4142;\nshort wordValue = 0x4344;\nunsigned int udwordValue = 0xDEADBEEF;\nint dwordValue = 0xDEADBEEF;\nunsigned long long ulongLongValue = 0xEFCDAB8967452301;\nlong long longLongValue = 0xEFCDAB8967452301;\nfloat floatValue = 1337.7331;\nListing 4-1: Creating variables of numeric data types in C++\nStarting from the top, this example includes variables of types char, short, \nint, long long, and float. Four of these are unsigned, and five are signed. (In \nC++, a float can’t be unsigned.) Taking into account what you’ve learned so \nfar, carefully study the relationship between the code in Listing 4-1 and the \nmemory dump in Figure 4-2. Assume that the variables are declared in global \nscope.\nFigure 4-2: OllyDbg memory dump of our numeric data\nYou might notice that some values seem arbitrarily spaced out. Since \nit’s much faster for processors to access values residing at addresses that are \nmultiples of the address size (which is 32 bits in x86), compilers pad values \nwith zeros in order to align them on such addresses—hence, padding is also \ncalled alignment. Single-byte values are not padded, since operations that \naccess them perform the same regardless of alignment.\nKeeping this in mind, take a look at Table 4-2, which provides a sort of \nmemory-to-code crosswalk between the memory dump in Figure 4-2 and \nthe variables declared in Listing 4-1. \nFrom Code to Memory: A General Primer 69\nTable 4-2: Memory-to-Code Crosswalk for Listing 4-1 and Figure 4-2\nAddress\nSize\nData\nObject\n0x00BB3018\n1 byte\n0xFF\nubyteValue\n0x00BB3019\n1 byte\n0xFE\nbyteValue\n0x00BB301A\n2 bytes\n0x00 0x00\nPadding before uwordValue\n0x00BB301C\n2 bytes\n0x42 0x41\nuwordValue\n0x00BB301E\n2 bytes\n0x00 0x00\nPadding before wordValue\n0x00BB3020\n2 bytes\n0x44 0x43\nwordValue\n0x00BB3022\n2 bytes\n0x00 0x00\nPadding before udwordValue\n0x00BB3024\n4 bytes\n0xEF 0xBE 0xAD 0xDE\nudwordValue\n0x00BB3028\n4 bytes\n0xEF 0xBE 0xAD 0xDE\ndwordValue\n0x00BB302C\n4 bytes\n0x76 0x37 0xA7 0x44\nfloatValue\n0x00BB3030\n8 bytes\n0x01 0x23 0x45 0x67 \n0x89 0xAB 0xCD 0xEF\nulongLongValue\n0x00BB3038\n8 bytes\n0x01 0x23 0x45 0x67 \n0x89 0xAB 0xCD 0xEF\nLongLongValue\nThe Address column lists locations in memory, and the Data column \ntells you exactly what’s stored there. The Object column tells you which \nvariable from Listing 4-1 each piece of data relates to. Notice that floatValue \nis placed before ulongLongValue in memory, even though it’s the last variable \ndeclared in Listing 4-1. Because these variables are declared in global scope, \nthe compiler can place them wherever it wants. This particular move is \nlikely a result of either alignment or optimization.\nString Data\nMost developers use the term string as if it’s synonymous with text, but text is \nonly the most common use for strings. At a low level, strings are just arrays \nof arbitrary numeric objects that appear linear and unaligned in memory. \nListing 4-2 shows four text string declarations.\n// char will be 1 byte per character\nchar* thinStringP = \"my_thin_terminated_value_pointer\";\nchar thinStringA[40] = \"my_thin_terminated_value_array\";\n// wchar_t will be 2 bytes per character\nwchar_t* wideStringP = L\"my_wide_terminated_value_pointer\";\nwchar_t wideStringA[40] = L\"my_wide_terminated_value_array\";\nListing 4-2: Declaring several strings in C++\n70 Chapter 4\nIn the context of text, strings hold character objects (char for 8-bit encod-\ning or wchar_t for 16-bit encoding), and the end of each string is specified by a \nnull terminator, a character equal to 0x0. Let’s look at the memory where these \nvariables are stored, as shown in the two memory dumps in Figure 4-3.\n\u001f\n\u001e\nFigure 4-3: In this OllyDbg memory dump of string data, the \nhuman-readable text in the ASCII column is the text we stored \nin Listing 4-2.\nIf you’re not used to reading memory, the OllyDbg dump might be a bit \ndifficult to follow at this point. Table 4-3 shows a deeper look at the correla-\ntion between the code in Listing 4-2 and the memory in Figure 4-3. \nTable 4-3: Memory-to-Code Crosswalk for Listing 4-2 and Figure 4-3\nAddress\nSize\nData\nObject\nPane 1\n0x012420F8\n32 bytes\n0x6D 0x79 0x5F {…} 0x74 0x65 0x72\nthinStringP characters\n0x01242118\n4 bytes\n0x00 0x00 0x00 0x00\nthinStringP terminator and \npadding\n0x0124211C\n4 bytes\n0x00 0x00 0x00 0x00\nUnrelated data\n0x01242120\n64 bytes\n0x6D 0x00 0x79 {…} 0x00 0x72 0x00\nwideStringP characters\n0x01242160\n4 bytes\n0x00 0x00 0x00 0x00\nwideStringP terminator and \npadding\n{...}\nUnrelated data\nPane 2\n0x01243040\n4 bytes\n0xF8 0x20 0x24 0x01\nPointer to thinStringP at \n0x012420F8\n0x01243044\n30 bytes\n0x6D 0x79 0x5F {…} 0x72 0x61 0x79\nthinStringA characters\n0x01243062\n10 bytes\n0x00 repeated 10 times\nthinStringA terminator and \narray fill\n0x0124306C\n4 bytes\n0x20 0x21 0x24 0x01\nPointer to wideStringP at \n0x01242120\n0x01243070\n60 bytes\n0x6D 0x00 0x79 {…} 0x00 0x79 0x00\nwideStringA characters\n0x012430AC\n20 bytes\n0x00 repeated 10 times\nwideStringA terminator and \narray fill\nFrom Code to Memory: A General Primer 71\nIn Figure 4-3, pane 1 shows that the values stored where thinStringP \n(address 0x01243040) and wideStringP (address 0x0124306C) belong in \nmemory are only 4 bytes long and contain no string data. That’s because \nthese variables are actually pointers to the first characters of their respec-\ntive arrays. For example, thinStringP contains 0x012420F8, and in pane 2 in \nFigure 4-3, you can see \"my_thin_terminated_value_pointer\" located at address \n0x012420F8. \nLook at the data between these pointers in pane 1, and you can see the \ntext being stored by thinStringA and wideStringA. Furthermore, notice that \nthinStringA and wideStringA are padded beyond their null terminators; this \nis because these variables were declared as arrays with length 40, so they are \nfilled up to 40 characters.\nData Structures\nUnlike the data types we have previously discussed, structures are containers \nthat hold multiple pieces of simple, related data. Game hackers who know \nhow to identify structures in memory can mimic those structures in their \nown code. This can greatly reduce the number of addresses they must find, \nas they need to find only the address to the start of the structure, not the \naddress of every individual item.\nN o t e \t\nThis section talks about structures as simple containers that lack member functions \nand contain only simple data. Objects that exceed these limitations will be discussed \nin “Classes and VF Tables” on page 74.\nStructure Element Order and Alignment\nSince structures simply represent an assortment of objects, they don’t visibly \nmanifest in memory dumps. Instead, a memory dump of a structure shows \nthe objects that are contained within that structure. The dump would look \nmuch like the others I’ve shown in this chapter, but with important differ-\nences in both order and alignment. \nTo see these differences, start by taking a look at Listing 4-3.\nstruct MyStruct {\n unsigned char ubyteValue;\n char byteValue;\n unsigned short uwordValue;\n short wordValue;\n unsigned int udwordValue;\n int dwordValue;\n unsigned long long ulongLongValue;\n long long longLongValue;\n float floatValue;\n};\nMyStruct& m = 0;\n72 Chapter 4\nprintf(\"Offsets: %d,%d,%d,%d,%d,%d,%d,%d,%d\\n\",\n &m->ubyteValue, &m->byteValue,\n &m->uwordValue, &m->wordValue,\n &m->udwordValue, &m->dwordValue,\n &m->ulongLongValue, &m->longLongValue,\n &m->floatValue);\nListing 4-3: A C++ structure and some code that uses it\nThis code declares a structure named MyStruct and creates a variable \nnamed m that supposedly points to an instance of the structure at address \n0. There’s not actually an instance of the structure at address 0, but this \ntrick lets me use the ampersand operator (&) in the printf() call to get the \naddress of each member of the structure. Since the structure is located \nat address 0, the address printed for each member is equivalent to its offset \nfrom the start of the structure. \nThe ultimate purpose of this example is to see exactly how each mem-\nber is laid out in memory, relative to the start of the structure. If you were \nto run the code, you’d see the following output:\nOffsets: 0,1,2,4,8,12,16,24,32\nAs you can see, the variables in MyStruct are ordered exactly as they \nwere defined in code. This sequential member layout is a mandatory prop-\nerty of structures. Compare this to the example from Listing 4-1, when we \ndeclared an identical set of variables; in the memory dump from Figure 4-2, \nthe compiler clearly placed some values out of order in memory.\nFurthermore, you may have noticed that the members are not aligned \nlike the globally scoped variables in Listing 4-1; if they were, for example, \nthere would be 2 padding bytes before uwordValue. This is because structure \nmembers are aligned on addresses divisible by either the struct member align-\nment (a compiler option that accepts 1, 2, 4, 8, or 16 bytes; in this example, \nit’s set to 4) or the size of the member—whichever is smaller. I arranged the \nmembers of MyStruct so that the compiler didn’t need to pad the values. \nIf, however, we put a char immediately after ulongLongValue, the printf() \ncall would give the following output: \nOffsets: 0,1,2,4,8,12,16,28,36\nNow, take a look at the original and the modified outputs together:\nOriginal: Offsets: 0,1,2,4,8,12,16,24,32\nModified: Offsets: 0,1,2,4,8,12,16,28,36\nIn the modified version, the last two values, which are the offsets for \nlongLongValue and floatValue from the start of the structure, have changed. \nThanks to the struct member alignment, the variable longLongValue moves \nby 4 bytes (1 for the char value and 3 following it) to ensure it gets placed on \nan address divisible by 4.\nFrom Code to Memory: A General Primer 73\nHow Structures Work \nUnderstanding structures—how they are aligned and how to mimic them—\ncan be very useful. For instance, if you replicate a game’s structures in your \nown code, you can read or write those entire structures from memory in \na single operation. Consider a game that declares the player’s current and \nmax health like so:\nstruct {\n int current;\n int max;\n} vital;\nvital health;\nIf an inexperienced game hacker wants to read this information from \nmemory, they might write something like this to fetch the health values:\nint currentHealth = readIntegerFromMemory(currentHealthAddress);\nint maxHealth = readIntegerFromMemory(maxHealthAddress);\nThis game hacker doesn’t realize that seeing these values right next to \neach other in memory could be more than a lucky happenstance, so they’ve \nused two separate variables. But if you came along with your knowledge of \nstructures, you might conclude that, since these values are closely related and \nare adjacent in memory, our hacker could have used a structure instead:\nstruct {\n int current;\n int max;\n} _vital;\n _vital health = readTypeFromMemory<_vital>(healthStructureAddress);\nSince this code assumes a structure is being used and correctly mimics it, \nit can fetch both health and max health in just one line . We’ll dive deeper \ninto how to write your own code to read memory from in Chapter 6.\nUnions\nUnlike structures, which encapsulate multiple pieces of related data, unions \ncontain a single piece of data that is exposed through multiple variables. \nUnions follow three rules:\n• \nThe size of a union in memory is equal to that of its largest member.\n• \nMembers of a union all reference the same memory.\n• \nA union inherits the alignment of its largest member.\nThe printf() call in the following code helps illustrate the first two rules:\nunion {\n BYTE byteValue;\n struct {\n74 Chapter 4\n WORD first;\n WORD second;\n } words;\n DWORD value;\n} dwValue;\ndwValue.value = 0xDEADBEEF;\nprintf(\"Size %d\\nAddresses 0x%x,0x%x\\nValues 0x%x,0x%x\\n\",\n sizeof(dwValue), &dwValue.value, &dwValue.words,\n dwValue.words.first, dwValue.words.second);\nThis call to printf() outputs the following:\nSize 4\nAddresses 0x2efda8,0x2efda8\nValues 0xbeef,0xdead\nThe first rule is illustrated by the Size value, which is printed first. Even \nthough dwValue has three members that occupy a total of 9 bytes, it has a \nsize of only 4 bytes. The size result validates the second rule as well, because \ndwValue.value and dwValue.words both point to address 0x2efda8, as shown by \nthe values printed after the word Addresses. The second rule is also vali-\ndated by the fact that dwValue.words.first and dwValue.words.second contain \n0xbeef and 0xdead, printed after Values, which makes sense considering that \ndwValue.value is 0xdeadbeef. The third rule isn’t demonstrated in this example \nbecause we don’t have enough memory context, but if you were to put this \nunion inside a structure and surround it with whatever types you like, it \nwould in fact always align like a DWORD.\nClasses and VF Tables\nMuch like structures, classes are containers that hold and isolate multiple \npieces of data, but classes can also contain function definitions. \nA Simple Class\nClasses with normal functions, such as bar in Listing 4-4, conform to the \nsame memory layouts as structures.\nclass bar {\npublic:\n bar() : bar1(0x898989), bar2(0x10203040) {}\n void myfunction() { bar1++; }\n int bar1, bar2;\n};\nbar _bar = bar();\nprintf(\"Size %d; Address 0x%x : _bar\\n\", sizeof(_bar), &_bar);\nListing 4-4: A C++ class\nFrom Code to Memory: A General Primer 75\nThe printf() call in Listing 4-4 would output the following:\nSize 8; Address 0x2efd80 : _bar\nEven though bar has two member functions, this output shows that it \nspans only the 8 bytes needed to hold bar1 and bar2. This is because the bar \nclass doesn’t include abstractions of those member functions, so the pro-\ngram can call them directly. \nN o t e \t\nAccess levels such as public, private, and protected do not manifest in memory. \nRegardless of these modifiers, members of classes are still ordered as they are defined.\nA Class with Virtual Functions\nIn classes that do include abstract functions (often called virtual functions), \nthe program must know which function to call. Consider the class defini-\ntions in Listing 4-5:\nclass foo {\npublic:\nfoo() : myValue1(0xDEADBEEF), myValue2(0xBABABABA) {}\n int myValue1;\n static int myStaticValue;\n virtual void bar() { printf(\"call foo::bar()\\n\"); }\n virtual void baz() { printf(\"call foo::baz()\\n\"); }\n virtual void barbaz() {}\n int myValue2;\n};\nint foo::myStaticValue = 0x12121212;\nclass fooa : public foo {\npublic:\n fooa() : foo() {}\n virtual void bar() { printf(\"call fooa::bar()\\n\"); }\n virtual void baz() { printf(\"call fooa::baz()\\n\"); }\n};\nclass foob : public foo {\npublic:\n foob() : foo() {}\n virtual void bar() { printf(\"call foob::bar()\\n\"); }\n virtual void baz() { printf(\"call foob::baz()\\n\"); }\n};\nListing 4-5: The foo, fooa, and foob classes\nThe class foo has three virtual functions: bar, baz, and barbaz. Classes \nfooa and foob inherit from class foo and overload both bar and baz. Since fooa \n76 Chapter 4\nand foob have a public base class of foo, a foo pointer can point to them, but \nthe program must still call the correct versions of bar and baz. You can see \nthis by executing the following code:\nfoo* _testfoo = (foo*)new fooa();\n_testfoo->bar(); // calls fooa::bar()\nAnd here is the output:\ncall fooa::bar()\nThe output shows that _testfoo->bar() invoked fooa::bar() even though \n_testfoo is a foo pointer. The program knew which version of the function \nto call, because the compiler included a VF (virtual function) table in the \nmemory of _testfoo. VF tables are arrays of function addresses that abstract \nclass instances use to tell a program where their overloaded functions are \nlocated. \nClass Instances and Virtual Function Tables\nTo understand the relationship between class instances and VF tables, let’s \ninspect a memory dump of the three objects declared in this listing:\nfoo _foo = foo();\nfooa _fooa = fooa();\nfoob _foob = foob();\nThese objects are of the types defined in Listing 4-5. You can see them \nin memory in Figure 4-4.\n\u001f\n\u001e\nFigure 4-4: OllyDbg memory dump of class data\nPane 1 shows that each class instance stores its members just like a \nstructure, but it precedes them with a DWORD value that points to the class \ninstance’s VF table. Pane 2 shows the VF tables for each of our three class \ninstances. The memory-to-code crosswalk in Table 4-4 shows how these \npanes and the code tie together.\nFrom Code to Memory: A General Primer 77\nTable 4-4: Memory-to-Code Crosswalk for Listing 4-5 and Figure 4-4\nAddress\nSize\nData\nObject\nPane 1\n0x0018FF20\n4 bytes\n0x004022B0\nStart of _foo and pointer to foo VF \ntable\n0x0018FF24\n8 bytes\n0xDEADBEEF \n0xBABABABA\n_foo.myValue1 and _foo.myValue2\n0x0018FF2C\n4 bytes\n0x004022C0\nStart of _fooa and pointer to fooa VF \ntable\n0x0018FF30\n8 bytes\n0xDEADBEEF \n0xBABABABA\n_fooa.myValue1 and _fooa.myValue2\n0x0018FF38\n4 bytes\n0x004022D0\nStart of _foob and pointer to foob VF \ntable\n0x0018FF3C\n8 bytes\n0xDEADBEEF \n0xBABABABA\n_foob.myValue1 and _foob.myValue2\n{...}\nUnrelated data\nPane 2\n0x004022B0\n4 bytes\n0x00401060\nStart of foo VF table; address of \nfoo::bar\n0x004022B4\n4 bytes\n0x00401080\nAddress of foo::baz\n0x004022B8\n4 bytes\n0x004010A0\nAddress of foo::barbaz\n0x004022BC\n4 bytes\n0x0040243C\nUnrelated data\n0x004022C0 \n4 bytes\n0x004010D0\nStart of fooa VF table; address of \nfooa::bar\n0x004022C4\n4 bytes\n0x004010F0\nAddress of fooa::baz\n0x004022C8\n4 bytes\n0x004010A0\nAddress of foo::barbaz\n0x004022CC\n4 bytes\n0x004023F0\nUnrelated data\n0x004022D0 \n4 bytes\n0x00401130\nStart of foob VF table; address of \nfoob::bar\n0x004022D4\n4 bytes\n0x00401150\nAddress of foob::baz\n0x004022D8\n4 bytes\n0x004010A0\nAddress of foo::barbaz\nThis crosswalk shows how the VF tables for the code in Listing 4-5 are \nlaid out in memory. Each VF table is generated by the compiler when the \nbinary is made, and the tables remain constant. To save space, instances \nof the same class all point to the same VF table, which is why the VF tables \naren’t placed inline with the class. \nSince we have three VF tables, you might wonder how a class instance \nknows which VF table to use. The compiler places code similar to the fol-\nlowing bit of assembly in each virtual class constructor:\nMOV DWORD PTR DS:[EAX], VFADDR \n78 Chapter 4\nThis example takes the static address of a VF table (VFADDR) and places it \nin memory as the first member of the class. \nNow look at addresses 0x004022B0, 0x004022C0, and 0x004022D0 in \nTable 4-4. These addresses contain the beginning of the foo, fooa, and foob \nVF tables. Notice that foo::barbaz exists in all three VF tables; this is because \nthe function is not overloaded by either subclass, meaning instances of each \nsubclass will call the original implementation directly. \nNotice, too, that foo::myStaticValue does not appear in this crosswalk. \nSince the value is static, it doesn’t actually need to exist as a part of the foo \nclass; it’s placed inside this class only for better code organization. In real-\nity, it gets treated like a global variable and is placed elsewhere.\nThe memory tour ends here, but if you have trouble identifying a chunk \nof data in the future, come back to this section for reference. Next, we’ll \nlook at how a computer can understand a game’s high-level source code in \nthe first place. \nx86 Assembly Crash Course\nWhen a program’s source code is compiled into a binary, it is stripped of all \nunnecessary artifacts and translated into machine code. This machine code, \nmade up of only bytes (command bytes are called opcodes, but there are also \nbytes representing operands), gets fed directly to the processor and tells it \nexactly how to behave. Those 1s and 0s flip transistors to control computa-\ntion, and they can be extremely difficult to understand. To make computers \na little easier to talk to, engineers working with such code use assembly lan-\nguage, a shorthand that represents raw machine opcodes with abbreviated \nnames (called mnemonics) and a simplistic syntax.\nAssembly language is important for game hackers to know because \nmany powerful hacks can be achieved only through direct manipulation \nof a game’s assembly code, via methods such as NOPing or hooking. In this \nsection, you’ll learn the basics of x86 assembly language, a specific flavor of \nVF Tables and Cheat Engine\nRemember Cheat Engine’s First element of pointerstruct must point to module \noption for pointer scans from Figure 1-4 on page 14? Now that you’ve read \na bit about VF tables, that knowledge should help you understand how this \noption works: it makes Cheat Engine ignore all heap chunks where the first \nmember is not a pointer to a valid VF table. It speeds up scans, but it works \nonly if every step in a pointer path is part of an abstract class instance.\nFrom Code to Memory: A General Primer 79\nassembly made for speaking to 32-bit processors. Assembly language is very \nextensive, so for the sake of brevity this section talks only about the small \nsubset of assembly concepts that are most useful to game hackers.1\nN o t e \t\nThroughout this section, many small snippets of assembly code include comments set \noff by a semicolon ( ;) to describe each instruction in greater detail.\nCommand Syntax\nAssembly language is used to describe machine code, so its syntax is pretty \nsimplistic. While this syntax makes it very easy for someone to understand \nindividual commands (also called operations), it also makes understanding \ncomplex blocks of code very hard. Even algorithms that are easily readable \nin high-level code seem obfuscated when written in assembly. For example, \nthe following snippet of pseudocode:\nif (EBX > EAX)\n ECX = EDX\nelse\n ECX = 0\nwould look like Listing 4-6 in x86 assembly.\n CMP EBX, EAX\n JG label1\n MOV ECX, 0\n JMP label2\nlabel1:\n MOV ECX, EDX\nlabel2:\nListing 4-6: Some x86 assembly commands\nTherefore, it takes extensive practice to understand even the most triv-\nial functions in assembly. Understanding individual commands, however, \nis very simple, and by the end of this section, you’ll know how to parse the \ncommands I just showed you.\nInstructions\nThe first part of an assembly command is called an instruction. If you equate \nan assembly command to a terminal command, the instruction is the pro-\ngram to run. At the machine code level, instructions are typically the first \nbyte of a command;2 there are also some 2-byte instructions, where the \nfirst byte is 0x0F. Regardless, an instruction tells the processor exactly what \nto do. In Listing 4-6, CMP, JG, MOV, and JMP are all instructions.\n1. Randall Hyde’s The Art of Assembly Language, 2nd edition (No Starch Press, 2010) is a wonder-\nful book that can teach you everything there is to know about assembly.\n2. Each command must fit within 15 bytes. Most commands are 6 or fewer.\n80 Chapter 4\nOperand Syntax\nWhile some instructions are complete commands, the vast majority are \nincomplete unless followed by operands, or parameters. Every command in \nListing 4-6 has at least one operand, like EBX, EAX, and label1. \nAssembly operands come in three forms:\nImmediate value An integer value that is declared inline (hexadeci-\nmal values have a trailing h).\nRegister A name that refers to a processor register.\nMemory offset An expression, placed in brackets, that represents the \nmemory location of a value. The expression can be an immediate value \nor a register. Alternatively, it can be either the sum or difference of a \nregister and immediate value (something like [REG+Ah] or [REG-10h]).\nEach instruction in x86 assembly can have between zero and three \noperands, and commas are used to separate multiple operands. In most \ncases, instructions that require two operands have a source operand and a \ndestination operand. The ordering of these operands is dependent on the \nassembly syntax. For example, Listing 4-7 shows a group of pseudocom-\nmands written in the Intel syntax, which is used by Windows (and, thus, \nby Windows game hackers):\nMOV R1, 1 ; set R1 (register) to 1 (immediate)\n MOV R1, [BADF00Dh] ; set R1 to value at [BADFOODh] (memory offset)\nMOV R1, [R2+10h] ; set R1 to value at [R2+10h] (memory offset)\nMOV R1, [R2-20h] ; set R1 to value at [R2+20h] (memory offset)\nListing 4-7: Demonstrating Intel syntax\nIn the Intel syntax, the destination operand comes first, followed by the \nsource, so at , R1 is the destination and [BADFOODh] is the source. On the other \nhand, compilers like GCC (which can be used to write bots on Windows) use \na syntax known as AT&T, or UNIX, syntax. This syntax does things a little \ndifferently, as you can see in the following example:\nMOV $1, %R1 ; set R1 (register) to 1 (immediate)\nMOV 0xBADF00D, %R1 ; set R1 to value at 0xBADFOOD (memory offset)\nMOV 0x10(%R2), %R1 ; set R1 to value at 0x10(%R2) (memory offset)\nMOV -0x20(%R2), %R1 ; set R1 to value at -0x20(%R2) (memory offset)\nThis code is the AT&T version of Listing 4-7. AT&T syntax not only \nreverses the operand order but also requires operand prefixing and has \na different format for memory offset operands. \nAssembly Commands\nOnce you understand assembly instructions and how to format their oper-\nands, you can start writing commands. The following code shows an assem-\nbly function, consisting of some very basic commands, that essentially does \nnothing. \nFrom Code to Memory: A General Primer 81\nPUSH EBP ; put EBP (register) on the stack\nMOV EBP, ESP ; set EBP to value of ESP (register, top of stack)\nPUSH -1 ; put -1 (immediate) on the stack\nADD ESP, 4 ; negate the 'PUSH -1' to put ESP back where it was (a PUSH\n ; subtracts 4 from ESP, since it grows the stack)\nMOV ESP, EBP ; set ESP to the value of EBP (they will be the same anyway,\n ; since we have kept ESP in the same place)\nPOP EBP ; set EBP to the value on top of the stack (it will be what\n ; EBP started with, put on the stack by PUSH EBP)\nXOR EAX, EAX ; exclusive-or EAX (register) with itself (same effect as\n ; 'MOV EAX, 0' but much faster)\nRETN ; return from the function with a value of 0 (EAX typically\n ; holds the return value)\nThe first two lines, a PUSH command and a MOV command, set up a stack \nframe. The next line pushes –1 to the stack, which is undone when the stack \nis set back to its original position by the ADD ESP, 4 command. Following that, \nthe stack frame is removed, the return value (stored in EAX) is set to 0 with \nan XOR instruction, and the function returns. \nYou’ll learn more about stack frames and functions in “The Call Stack” \non page 86 and “Function Calls” on page 94. For now, turn your atten-\ntion to the constants in the code—namely EBP, ESP, and EAX, which are used \nfrequently in the code as operands. These values, among others, are called \nprocessor registers, and understanding them is essential to understanding the \nstack, function calls, and other low-level aspects of assembly code. \nProcessor Registers\nUnlike high-level programming languages, assembly language does not \nhave user-defined variable names. Instead, it accesses data by referenc-\ning its memory address. During intensive computation, however, it can be \nextremely costly for the processor to constantly deal with the overhead of \nreading and writing data to RAM. To mitigate this high cost, x86 proces-\nsors provide a small set of temporary variables, called processor registers, \nwhich are small storage spaces within the processor itself. Since accessing \nthese registers requires far less overhead than accessing RAM, assembly \nuses them to describe its internal state, pass volatile data around, and store \ncontext-sensitive variables.\nGeneral Registers\nWhen assembly code needs to store or operate on arbitrary data, it uses a \nsubset of process registers called general registers. These registers are used \nexclusively to store process-specific data, such as a function’s local variables. \nEach general register is 32 bits and thus can be thought of as a DWORD vari-\nable. General registers are also optimized for specific purposes:\nEAX, the accumulator This register is optimized for mathematical \ncomputations. Some operations, such as multiplication and division, \ncan only occur in EAX.\n82 Chapter 4\nEBX, the base register This register is used arbitrarily for extra \nstorage. Since its 16-bit predecessor, BX, was the only register that \noperations could use to reference memory addresses, EBX was used \nas a reference to RAM. In x86 assembly, however, all registers can be \naddress references, leaving EBX without a true purpose.\nECX, the counter This register is optimized to act as the counter vari-\nable (often called i in high-level code) in a loop.\nEDX, the data register This register is optimized to act as a helper to \nEAX. In 64-bit computations, for instance, EAX acts as bits 0–31 and \nEDX acts as bits 32–63.\nThese registers also have a set of 8- and 16-bit subregisters that you can \nuse to access partial data. Think of every general register as a union, where \na register name describes the 32-bit member and the subregisters are alter-\nnate members that allow access to smaller pieces of the register. The follow-\ning code shows what this union might look like for EAX:\nunion {\n DWORD EAX;\n WORD AX;\n struct {\n BYTE L;\n BYTE H;\n } A;\n} EAX;\nIn this example, AX allows access to the lower WORD of EAX, while AL allows \naccess to the lower BYTE of AX and AH to its higher BYTE. Every general reg-\nister has this structure, and I outline the other registers’ subregisters in \nFigure 4-5.\nEAX\nEBX\nECX\nEDX\nAX\nBX\nCX\nDX\nAH\nBH\nCH\nDH\nAL\nBL\nCL\nDL\n16 bits\n8 bits\n8 bits\n32 bits\nFigure 4-5: x86 registers and subregisters\nEAX, EBC, ECX, and EDX have higher words, too, but the compiler \nwill almost never access them on its own, as it can just use the lower word \nwhen it needs word-only storage.\nFrom Code to Memory: A General Primer 83\nIndex Registers\nx86 assembly also has four index registers, which are used to access data \nstreams, reference the call stack, and keep track of local information. Like \nthe general registers, index registers are 32 bits, but index registers have \nmore strictly defined purposes:\nEDI, the destination index This register is used to index memory tar-\ngeted by write operations. If there are no write operations in a piece of \ncode, the compiler can use EDI for arbitrary storage if needed.\nESI, the source index This register is used to index memory targeted \nby read operations. It can also be used arbitrarily.\nESP, the stack pointer This register is used to reference the top of the \ncall stack. All stack operations directly access this register. You must use \nESP only when working with the stack, and it must always point to the \ntop of the stack.\nEBP, the stack base pointer This register marks the bottom of the \nstack frame. Functions use it as a reference to their parameters and local \nvariables. Some code may be compiled with an option to omit this behav-\nior, in which case EBP can be used arbitrarily.\nLike the general registers, each index register has a 16-bit counterpart: \nDI, SI, SP, and BP, respectively. However, the index registers have no 8-bit \nsubregisters.\nThe Execution Index Register\nThe Execution Index register, referred to as EIP, has a very concrete pur-\npose: it points to the address of the code currently being executed by the \nprocessor. Because it controls the flow of execution, it is directly incre-\nmented by the processor and is off-limits to assembly code. To modify EIP, \nassembly code must indirectly access it using operations such as CALL, JMP, \nand RETN.\nWhy Do Some x86 Registers Have Subregisters?\nThere is a historical reason why both general and index registers have \n16-bit counterparts. The x86 architecture was based on a 16-bit architec-\nture, from which it extended the registers AX, BX, CX, DX, DI, SI, SP, and BP. \nAppropriately, the extensions retain the same names but are prefixed with an \nE, for “extended.” The 16-bit versions remain for backward compatibility. This \nalso explains why index registers have no 8-bit abstractions: they are intended \nto be used as memory-address offsets, and there is no practical need to know \npartial bytes of such values.\n84 Chapter 4\nThe EFLAGS Register\nUnlike high-level code, assembly language doesn’t have binary comparison \noperators like ==, >, and <. Instead, it uses the CMP command to compare two \nvalues, storing the resulting information in the EFLAGS register. Then, the \ncode changes its control flow using special operations that depend on the \nvalue stored in ELFAGS.\nWhile comparison commands are the only user-mode operations that \ncan access EFLAGS, they use only this register’s status bits: 0, 2, 4, 6, 7, and \n11. Bits 8–10 act as control flags, bits 12–14 and 16–21 act as system flags, \nand the remaining bits are reserved for the processor. Table 4-5 shows the \ntype, name, and description of each EFLAGS bit.\nTable 4-5: EFLAGS bits\nBit(s)\nType\nName\nDescription\n0\nStatus\nCarry\nSet if a carry or borrow was generated from the most \nsignificant bit during the previous instruction.\n2\nStatus\nParity\nSet if the least significant byte resulting from the previ-\nous instruction has an even number of bits set.\n4\nStatus\nAdjust\nSame as the carry flag, but considers the 4 least sig-\nnificant bits.\n6\nStatus\nZero\nSet if the resulting value from the previous instruction \nis equal to 0.\n7\nStatus\nSign\nSet if the resulting value from the previous instruction \nhas its sign bit (most significant bit) set.\n8\nControl\nTrap\nWhen set, the processor sends an interrupt to the \noperating system kernel after executing the next \noperation.\n9\nControl\nInterrupt\nWhen not set, the system ignores maskable interrupts.\n10\nControl\nDirection\nWhen set, ESI and EDI are decremented by operations \nthat automatically modify them. When not set, they \nare incremented.\n11\nStatus\nOverflow\nSet when a value is overflowed by the previous instruc-\ntion, such as when ADD is performed on a positive \nvalue and the result is a negative value.\nThe EFLAGS register also contains a system bit and a reserved bit, but \nthose are irrelevant in user-mode assembly and game hacking, so I’ve omit-\nted them from this table. Keep EFLAGS in mind when you’re debugging \ngame code to figure out how it works. For example, if you set a breakpoint \non a JE (jump if equal) instruction, you can look at the EFLAGS 0 bit to see \nwhether the jump will be taken. \nSegment Registers\nFinally, assembly language has a set of 16-bit registers called segment registers. \nUnlike other registers, segment registers are not used to store data; they \nare used to locate it. In theory, they point to isolated segments of memory, \nFrom Code to Memory: A General Primer 85\nallowing different types of data to be stored in completely separate memory \nsegments. The implementation of such segmentation is left up to the operat-\ning system. These are the x86 segment registers and their intended purposes:\nCS, the code segment This register points to the memory that holds \nan application’s code.\nDS, the data segment This register points to the memory that holds \nan application’s data.\nES, FS, and GS, the extra segments These registers point to any pro-\nprietary memory segments used by the operating system.\nSS, the stack segment This register points to memory that acts as a \ndedicated call stack.\nIn assembly code, segment registers are used as prefixes to memory off-\nset operands. When a segment register isn’t specified, DS is used by default. \nThis means that the command PUSH [EBP] is effectively the same as PUSH \nDS:[EBP]. But the command PUSH FS:[EBP] is different: it reads memory from \nthe FS segment, not the DS segment.\nIf you look closely at the Windows x86 implementation of memory seg-\nmentation, you might notice that these segment registers were not exactly \nused as intended. To see this in action, you can run the following commands \nwith the OllyDbg command line plug-in while OllyDbg is attached to a \npaused process:\n? CALC (DS==SS && SS==GS && GS==ES) \n? 1 \n? CALC DS-CS \n? 8\n? CALC FS-DS \n; returns nonzero (and changes between threads)\nThis output tells us three distinct things. First, it shows that there are \nonly three segments being used by Windows: FS, CS, and everything else. \nThis is demonstrated by DS, SS, GS, and ES being equal. For the same \nreason, this output shows that DS, SS, GS, and ES can all be used inter-\nchangeably, as they all point to the same memory segments. Lastly, since FS \nchanges depending on the thread, this output shows that it is thread depen-\ndent. FS is an interesting segment register, and it points to certain thread-\nspecific data. In “Bypassing ASLR in Production” on page 128, we’ll explore \nhow the data in FS can be used to bypass ASLR—something most bots will \nneed to do.\nIn fact, in assembly code generated for Windows by a compiler, you’d \nonly ever see three segments used: DS, FS, and SS. Interestingly enough, \neven though CS seems to show a constant offset from DS, it has no real \npurpose in user-mode code. Knowing all of these things, you can further \nconclude that there are only two segments being used by Windows: FS and \neverything else.\nThese two segments actually point to different locations in the same \nmemory (there’s no simple way to verify this, but it is true), which shows \n86 Chapter 4\nthat Windows actually doesn’t use memory segments at all. Instead, it uses \na flat memory model in which segment registers are nearly irrelevant. While \nall segment registers point to the same memory, only FS and CS point to \ndifferent locations, and CS is not used.\nIn conclusion, there are only three things you need to know about seg-\nment registers when working with x86 assembly in Windows. First, DS, SS, \nGS, and ES are interchangeable, but for clarity DS should be used to access \ndata and SS should be used to access the call stack. Second, CS can be safely \nforgotten. Third, FS is the only segment register with a special purpose; it \nshould be left alone for now.\nThe Call Stack\nRegisters are powerful, but unfortunately they come in very limited sup-\nply. In order for assembly code to effectively store all of its local data, \nit must also use the call stack. The stack is used to store many different \nvalues, including function parameters, return addresses, and some local \nvariables.\nUnderstanding the ins and outs of the call stack will come in handy \nwhen you’re reverse engineering a game. Moreover, you’ll rely on this \nknowledge heavily when we jump into control flow manipulation in \nChapter 8.\nStructure\nYou can think of the call stack as a FILO (first-in-last-out) list of DWORD values \nthat can be directly accessed and manipulated by assembly code. The \nterm stack is used because the structure resembles a stack of paper: objects \nare both added to and removed from the top. Data is added to the stack \nthrough the PUSH operand command, and it is removed (and placed in a reg-\nister) through the POP register command. Figure 4-6 shows how this process \nmight look.\nPUSH\nPOP\nFigure 4-6: The structure of a stack\nFrom Code to Memory: A General Primer 87\nIn Windows, the stack grows from higher memory addresses to lower \nones. It occupies a finite block of memory, piling up to address 0x00000000 \n(the absolute top) from address n (the absolute bottom). This means that \nESP (the pointer to the top of the stack) decreases as items are added and \nincreases as items are removed.\nThe Stack Frame\nWhen an assembly function uses the stack \nto store data, it references the data by creat-\ning a stack frame. It does so by storing ESP \nin EBP and then subtracting n bytes from \nESP, effectively opening an n-byte gap that \nis framed between the registers EBP and ESP. \nTo better understand this, first imagine that \nthe stack in Figure 4-7 is passed to a func-\ntion that requires 0x0C bytes of local storage \nspace.\nIn this example, address 0x0000 is the \nabsolute top of the stack. We have unused \nmemory from addresses 0x0000 to 0xFF00 \n– 4, and at the time of the function call, 0xFF00 is the top of the stack. ESP \npoints to this address. The stack memory after 0xFF00 is used by preceding \nfunctions in the call chain (from 0xFF04 to 0xFFFF). When the function is \ncalled, the first thing it does is execute the following assembly code, which \ncreates a stack frame of 0x0C (12 in decimal) bytes:\nPUSH EBP ; saves the bottom of the lower stack frame\nMOV EBP, ESP ; stores the bottom of the current stack frame, in EBP\n ; (also 4 bytes above the lower stack frame)\nSUB ESP, 0x0C ; subtracts 0x0C bytes from ESP, moving it up the stack\n ; to mark the top of the stack frame\nAfter this code executes, the stack looks more like the one shown in \nFigure 4-8. After creating this stack, the function can work with the 0x0C \nbytes it allocated on the stack.\n0x0000 is still the absolute top of the stack. We have unused stack \nmemory from addresses 0x0000 to 0xFF00 – 20, and the memory at \naddress 0xFF00 – 16 contains the final 4 bytes of local storage (referenced \nby [EBP-Ch]). This is also the top of the current stack frame, so ESP points \nhere. 0xFF00 – 12 contains the middle 4 bytes of local storage (referenced \nby [EBP-8h]), and 0xFF00 – 8 contains the first 4 bytes of local storage (ref-\nerenced by [EBP-4h]). EBP points to 0xFF00 – 4, which is the bottom of the \ncurrent stack frame; this address holds the original value of EBP. 0xFF00 is \nthe top of the lower stack frame, and the original ESP in Figure 4-7 pointed \nhere. Finally, you can still see the stack memory from preceding functions \nin the call chain from 0xFF04 to 0xFFFF. \nFigure 4-7: Initial example stack \n(read from bottom to top)\n0x0000\n0xFF00\n. . . \n. . . \nESP\n88 Chapter 4\n0x0000\nESP\nEBP\n0xFF00 – 16\n. . . \n0xFF00 – 12\n0xFF00 – 8\n. . . \n0xFF00 – 4\n0xFF00\nFigure 4-8: Example stack with stack \nframe set up (read from bottom to top)\nWith the stack in this state, the function is free to use its local data as \nit pleases. If this function called another function, the new function would \nbuild its own stack frame using the same technique (the stack frames really \nstack up). Once a function finishes using a stack frame, however, it must \nrestore the stack to its previous state. In our case, that means making the \nstack look like it did in Figure 4-7. When the second function finishes, our \nfirst function cleans the stack using the following two commands:\nMOV ESP, EBP ; demolishes the stack frame, bringing ESP to 4 bytes above\n ; its original value (0xFF00-4)\nPOP EBP ; restores the bottom of the old stack frame that was saved by\n ; 'PUSH EBP'. Also adds 4 bytes to ESP, putting it back at\n ; its original value\nBut if you want to change the parameters passed to a function in a game, \ndon’t look for them in that function’s stack frame. A function’s parameters \nare stored in the stack frame of the function that called it, and they’re ref-\nerenced through [EBP+8h], [EBP+Ch], and so on. They start at [EBP+8h] because \n[EBP+4h] stores the function’s return address. (“Function Calls” on page 94 \nexplains this topic further.)\nN o t e \t\nCode can be compiled with stack frames disabled. When this is the case, you’ll notice \nthat functions don’t open with PUSH EBP and instead reference everything relative to \nESP. More often than not, though, stack frames are enabled in compiled game code.\nNow that you have a grasp on the fundamentals of assembly code, let’s \nexplore some specifics that will come in handy when hacking games.\nFrom Code to Memory: A General Primer 89\nImportant x86 Instructions for Game Hacking\nWhile assembly language has hundreds of instructions, many well-equipped \ngame hackers understand only a small subset of them, which I cover in detail \nhere. This subset typically encapsulates all instructions that are used to mod-\nify data, call functions, compare values, or jump around within code.\nData Modification\nData modification often happens over several assembly operations, but \nthe end result has to be stored either in memory or in a register, typically \nwith the MOV instruction. The MOV operation takes two operands: a destina-\ntion and a source. Table 4-6 shows all possible sets of MOV operands and the \nresults you can expect from those calls.\nTable 4-6: Operands to the MOV Instruction\nInstruction syntax\nResult\nMOV R1, R2\nCopies R2’s value to R1.\nMOV R1, [R2]\nCopies the value from the memory referenced \nby R2 to R1.\nMOV R1, [R2+Ah]\nCopies the value from the memory referenced \nby R2+0xA to R1.\nMOV R1, [DEADBEEFh]\nCopies the value from the memory at \n0xDEADBEEF to R1.\nMOV R1, BADF00Dh\nCopies the value 0xBADF00D to R1.\nMOV [R1], R2\nCopies R2’s value to the memory referenced \nby R1.\nMOV [R1], BADF00Dh\nCopies the value 0xBADF00D to the memory \nreferenced by R1.\nMOV [R1+4h], R2\nCopies R2’s value to the memory referenced \nby R1+0x4.\nMOV [R1+4h], BADF00Dh\nCopies the value 0xBADF00D to the memory \nreferenced by R1+0x4.\nMOV [DEADBEEFh], R1\nCopies R1’s value to the memory at \n0xDEADBEEF.\nMOV [DEADBEEFh], BADF00Dh\nCopies the value 0xBADF00D to the memory \nat 0xDEADBEEF.\nThe MOV instruction can take a lot of operand combinations, but some \naren’t allowed. First, the destination operand can’t be an immediate value; \nit must be a register or memory address, because immediate values can’t be \nmodified. Second, values can’t be directly copied from one memory address \nto another. Copying a value requires two separate operations, like so:\nMOV EAX, [EBP+10h] ; copy memory from EBP+0x10 to EAX\nMOV [DEADBEEFh], EAX ; MOV the copied memory to memory at 0xDEADBEEF\n90 Chapter 4\nThese instructions copy whatever is stored at EBP+0x10 to the memory at \n0xDEADBEEF.\nArithmetic\nLike many high-level languages, assembly language has two types of arith-\nmetic: unary and binary. Unary instructions take a single operand that \nacts as both a destination and a source. This operand can be a register or \na memory address. Table 4-7 shows the common unary arithmetic instruc-\ntions in x86. \nTable 4-7: Unary Arithmetic Instructions\nInstruction syntax\nResult\nINC operand\nAdds 1 to the operand value.\nDEC operand\nSubtracts 1 from the operand value.\nNOT operand\nLogically negates the operand value (flips all bits).\nNEG operand\nPerforms two’s-complement negation (flips all bits and adds 1; \nessentially multiplies by −1).\nBinary instructions (which make up the majority of x86 arithmetic), on \nthe other hand, are syntactically similar to the MOV instruction. They require \ntwo operands and have similar operand limitations. Unlike MOV, however, \ntheir destination operand serves a second purpose: it is also the left-hand \nvalue in the calculation. For example, the assembly operation ADD EAX,EBX \nequates to EAX = EAX + EBX or EAX += EBX in C++. Table 4-8 shows the common \nx86 binary arithmetic instructions.\nTable 4-8: Binary Arithmetic Instructions\nInstruction syntax\nFunction\nOperand notes\nADD destination, source\ndestination += source\nSUB destination, source\ndestination -= source\nAND destination, source\ndestination &= source\nOR destination, source\ndestination |= source\nXOR destination, source\ndestination ^= source\nSHL destination, source\ndestination = destination << source\nsource must be CL or an 8-bit \nimmediate value.\nSHR destination, source\ndestination = destination >> source\nsource must be CL or an 8-bit \nimmediate value.\nIMUL destination, source\ndestination *= source\ndestination must be a register; \nsource cannot be an immediate \nvalue.\nOf these arithmetic instructions, IMUL is special because you can pass it \na third operand, in the form of an immediate value. With this prototype, \nthe destination operand is no longer involved in the calculation, which \nFrom Code to Memory: A General Primer 91\ninstead takes place between the remaining operands. For example, the \nassembly command IMUL EAX,EBX,4h equates to EAX = EBX * 0x4 in C++. \nYou can also pass a single operand to IMUL.3 In this case, the oper-\nand acts as the source and can be either a memory address or a register. \nDepending on the size of the source operand, the instruction will use \ndifferent parts of the EAX register for inputs and output, as shown in \nTable 4-9.\nTable 4-9: Possible IMUL Register Operands\nSource size\nInput\nOutput\n8 bits\nAL\n16 bit, stored in AH:AL (which is AX)\n16 bits\nAX\n32 bit, stored in DX:AX (bits 0–15 in AX and bits \n16–31 in DX)\n32 bits\nEAX\n64 bit, stored in EDX:EAX (bits 0–31 in EAX and \nbits 32–64 in EDX)\nNotice that even though the input is only one register, each output uses \ntwo registers. That’s because in multiplication, the result generally is larger \nthan the inputs.\nLet’s look at an example calculation using IMUL with a single 32-bit \noperand:\nIMUL [BADFOODh] ; 32-bit operand is at address 0xBADFOOD\nThis command behaves like the following pseudocode:\nEDX:EAX = EAX * [BADFOODh]\nSimilarly, here’s an operation that uses IMUL with a single 16-bit \noperand:\nIMUL CX ; 16-bit operand is stored in CX\nAnd its corresponding pseudocode:\nDX:AX = AX * CX\nFinally, this is an IMUL command with a single 8-bit operand:\nIMUL CL ; 8-bit operand is stored in CL\nAnd its corresponding pseudocode:\nAX = AL * CL\n3. There is also an unsigned multiplication instruction, MUL, which only works with a single \noperand.\n92 Chapter 4\nx86 assembly language has division as well, through the IDIV instruction.4 \nThe IDIV instruction accepts a single source operand and follows register \nrules similar to those for IMUL. As Table 4-10 shows, IDIV operations require \ntwo inputs and two outputs.\nTable 4-10: Possible IDIV Register Operands\nSource size\nInput\nOutput\n8 bit\n16 bit, stored in AH:AL (which is AX)\nRemainder in AH; quotient in AL\n16 bit\n32 bit, stored in DX:AX\nRemainder in DX; quotient in AX\n32 bit\n64 bit, stored in EDX:EAX\nRemainder in EDX; quotient in EAX\nIn division, the inputs are generally larger than the output, so here \nthe inputs take two registers. Moreover, division operations must store a \nremainder, which gets stored in the first input register. For example, here’s \nhow a 32-bit IDIV calculation would look:\nMOV EDX, 0 ; there's no high-order DWORD in the input, so EDX is 0\nMOV EAX, inputValue ; 32-bit input value\nIDIV ECX ; divide EDX:EAX by ECX\nAnd here’s some pseudocode that expresses what happens under \nthe hood:\nEAX = EDX:EAX / ECX ; quotient\nEDX = EDX:EAX % ECX ; remainder\nThese details of IDIV and IMUL are important to remember, as the behav-\nior can otherwise be quite obfuscated when you’re simply looking at the \ncommands.\nBranching\nAfter evaluating an expression, programs can decide what to execute next \nbased on the result, typically using constructs such as if() statements or \nswitch() statements. These control flow statements don’t exist at the assem-\nbly level, however. Instead, assembly code uses the EFLAGS register to make \ndecisions and jump operations to execute different blocks; this process is \ncalled branching.\nTo get the proper value in EFLAGS, assembly code uses one of two \ninstructions: TEST or CMP. Both compare two operands, set the status bits of \nEFLAGS, and then discard any results. TEST compares the operands using \na logical AND, while CMP uses signed subtraction to subtract the latter oper-\nand from the former.\n4. Just as MUL is to IMUL, DIV is the unsigned counterpart to IDIV.\nFrom Code to Memory: A General Primer 93\nIn order to branch properly, the code has a jump command immedi-\nately following the comparison. Each type of jump instruction accepts a \nsingle operand that specifies the address of the code to jump to. How a \nparticular jump instruction behaves depends on the status bits of EFLAGS. \nTable 4-11 describes some x86 jump instructions.\nTable 4-11: Common x86 Jump Instructions\nInstruction\nName\nBehavior\nJMP dest\nUnconditional jump\nJumps to dest (sets EIP to dest).\nJE dest\nJump if equal\nJumps if ZF (zero flag) is 1.\nJNE dest\nJump if not equal\nJumps if ZF is 0.\nJG dest\nJump if greater\nJumps if ZF is 0 and SF (sign flag) is equal \nto OF (overflow flag).\nJGE dest\nJump if greater or equal\nJumps if SF is equal to OF.\nJA dest\nUnsigned JG\nJumps if CF (carry flag) is 0 and ZF is 0.\nJAE dest\nUnsigned JGE\nJumps if CF is 0.\nJL dest\nJump if less\nJumps if SF is not equal to OF.\nJLE dest\nJump if less or equal\nJumps if ZF is 1 or SF is not equal to OF.\nJB dest\nUnsigned JL\nJumps if CF is 1.\nJBE dest\nUnsigned JLE\nJumps if CF is 1 or ZF is 1.\nJO dest\nJump if overflow\nJumps if OF is 1.\nJNO dest\nJump if not overflow\nJumps if OF is 0.\nJZ dest\nJump if zero\nJumps if ZF is 1 (identical to JE).\nJNZ dest\nJump if not zero\nJumps if ZF is 0 (identical to JNE).\nRemembering which flags control which jump instructions can be a \npain, but their purpose is clearly expressed in their name. A good rule of \nthumb is that a jump preceded by a CMP is the same as its corresponding \noperator. For example, Table 4-11 lists JE as “jump if equal,” so when JE fol-\nlows a CMP operation, it’s the same as the == operator. Similarly, JGE would be \n>=, JLE would be >=, and so on. \nAs an example, consider the high-level code shown in Listing 4-8.\n--snip--\nif (EBX > EAX)\n ECX = EDX;\nelse\n ECX = 0;\n--snip--\nListing 4-8: A simple conditional statement\n94 Chapter 4\nThis if() statement just checks whether EBX is greater than EAX and sets \nECX based on the result. In assembly, the same statement may look some-\nthing like this:\n --snip--\n CMP EBX, EAX ; if (EBX > EAX)\n JG label1 ; jump to label1 if EBX > EAX \n MOV ECX, 0 ; ECX = 0 (else block)\n JMP label2 ; jump over the if block\nlabel1:\n MOV ECX, EDX ; ECX = EDX (if block)\nlabel2:\n --snip--\nThe assembly for the if() statement in Listing 4-8 begins with a CMP \ninstruction and branches if EBX is greater than EAX. If the branch is taken, \nEIP is set to the if block at courtesy of the JG instruction. If the branch \nis not taken, the code continues executing linearly and hits the else block \nimmediately after the JG instruction. When the else block finishes execut-\ning, an unconditional JMP sets EIP to 0x7, skipping over the if block.\nFunction Calls\nIn assembly code, functions are isolated blocks of commands executed \nthrough the CALL instruction. The CALL instruction, which takes a function \naddress as the only operand, pushes a return address onto the stack and \nsets EIP to its operand value. The following pseudocode shows a CALL in \naction, with memory addresses on the left in hex:\n0x1: CALL EAX\n0x2: ...\nWhen CALL EAX is executed, the next address is pushed to the stack and \nEIP is set to EAX, showing that CALL is essentially a PUSH and JMP. The following \npseudocode underscores this point:\n0x1: PUSH 3h\n0x2: JMP EAX\n0x3: ...\nWhile there’s an extra address between the PUSH instruction and the \ncode to execute, the result is the same: before the block of code at EAX is \nexecuted, the address of the code that follows the branch is pushed to the \nstack. This happens so the callee (the function being called) knows where to \njump to in the caller (the function doing the call) when it returns.\nIf a function without parameters is called, a CALL command is all that’s \nnecessary. If the callee takes parameters, however, the parameters must first \nFrom Code to Memory: A General Primer 95\nbe pushed onto the stack in reverse order. The following pseudocode shows \nhow a function call with three parameters might look:\nPUSH 300h ; arg3\nPUSH 200h ; arg2\nPUSH 100h ; arg1\nCALL ECX ; call\nWhen the callee is executed, the top of the stack contains a return \naddress that points to the code after the call. The first parameter, 0x100, \nis below the return address on the stack. The second parameter, 0x200, is \nbelow that, followed by the third parameter, 0x300. The callee sets up its \nstack frame, using memory offsets from EBP to reference each parameter. \nOnce the callee has finished executing, it restores the caller’s stack frame \nand executes the RET instruction, which pops the return address off the \nstack and jumps to it.\nSince the parameters are not a part of the callee’s stack frame, they \nremain on the stack after RET is executed. If the caller is responsible for \ncleaning the stack, it adds 12 (3 parameters, at 4 bytes each) to ESP imme-\ndiately after CALL ECX completes. If the callee is responsible, it cleans up by \nexecuting RET 12 instead of RET. This responsibility is determined by the cal-\nlee’s calling convention.\nA function’s calling convention tells the compiler how the assembly code \nshould pass parameters, store instance pointers, communicate the return \nvalue, and clean the stack. Different compilers have different calling con-\nventions, but the ones listed in Table 4-12 are the only four that a game \nhacker is likely to encounter.\nTable 4-12: Calling Conventions to Know for Game Hacking\nDirective\nCleaner\nNotes\n__cdecl\ncaller\nDefault convention in Visual Studio.\n__stdcall\ncallee\nConvention used by Win32 API functions.\n__fastcall\ncallee\nFirst two DWORD (or smaller) parameters are \npassed in ECX and EDX.\n__thiscall\ncallee\nUsed for member functions. The pointer to \nthe class instance is passed in ECX.\nThe Directive column in Table 4-12 gives the name of the calling con-\nvention, and the Cleaner column tells you whether the caller or callee is \nresponsible for cleaning the stack given that directive. In the case of these \nfour calling conventions, parameters are always pushed right to left, and \nreturn values are always stored in EAX. This is a standard, but not a rule; \nit can differ across other calling conventions.\n96 Chapter 4\nClosing Thoughts\nMy goal in writing this chapter was to help you understand memory and \nassembly in a general sense, before we dig into game-hacking specifics. \nWith your newfound ability to think like a computer, you should be ade-\nquately armed to start tackling more advanced memory forensics tasks. If \nyou’re itching for a peek at how you’ll apply all of this to something real, flip \nto “Applying Call Hooks to Adobe AIR” on page 169 or “Applying Jump \nHooks and VF Hooks to Direct3D” on page 175. \nIf you want some hands-on time with memory, compile this chapter’s \nexample code and use Cheat Engine or OllyDbg to inspect, tweak, and poke \nat the memory until you’ve got the hang of it. This is important, as the next \nchapter will build on these skills by teaching you advanced memory forensic \ntechniques.\n5\nA dva nce d M e mory For e n sic s\nWhether you hack games as a hobby or \na business, you’ll eventually find yourself \nbetween a rock and . . . an unintelligible \nmemory dump. Be it a race with a rival bot \ndeveloper to release a highly requested feature, a \nbattle against a game company’s constant barrage of \nupdates, or a struggle to locate some complex data \nstructure in memory, you’ll need top-notch memory \nforensics skills to prevail.\nSuccessful bot development is precariously balanced atop speed and \nskill, and tenacious hackers must rise to the challenge by swiftly releasing \ningenious features, promptly responding to game updates, and readily \nsearching for even the most elusive pieces of data. Doing this, however, \nrequires a comprehensive understanding of common memory patterns, \nadvanced data structures, and the purpose of different pieces of data. \n98 Chapter 5\nThose three aspects of memory forensics are perhaps the most effective \nweapons in your arsenal, and this chapter will teach you how to use them. \nFirst, I’ll discuss advanced memory-scanning techniques that focus on \nsearching for data by understanding its purpose and usage. Next, I’ll teach \nyou how to use memory patterns to tackle game updates and tweak your \nbots without having to relocate all of your addresses from scratch. To wrap \nup, I’ll dissect the four most common complex data structures in the C++ \nstandard library (std::string, std::vector, std::list, and std::map) so you can \nrecognize them in memory and enumerate their contents. By the end of the \nchapter, my hope is that you’ll have a deep understanding of memory foren-\nsics and be able to take on any challenge related to memory scanning.\nAdvanced Memory Scanning\nWithin a game’s source code, each piece of data has a cold, calculated defi-\nnition. When the game is being played, however, all of that data comes \ntogether to create something new. Players only experience the beautiful \nscenery, visceral sounds, and intense adventures; the data that drives these \nexperiences is irrelevant. \nWith that in mind, imagine Hacker A has just started tearing into his \nfavorite game, wanting to automate some of the boring bits with a bot. He \ndoesn’t have a complete understanding of memory yet, and to him, the \ndata is nothing but assumptions. He thinks, “I have 500 health, so I can \nfind the health address by telling Cheat Engine to look for a 4-byte integer \nwith a value of 500.” Hacker A has an accurate understanding of data: it’s \njust information (values) stored at particular locations (addresses) using \ndefined structures (types). \nNow imagine Hacker B, who already understands the game both inside \nand out; she knows how playing the game alters its state in memory, and the \ndata no longer has any secrets. She knows that every defined property of \nthe data can be determined given its purpose. Unlike Hacker A, Hacker B \nhas an understanding of data that transcends the confines of a single vari-\nable declaration: she considers the data’s purpose and usage. In this section, \nwe’ll discuss both.\nEach piece of data in a game has a purpose, and the assembly code of \nthe game must, at some point, reference the data to fulfill that purpose. \nFinding the unique code that uses a piece of data means finding a version-\nagnostic marker that persists across game updates until the data is either \nremoved or its purpose is changed. Let me show you why this is important.\nDeducing Purpose\nSo far, I’ve only shown you how to blindly search memory for a given piece \nof data without considering how it’s being used. This method can be effec-\ntive, but it is not always efficient. In many cases, it’s much quicker to deduce \nthe purpose of data, determine what code might use that data, and then \nlocate that code to ultimately find the address of the data.\nAdvanced Memory Forensics 99\nThis might not sound easy, but neither does “scan the game’s memory \nfor a specific value of a specific data type, and then continuously filter the \nresult list based on changing criteria,” which is what you’ve learned to do \nthus far. So let’s look at how we might locate the address for health given its \npurpose. Consider the code in Listing 5-1.\nstruct PlayerVital {\n int current, maximum;\n};\nPlayerVital health;\n--snip--\nprintString(\"Health: %d of %d\\n\", health.current, health.maximum);\nListing 5-1: A structure containing the player’s vitals, and a function that displays them\nIf you pretend that printString() is a fancy function to draw text on an \nin-game interface, then this code is pretty close to what you might find in a \ngame. The PlayerVital structure has two properties: the current value and a \nmaximum value. The value health is a PlayerVital structure, so it has these prop-\nerties, too. Based on the name alone, you can deduce that health exists to \ndisplay information about the player’s health, and you can see this purpose \nfulfilled when printString() uses the data.\nEven without the code, you can intuitively draw similar conclusions by \njust looking at the health text displayed in the game’s interface; a computer \ncan’t do anything without code, after all. Aside from the actual health vari-\nable, there are a few code elements that need to exist to show a player this \ntext. First, there needs to be some function to display text. Second, the \nstrings Health and of must be nearby.\nN o t e \t\nWhy do I assume the text is split into two separate strings instead of one? The game \ninterface shows that the current health value is between these two strings, but there are \nmany ways that could happen, including format strings, strcat(), or text aligned \nwith multiple display text calls. When you’re analyzing data, it’s best to keep your \nassumptions broad to account for all possibilities.\nTo find health without using a memory scanner, we could utilize these \ntwo distinct strings. We probably wouldn’t have a clue what the function \nto display text looks like, where it is, or how many times it’s called, though. \nRealistically, the strings are all we would know to look for, and that’s \nenough. Let’s walk through it.\nFinding the Player’s Health with OllyDbg\nI’ll walk you through how to track down the health structure in this sec-\ntion, but I’ve also included the binary I analyze in the book’s resource files. \nTo follow along and get some hands-on practice, use the file Chapter5_\nAdvancedMemoryForensics_Scanning.exe.\nFirst, open OllyDbg and attach it to the executable. Then, open \nOllyDbg’s Executable modules window and double-click the main module; \nin my example, the main module is the only .exe in the module’s window. \n100 Chapter 5\nThe CPU window should pop up. Now, right-click in the Disassembler pane \nand select Search forAll referenced text strings. This should open the \nReferences window, shown in Figure 5-1.\nFigure 5-1: OllyDbg’s References window, showing only \na list of strings. There would be a lot more than four in a \nreal game.\nFrom this window, right-click and select Search for text. A search dia-\nlog appears. Enter the string you’re looking for, as shown in Figure 5-2, \nand make the search as broad as possible by disabling Case sensitive and \nenabling Entire scope. \nFigure 5-2: Searching for strings in OllyDbg\nClick OK to execute the search. The References window comes back into \nfocus with the first match highlighted. Double-click the match to see the \nassembly code that uses the string inside the CPU window. The Disassembler \npane focuses on the line of code at 0x401030, which pushes the format string \nparameter to printString(). You can see this line in Figure 5-3, where I’ve \nhighlighted the entire function call block.\n\u001f\n\u001e\n\u001d\nFigure 5-3: Viewing the printString() call in the CPU window’s Disassembler pane\nBy reading the assembly code, you can get a very accurate understand-\ning of exactly what the game is doing. The black bracket on the left shows \nthat the string Health is inside a function call. Notice the arguments to that \nAdvanced Memory Forensics 101\nfunction. In order, these are EAX , ECX , and the format string at \n0x4020D0 . EAX is the value at 0x40301C, ECX is the value at 0x403018, \nand the format string contains Health. Since the string contains two format \nplaceholders, you can assume that the remaining two parameters are the \narguments for those placeholders.\nKnowing what the arguments are and that they are pushed in reverse \norder, you can work backward and conclude that the original code looked \nsomething like Listing 5-2.\nint currentHealth; // value at 0x403018 \nint maxHealth; // value at 0x40301C \n--snip--\nsomeFunction(\"Health: %d of %d\\n\",\n currentHealth, maxHealth);\nListing 5-2: How a game hacker might interpret the assembly that Figure 5-3 compiles to\nThe values stored in EAX and ECX are adjacent in memory, which \nmeans they may be part of a structure. To keep it simple, though, this \nexample just shows them as variable definitions. Either way, these are the \ntwo numbers used to display the player’s health. Because both of these impor-\ntant values were displayed in the game’s UI, it was easy to make assumptions \nabout the underlying code that displays them. When you know the purpose \nof a piece of data, you can quickly find the code responsible for fulfilling it; \nin this case, that knowledge helped us quickly find both addresses. \nIn many cases, finding addresses can be this easy, but some pieces of \ndata have such complex purposes that it’s harder to guess what to look for. \nFiguring out how to search for map data or character locations in OllyDbg, \nfor instance, can be pretty tricky.\nStrings are far from the only markers that you can use to find the \ndata you want to change in a game, but they are definitely the easiest to \nteach without giving contrived examples. Moreover, some games have log-\nging or error strings embedded in their code, and poking around in the \nReferenced text strings window of OllyDbg can be a quick way to determine \nwhether these strings are present. If you become familiar with a game’s log-\nging practices, you’ll be able to find values even more easily.\nDetermining New Addresses After Game Updates\nWhen application code is modified and recompiled, a brand-new binary \nthat reflects the changes is produced. This binary might be very similar to \nthe previous one, or the binaries might be nothing alike; the difference \nbetween the two versions has a direct correlation to the complexity of the \nhigh-level changes. Small changes, like modified strings or updated con-\nstants, can leave binaries nearly identical and often have no effect on the \naddresses of code or data. But more complex changes—like added features, \na new user interface, refactored internals, or new in-game content—often \ncause shifts in the location of crucial memory.\n102 Chapter 5\nDue to constant bug fixes, content improvements, and feature addi-\ntions, online games are among the most rapidly evolving types of software. \nSome games release updates as often as once a week, and game hackers often \nspend a majority of their time reverse engineering the new binaries in order \nto accordingly update their bots.\nIf you create advanced bots, they will become increasingly supported \nby a foundation of memory addresses. When an update comes, determin-\ning the new addresses for a large number of values and functions is the \nmost time-consuming inevitability you will face. Relying on the “Tips for \nWinning the Update Race” can be very beneficial, but the tips won’t help \nyou locate the updated addresses. You can automatically locate some \naddresses using Cheat Engine scripts, but that won’t always work either. \nSometimes you’ll have to do the dirty work by hand.\nIf you try to reinvent the wheel and find these addresses the same way \nyou did initially, you’ll be wasting your time. You actually have a big advan-\ntage, though: the old binary and the addresses themselves. Using these two \nthings, it is possible to find every single address you need to update in a \nfraction of the time.\nFigure 5-4 shows two different disassemblies: a new game binary on the \nleft and the previous version on the right. I have taken this image from an \nactual game (which will remain nameless) in order to give you a realistic \nexample.\nAutomatically Find currentHealth and ma xHealth\nIn “Searching for Assembly Patterns” on page 19 and “Searching for Strings” \non page 21, I showed a few Cheat Engine Lua scripts and explained how \nthey worked. Using the findString() function in these examples, you can \nmake Cheat Engine automatically locate the address of the format string that \nwe just found manually in OllyDbg. Next, you can write a small function to \nscan for this address following byte 0x68 (the byte for the PUSH command, \nas you can see beside it at 0x401030 in Figure 5-3) to locate the address \nof the code that pushes it to the stack. Then, you can read 4 bytes from \npushAddress - 5 and pushAddress - 12 to locate currentHealth and maxHealth, \nrespectively.\nThis may not seem useful since we’ve already found the addresses, but \nif this were a real game, these addresses would change when an update is \nreleased. Using this knowledge to automate finding them can be very helpful. \nIf you’re up to the challenge, give it a whirl!\nAdvanced Memory Forensics 103\nFigure 5-4: Side-by-side disassemblies of two versions of one game\nMy bot modified the code at 0x047B542 (right), and I needed to find \nthe corresponding code in the new version, which I discovered at 0x047B672 \n(left). This function call invokes a packet-parsing function when a packet \nhas been received. In order to find this address originally (and by “origi-\nnally,” I mean about 100 updates previous), I figured out how the game’s \nnetwork protocol worked, set breakpoints on many network-related API \ncalls, stepped through execution, and inspected data on the stack until I \nfound something that looked similar to what I expected given my knowl-\nedge of the protocol.\nTips for Winning the Update R ace\nIn saturated markets, being the first bot developer to release a stable update is \ncritical to success. The race starts the second the game updates, and hackers \ndetermined to be the fastest will spend hundreds of hours preparing. These are \nthe most common ways to stay on top:\nCreate update alarms By writing software that alerts you as soon as the game \npatches, you can begin working on your updates as soon as possible.\nAutomate bot installs Games often schedule expected updates at times when \nthe fewest players are online. Botters hate waking up and downloading new \nsoftware before they bot, but they love waking up to find it silently installed \nwhile the game is patching.\nUse fewer addresses The less there is to update, the better. Consolidating \nrelated data into structures and eliminating unnecessary memory address \nusage can save a bunch of time.\nHave great test cases Data changes, and hackers make mistakes. Having \nways to quickly test every feature can be the difference between a stable bot \nand one that randomly crashes, gets users killed, or even leads to their charac-\nters being banned from the game.\nAttacking updates with these practices will give you a sizable head start, \nbut they might not always be enough to lead you to victory. Above all else, \nstrive to understand reverse engineering as much as possible and use that \nunderstanding to your advantage.\n104 Chapter 5\nI could have followed the same steps for each of the 100+ updates since \nthen, but that would have been unnecessary. The code stayed relatively the \nsame throughout the years, which let me use patterns from the old code to \nfind that function call’s address in the new code. \nNow, consider this chunk of assembly code:\nPUSH EDI\nPUSH EAX\nLEA EAX,DWORD PTR SS:[EBP-C]\nMOV DWORD PTR FS:[0],EAX\nMOV DWORD PTR SS:[EBP-10],ESP\nMOV DWORD PTR SS:[EBP-220],-1\nMOV DWORD PTR SS:[EBP-4],0\nDoes it look familiar? Compare it to Figure 5-4, and you’ll see that this \nexact code exists right above the highlighted function call in both versions \nof the game. Regardless of what it does, the combination of operations looks \npretty distinctive; because of the number of different offsets the code is \nusing relative to EBP, it’s unlikely that an identical chunk of code exists in \nany other part of the binary. \nEvery time I have to update this address, I open the old binary \nin OllyDbg, highlight this chunk of operations, right-click, and select \nAsm2ClipboardCopy fixed asm to clipboard. Then, I open the new binary \nin OllyDbg, navigate to the CPU Window, press ctrl-S, paste the assembly \ncode, and hit Find. In 9.5 cases out of 10, this places me directly above the \nfunction call I need to find in the new version.\nWhen an update comes, you can use the same method to find nearly all \nof your known addresses. It should work for every address you can find eas-\nily in assembly code. There are a few caveats, though:\n• \nOllyDbg limits search to eight operations, so you must find code mark-\ners of that size or smaller.\n• \nThe operations you use cannot contain any other addresses, as those \naddresses have likely changed.\n• \nIf parts of the game have changed that use the address you’re looking \nfor, the code might be different.\n• \nIf the game changes compilers or switches optimization settings, almost \nall code will be entirely different.\nAs discussed in “Automatically Find currentHealth and maxHealth” on \npage 102, you can benefit from writing scripts that carry out these tasks for \nyou. Serious game hackers work very hard to automatically locate as many \naddresses as possible, and some of the best bots are engineered to automati-\ncally detect their addresses at runtime, every time. It can be a lot of work \ninitially, but the investment can definitely pay off.\nAdvanced Memory Forensics 105\nIdentifying Complex Structures in Game Data\nChapter 4 described how a game might store data in static structures. This \nknowledge will suffice when you’re trying to find simple data, but it falls \nshort for data that is stored through dynamic structures. This is because \ndynamic structures might be scattered across different memory locations, \nfollow long pointer chains, or require complex algorithms to actually extract \nthe data from them.\nThis section explores common dynamic structures you’ll find in video \ngame code, and how to read data from them once they’re found. To begin, \nI’ll talk about the underlying composition of each dynamic structure. \nNext, I’ll outline the algorithms needed to read the data from these struc-\ntures. (For simplicity, each algorithm discussion assumes you have a pointer \nto an instance of the structure as well as some way to read from memory.) \nLastly, I’ll cover tips and tricks that can help you determine when a value \nyou’re searching for in memory is actually encapsulated in one of these \nstructures, so you’ll know when to apply this knowledge. I’ll focus on C++, \nas its object-oriented nature and heavily used standard library are typically \nresponsible for such structures.\nN o t e \t\nSome of these structures might differ slightly from machine to machine based on com-\npilers, optimization settings, or standard library implementations, but the basic con-\ncepts will remain the same. Also, in the interest of brevity, I will be omitting irrelevant \nparts of these structures, such as custom allocators or comparison functions. Working \nexample code can be found at https://www.nostarch.com/gamehacking/ in the \nresource files for Chapter 5.\nThe std::string Class\nInstances of std::string are among the most common culprits of dynamic \nstorage. This class from the C++ Standard Template Library (STL) abstracts \nstring operations away from the developer while preserving efficiency, \nmaking it widely used in all types of software. A video game might use \nstd::string structure for any string data, such as creature names.\nExamining the Structure of a std::string\nWhen you strip away the member functions and other nondata components \nof the std::string class, this is the structure that remains:\nclass string {\n union {\n char* dataP;\n char dataA[16];\n };\n int length;\n};\n// point to a string in memory\nstring* _str = (string*)stringAddress;\n106 Chapter 5\nThe class reserves 16 characters that are presumably used to store \nthe string in place. It also, however, declares that the first 4 bytes can be a \npointer to a character. This might seem odd, but it’s a result of optimiza-\ntion. At some point, the developers of this class decided that 15 charac-\nters (plus a null terminator) was a suitable length for many strings, and \nthey chose to save on memory allocations and de-allocations by reserving \n16 bytes of memory in advance. To accommodate longer strings, they allowed \nthe first 4 bytes of this reserved memory to be used as a pointer to the char-\nacters of these longer strings. \nN o t e \t\nIf the code were compiled to 64 bits, then it would actually be the first 8 (not 4) bytes \nthat point to a character. Throughout this example, however, you can assume 32-bit \naddresses and that int is the size of an address.\nAccessing string data this way takes some overhead. The function to \nlocate the right buffer looks something like this:\nconst char* c_str() {\n if (_str->length <= 15)\n return (const char*)&_str->dataA[0];\n else\n return (const char*)_str->dataP;\n}\nThe fact that a std::string can be either a complete string or a pointer \nto a longer string makes this particular structure quite tricky from a game-\nhacking perspective. Some games may use std::string to store strings that \nonly rarely exceed 15 characters. When this is the case, you might imple-\nment bots that rely on these strings, never knowing that the underlying \nstructure is in fact more complicated than a simple string. \nOverlooking a std::string Can Ruin Your Fun \nNot knowing the true nature of the structure containing the data you need \ncan lead you to write a bot that works only some of the time and fails when \nit counts. Imagine, for example, that you’re trying to figure out how a game \nstores creature data. In your hypothetical search, you find that all the \ncreatures in the game are stored in an array of structures that look some-\nthing like Listing 5-3.\nstruct creatureInfo {\n int uniqueID;\n char name[16];\n int nameLength;\n int healthPercent;\n int xPosition;\n int yPosition;\n int modelID;\nAdvanced Memory Forensics 107\n int creatureType;\n};\nListing 5-3: How you might interpret creature data found in memory\nAfter scanning the creature data in memory, say you notice that the \nfirst 4 bytes of each structure are unique for each creature, so you call those \nbytes the uniqueID and assume they form a single int property. Looking fur-\nther in the memory, you find that the creature’s name is stored right after \nuniqueID, and after some deduction, you figure out the name is 16 bytes \nlong. The next value you see in memory turns out to be the nameLength; it’s a \nbit strange that a null-terminated string has an associated length, but you \nignore that oddity and continue analyzing the data in memory. After fur-\nther analysis, you determine what the remaining values are for, define the \nstructure shown in Listing 5-3, and write a bot that automatically attacks \ncreatures with certain names.\nAfter weeks of testing your bot while hunting creatures with names like \nDragon, Cyclops, Giant, and Hound, you decide it’s time to give your bot to \nyour friends. For the inaugural use, you gather everyone together to kill a \nboss named Super Bossman Supreme. The entire team sets the bot to attack \nthe boss first and target lesser creatures like a Demon or Grim Reaper when the \nboss goes out of range.\nOnce your team arrives at the boss’s dungeon . . . you’re all slowly \nobliterated. \nWhat went wrong in this scenario? Your game must be storing creature \nnames with std::string, not just a simple character array. The name and \nnameLength fields in creatureInfo are, in fact, part of a std::string field, and \nthe name character array is a union of dataA and dataP members. Super Bossman \nSupreme is longer than 15 characters, and because the bot was not aware of \nthe std::string implementation, it didn’t recognize the boss. Instead, it con-\nstantly retargeted summoned Demon creatures, effectively keeping you from \ntargeting the boss while he slowly drained your health and supplies.\nDetermining Whether Data Is Stored in a std::string\nWithout knowing how the std::string class is structured, you’d have trouble \ntracking down bugs like the hypothetical one I just described. But pair what \nyou’ve learned here with experience, and you can avoid these kinds of bugs \nentirely. When you find a string like name in memory, don’t just assume it’s \nstored in a simple array. To figure out whether a string is in fact a std::string, \nask yourself these questions:\n• \nWhy is the string length present for a null-terminated string? If you \ncan’t think of a good reason, then you may have a std::string on your \nhands.\n• \nDo some creatures (or other game elements, depending on what you’re \nlooking for) have names longer than 16 letters, but you find room for \nonly 16 characters in memory? If so, the data is almost definitely stored \nin a std::string.\n108 Chapter 5\n• \nIs the name stored in place, requiring the developer to use strcpy() \nto modify it? It’s probably a std::string, because working with raw C \nstrings in this way is considered bad practice.\nFinally, keep in mind that there is also a class called std::wstring that is \nused to store wide strings. The implementation is very similar, but wchar_t \nis used in place of every char.\nThe std::vector Class\nGames must keep track of many dynamic arrays of data, but manag-\ning dynamically sized arrays can be very tricky. For speed and flexibility, \ngame developers often store such data using a templated STL class called \nstd::vector instead of a simple array. \nExamining the Structure of a std::vector \nA declaration of this class looks something like Listing 5-4.\ntemplate\nclass vector {\n T* begin;\n T* end;\n T* reservationEnd;\n};\nListing 5-4: An abstracted std::vector object\nThis template adds an extra layer of abstraction, so I’ll continue this \ndescription using a std::vector declared with the DWORD type. Here’s how a \ngame might declare that vector:\nstd::vector _vec;\nNow, let’s dissect what a std::vector of DWORD objects would look like in \nmemory. If you had the address of _vec and shared the same memory space, \nyou could re-create the underlying structure of the class and access _vec as \nshown in Listing 5-5.\nclass vector {\n DWORD* begin;\n DWORD* end;\n DWORD* tail;\n};\n// point to a vector in memory\nvector* _vec = (vector*)vectorAddress;\nListing 5-5: A DWORD std::vector object\nYou can treat the member begin like a raw array, as it points to the first \nelement in the std::vector object. There is no array length member, though, \nAdvanced Memory Forensics 109\nso you must calculate the vector’s length based on begin and end, which is an \nempty object following the final object in the array. The length calculation \ncode looks like this:\nint length() {\n return ((DWORD)_vec->end - (DWORD)_vec->begin) / sizeof(DWORD);\n}\nThis function simply subtracts the address stored in begin from the \naddress stored in end to find the number of bytes between them. Then, to \ncalculate the number of objects, it divides the number of bytes by the num-\nber of bytes per object. \nUsing begin and this length() function, you can safely access elements in \n_vec. That code would look something like this:\nDWORD at(int index) {\n if (index >= _vec->length())\n throw new std::out_of_range();\n return _vec->begin[index];\n}\nGiven an index, this code will fetch an item from the vector. But if \nthe index is greater than the vector’s length, a std::out_of_range exception \nwill be thrown. Adding values to a std::vector would be very expensive if \nthe class couldn’t reserve or reuse memory, though. To remedy this, the \nclass implements a function called reserve() that tells the vector how many \nobjects to leave room for. \nThe absolute size of a std::vector (its capacity) is determined through an \nadditional pointer, which is called tail in the vector class we’ve re-created. \nThe calculation for the capacity resembles the length calculation:\nint capacity() {\n return ((DWORD)_vec->tail - (DWORD)_vec->begin) / sizeof(DWORD);\n}\nTo find the capacity of a std::vector, instead of subtracting the begin \naddress from the end address, as you would to calculate length, this function \nsubtracts the begin address from tail. Additionally, you can use this calcula-\ntion a third time to determine the number of free elements in the vector by \nusing tail and end instead:\nint freeSpace() {\n return ((DWORD)_vec->tail - (DWORD)_vec->end) / sizeof(DWORD);\n}\nGiven proper memory reading and writing functions, you can use the \ndeclaration in Listing 5-4 and the calculations that follow to access and \nmanipulate vectors in the memory of a game. Chapter 6 discusses reading \nmemory in detail, but for now, let’s look at ways you can determine whether \ndata you’re interested in is stored in a std::vector.\n110 Chapter 5\nDetermining Whether Data Is Stored in a std::vector\nOnce you’ve found an array of data in a game’s memory, there are a few \nsteps you can follow to determine whether it is stored in a std::vector. First, \nyou can be sure that the array is not stored in a std::vector if it has a static \naddress, because std::vector objects require pointer paths to access the \nunderlying array. If the array does require a pointer path, having a final off-\nset of 0 would indicate a std::vector. To confirm, you can change the final \noffset to 4 and check if it points to the final object in the array instead of \nthe first one. If so, you’re almost definitely looking at a vector, as you’ve just \nconfirmed the begin and end pointers.\nThe std::list Class\nSimilar to std::vector, std::list is a class that you can use to store a collec-\ntion of items in a linked list. The main differences are that std::list doesn’t \nrequire a contiguous storage space for elements, cannot directly access ele-\nments by their index, and can grow in size without affecting any previous \nelements. Due to the overhead required to access items, it is rare to see this \nclass used in games, but it shows up in some special cases, which I’ll discuss \nin this section. \nExamining the Structure of a std::list \nThe std::list class looks something like Listing 5-6.\ntemplate\nclass listItem {\n listItem* next;\n listItem* prev;\n T value;\n};\ntemplate\nclass list {\n listItem* root;\n int size;\n};\nListing 5-6: An abstracted std::list object\nThere are two classes here: listItem and list. To avoid extra abstraction \nwhile explaining how std::list works, I’ll describe this object as it would \nlook when the type is DWORD. Here’s how a game would declare a std::list of \nthe DWORD type: \nstd::list _lst;\nGiven that declaration, the std::list is structured like the code in \nListing 5-7.\nAdvanced Memory Forensics 111\nclass listItem {\n listItem* next;\n listItem* prev;\n DWORD value;\n};\nclass list {\n listItem* root;\n int size;\n};\n// point to a list\nlist* _lst = (list*)listAddress;\nListing 5-7: A DWORD std::list object\nThe class list represents the list header, while listItem represents a \nvalue stored in the list. Instead of being stored contiguously, the items \nin the list are stored independently. Each item contains a pointer to the \nitem that comes after it (next) and the \none that comes before it (prev), and \nthese pointers are used to locate \nitems in the list. The root item acts \nas a marker for the end of the list; the \nnext pointer of the last item points to \nroot, as does the prev pointer of the \nfirst item. The root item’s next and \nprev pointers also point to the first \nitem and the last item, respectively. \nFigure 5-5 shows what this looks like.\nGiven this structure, you can use the following code to iterate over a \nstd::list object:\n// iterate forward\nlistItem* it = _lst->root->next;\nfor (; it != _lst->root; it = it->next)\n printf(\"Value is %d\\n\", it->value);\n// iterate backward\nlistItem* it = _lst->root->prev;\nfor (; it != _lst->root; it = it->prev)\n printf(\"Value is %d\\n\", it->value);\nThe first loop starts at the first item (root->next) and iterates forward \n(it = it->next) until it hits the end marker (root). The second loop starts \nat the last item (root->pres) and iterates backward (it = it->prev) until it \nhits the end marker (root). This iteration relies on next and prev because \nunlike objects in an array, objects in a std::list are not contiguous. Since \nthe memory of each object in a std::list is not contiguous, there’s no \nquick-and-dirty way to calculate the size. Instead, the class just defines a \nFigure 5-5: A std::list flowchart\nRoot\nItem\nItem\nItem\n112 Chapter 5\nsize member. Additionally, the concept of reserving space for new objects \nis irrelevant for lists, so there’s no variable or calculation to determine a \nlist’s capacity.\nDetermining Whether Game Data Is Stored in a std::list\nIdentifying objects stored in the std::list class can be tricky, but there are \na few hints you can watch for. First, items in a std::list cannot have static \naddresses, so if the data you seek has a static address, then you’re in the clear. \nItems that are obviously part of a collection may, however, be part of a \nstd::list if they’re not contiguous in memory. \nAlso consider that objects in a std::list can have infinitely long pointer \nchains (think it->prev->next->prev->next->prev . . .), and pointer scanning \nfor them in Cheat Engine will show many more results when No Looping \nPointers is turned off. \nYou can also use a script to detect when a value is stored in a linked list. \nListing 5-8 shows a Cheat Engine script that does just this.\nfunction _verifyLinkedList(address)\n local nextItem = readInteger(address) or 0\n local previousItem = readInteger(address + 4) or 0\n local nextItemBack = readInteger(nextItem + 4)\n local previousItemForward = readInteger(previousItem)\n return (address == nextItemBack\n and address == previousItemForward)\nend\nfunction isValueInLinkedList(valueAddress)\n for address = valueAddress - 8, valueAddress - 48, -4 do\n if (_verifyLinkedList(address)) then\n return address\n end\n end\n return 0\nend\nlocal node = isValueInLinkedList(addressOfSomeValue)\nif (node > 0) then\n print(string.format(\"Value in LL, top of node at 0x0%x\", node))\nend\nListing 5-8: Determining whether data is in a std::list using a Cheat Engine Lua script\nThere’s quite a bit of code here, but what it’s doing is actually pretty \nsimple. The isValueInLinkedList() function takes an address of some value \nand then looks backward for up to 40 bytes (10 integer objects, in case the \nAdvanced Memory Forensics 113\nvalue is in some larger structure), starting 8 bytes above the address (two \npointers must be present, and they are 4 bytes each). Because of memory \nalignment, this loop iterates in steps of 4 bytes. \nOn each iteration, the address is passed to the _verifyLinkedList() func-\ntion, which is where the magic happens. If we look at it in terms of linked \nlist structure as defined in this chapter, the function simply does this:\nreturn (node->next->prev == node && node->prev->next == node)\nThat is, the function basically assumes the memory address it’s given \npoints to a linked list, and it makes sure the supposed node has valid next \nand previous nodes. If the nodes are valid, the assumption was correct and \nthe address is that of a linked list node. If the nodes don’t exist or don’t \npoint to the right locations, the assumption was wrong and the address is \nnot part of a linked list.\nKeep in mind that this script won’t give you the address of the list’s root \nnode but simply the address of the node containing the value you’ve given \nit. To properly traverse a linked list, you’ll need to scan for a valid pointer \npath to the root node, so you’ll need its address. \nFinding that address can require some searching of memory dumps, \na lot of trial and error, and a ton of head scratching, but it’s definitely pos-\nsible. The best way to start is to follow the chain of prev and next nodes until \nyou find a node with data that is either blank, nonsensical, or filled with the \nvalue 0xBAADF00D (some, but not all, standard library implementations use \nthis value to mark root nodes). \nThis investigation can also be made easier if you know exactly how \nmany nodes are in the list. Even without the list header, you can determine \nthe amount of nodes by continuously following the next pointer until you \nend up back at your starting node, as in Listing 5-9.\nfunction countLinkedListNodes(nodeAddress)\n local counter = 0\n local next = readInteger(nodeAddress)\n while (next ~= nodeAddress) do\n counter = counter + 1\n next = readInteger(next)\n end\n return counter\nend\nListing 5-9: Determining the size of an arbitrary std::list using a Cheat Engine Lua script\nFirst, this function creates a counter to store the number of nodes and \na variable to store the next node’s address. The while loop then iterates over \nthe nodes until it ends up back at the initial node. Finally, it returns the \ncounter variable, which was incremented on every iteration of the loop.\n114 Chapter 5\nThe std::map Class\nLike a std::list, a std::map uses links between elements to form its structure. \nUnique to std::map, however, is the fact that each element stores two pieces \nof data (a key and a value), and sorting the elements is an inherent prop-\nerty of the underlying data structure: a red-black tree. The following code \nshows the structures that compose a std::map. \ntemplate\nstruct mapItem {\n mapItem* left;\n mapItem* parent;\n mapItem* right;\n keyT key;\n valT value;\n};\ntemplate\nstruct map {\n DWORD irrelevant;\n mapItem* rootNode;\n int size;\n}\nA red-black tree is a self-balancing binary search tree, so a std::map is, \ntoo.� In the STL’s std::map implementation, each element (or node) in the \ntree has three pointers: left, parent, and right. In addition to the point-\ners, each node also has a key and a value. The nodes are arranged in the \ntree based on a comparison between their keys. The left pointer of a node \npoints to a node with a smaller key, and the right pointer points to a node \nwith a larger key. The parent points to the upper node. The first node in the \ntree is called the rootNode, and nodes that lack children point to it. \nVisualizing a std::map\nFigure 5-6 shows a std::map that has the keys 1, 6, 8, 11, 13, 15, 17, 22, 25, \nand 27.\nFind the Root Node with a Script\nIt’s actually possible to write a script that can find the root node, but I’ll leave \nit as an optional exercise for you. How does it work? Well, the root node must \nbe in the chain of nodes, the list header points to the root, and the size of the \nlist will immediately follow the root in memory. Given this information, you can \nwrite a script that will search for any memory containing a pointer to one of the \nlist’s nodes, followed by the size of the list. More often than not, this piece of \nmemory is the list header, and the node it points to is the root node.\nAdvanced Memory Forensics 115\nRoot\n13\n8\n17\n1\n11\n15\n25\n22\n27\nRoot\nRoot\nRoot\nRoot\nRoot\nRoot\nRoot\nRoot\nRoot\nRoot\n6\nRoot\nFigure 5-6: A red-black tree\nThe top node (holding the value 13) is pointed to by the parent of \nrootNode. Everything to the left of it has a smaller key, and everything to the \nright has a greater key. This is true for any node in the tree, and this truth \nenables efficient key-based search. While not represented in the image, the \nleft pointer of the root node will point to the leftmost node (1), and the \nright pointer will point to the rightmost node (27). \nAccessing Data in a std::map\nOnce again, I’ll use a static std::map definition when discussing how to extract \ndata from the structure. Since the template takes two types, I’ll also use \nsome pseudotypes to keep things obvious. Here’s the declaration for the \nstd::map object I’ll reference for the rest of the section:\ntypedef int keyInt;\ntypedef int valInt;\nstd::map myMap;\nWith this declaration, the structure of myMap becomes:\nstruct mapItem {\n mapItem* left;\n mapItem* parent;\n mapItem* right;\n keyInt key;\n valInt value;\n};\nstruct map {\n DWORD irrelevant;\n mapItem* rootNode;\n116 Chapter 5\n int size;\n}\nmap* _map = (map*)mapAddress;\nThere are some important algorithms that you might need to access the \ndata in a std::map structure in a game. First, blindly iterating over every item \nin the map can be useful if you just want to see all of the data. To do this \nsequentially, you could write an iteration function like this: \nvoid iterateMap(mapItem* node) {\n if (node == _map->rootNode) return;\n iterateMap(node->left);\n printNode(node);\n iterateMap(node->right);\n}\nA function to iterate over an entire map would first read the current \nnode and check whether it’s the rootNode. If not, it would recurse left, print \nthe node, and recurse right. \nTo call this function, you’d have to pass a pointer to the rootNode as \nfollows:\niterateMap(_map->rootNode->parent);\nThe purpose of a std::map, however, is to store keyed data in a quickly \nsearchable way. When you need to locate a node given a specific key, mim-\nicking the internal search algorithm is preferable to scanning the entire \ntree. The code for searching a std::map looks something like this:\nmapItem* findItem(keyInt key, mapItem* node) {\n if (node != _map->rootNode) {\n if (key == node->key)\n return node;\n else if (key < node->key)\n return findItem(key, node->left);\n else\n return findItem(key, node->right);\n } else return NULL;\n}\nStarting at the top of the tree, you simply recurse left if the current key \nis greater than the search key and recurse right if it is smaller. If the keys are \nequal, you return the current node. If you reach the bottom of the tree and \ndon’t find the key, you return NULL because the key isn’t stored in the map.\nHere’s one way you might use this findItem() function:\nmapItem* ret = findItem(someKey, _map->rootNode->parent);\nif (ret)\n printNode(ret); \nAdvanced Memory Forensics 117\nAs long as findItem() doesn’t return NULL, this code should print a node \nfrom _map.\nDetermining Whether Game Data Is Stored in a std::map \nTypically, I don’t even consider whether data could be in a std::map until I \nknow the collection is not an array, a std::vector, or a std::list. If you rule \nout all three options, then as with a std::list, you can look at the three inte-\nger values before the value and check if they point to memory that could \npossibly be other map nodes. \nOnce again, this can be done with a Lua script in Cheat Engine. The \nscript is similar to the one I showed for lists, looping backward over memory \nto see if a valid node structure is found before the value. Unlike the list code, \nthough, the function that verifies a node is much trickier. Take a look at the \ncode in Listing 5-10, and then I’ll dissect it.\nfunction _verifyMap(address)\n local parentItem = readInteger(address + 4) or 0\n local parentLeftItem = readInteger(parentItem + 0) or 0\n local parentRightItem = readInteger(parentItem + 8) or 0\n local validParent =\n parentLeftItem == address\n or parentRightItem == address\n if (not validParent) then return false end\n local tries = 0\n local lastChecked = parentItem\n local parentsParent = readInteger(parentItem + 4) or 0\n while (readInteger(parentsParent + 4) ~= lastChecked and tries < 200) do\n tries = tries + 1\n lastChecked = parentsParent\n parentsParent = readInteger(parentsParent + 4) or 0\n end\n return readInteger(parentsParent + 4) == lastChecked\nend\nListing 5-10: Determining whether data is in a std::map using a Cheat Engine Lua script\nGiven address, this function checks if address is in a map structure. It \nfirst checks if there’s a valid parent node and, if so, checks whether that \nparent node points to address on either side . But this check isn’t enough. \nIf the check passes, the function will also climb up the line of parent nodes \nuntil it reaches a node that is the parent of its own parent , trying 200 \ntimes before calling it quits. If the climb succeeds in finding a node that \nis its own grandparent, then address definitely points to a map node. This \nworks because, as I outlined in “Visualizing a std::map” on page 114, at the \ntop of every map is a root node whose parent points to the first node in the \ntree, and that node’s parent points back to the root node.\n118 Chapter 5\nN o t e \t\nI bet you didn’t expect to run into the grandfather paradox from time travel when \nreading a game-hacking book!\nUsing this function and a slightly modified backtracking loop from \nListing 5-8, you can automatically detect when a value is inside a map:\nfunction isValueInMap(valueAddress)\n for address = valueAddress - 12, valueAddress - 52, -4 do\n if (_verifyMap(address)) then\n return address\n end\n end\n return 0\nend\nlocal node = isValueInMap(addressOfSomeValue)\nif (node > 0) then\n print(string.format(\"Value in map, top of node at 0x0%x\", node))\nend\nAside from function names, the only change in this code from \nListing 5-8 is that it starts looping 12 bytes before the value instead of 8, \nbecause a map has three pointers instead of the two in a list. One good \nconsequence of a map’s structure is that it’s easy to obtain the root node. \nWhen the _verifyMap function returns true, the parentsParent variable will \ncontain the address of the root node. With some simple modifications, \nyou could return this to the main call and have everything you need to \nread the data from a std::map in one place.\nClosing Thoughts\nMemory forensics is the most time-consuming part of hacking games, and \nits obstacles can appear in all shapes and sizes. Using purpose, patterns, \nand a deep understanding of complex data structures, however, you can \nquickly overcome these obstacles. If you’re still a bit confused about what’s \ngoing on, make sure to download and play with the example code provided, \nas it contains proofs of concept for all of the algorithms covered in this \nchapter.\nIn Chapter 6, we’ll start diving in to the code you need to read from \nand write to a game’s memory from your own programs so you can take \nthe first step in putting to work all of this information about memory struc-\ntures, addresses, and data. \n6\nR e a ding f rom a n d \nW r i t ing t o G a m e M e mory\nEarlier chapters discussed how memory \nis structured as well as how to scan and \nmodify memory using Cheat Engine and \nOllyDbg. Working with memory will be essen-\ntial when you begin to write bots, and your code will \nneed to know how to do so.\nThis chapter digs into the code-level details of memory manipulation. \nFirst, you’ll learn how to use code to locate and obtain handles to game \nprocesses. Next, you’ll learn how to use those handles to read from and \nwrite to memory both from remote processes and from injected code. To \nwrap up, you’ll learn bypasses for a certain memory protection technique, \ncomplete with a small example of code injection. You’ll find the example \ncode for this chapter in the GameHackingExamples/Chapter6_AccessingMemory \ndirectory in this book’s source files.\nN o t e \t\nWhen I talk about API functions in this chapter (and in later ones), I’m referring to \nthe Windows API unless otherwise specified. If I don’t mention a header file for the \nlibrary, you can assume it is Windows.h.\n120 Chapter 6\nObtaining the Game’s Process Identifier\nTo read from or write to a game’s memory, you need its process identifier (PID), \na number that uniquely identifies an active process. If the game has a visible \nwindow, you can obtain the PID of the process that created that window by \ncalling GetWindowThreadProcessId(). This function takes the window’s handle \nas the first parameter and outputs the PID to the second parameter. You \ncan find the window’s handle by passing its title (the text on the taskbar) as \nthe second parameter to FindWindow(), as shown in Listing 6-1.\nHWND myWindow =\n FindWindow(NULL, \"Title of the game window here\");\nDWORD PID;\nGetWindowThreadProcessId(myWindow, &PID);\nListing 6-1: Fetching a window’s handle to obtain a PID\nWith the window handle secured, all you have to do is create a place to \nstore the PID and call GetWindowThreadProcessId(), as shown in this example.\nIf a game isn’t windowed or the window name isn’t predictable, you \ncan find the game’s PID by enumerating all processes and looking for \nthe name of the game binary. Listing 6-2 does this using the API func-\ntions CreateToolhelp32Snapshot(), Process32First(), and Process32Next() from \ntlhelp32.h.\n#include \nPROCESSENTRY32 entry;\nentry.dwSize = sizeof(PROCESSENTRY32);\nHANDLE snapshot =\n CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, NULL);\nif (Process32First(snapshot, &entry) == TRUE) {\n while (Process32Next(snapshot, &entry) == TRUE) {\n wstring binPath = entry.szExeFile;\n if (binPath.find(L\"game.exe\") != wstring::npos) {\n printf(\"game pid is %d\\n\", entry.th32ProcessID);\n break;\n }\n }\n}\nCloseHandle(snapshot);\nListing 6-2: Fetching a game’s PID without the window name\nListing 6-2 might look a bit more complex than Listing 6-1, but \nunderneath all that code, the function is actually like a canonical for \n(iterator; comparator; increment) loop. The CreateToolhelp32Snapshot() func-\ntion obtains a list of processes named snapshot, and entry is an iterator over \nthat list. The value returned by Process32First() initializes the iterator, \nwhile Process32Next() increments it. Finally, the Boolean return value of \nReading from and Writing to Game Memory 121\nProcess32Next() is the comparator. This code just iterates over a snapshot of \nevery running process, looks for one whose binary path contains the text \ngame.exe, and prints its PID.\nObtaining Process Handles\nOnce you know a game’s PID, you can obtain a handle to the process itself \nusing an API function called OpenProcess(). This function allows you to fetch \nhandles with the access levels you need to read from and write to memory. \nThis is crucial to game hacking, as any function that operates on a process \nwill require a handle with proper access. \nLet’s take a look at the prototype of OpenProcess():\nHANDLE OpenProcess(DWORD DesiredAccess, BOOL InheritHandle, DWORD ProcessId);\nThe first parameter, DesiredAccess, expects one or a mixture of process \naccess flags to set on the handle that OpenProcess() returns. There are many \nflags you can use, but these are the most common in game hacking:\nPROCESS_VM_OPERATION The returned handle can be used with \nVirtualAllocEx(), VirtualFreeEx(), and VirtualProtectEx() to allocate, \nfree, and protect chunks of memory, respectively.\nPROCESS_VM_READ The returned handle can be used with \nReadProcessMemory().\nPROCESS_VM_WRITE The returned handle can be used with \nWriteProcessMemory(), but it must also have PROCESS_VM_OPERATION rights. \nYou can set both flags by passing PROCESS_VM_OPERATION | PROCESS_VM_WRITE \nas the DesiredAccess parameter.\nPROCESS_CREATE_THREAD The returned handle can be used with \nCreateRemoteThread().\nPROCESS_ALL_ACCESS The returned handle can be used to do anything. \nAvoid using this flag, as it can only be used by processes with debug \nprivileges enabled and has compatibility issues with older versions of \nWindows.\nWhen fetching a handle to a game, you can typically just set the \nOpenProcess() function’s second parameter, InheritHandle, to false. The \nthird parameter, ProcessId, expects the PID of the process to be opened. \nWorking with OpenProcess() \nNow let’s walk through an example call to OpenProcess() that uses a handle \nwith access permissions allowing it to read from and write to memory:\nDWORD PID = getGamePID();\nHANDLE process = OpenProcess(\n PROCESS_VM_OPERATION |\n PROCESS_VM_READ |\n PROCESS_VM_WRITE,\n122 Chapter 6\n FALSE,\n PID\n);\n if (process == INVALID_HANDLE_VALUE) {\n printf(\"Failed to open PID %d, error code %d\",\n PID, GetLastError());\n}\nFirst, the call to getGamePID() fetches the PID you’re looking for. (The \nfunction is something you’ll have to write yourself, though it could just be \none of the snippets I showed in Listings 6-1 and 6-2, fleshed out into a full-\nblown function.) Then, the code calls OpenProcess() with three flags: the \nPROCESS_VM_OPERATION flag gives this handle memory access permissions, and \nthe other two combined give it read and write permissions. This example \nalso contains an error-handling case , but as long as you have the correct \nPID, you have valid access flags, and your code is running under the same \nor higher permissions as the game (for example, if you start your bot using \nRun As Admin), the call should never fail. \nOnce you’re done using a handle, clean it up using CloseHandle() as \nfollows:\nCloseHandle(process);\nYou can reuse handles as much as you want, so you can leave one open \nuntil you’re completely done using it or until your bot is exited. \nNow that you’ve seen how to open a process handle in preparation for \nmanipulating game memory, let’s dig into how to actually access the mem-\nory of that process.\nAccessing Memory\nThe Windows API exposes two functions that are crucial to memory access: \nReadProcessMemory() and WriteProcessMemory(). You can use these functions to \nexternally manipulate a game’s memory. \nWorking with ReadProcessMemory() and WriteProcessMemory() \nThe prototypes for these two functions (shown in Listing 6-3) resemble \neach other closely, and you’ll follow almost exactly the same steps to \nuse them. \nBOOL ReadProcessMemory(\n HANDLE Process, LPVOID Address,\n LPVOID Buffer, DWORD Size,\n DWORD *NumberOfBytesRead\n);\nBOOL WriteProcessMemory(\n HANDLE Process, LPVOID Address,\n LPCVOID Buffer, DWORD Size,\nReading from and Writing to Game Memory 123\n DWORD *NumberOfBytesWritten\n);\nListing 6-3: ReadProcessMemory() and WriteProcessMemory() prototypes\nBoth functions expect Process to be a process handle and Address to be \nthe target memory address. When the function is reading from memory, \nBuffer is expected to point to an object that will hold the read data. When \nthe function is writing to memory, Buffer is expected to point to the data \nto write. In both cases, Size defines the size of Buffer, in bytes. The final \nparameter to both functions is used to optionally return the number of \nbytes that were accessed; you can safely set it to NULL. Unless the function \nfails, the value returned in the final parameter should be equal to Size. \nAccessing a Value in Memory with ReadProcessMemory() and \nWriteProcessMemory() \nThe code in Listing 6-4 shows how you might use these functions to access a \nvalue in memory.\nDWORD val;\nReadProcessMemory(proc, adr, &val, sizeof(DWORD), 0);\nprintf(\"Current mem value is %d\\n\", val);\nval++;\nWriteProcessMemory(proc, adr, &val, sizeof(DWORD), 0);\nReadProcessMemory(proc, adr, &val, sizeof(DWORD), 0);\nprintf(\"New mem value is confirmed as %d\\n\", val);\nListing 6-4: Reading from and writing to process memory using the Windows API\nBefore code like this appears in a program, you need to find the \nPID (proc) as described in “Obtaining the Game’s Process Identifier” on \npage 120, as well as the memory address (adr) you want to read from or \nwrite to. With those values in place, the ReadProcessMemory() function stores \na fetched value from memory in val. Then, the code increments val and \nreplaces the original value by calling WriteProcessMemory(). After the write \ntakes place, ReadProcessMemory() is called on the same address to confirm the \nnew memory value. Notice that val isn’t actually a buffer. Passing &val as the \nBuffer parameter works because it can be a pointer to any static memory \nstructure, as long as Size matches.\nWriting Templated Memory Access Functions\nOf course, the example in Listing 6-4 assumes you already know what type \nof memory you’re dealing with, and it hardcodes the type as DWORD. To be a \nversatile game hacker, it’s better to have some generic code in your toolbox \nto avoid duplicating code for different types. Generic memory reading and \nwriting functions that support different types might look like Listing 6-5.\n124 Chapter 6\ntemplate\nT readMemory(HANDLE proc, LPVOID adr) {\n T val;\n ReadProcessMemory(proc, adr, &val, sizeof(T), NULL);\n return val;\n}\ntemplate\nvoid writeMemory(HANDLE proc, LPVOID adr, T val) {\n WriteProcessMemory(proc, adr, &val, sizeof(T), NULL);\n}\nListing 6-5: Generic memory functions\nThese functions use C++ templates to accept arbitrary types as argu-\nments. They allow you to access memory with whatever types you like in a \nvery clean way. For example, given these readMemory() and writeMemory() tem-\nplates I just showed, you could make the calls in Listing 6-6.\nDWORD value = readMemory(proc, adr); // read\nwriteMemory(proc, adr, value++); // increment and write\nListing 6-6: Calling templated memory access functions\nCompare this to the calls to WriteProcessMemory() and ReadProcessMemory() \nin Listing 6-4. This code still reads a value, increments it, and writes the \nnew value to memory. But since the templated functions let you specify the \ntype when you call them, you don’t need a new readMemory() and writeMemory() \nfunction for every data type you might need to work with. That’s much \ncleaner, since you’ll often want to work with all kinds of data.\nMemory Protection\nWhen memory is allocated by a game (or any program), it is placed in \na page. In x86 Windows, pages are chunks of 4,096 bytes that store data. \nBecause all memory must be within a page, the minimal allocation unit \nis 4,096 bytes. The operating system can place memory chunks smaller \nthan 4,096 bytes as a subset of an existing page that has enough uncommit-\nted space, in a newly allocated page, or across two contiguous pages that \nhave the same attributes. \nMemory chunks 4,096 bytes or larger span n pages, where n is \nmemory size\n4 096\n,\n.\nThe operating system typically looks for room in existing pages when \nallocating memory, but it allocates new pages on demand if necessary.\nN o t e \t\nIt’s also possible for large chunks to span n + 1 pages, as there’s no guarantee that a \nchunk begins at the start of a page.\nReading from and Writing to Game Memory 125\nThe important thing to understand about memory pages is that each \npage has a set of specific attributes. Most of these attributes are transparent \nin user mode, but there’s one you should be extra conscious of when work-\ning with memory: protection.\nDifferentiating x86 Windows Memory Protection Attributes\nThe memory-reading techniques you’ve learned so far are very basic. They \nassume that the memory you’re accessing is protected with the PAGE_READWRITE \nattribute. While this assumption is correct for variable data, other types of \ndata exist on pages with different types of protection. Table 6-1 describes \nthe different types of memory protection in x86 Windows.\nTable 6-1: Memory Protection Types\nProtection type\nValue\nRead \npermission?\nWrite \npermission?\nExecute \npermission?\nSpecial \npermissions?\nPAGE_NOACCESS\n0x01\nNo\nNo\nNo\nPAGE_READONLY\n0x02\nYes\nNo\nNo\nPAGE_READWRITE\n0x04\nYes\nNo\nNo\nPAGE_WRITECOPY\n0x08\nYes\nYes\nNo\nYes, copy on write\nPAGE_EXECUTE\n0x10\nNo\nNo\nYes\nPAGE_EXECUTE_READ\n0x20\nYes\nNo\nYes\nPAGE_EXECUTE_READWRITE\n0x40\nYes\nYes\nYes\nPAGE_EXECUTE_WRITECOPY\n0x80\nYes\nYes\nYes\nYes, copy on write\nPAGE_GUARD\n0x100\nNo\nNo\nNo\nYes, guard page\nIf a protection type in Table 6-1 has a Yes in any permission column, it \nmeans the action in question can be performed on that page of memory. \nFor example, if a page is PAGE_READONLY, then a program can read the mem-\nory on that page, but the program cannot write to that memory. \nConstant strings, for example, are usually stored with PAGE_READONLY pro-\ntection. Other constant data, such as virtual function tables and a module’s \nentire Portable Executable (PE) header (which contains information about a \nprogram, such as the kind of application it is, library functions it uses, its \nsize, and so on), are also stored on read-only pages. Assembly code, on the \nother hand, is stored on pages protected with PAGE_EXECUTE_READ.\nMost protection types involve only some combination of read, write, \nand execute protection. For now, you can safely ignore special protection \ntypes; I cover them in “Special Protection Types” on page 126 if you’re \ncurious, but only very advanced hacks will ever require knowledge of them. \nThe basic protection types, though, will be prevalent in your game-hacking \nadventures. \n126 Chapter 6\nChanging Memory Protection\nWhen you want to hack a game, you’ll sometimes need to access memory in \na way that is forbidden by the memory page’s protection, making it impor-\ntant to be able to change memory protection at will. Luckily, the Windows \nAPI provides the VirtualProtectEx() function for this purpose. This is the \nfunction’s prototype:\nBOOL VirtualProtectEx(\n HANDLE Process, LPVOID Address,\n DWORD Size, DWORD NewProtect,\n PDWORD OldProtect\n);\nThe parameters Process, Address, and Size take the same input as they \ndo in the ReadProcessMemory() and WriteProcessMemory() functions. NewProtect \nshould specify the new protection flags for the memory, and OldProtect can \noptionally point to a DWORD where the old protection flags will be stored. \nSpecial Protection T ypes\nTwo protection types in Table 6-1 include copy-on-write protection. When \nmultiple processes have pages of memory that are identical (such as pages \nwith mapped system DLLs), copy-on-write protection is used to conserve memory. \nThe actual data is stored in only one physical place, and the operating system \nvirtually maps all memory pages containing that data to the physical location. If \na process sharing the memory makes a change to it, a copy of the data will be \nmade in physical memory, the change will be applied, and the memory page(s) \nfor that process will be remapped to the new physical memory. When a copy \non write happens, the protection for all affected pages changes accordingly; \nPAGE_WRITECOPY will become PAGE_READWRITE, and PAGE_EXECUTE_WRITECOPY will \nbecome PAGE_EXECUTE_READWRITE. I’ve found no game hacking–specific uses for \ncopy-on-write pages, but it’s useful to understand them.\nPages can also be created with guard protection. Guarded pages must \nhave a secondary protection, defined like PAGE_GUARD | PAGE_READONLY. When \nthe program tries to access a guarded page, the operating system will throw \na STATUS_GUARD_PAGE_VIOLATION exception. Once the exception is handled, the \nguard protection is removed from the page, leaving only the secondary protec-\ntion. One way in which the operating system uses this type of protection is to \ndynamically expand the call stack by placing a guarded page at the top and \nallocating more memory when that guarded page is hit. Some memory analy-\nsis tools place guarded pages after heap memory to detect heap corruption \nbugs. In the context of game hacking, a guarded page can be used as a trip \nwire that tells you when a game might be attempting to detect your code within \nits memory.\nReading from and Writing to Game Memory 127\nThe most granular scale for memory protection is per page, which means \nVirtualProtectEx() will set the new protection to every page that is on or \nbetween Address and Address + Size - 1. \nN o t e \t\nThe VirtualProtectEx() function has a sister called VirtualProtect(). They work \nthe same way, but VirtualProtect() operates only on the process calling it and, thus, \ndoes not have a process handle parameter.\nWhen you’re writing your own code to change memory protections, I \nsuggest making it flexible by creating a template. A generic wrapped func-\ntion for VirtualProtectEx() should look something like Listing 6-7.\ntemplate\nDWORD protectMemory(HANDLE proc, LPVOID adr, DWORD prot) {\n DWORD oldProt;\n VirtualProtectEx(proc, adr, sizeof(T), prot, &oldProt);\n return oldProt;\n}\nListing 6-7: A generic function to change memory protection\nWith this template in place, if you wanted to, say, write a DWORD to a mem-\nory page without write permission, you might do something like this:\nprotectMemory(process, address, PAGE_READWRITE)\nwriteMemory(process, address, newValue)\nFirst, this sets the protection on the memory to change to PAGE_READWRITE. \nWith write permission granted, the door is open to call writeMemory() and \nchange the data at address.\nWhen you’re changing memory protection, it’s best practice to let the \nchange persist only as long as needed and restore the original protection \nas soon as possible. This is less efficient, but it ensures that a game doesn’t \ndetect your bot (for example, by noticing that some of its assembly code \npages have become writable). \nA typical write operation on read-only memory should look like this:\nDWORD oldProt =\n protectMemory(process, address, PAGE_READWRITE);\nwriteMemory(process, address, newValue);\nprotectMemory(process, address, oldProt);\nThis code calls the protectMemory() function from Listing 6-7 to change \nthe protection to PAGE_READWRITE. It then writes newValue to the memory before \nchanging the protection back to oldProt, which was set to the page’s original \nprotection by the initial call to protectMemory(). The writeMemory() function \nused here is the same one defined in Listing 6-5.\nA final important point is that when you’re manipulating a game’s \nmemory, it’s entirely possible that the game will access the memory at \nthe same time as you. If the new protection that you set is not compatible \n128 Chapter 6\nwith the original protection, the game process will get an ACCESS_VIOLATION \nexception and crash. For instance, if you change memory protection from \nPAGE_EXECUTE to PAGE_READWRITE, the game might try to execute the code on \nthe page(s) when the memory is not marked as executable. In this case, \nyou’d want to instead set the memory protection to PAGE_EXECUTE_READWRITE to \nensure that you can operate on the memory while still allowing the game to \nexecute it.\nAddress Space Layout Randomization\nSo far, I’ve described memory addresses as static integers that change only \nas the binary changes. This model is correct on Windows XP and earlier. \nOn later Windows systems, however, memory addresses are only static rela-\ntive to the base address of the game binary, because these systems enable \na feature called address space layout randomization (ASLR) for supported bina-\nries. When a binary is compiled with ASLR support (enabled by default on \nMSVC++ 2010 and many other compilers), its base address can be different \nevery time it is run. Conversely, non-ASLR binaries will always have a base \naddress of 0x400000. \nN o t e \t\nSince ASLR doesn’t work on XP, I’ll call 0x400000 the XP-base.\nDisabling ASLR to Simplify Bot Development\nTo keep development simple, you can disable ASLR and use addresses with \nthe transparent XP-base. To do so, enter a single command in the Visual \nStudio Command Prompt:\n> editbin /DYNAMICBASE:NO \"C:\\path\\to\\game.exe\"\nTo re-enable it, enter:\n> editbin /DYNAMICBASE \"C:\\path\\to\\game.exe\" \nBypassing ASLR in Production\nDisabling ASLR is suitable for bot development, but it is a no-no for produc-\ntion; end users cannot be expected to turn off ALSR. Instead, you can write \na function to dynamically rebase addresses at runtime. If you use addresses \nwith the XP-base, the code to do a rebase would look like this:\nDWORD rebase(DWORD address, DWORD newBase) {\n DWORD diff = address - 0x400000;\n return diff + newBase;\n}\nReading from and Writing to Game Memory 129\nWhen you know the base address of the game (newBase), this function \nallows you to essentially ignore ASLR by rebasing address.\nTo find newBase, however, you need to use the GetModuleHandle() function. \nWhen the parameter to GetModuleHandle() is NULL, it always returns a handle \nto the main binary in a process. The function’s returned type is HMODULE, but \nthe value is actually just the address where the binary is mapped. This is the \nbase address, so you can directly cast it to a DWORD to get newBase. Since you’re \nlooking for the base address in another process, though, you need a way to \nexecute the function in the context of that process. \nTo do this, call GetModuleHandle() using the CreateRemoteThread() API func-\ntion, which can be used to spawn threads and execute code in a remote pro-\ncess. It has the prototype shown in Listing 6-8.\nHANDLE CreateRemoteThread(\n HANDLE Process,\n LPSECURITY_ATTRIBUTES ThreadAttributes,\n DWORD StackSize,\n LPTHREAD_START_ROUTINE StartAddress,\n LPVOID Param,\n DWORD CreationFlags,\n LPDWORD ThreadId\n);\nListing 6-8: A function that spawns a thread \nThe spawned thread will start execution on StartAddress, treating it \nas a single-parameter function with Param as input and setting the value \nreturned as the thread exit code. This is ideal, as the thread can be started \nwith StartAddress pointing to the address of GetModuleHandle() and Param set to \nNULL. You can then use the API function WaitForSingleObject() to wait until \nthe thread is done executing and get the returned base address using the \nAPI function GetExitCodeThread().\nOnce all of these things are tied together, the code to get newBase from \nan external bot should look like Listing 6-9.\nDWORD newBase;\n// get the address of kernel32.dll\nHMODULE k32 = GetModuleHandle(\"kernel32.dll\");\n// get the address of GetModuleHandle()\nLPVOID funcAdr = GetProcAddress(k32, \"GetModuleHandleA\");\nif (!funcAdr)\n funcAdr = GetProcAddress(k32, \"GetModuleHandleW\");\n// create the thread\nHANDLE thread =\n CreateRemoteThread(process, NULL, NULL,\n (LPTHREAD_START_ROUTINE)funcAdr,\n NULL, NULL, NULL);\n130 Chapter 6\n// let the thread finish\nWaitForSingleObject(thread, INFINITE);\n// get the exit code\nGetExitCodeThread(thread, &newBase);\n// clean up the thread handle\nCloseHandle(thread);\nListing 6-9: Finding the base address of a game with API functions\nThe GetModuleHandle() function is part of kernel32.dll, which has the \nsame base address in every process, so first this code gets the address for \nkernel32.dll. Since the base address of kernel32.dll is the same in every pro-\ncess, the address of GetModuleHandle() will be the same in the game as it is in \nthe external bot. Given the base address of kernel32.dll, this code finds the \naddress of GetModuleHandle() easily with the API function GetProcAddress(). \nFrom there, it calls the CreateRemoteThread() function from Listing 6-8, lets \nthe thread do its job, and fetches the exit code to obtain newBase.\nClosing Thoughts\nNow that you’ve seen how to manipulate memory from your own code, I’ll \nshow you how to apply the skills from Parts I and II to games. These skills \nare paramount to the concepts you’ll explore in the coming chapters, so \nmake sure you have a firm grasp on what’s happening. If you’re having \ntrouble, play with the example code as you review concepts, as it provides a \nsafe sandbox for testing and tweaking how the methods in this and earlier \nchapters behave.\nThe way Listing 6-9 tricks the game into executing GetModuleHandle() is a \nform of code injection. But that’s just a glimpse into what injection can do. \nIf you’re excited to learn more about it, dive into Chapter 7, which explores \nthis topic in detail.\nPart 3\nProce s s P u ppe t e e r ing\n7\nCo de In jec t ion\nImagine being able to walk into a game \ncompany’s office, sit down, and start add-\ning code to their game client. Imagine \nthat you can do this for any game you want, \nwhenever you want, and for any functionality you want. \nAlmost any gamer you talk to will have ideas on how \nto improve a game, but, as far as they know, it’s just a \npipe dream. But you know that dreams are meant to be fulfilled, and now \nthat you’ve learned a bit about how memory works, you’re ready to start \nthrowing the rules out the window. Using code injection, you can, for all \nintents and purposes, become as powerful as any game’s developers.\nCode injection is a means of forcing any process to execute foreign code \nwithin its own memory space and execution context. I touched on this \ntopic previously in “Bypassing ASLR in Production” on page 128, where \nI showed you how to remotely subvert ASLR using CreateRemoteThread(), but \nthat example only scratched the surface. In the first part of this chapter, \nyou’ll learn how to create code caves, inject new threads, and hijack thread \n134 Chapter 7\nexecution to force games to execute small snippets of assembly code. In the \nsecond part, you’ll learn how to inject foreign binaries directly into games, \nforcing those games to execute entire programs that you’ve created.\nInjecting Code Caves with Thread Injection\nThe first step to injecting code into another process is writing position-\nagnostic assembly code, known as shellcode, in the form of a byte array. You \ncan write shellcode to remote processes to form code caves, which act as the \nentry point for a new thread that you want a game to execute. Once a code \ncave is created, you can execute it using either thread injection or thread hijack-\ning. I’ll show you an example of thread injection in this section, and I’ll \nillustrate thread hijacking in “Hijacking a Game’s Main Thread to Execute \nCode Caves” on page 138.\nYou’ll find example code for this chapter in this book’s resource \nfiles in the directory GameHackingExamples/Chapter7_CodeInjection. Open \nmain-codeInjection.cpp to follow along as I explain how to build a simplified \nversion of the function injectCodeUsingThreadInjection() from that file. \nCreating an Assembly Code Cave\nIn “Bypassing ASLR in Production” on page 128, I used thread injection \nto call the function GetModuleHandle() by way of CreateRemoteThread() and \nobtain a process handle. In that case, GetModuleHandle() acted as the code \ncave; it had the proper code structure to act as the entry point for a new \nthread. Thread injection isn’t always that easy, though. \nFor example, say you want your external bot to remotely call a function \nwithin a game, and the function has this prototype:\nDWORD __cdecl someFunction(int times, const char* string);\nA few things make remotely calling this function tricky. First, it has two \nparameters, meaning you need to create a code cave that will both set up \nthe stack and properly make the call. CreateRemoteThread() allows you to pass \none argument to the code cave, and you can access that argument relative \nto ESP, but the other one would still need to be hardcoded into the code \ncave. Hardcoding the first argument, times, is easiest. Additionally, you’d \nneed to make sure that the cave properly cleans the stack. \nN o t e \t\nRecall that when bypassing ASLR in Chapter 6, I used CreateRemoteThread() to start \nnew threads by executing any arbitrary code at a given address and passing that \ncode a single parameter. That’s why these examples can pass one parameter using the \nstack.\nCode Injection 135\nUltimately, the code cave to inject that call to someFunction into a run-\nning game process would look something like this pseudocode: \nPUSH DWORD PTR:[ESP+0x4] // get second arg from stack\nPUSH times\nCALL someFunction\nADD ESP, 0x8\nRETN\nThis code cave is almost perfect, but it could be less complex. The CALL \noperation expects one of two operands: either a register with an absolute \nfunction address or an immediate integer that holds an offset to a function, \nrelative to the return address. This means you’d have to do a bunch of off-\nset calculations, which can be tedious. \nTo keep the cave position agnostic, modify it to use a register instead, \nas in Listing 7-1. \nPUSH DWORD PTR:[ESP+0x4] // get second arg from stack\nPUSH times\nMOV EAX, someFunction\nCALL EAX\nADD ESP, 0x8\nRETN\nListing 7-1: A code cave to call someFunction\nSince a caller knows that a function it calls will overwrite EAX with its \nreturn value, the caller should ensure that EAX doesn’t hold any critical data. \nKnowing this, you can use EAX to hold the absolute address of someFunction.\nTranslating the Assembly to Shellcode\nBecause code caves need to be written to another process’s memory, they \ncannot be written directly in assembly. Instead, you’ll need to write them \nbyte by byte. There’s no standard way to determine which bytes represent \nwhich assembly code, but there are a few hacky approaches. My personal \nfavorite is to compile an empty C++ application with the assembly code in a \nfunction and use OllyDbg to inspect that function. Alternatively, you could \nopen OllyDbg on any arbitrary process and scan through the disassembly \nuntil you find the bytes for all of the operations you need. This method is \nactually really good, as your code caves should be written as simply as pos-\nsible, meaning all of the operations should be very common. You can also \nfind charts of assembly opcodes online, but I find that they’re all pretty \nhard to read; the methods I just described are easier overall.\nWhen you know what your bytes should be, you can use C++ to easily \ngenerate the proper shellcode. Listing 7-2 shows the finished shellcode skel-\neton for the assembly in Listing 7-1.\n136 Chapter 7\nBYTE codeCave[20] = {\n 0xFF, 0x74, 0x24, 0x04, // PUSH DWORD PTR:[ESP+0x4]\n 0x68, 0x00, 0x00, 0x00, 0x00, // PUSH 0 \n 0xB8, 0x00, 0x00, 0x00, 0x00, // MOV EAX, 0x0 \n 0xFF, 0xD0, // CALL EAX\n 0x83, 0xC4, 0x08, // ADD ESP, 0x08\n 0xC3 // RETN\n};\nListing 7-2: Shellcode skeleton\nThis example creates a BYTE array containing the needed bytes of shell-\ncode. But the times argument needs to be dynamic, and it’s impossible to \nknow the address of someFunction at compile time, which is why this shell-\ncode is written as a skeleton. The two groups of four sequential 0x00 bytes \nare placeholders for times and the address of someFunction, and you can \ninsert the real values into your code cave at runtime by calling memcpy(), as \nin the snippet in Listing 7-3.\nmemcpy(&codeCave[5], ×, 4);\nmemcpy(&codeCave[10], &addressOfSomeFunc, 4);\nListing 7-3: Inserting times and the location of someFunction into the code cave\nBoth times and the address of someFunction are 4 bytes each (recall \nthat times is an int and addresses are 32-bit values), and they belong at \ncodeCave[5-8] and codeCave[10-13], respectively. The two calls to memcpy() pass \nthis information as parameters to fill the blanks in the codeCave array.\nWriting the Code Cave to Memory\nWith the proper shellcode created, you can place it inside the target process \nusing VirtualAllocEx() and WriteProcessMemory(). Listing 7-4 shows one way to \ndo this. \nint stringlen = strlen(string) + 1; // +1 to include null terminator\nint cavelen = sizeof(codeCave);\n int fulllen = stringlen + cavelen;\nauto remoteString = // allocate the memory with EXECUTE rights\n VirtualAllocEx(process, 0, fulllen, MEM_COMMIT, PAGE_EXECUTE);\nauto remoteCave = // keep a note of where the code cave will go\n (LPVOID)((DWORD)remoteString + stringlen);\n// write the string first\n WriteProcessMemory(process, remoteString, string, stringlen, NULL);\n// write the code cave next\n WriteProcessMemory(process, remoteCave, codeCave, cavelen, NULL);\nListing 7-4: Writing the final shellcode to a code cave memory\nCode Injection 137\nFirst, this code determines exactly how many bytes of memory it will \nneed to write the string argument and the code cave into the game’s mem-\nory, and it stores that value in fulllen . Then, it calls the API function \nVirtualAllocEx() to allocate fulllen bytes inside of process with PAGE_EXECUTE \nprotection (you can always use 0 and MEM_COMMIT, respectively, for the sec-\nond and fourth parameters), and it stores the address of the memory in \nremoteString . It also increments remoteString by stringlen bytes and stores \nthe result in remoteCave , as the shellcode should be written directly to the \nmemory following the string argument. Finally, it uses WriteProcessMemory() \nto fill the allocated buffer with string and the assembly bytes stored in \ncodeCave.\nTable 7-1 shows how a memory dump of the code cave might look, \nassuming that it is allocated at 0x030000, someFunction is at 0xDEADBEEF, \ntimes is set to 5, and string is pointing to the injected! text.\nTable 7-1: Code Cave Memory Dump\nAddress\nCode representation\nRaw data\nData meaning\n0x030000 \nremoteString[0-4]\n0x69 0x6E 0x6A \n0x65 0x63\ninjec\n0x030005 \nremoteString[5-9]\n0x74 0x65 0x64 \n0x0A 0x00\nted!\\0\n0x03000A \nremoteCave[0-3]\n0xFF 0x74 0x24 \n0x04\nPUSH DWORD \nPTR[ESP+0x4]\n0x03000E \nremoteCave[4-8]\n0x68 0x05 0x00 \n0x00 0x00\nPUSH 0x05\n0x030013 \nremoteCave[9-13]\n0xB8 0xEF 0xBE \n0xAD 0xDE\nMOV EAX, 0xDEADBEEF\n0x030018 \nremoteCave[14-15]\n0xFF 0xD0\nCALL EAX\n0x03001A \nremoteCave[16-18]\n0x83 0xC4 0x08\nADD ESP, 0x08\n0x03001D \nremoteCave[19]\n0xC3\nRETN\nThe Address column shows where each piece of the cave is located \nin memory; the Code representation column tells you which indexes of \nremoteString and remoteCave correspond to the bytes in the Raw data col-\numn; and the Data meaning column shows what the bytes represent, in \nhuman-readable format. You can see the injected! string at 0x030000, the \nvalue of times at 0x03000E, and the address of someFunction at 0x030014.\nUsing Thread Injection to Execute the Code Cave\nWith a complete code cave written to memory, the only thing left to \ndo is execute it. In this example, you could execute the cave using the \nfollowing code:\nHANDLE thread = CreateRemoteThread(process, NULL, NULL,\n (LPTHREAD_START_ROUTINE)remoteCave,\n remoteString, NULL, NULL);\n138 Chapter 7\nWaitForSingleObject(thread, INFINITE);\nCloseHandle(thread);\nVirtualFreeEx(process, remoteString, fulllen, MEM_RELEASE)\nThe calls to CreateRemoteThread(), WaitForSingleObject(), and CloseHandle() \nwork to inject and execute the code cave, and VirtalFreeEx() covers the \nbot’s tracks by freeing the memory allocated in code like Listing 7-4. In \nthe simplest form, that’s all there is to executing a code cave injected into \na game. In practice, you should also check return values after calling \nVirtualAllocEx(), WriteProcessMemory(), and CreateRemoteThread() to make sure \nthat everything was successful.\nFor instance, if VirtualAllocEx() returns 0x00000000, it means that the \nmemory allocation failed. If you don’t handle the failure, WriteProcessMemory() \nwill also fail and CreateRemoteThread() will begin executing with an entry point \nof 0x00000000, ultimately crashing the game. The same is true for the return \nvalues of WriteProcessMemory() and CreateRemoteThread(). Typically, these func-\ntions will only fail when the process handle is opened without the required \naccess flags.\nHijacking a Game’s Main Thread to Execute Code Caves\nIn some cases, injected code caves need to be in sync with the main thread \nof the game process. Solving this problem can be very tricky because it \nmeans that you must control the existing threads in an external process.\nYou could simply suspend the main thread until the code cave finishes \nexecuting, which might work, but that would prove very slow. The overhead \nrequired to wait for a code cave and then resume a thread is pretty heavy. \nA faster alternative is to force the thread to execute the code for you, a pro-\ncess called thread hijacking.\nN o t e \t\nOpen the main-codeInjection.cpp file in this book’s source code files to follow \nalong with building this thread-hijacking example, which is a simplified version of \ninjectCodeUsingThreadHijacking(). \nBuilding the Assembly Code Cave\nAs with thread injection, the first step to thread hijacking is knowing what \nyou want to happen in your code cave. This time, however, you don’t know \nwhat the thread will be executing when you hijack it, so you’ll need to make \nsure to save the thread’s state when the code cave starts and restore the \nstate when you’re done hijacking it. This means your shellcode needs to be \nwrapped in some assembly, as in Listing 7-5.\nPUSHAD // push general registers to the stack\nPUSHFD // push EFLAGS to the stack\nCode Injection 139\n// shellcode should be here\nPOPFD // pop EFLAGS from the stack\nPOPAD // pop general registers to the stack\n// resume the thread without using registers here\nListing 7-5: A framework for the thread-hijacking code cave\nIf you were to call the same someFunction that you did with thread injec-\ntion, you could use shellcode similar to that in Listing 7-2. The only dif-\nference is that you couldn’t pass the second parameter to your bot using \nthe stack because you wouldn’t be using CreateRemoteThread(). But that’s no \nproblem; you could just push it the same way you’d push the first parameter. \nThe part of the code cave that executes the function you want to call would \nneed to look like Listing 7-6.\nPUSH string\nPUSH times\nMOV EAX, someFunction\nCALL EAX\nADD ESP, 0x8\nListing 7-6: Assembly skeleton for calling someFunction\nAll that’s changed here from Listing 7-1 is that this example pushes \nstring explicitly and there’s no RETN. You don’t call RETN in this case because \nyou want the game thread to go back to whatever it was doing before you \nhijacked it. \nTo resume the execution of the thread normally, the cave needs to \njump back to the thread’s original EIP without using registers. Fortunately, \nyou can use the GetThreadContext() function to fetch EIP, filling the shellcode \nskeleton in C++. Then you can push it to the stack inside your code cave \nand do a return. Listing 7-7 shows how your code cave would need to end.\nPUSH originalEIP\nRETN\nListing 7-7: Jumping to EIP indirectly\nA return jumps to the value on the top of the stack, so doing this imme-\ndiately after pushing EIP will do the trick. You should use this method \ninstead of a jump, because jumps require offset calculation and make the \nshellcode a bit more complex to generate. If you tie Listings 7-5 through 7-7 \ntogether, you come up with the following code cave:\n//save state\nPUSHAD // push general registers to the stack\nPUSHFD // push EFLAGS to the stack\n140 Chapter 7\n// do work with shellcode\nPUSH string \nPUSH times\nMOV EAX, someFunction\nCALL EAX\nADD ESP, 0x8\n// restore state\nPOPFD // pop EFLAGS from the stack\nPOPAD // pop general registers to the stack\n// un-hijack: resume the thread without using registers\nPUSH originalEIP\nRETN\nNext, follow the instructions in “Translating the Assembly to Shellcode” \non page 135 and plug those bytes into an array representing your code cave. \nGenerating Skeleton Shellcode and Allocating Memory\nUsing the same method shown in Listing 7-2, you could generate the shell-\ncode for this cave, as shown in Listing 7-8.\nBYTE codeCave[31] = {\n 0x60, // PUSHAD\n 0x9C, // PUSHFD\n 0x68, 0x00, 0x00, 0x00, 0x00, // PUSH 0\n 0x68, 0x00, 0x00, 0x00, 0x00, // PUSH 0\n 0xB8, 0x00, 0x00, 0x00, 0x00, // MOV EAX, 0x0\n 0xFF, 0xD0, // CALL EAX\n 0x83, 0xC4, 0x08, // ADD ESP, 0x08\n 0x9D, // POPFD\n 0x61, // POPAD\n 0x68, 0x00, 0x00, 0x00, 0x00, // PUSH 0\n 0xC3 // RETN\n};\n// we'll need to add some code here to place \n// the thread's EIP into threadContext.Eip\nmemcpy(&codeCave[3], &remoteString, 4);\nmemcpy(&codeCave[8], ×, 4);\nmemcpy(&codeCave[13], &func, 4);\nmemcpy(&codeCave[25], &threadContext.Eip, 4);\nListing 7-8: Creating the thread-hijacking shellcode array\nAs in Listing 7-3, memcpy() is used to put the variables into the skeleton. \nUnlike in that listing, though, there are two variables that cannot be copied \nright away; times and func are known immediately, but remoteString is a result \nof allocation and threadContext.Eip will be known only once the thread is \nfrozen. It also makes sense to allocate memory before freezing the thread, \nCode Injection 141\nbecause you don’t want the thread to be frozen any longer than it has to be. \nHere’s how this might look:\nint stringlen = strlen(string) + 1;\nint cavelen = sizeof(codeCave);\nint fulllen = stringlen + cavelen;\nauto remoteString =\n VirtualAllocEx(process, 0, fulllen, MEM_COMMIT, PAGE_EXECUTE);\nauto remoteCave =\n (LPVOID)((DWORD)remoteString + stringlen);\nThe allocation code is the same as it was for thread injection, so you \ncan reuse the same snippet.\nFinding and Freezing the Main Thread\nThe code to freeze the main thread is a bit trickier. First, you get the thread’s \nunique identifier. This works much like getting a PID, and you can do it \nusing CreateToolhelp32Snapshot(), Thread32First(), and Thread32Next() from \nTlHelp32.h. As discussed in “Obtaining the Game’s Process Identifier” on \npage 120, these functions are used to essentially iterate over a list. A pro-\ncess can have many threads, but the following example assumes that the \nfirst thread the game process created is the one that needs to be hijacked:\nDWORD GetProcessThreadID(HANDLE Process) {\n THREADENTRY32 entry;\n entry.dwSize = sizeof(THREADENTRY32);\n HANDLE snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);\n if (Thread32First(snapshot, &entry) == TRUE) {\n DWORD PID = GetProcessId(Process);\n while (Thread32Next(snapshot, &entry) == TRUE) {\n if (entry.th32OwnerProcessID == PID) {\n CloseHandle(snapshot);\n return entry.th32ThreadID;\n }\n }\n }\n CloseHandle(snapshot);\n return NULL;\n}\nThis code simply iterates over the list of all threads on the system and \nfinds the first one that matches the game’s PID. Then it gets the thread \nidentifier from the snapshot entry. Once you know the thread identifier, \nfetch the thread’s current register state like this:\nHANDLE thread = OpenThread(\n (THREAD_GET_CONTEXT | THREAD_SUSPEND_RESUME | THREAD_SET_CONTEXT),\n false, threadID);\nSuspendThread(thread);\n142 Chapter 7\nCONTEXT threadContext;\nthreadContext.ContextFlags = CONTEXT_CONTROL;\nGetThreadContext(thread, &threadContext);\nThis code uses OpenThread() to get a thread handle. It then suspends \nthe thread using SuspendThread() and obtains the values of its registers using \nGetThreadContext(). After this, the memcpy() code in Listing 7-8 should have all \nof the variables it needs to finish generating the shellcode. \nWith the shellcode generated, the code cave can be written to the allo-\ncated memory in the same fashion as in Listing 7-4:\nWriteProcessMemory(process, remoteString, string, stringlen, NULL);\nWriteProcessMemory(process, remoteCave, codeCave, cavelen, NULL);\nOnce the cave is ready and waiting in memory, all you need to do is set \nthe thread’s EIP to the address of the code cave and let the thread resume \nexecution, as follows:\nthreadContext.Eip = (DWORD)remoteCave;\nthreadContext.ContextFlags = CONTEXT_CONTROL;\nSetThreadContext(thread, &threadContext);\nResumeThread(thread);\nThis code causes the thread to resume execution at the address of the \ncode cave. Because of the way the code cave is written, the thread has no \nclue that anything has changed. The cave stores the thread’s original state, \nexecutes the payload, restores the thread’s original state, and then returns \nto the original code with everything intact.\nWhen you’re using any form of code injection, it is also important to \nunderstand what data your code caves touch. For example, if you were to \ncreate a code cave that calls a game’s internal functions to create and send \na network packet, you’d need to make sure that any global variables that \nthe functions touch (like a packet buffer, packet position marker, and so \non) are safely restored once you’re done. You never know what the game is \ndoing when your code cave is executed—it could be calling the same func-\ntion as you!\nInjecting DLLs for Full Control\nCode caves are very powerful (you can make a game do anything using \nassembly shellcode), but handcrafting shellcode isn’t practical. It would be \nmuch more convenient to inject C++ code, wouldn’t it? That’s possible, but \nthe process is far more complex: the code must be compiled to assembly, \npackaged in a position-agnostic format, made aware of any external depen-\ndencies, entirely mapped into memory, and then executed on some entry \npoint.\nCode Injection 143\nLuckily, all of these things are already taken care of in Windows. By \nchanging a C++ project to compile as a dynamic library, you can create a self-\ncontained, position-agnostic binary called a dynamic link library (DLL). Then \nyou can use a mix of thread injection or hijacking and the LoadLibrary() API \nfunction to map your DLL file into a game’s memory. \nOpen main-codeInjection.cpp in the GameHackingExamples/Chapter7_\nCodeInjection directory and dllmain.cpp under GameHackingExamples/\nChapter7_CodeInjection_DLL to follow along with some example code as \nyou read this section. In main-codeInjection.cpp, look at the LoadDLL() function \nspecifically.\nTricking a Process into Loading Your DLL\nUsing a code cave, you can trick a remote process into invoking LoadLibrary() \non a DLL, effectively loading foreign code into its memory space. Because \nLoadLibrary() takes only a single parameter, you could create a code cave to \ncall it as follows:\n// write the dll name to memory\nwchar_t* dllName = \"c:\\\\something.dll\";\nint namelen = wcslen(dllName) + 1;\nLPVOID remoteString =\n VirtualAllocEx(process, NULL, namelen * 2, MEM_COMMIT, PAGE_EXECUTE);\nWriteProcessMemory(process, remoteString, dllName, namelen * 2, NULL);\n// get the address of LoadLibraryW()\nHMODULE k32 = GetModuleHandleA(\"kernel32.dll\");\nLPVOID funcAdr = GetProcAddress(k32, \"LoadLibraryW\");\n// create a thread to call LoadLibraryW(dllName)\nHANDLE thread =\n CreateRemoteThread(process, NULL, NULL,\n (LPTHREAD_START_ROUTINE)funcAdr,\n remoteString, NULL, NULL);\n// let the thread finish and clean up\nWaitForSingleObject(thread, INFINITE);\nCloseHandle(thread);\nThis code is somewhat a mix of the thread injection code from \n“Bypassing ASLR in Production” on page 128 and the code cave created \nto call someFunction in Listings 7-2 and 7-3. Like the former, this example \nuses the body of a single-parameter API function, namely LoadLibrary, as the \nbody of the code cave. Like the latter, though, it has to inject a string into \nmemory, since LoadLibrary expects a string pointer as its first argument. Once \nthe thread is injected, it forces LoadLibrary to load the DLL whose name was \ninjected into memory, effectively putting foreign code into a game.\n144 Chapter 7\nN o t e \t\nGive any DLL you plan to inject a unique name, like MySuperBotV2Hook.dll. \nSimpler names, such as Hook.dll or Injected.dll, are dangerously generic. If the \nname conflicts with a DLL that is already loaded, LoadLibrary() will assume that it \nis the same DLL and not load it!\nOnce the LoadLibrary() code cave loads your DLL into a game, the DLL’s \nentry point—known as DllMain()—will be executed with DLL_PROCESS_ATTACH as \nthe reason. When the process is killed or FreeLibrary() is called on the DLL, \nits entry point will be called with the DLL_PROCESS_DETACH reason. Handling \nthese events from the entry point might look like this:\nBOOL APIENTRY DllMain(HMODULE hModule,\n DWORD ul_reason_for_call,\n LPVOID lpReserved) {\n switch (ul_reason_for_call) {\n case DLL_PROCESS_ATTACH:\n printf(\"DLL attached!\\n\");\n break;\n case DLL_PROCESS_DETACH:\n printf(\"DLL detached!\\n\");\n break;\n }\n return TRUE;\n}\nThis example function starts by checking why DllMain() was called. It \nthen outputs text indicating whether it was called because the DLL was \nattached or detached, returning TRUE either way.\nKeep in mind that the entry point of a DLL is executed inside a loader \nlock, which is a global synchronization lock used by all functions that read \nor modify the list of modules loaded in a process. This loader lock gets used \nby functions like GetModuleHandle(), GetModuleFileName(), Module32First(), and \nModule32Next(), which means that running nontrivial code from a DLL entry \npoint can lead to deadlocks and should be avoided. \nIf you need to run code from a DLL entry point, do so from a new \nthread, as follows:\nDWORD WINAPI runBot(LPVOID lpParam) {\n // run your bot\n return 1;\n}\n// do this from DllMain() for case DLL_PROCESS_ATTACH\nauto thread = CreateThread(NULL, 0, &runBot, NULL, 0, NULL); \nCloseHandle(thread);\nFrom DllMain(), this code creates a new thread starting on the function \nrunBot(). It then immediately closes its handle to the thread, as doing any \nfurther operations from DllMain() can lead to serious problems. From inside \nCode Injection 145\nthis runBot(), you can begin executing your bot’s code. The code runs inside \nthe game, meaning you can directly manipulate memory using the type-\ncasting methods. You can also do a lot more, as you’ll see in Chapter 8.\nWhen injecting DLLs, make sure you have no dependency issues. If \nyour DLL relies on some nonstandard DLLs, for example, you have to \neither inject those DLLs into the game first or put them in a folder that \nLoadLibrary() will search, such as any folder in the PATH environment vari-\nable. The former will work only if the DLLs have no dependencies of their \nown, whereas the latter is a bit tricky to implement and subject to name col-\nlisions. The best option is to link all external libraries statically so that they \nare compiled directly into your DLL.\nAccessing Memory in an Injected DLL\nWhen you’re trying to access a game’s memory from an injected DLL, pro-\ncess handles and API functions are a hindrance. Because a game shares \nthe same memory space as all code injected into it, you can access a game’s \nmemory directly from injected code. For example, to access a DWORD value \nfrom injected code, you could write the following:\nDWORD value = *((DWORD*)adr); // read a DWORD from adr\n*((DWORD*)adr) = 1234; // write 1234 to DWORD adr\nThis simply typecasts the memory address adr to a DWORD* and \ndereferences that pointer to a DWORD. Doing typecasts in place like that \nis fine, but your memory access code will look cleaner if the functions \nare abstracted and made generic, just like the Windows API wrappers. \nThe generic functions for accessing memory from inside injected code \nlook something like this:\ntemplate\nT readMemory(LPVOID adr) {\n return *((T*)adr);\n}\ntemplate\nvoid writeMemory(LPVOID adr, T val) {\n *((T*)adr) = val;\n}\nUsing these templates is just like using the functions under “Writing \nTemplated Memory Access Functions” on page 123. Here’s an example:\nDWORD value = readMemory(adr); // read\nwriteMemory(adr, value++); // increment and write\nThese calls are nearly identical to the calls in Listing 6-6 on page 124; \nthey just don’t need to take the process handle as an argument because \n146 Chapter 7\nthey’re being called from inside the process itself. You can make this \nmethod even more flexible by creating a third templated function called \npointMemory(), as follows:\ntemplate\nT* pointMemory(LPVOID adr) {\n return ((T*)adr);\n}\nThis function skips the dereferencing step of a memory read and simply \ngives you the pointer to the data. From there, you’re free to both read from \nand write to the memory by dereferencing that pointer yourself, like this:\nDWORD* pValue = pointMemory(adr); // point\nDWORD value = *pValue; // 'read'\n(*pValue)++; // increment and 'write'\nWith a function like pointMemory() in place, you could eliminate the calls \nto readMemory() and writeMemory(). You’d still need to find adr ahead of time, \nbut from there, the code to read a value, change it, and write it back would \nbe much simpler to follow.\nBypassing ASLR in an Injected DLL\nSimilarly, since the code is injected, there’s no need to inject a thread into \nthe game to get the base address. Instead, you can just call GetModuleHandle() \ndirectly, like so:\nDWORD newBase = (DWORD)GetModuleHandle(NULL);\nA faster way to get the base address is to utilize the game’s FS memory \nsegment, which is another superpower you get from injected code. This \nmemory segment points to a structure called the thread environment block \n(TEB), and 0x30 bytes into the TEB is a pointer to the process environment \nblock (PEB) structure. These structures are used by the operating system \nand contain a ton of data about the current thread and the current process, \nbut we’re interested only in the base address of the main module, which is \nstored 0x8 bytes into the PEB. Using inline assembly, you can traverse these \nstructures to get newBase, like this:\nDWORD newBase;\n__asm {\n MOV EAX, DWORD PTR FS:[0x30]\n MOV EAX, DWORD PTR DS:[EAX+0x8]\n MOV newBase, EAX\n}\nThe first command stores the PEB address in EAX, and the second com-\nmand reads the main module’s base address and stores it in EAX. The final \ncommand then copies EAX to newBase.\nCode Injection 147\nClosing Thoughts\nIn Chapter 6, I showed you how to read from memory remotely and how \nan injected DLL can directly access a game’s memory using pointers. This \nchapter demonstrated how to inject all types of code, from pure assembly \nbyte code to entire C++ binaries. In the next chapter, you’ll learn just \nhow much power being in a game’s memory space actually gives you. If you \nthought assembly code injection was cool, you’ll love what you can do when \nyou mix injected C++ with control flow manipulation.\nThe example code for this chapter contains proofs of concept for every-\nthing we’ve discussed. If you’re still unclear about any of the topics, you can \npoke at the code to learn exactly what’s going on and see all of the tricks in \naction.\n8\nM a n ipu l at ing Con t rol F l ow \nin a G a m e\nForcing a game to execute foreign code is \ndefinitely powerful, but what if you could \nalter the way a game executes its own code? \nWhat if you could force the game to bypass \nthe code that draws the fog of war, trick it into mak-\ning enemies visible through walls, or manipulate the \narguments it passes to functions? Control flow manipu-\nlation lets you do exactly that, allowing you to change \nwhat a process does by intercepting code execution \nand monitoring, modifying, or preventing it.\nThere are many ways to manipulate the control flow of a process, but \nalmost all require you to modify the process’s assembly code. Depending on \nyour goals, you’ll need to either completely remove code from the process \n(called NOPing) or force the process to redirect execution to injected func-\ntions (called hooking). In the beginning of this chapter, you’ll learn about \n150 Chapter 8\nNOPing, several types of hooking, and other control flow manipulation tech-\nniques. Once I’ve explained the basics, I’ll show you how I’ve applied these \nprinciples to common game libraries like Adobe AIR and Direct3D.\nOpen the directory GameHackingExamples/Chapter8_ControlFlow in this \nbook’s resource files to see the complete sample code for the next section \nand “Hooking to Redirect Game Execution” on page 153.\nNOPing to Remove Unwanted Code\nChapter 7 described how to inject new code into a game, but the opposite—\nremoving code from a game—can also be useful. Some hacks require you \nto stop some of a game’s original code from being executed, and to do that, \nyou’ll have to get rid of it. One way to eliminate code from a game process \nis NOPing, which involves overwriting the original x86 assembly code with \nNOP instructions.\nWhen to NOP\nConsider a game that won’t show the health bars of cloaked enemies. It’s \npretty hard to see cloaked enemies coming, and you’d have a huge advan-\ntage in combat if you could at least see their health bars. The code to draw \nhealth bars often looks like Listing 8-1.\nfor (int i = 0; i < creatures.size(); i++) {\n auto c = creatures[i];\n if (c.isEnemy && c.isCloaked) continue;\n drawHealthBar(c.healthBar);\n}\nListing 8-1: The loop from the drawCreatureHealthBarExample() function\nWhen drawing health bars, a game with cloaked creatures might \nuse a for loop to check whether the creatures within the screen’s bounds \nare cloaked. If an enemy isn’t cloaked, the loop calls some function \n(drawHealthBar() in this example) to display the enemy’s health bar.\nGiven the source code, you could force the game to draw even cloaked \nenemies’ health bars by simply removing if (c.isEnemy && c.isCloaked) \ncontinue; from the code. But as a game hacker, you have only the assembly \ncode, not the source code. When simplified, the assembly that Listing 8-1 \ntranslates into looks something like this pseudocode:\nstartOfLoop: ; for\n MOV i, 0 ; int i = 0\n JMP condition ; first loop, skip increment\nincrement:\n ADD i, 1 ; i++\ncondition:\n CMP i, creatures.Size() ; i < creatures.size()\n JNB endOfLoop ; exit loop if i >= creatures.size()\nManipulating Control Flow in a Game 151\nbody:\n MOV c, creatures[i] ; auto c = creatures[i]\n TEST c.isEnemy, c.isEnemy ; if c.isEnemy\n JZ drawHealthBar ; draw bar if c.isEnemy == false\n TEST c.isCloaked, c.isCloaked ; && c.isCloaked\n JZ drawHealthBar ; draw bar if c.isCloaked == false\n JMP increment ; continue\ndrawHealthBar:\n CALL drawHealthBar(c.healthBar) ; drawHealthBar(c.healthBar)\n JMP increment ; continue\nendOfLoop:\nTo trick the game into drawing all enemy health bars, regardless of \ncloaking, you’d need to remove the JMP increment command that executes \nwhen c.isEnemy && c.isCloaked evaluates to true. In assembly, though, replac-\ning unwanted code with instructions that do nothing is easier than deleting \ncode. That’s where the NOP command comes in. Since NOP is a single byte \n(0x90), you can overwrite the 2-byte JMP increment command with two NOP \ncommands. When the processor reaches those NOP commands, it rolls over \nthem and falls into drawHealthBar() even when c.isEnemy && c.isCloaked evalu-\nates to true.\nHow to NOP\nThe first step to NOPing a chunk of assembly code is making the memory \nchunk where the code lives writable. It’s possible for the code on the same \nmemory page to be executed while you’re writing the NOP commands, \nthough, so you also want to make sure the memory is still executable. You \ncan accomplish both of these tasks by setting the memory’s protection to \nPAGE_EXECUTE_READWRITE. Once the memory is properly protected, you can \nwrite the NOP commands and be done. It technically doesn’t hurt to leave \nthe memory writable, but it’s good practice to also restore the original pro-\ntection when you’re finished.\nProvided you have facilities in place for writing and protecting memory \n(as described in Chapter 6), you can write a function like the one shown \nin Listing 8-2 to write NOP commands to game memory. (Follow along by \nopening the project’s NOPExample.cpp file.)\ntemplate\nvoid writeNop(DWORD address)\n{\n auto oldProtection =\n protectMemory(address, PAGE_EXECUTE_READWRITE);\n for (int i = 0; i < SIZE; i++)\n writeMemory(address + i, 0x90);\n protectMemory(address, oldProtection);\n}\nListing 8-2: Proper NOPing, complete with memory protection\n152 Chapter 8\nIn this example, the writeNop() function sets the appropriate memory \nprotection, writes a number of NOP commands equal to SIZE, and reapplies \nthe original memory protection level.\nThe writeNop() function takes the number of NOP instructions to \nplace as a template parameter, since the memory functions require a \ncorrectly sized type at compile time. Passing an integer SIZE tells the \nmemory functions to operate on a type of BYTE[SIZE] at compile time. \nTo specify a dynamic size at runtime, simply drop the loop and instead \ncall protectMemory and pass address and address + SIZE as arguments. \nAs long as the size isn’t larger than a page (and really, you shouldn’t be \nNOPing a full page), this will ensure that the memory gets properly pro-\ntected even if it’s on a page boundary.\nCall this function with the address where you want to place your NOPs \nand the number of NOP commands to place:\nwriteNop<2>(0xDEADBEEF);\nKeep in mind that the number of NOP commands should match the \nsize in bytes of the command being removed. This call to writeNop() writes \ntwo NOP commands to the address 0xDEADBEEF.\nPr actice NOPing\nIf you haven’t already, open NOPExample.cpp in this chapter’s example code \nnow and play around with it for a bit. You’ll find a working implementation of \nthe writeNop() function and an interesting function called getAddressforNOP() \nthat scans the example program’s memory to find where the NOP command \nshould be placed.\nTo see the NOP command in action, run the compiled NOPapplication in \nVisual Studio’s debugger with breakpoints at the start and end of the writeNop() \nfunction. When the first breakpoint is hit, press alt-8 to open the disassembly \nwindow, enter address in the input box, and press enter. This brings you to the \nNOP’s target address, where you’ll see the assembly code fully intact. Press F5 \nto continue execution, which triggers the second breakpoint after allowing the \napplication to place the NOPs. Finally, jump back to address in the disassembly \ntab to see that the code was replaced by NOPs.\nYou can rework this code to do other cool stuff. For example, you might try \nplacing NOPs on the comparisons instead of the jump or even modifying the \njump’s type or destination.\nThese and other alternative approaches may work, but note that they \nintroduce more room for error than overwriting the single JMP with NOP com-\nmands. When modifying foreign code, make as few changes as possible to \nminimize the potential for errors.\nManipulating Control Flow in a Game 153\nHooking to Redirect Game Execution\nSo far, I’ve shown you how to manipulate games by adding code to them, \nhijacking their threads, creating new threads, and even removing existing \ncode from their execution flow. These methods are very powerful on their \nown, but when combined, they form an even more potent method of manip-\nulation called hooking. Hooking allows you to intercept precise branches of \nexecution and redirect them to injected code that you’ve written to dictate \nwhat the game should do next, and it comes in a variety of flavors. In this \nsection, I’ll teach you about four of the most powerful hooking methods for \ngame hacking: call hooking, virtual function table hooking, import address \ntable hooking, and jump hooking.\nCall Hooking\nA call hook directly modifies the target of a CALL operation to point to a new \npiece of code. There are a few variations of the CALL operation in x86 assem-\nbly, but call hooks are generally used on only one: the near call, which takes \nan immediate address as an operand.\nWorking with Near Calls in Memory\nIn an assembly program, a near call looks like this:\nCALL 0x0BADF00D\nThis near call is represented by the byte 0xE8, so you might assume it is \nstored in memory like this:\n0xE8 0x0BADF00D\nOr, when split into single bytes and swapped for endianness, like this:\n0xE8 0x0D 0xF0 0xAD 0x0B\nBut the anatomy of a near call in memory is not that simple. Instead \nof storing the callee’s absolute address, a near call stores an offset to the \ncallee relative to the address immediately after the call. Since a near call is \n5 bytes, the address immediately after the call is 5 bytes later in memory. \nGiven that, the address stored can be computed as follows:\ncalleeAddress – (callAddress + 5)\nIf CALL 0x0BADF00D lives at 0xDEADBEEF in memory, then the value after \n0xE8 is this:\n0x0BADF00D – (0xDEADBEEF + 5) = 0x2D003119\n154 Chapter 8\nIn memory, then, that CALL instruction looks like this:\n0xE8 0x19 0x31 0x00 0x2D\nTo hook a near call, you first need to change the offset following 0xE8 \n(that is, the little-endian 0x19 0x31 0x00 0x2D) to point to your new callee.\nHooking a Near Call\nFollowing the same memory protection rules shown in Listing 8-2, you hook \na near call like so (follow along by opening CallHookExample.cpp):\nDWORD callHook(DWORD hookAt, DWORD newFunc)\n{\n DWORD newOffset = newFunc - hookAt - 5;\n auto oldProtection = \n protectMemory(hookAt + 1, PAGE_EXECUTE_READWRITE);\n DWORD originalOffset = readMemory(uhookAt + 1);\n writeMemory(hookAt + 1, newOffset);\n protectMemory(hookAt + 1, oldProtection);\n return originalOffset + hookAt + 5;\n}\nThis function takes as arguments the address of the CALL to hook \n(hookAt) and the address to redirect execution to (newFunc), and it uses \nthem to calculate the offset required to call the code at the address newFunc \ncontains. After you apply the correct memory protections, the callHook() \nfunction writes the new offset to the memory at hookAt + 1 , applies the \nold memory protections, calculates the address of the original call , and \nreturns that value to the caller.\nHere’s how you might actually use a function like this in a game hack:\nDWORD origFunc = callHook(0xDEADBEEF, (DWORD)&someNewFunction);\nThis hooks the near call to 0x0BADF00D at 0xDEADBEEF and redirects \nit to the address of someNewFunction, which is the code your hack will execute. \nAfter this is called, the origFunc value will hold 0x0BADF00D.\nCleaning Up the Stack\nThe new callee must also properly handle the stack, preserve registers, and \npass proper return values. At the least, this means your replacement func-\ntion must match the game’s original function in both calling convention \nand argument count.\nLet’s say this is the original full function call, in assembly:\nPUSH 1\nPUSH 456\nManipulating Control Flow in a Game 155\nPUSH 321\nCALL 0x0BADF00D\nADD ESP, 0x0C\nYou can tell the function has the C++ __cdecl convention because the \nstack is being reset by the caller. Additionally, the 0x0C bytes being cleaned \nfrom the stack show that there are three arguments, which you can calcu-\nlate as follows:\n0x0C\nsizeof(DWORD) = 3\nOf course, you can also obtain the number of arguments by checking \nhow many things are pushed to the stack: there are three PUSH commands, \none for each argument.\nWriting a Call Hook\nIn any case, the new callee, someNewFunction, must follow the __cdecl conven-\ntion and have three arguments. Here’s an example skeleton for the new \ncallee:\nDWORD __cdecl someNewFunction(DWORD arg1, DWORD arg2, DWORD arg3)\n{\n \n}\nIn Visual Studio, C++ programs use the __cdecl convention by default, \nso technically you could omit it from your function definition; however, I’ve \nfound it’s better to be verbose so you get into the habit of being specific. \nAlso keep in mind that if the caller expects a value to be returned, the \nreturn type of your function should match as well. This example assumes \nthe return type is always a DWORD or smaller. Since return types in this size \nrange will all be passed back on EAX, further examples will also use a \nreturn type of DWORD.\nIn most cases, a hook finishes by calling the original function and \npassing its return value back to the caller. Here’s how all of that might fit \ntogether:\ntypedef DWORD (__cdecl _origFunc)(DWORD arg1, DWORD arg2, DWORD arg3);\n_origFunc* originalFunction =\n (_origFunc*)hookCall(0xDEADBEEF, (DWORD)&someNewFunction);\nDWORD __cdecl someNewFunction(DWORD arg1, DWORD arg2, DWORD arg3)\n{\n return originalFunction(arg1, arg2, arg3);\n}\nThis example uses typedef to declare a type representing the original \nfunction’s prototype and creates a pointer with this type to the original \n156 Chapter 8\nfunction. Then someNewFunction() uses this pointer to call the original func-\ntion with the original arguments and pass the returned value back to the \ncaller.\nRight now, all someNewFunction() does is return to the original function. \nBut you can do whatever you want from inside the someNewFunction() call \nfrom here. You can modify the parameters being passed to the original \nfunction or intercept and store interesting parameters for later use. If you \nknow the caller isn’t expecting a return value (or if you know how to spoof \nthe return value), you can even forget about the original function and com-\npletely replace, replicate, or improve its functionality inside the new callee. \nOnce you’ve perfected this skill, you can add your own native C or C++ code \nto any part of a game that you wish.\nVF Table Hooking\nUnlike call hooks, virtual function (VF) table hooks don’t modify assembly \ncode. Instead, they modify the function addresses stored in the VF tables \nof classes. (If you need a refresher on VF tables, see “A Class with Virtual \nFunctions” on page 75.) All instances of the same class type share a static \nVF table, so VF table hooks will intercept all calls made to a member func-\ntion, regardless of which class instance the game is calling the function \nfrom. This can be both powerful and tricky.\nWriting a VF Table Hook\nBefore we go any deeper into how to place a VF table hook, we need to \ntalk about those pesky calling conventions again. VF tables are used by \nclass instances to call virtual member functions, and all member functions \nwill have the __thiscall convention. The name __thiscall is derived from \nThe Truth About VF Tables\nTo simplify the explanation, I lied a little when I said that VF table hooks could \nintercept all calls made to a function. In reality, the VF table is traversed only \nwhen a virtual function is called in a way that leaves the compiler with some \nplausible type ambiguity. For example, a VF table will be traversed when a \nfunction is called through the inst->function() call format. A VF table won’t be \ntraversed when a virtual function is invoked in such a way that the compiler is \nsure about the type, as in inst.function() or similar calls, since the compiler \nwill know the function’s address. Conversely, calling inst.function() from a \nscope where inst is passed in as a reference would trigger a VF table tra-\nversal. Before you try to deploy VF table hooking, make sure the function calls \nyou want to hook have type ambiguity.\nManipulating Control Flow in a Game 157\nthe this pointer that member functions use to reference the active class \ninstance. Thus, member functions are given this as a pseudoparameter \non ECX.\nIt’s possible to match the prototype of a __thiscall by declaring a class \nthat acts as a container for all __thiscall hook callbacks, but I don’t prefer \nthis method. Instead, I find it easier to control the data using inline assem-\nbly. Let’s explore how you control the data when placing a VF hook on a \nclass that looks like this:\nclass someBaseClass {\n public:\n virtual DWORD someFunction(DWORD arg1) {}\n};\nclass someClass : public someBaseClass {\n public:\n virtual DWORD someFunction(DWORD arg1) {}\n};\nThe someBaseClass class just has one member (a public virtual func-\ntion), and the someClass class inherits from someBaseClass and overrides the \nsomeBaseClass::someFunction member. To hook someClass::someFunction, you \nreplicate the prototype in your VF table hook, as shown in Listing 8-3 (fol-\nlow along in the VFHookExample.cpp file of the project).\nDWORD __stdcall someNewVFFunction(DWORD arg1)\n{\n static DWORD _this;\n __asm MOV _this, ECX\n}\nListing 8-3: The start of a VF table hook\nThis function works as a hook because __thiscall only differs from \n__stdcall in that the former is given this on ECX. To reconcile this small \ndifference, the callback function uses inline assembly (denoted by __asm) \nto copy this from ECX to a static variable . Since the static variable is actu-\nally initialized as a global, the only code that executes before MOV _this, ECX \nis the code that sets up the stack frame—and that code never touches \nECX. That ensures that the proper value is in ECX when the assembly is \nexecuted.\nN o t e \t\nIf multiple threads start calling the same VF function, the someNewVFFunction() \nhook will break because _this might be modified by one call while still being used by \nanother call. I’ve never personally run into this problem, as games don’t typically \nthrow around multiple instances of critical classes between threads, but an efficient \nremedy would be to store _this in thread local storage, ensuring each thread would \nhave its own copy.\n158 Chapter 8\nBefore returning, a VF table callback must also restore ECX, to keep \nwith the __thiscall convention. Here’s how that process looks:\nDWORD __stdcall someNewVFFunction(DWORD arg1)\n{\n static DWORD _this;\n __asm MOV _this, ECX\n // do game modifying stuff here\n __asm MOV ECX, _this\n}\nAfter executing some game-hacking code, this version of the function \nsomeNewVFFunction() restores ECX with a reversed version of the first MOV \ninstruction from Listing 8-3.\nUnlike with __cdecl functions, however, you shouldn’t call functions that \nuse the __thiscall convention from pure C++ using only a function pointer \nand typedef (as you would for a call hook). When calling the original func-\ntion from a VF table hook, you must use inline assembly—that’s the only \nway to be sure you’re passing data (specifically _this) around properly. For \nexample, this is how you continue to build the someNewVFFunction() hook:\nDWORD __stdcall someNewVFFunction(DWORD arg1)\n{\n static DWORD _this, _ret;\n __asm MOV _this, ECX\n // do pre-call stuff here\n __asm {\n PUSH arg1\n MOV ECX, _this\n CALL [originalVFFunction]\n MOV _ret, EAX\n }\n // do post-call stuff here\n __asm MOV ECX, _this\n return _ret;\n}\nNow, someNewVFFunction() stores this in the _this variable, allows some \ncode to execute, calls the original game function that’s being hooked, \nstores that function’s return value in _ret , allows some more code to \nexecute, restores this to ECX , and returns the value stored in _ret. The \ncallee cleans the stack for __thiscall calls, so unlike a call hook, the pushed \nargument doesn’t need to be removed.\nManipulating Control Flow in a Game 159\nN o t e \t\nIf you want to remove a single pushed argument at any point, use the assembly \ninstruction ADD ESP, 0x4 because a single argument is 4 bytes.\nUsing a VF Table Hook\nWith the calling convention established and a skeleton callback in place, it’s \ntime to move on to the fun part: actually using a VF table hook. A pointer \nto a class’s VF table is the first member of every class instance, so placing a \nVF table hook requires only a class instance address and the index of the \nfunction to be hooked. Using these two pieces of information, you need \nonly a modest amount of code to place a hook. Here’s an example:\nDWORD hookVF(DWORD classInst, DWORD funcIndex, DWORD newFunc)\n{\n DWORD VFTable = ureadMemory(classInst);\n DWORD hookAt = VFTable + funcIndex * sizeof(DWORD);\n auto oldProtection =\n protectMemory(hookAt, PAGE_READWRITE);\n DWORD originalFunc = readMemory(hookAt);\n writeMemory(hookAt, newFunc);\n protectMemory(hookAt, oldProtection);\n return originalFunc;\n}\nThe hookVF() function finds the VF table by reading the first member \nof the class instance and storing it in VFTable. Since the VF table is just \nan array of DWORD-sized addresses, this code finds the function address by \nmultiplying the function’s index in the VF table (funcIndex in this example) \nby the size of a DWORD, which is 4, and adding the result to the VF table’s \naddress. From there, hookVF() acts similar to a call hook: it makes sure the \nmemory is properly accessible by setting appropriate protections, stores the \noriginal function address for later, writes the new function address, and \nfinally, restores the original memory protection.\nYou’ll typically hook the VF table of a class instantiated by the game, \nand calling a function like hookVF() for a VF table hook looks like this:\nDWORD origVFFunction =\n hookVF(classInstAddr, 0, (DWORD)&someNewVFFunction);\nAs usual, you need to find classInstAddr and the funcIndex argument \nahead of time.\nThere are some very niche cases in which VF table hooks are useful, and \nit can be really hard to find the right class pointers and functions. Given that, \ninstead of showing contrived use cases, I’ll come back to VF table hooks in \n“Applying Jump Hooks and VF Hooks to Direct3D” on page 175, once I’ve \ndiscussed other types of hooking.\n160 Chapter 8\nIf you want to play with VF hooks before reading more, add new vir-\ntual functions to the example classes in this book’s resource files and prac-\ntice hooking them. You might even create a second class that derives from \nsomeBaseClass and place a hook on its virtual table to demonstrate how you \ncan have two completely separate VF hooks on two classes that inherit the \nsame base class.\nIAT Hooking\nIAT hooks actually replace function addresses in a specific type of VF table, \ncalled the import address table (IAT). Each loaded module in a process con-\ntains an IAT in its PE header. A module’s IAT holds a list of all the other \nmodules on which the module depends, as well as a list of functions that the \nmodule uses from each dependency. Think of an IAT as a lookup table for \nAPIs to call one another.\nWhen a module is loaded, its dependencies are also loaded. Dependency \nloading is a recursive process that continues until all dependencies for \nall modules are loaded. As each dependency is loaded, the operating sys-\ntem finds all functions used by the dependent module and fills any blank \nspaces in its IAT with the function addresses. Then, when a module calls a \nfunction from a dependency, it makes that call by resolving the function’s \naddress from the IAT.\nPaying for Portability\nFunction addresses are always resolved from the IAT in real time, so hook-\ning the IAT is similar to hooking VF tables. Since function pointers are \nstored in the IAT beside their actual names, there’s no need to do any \nreverse engineering or memory scanning; as long as you know the name \nof the API you want to hook, you can hook it! Moreover, IAT hooking lets \nyou easily hook Windows API calls on a module-specific basis, allowing your \nhooks to intercept only API calls from a game’s main module.\nThis portability has a cost, though; the code to place an IAT hook is \nmuch more complex than what you’ve seen so far. First, you need to locate \nthe PE header of the game’s main module. Since the PE header is the first \nstructure in any binary, you can find it at the base address of each module, \nas shown in Listing 8-4 (follow along in the IATHookExample.cpp file of the \nproject).\nDWORD baseAddr = (DWORD)GetModuleHandle(NULL);\nListing 8-4: Fetching the module’s base address\nOnce you’ve found the base address, you must verify that the PE header \nis valid. This validation can be very important, as some games try to prevent \nthese types of hooks by scrambling nonessential parts of their PE header after \nthey load. A valid PE header is prefixed by a DOS header, which indicates \nManipulating Control Flow in a Game 161\nthe file is a DOS MZ executable; the DOS header is identified by the magic \nvalue 0x5A4D. A member of the DOS header called e_lfanew then points to \nthe optional header, which contains values like the size of the code, a version \nnumber, and so on and is identified by the magic value 0x10B.\nThe Windows API has PE structures called IMAGE_DOS_HEADER and \nIMAGE_OPTIONAL_HEADER that correspond to the DOS header and optional \nheader, respectively. You can use them to validate the PE header with \ncode like Listing 8-5.\nauto dosHeader = pointMemory(baseAddr);\nif (dosHeader->e_magic != 0x5A4D)\n return 0;\nauto optHeader =\n pointMemory(baseAddr + dosHeader->e_lfanew + 24);\nif (optHeader->Magic != 0x10B)\n return 0;\nListing 8-5: Confirming the DOS and optional headers are valid\nThe calls to pointMemory() create pointers to the two headers that need \nto be checked. If either if() statement returns 0, then the corresponding \nheader has the wrong magic number, meaning the PE header isn’t valid.\nReferences to the IAT from assembly are hardcoded, meaning assem-\nbly references don’t traverse the PE header to locate the IAT. Instead, each \nfunction call has a static location indicating where to find the function \naddress. That means overwriting the PE header to say that there are no \nimports is a viable way to protect against IAT hooks, and some games have \nthis protection.\nTo account for that, you also need to make sure the game’s IAT still \nexists. Listing 8-6 shows how to add such a check to the code in Listing 8-5.\nauto IAT = optHeader->DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT];\nif (IAT.Size == 0 || IAT.VirtualAddress == 0)\n return 0;\nListing 8-6: Checking that the IAT actually exists\nThe PE header contains many sections that store information about \nthe application’s code, embedded resources, relocations, and so on. The \npiece of code in Listing 8-6 is particularly interested in the data section, \nwhich—as you might guess—stores many different types of data. Each \ntype of data is stored in its own directory, and the DataDirectory member of \nIMAGE_OPTIONAL_HEADER is an array of directory headers that describes the size \nand virtual address of each directory in the data section. The Windows API \ndefines a constant called IMAGE_DIRECTORY_ENTRY_IMPORT, which happens to be \nthe index of the IAT header within the DataDirectory array.\n162 Chapter 8\nThus, this code uses optHeader->DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT] \nto resolve the header of the IAT and check that the header’s Size and \nVirtualAddress are nonzero, essentially confirming its existence.\nTraversing the IAT\nOnce you know the IAT is still intact, you can start traversing it, and this \nis where IAT hooking starts to get ugly. The IAT is an array of structures \ncalled import descriptors. There is one import descriptor for each depen-\ndency, each import descriptor points to an array of structures called thunks, \nand each thunk represents a function imported from the dependency.\nLuckily, the Windows API exposes both the import descriptors and \nthunks through the IMAGE_IMPORT_DESCRIPTOR and IMAGE_THUNK_DATA structures, \nrespectively. Having the structures predefined saves you from creating \nyour own, but it doesn’t make the code to traverse the IAT any prettier. \nTo see what I mean, look at Listing 8-7, which builds on Listings 8-4 \nthrough 8-6.\nauto impDesc =\n pointMemory(ubaseAddr + IAT.VirtualAddress);\n while (impDesc->FirstThunk) {\n auto thunkData =\n pointMemory(baseAddr + impDesc->OriginalFirstThunk);\n int n = 0;\n while (thunkData->u1.Function) {\n // the hook happens in here\n n++;\n thunkData++;\n }\n impDesc++;\n}\nListing 8-7: Iterating over the IAT to find a function\nKeeping in mind that the import descriptors are stored relative to the \nstart of the PE header, this code adds the module’s base address to the \nvirtual address found in the IAT’s directory header , creating a pointer, \nimpDesc, that points to the module’s first import descriptor.\nImport descriptors are stored in a sequential array, and a descriptor \nwith a FirstThunk member set to NULL signifies the end of the array. Knowing \nthis, the code uses a while loop that continues until impDesc->FirstThunk is \nNULL, incrementing the descriptor by executing impDesc++ each iteration.\nFor each import descriptor, the code creates a pointer called thunkData \nthat points to the first thunk inside the descriptor. Using a familiar loop, \nthe code iterates over thunks until one is found with a Function member \nset to NULL. The loop also uses an integer, n, to keep track of the current \nthunk index, as the index is important when placing the hook.\nManipulating Control Flow in a Game 163\nPlacing the IAT Hook\nFrom here, placing the hook is just a matter of finding the proper function \nname and replacing the function address. You can find the name inside the \nnested while loop, as shown in Listing 8-8.\nchar* importFunctionName =\n pointMemory(baseAddr + (DWORD)thunkData->u1.AddressOfData + 2);\nListing 8-8: Finding the function name\nThe function name for each thunk is stored at thunkData->u1.AddressOfData \n+ 2 bytes into the module, so you can add that value to the module’s base \naddress to locate the function name in memory.\nAfter obtaining a pointer to the function name, use strcmp() to check \nwhether it’s the target function, like so:\nif (strcmp(importFuncName, funcName) == 0) {\n // the final step happens in here\n}\nOnce you’ve located the target function using its name, you simply need \nto overwrite the function address with the address of your own function. \nUnlike function names, function addresses are stored in an array at the \nstart of each import descriptor. Using n from the thunk loop, you can finally \nset the hook, as shown in Listing 8-9.\nauto vfTable = pointMemory (baseAddr + impDesc->FirstThunk);\nDWORD original = vfTable[n];\n auto oldProtection = protectMemory((DWORD)&vfTable[n], PAGE_READWRITE);\n vfTable[n] = newFunc;\nprotectMemory((DWORD)&vfTable[n], oldProtection);\nListing 8-9: Finding the function address\nThis code locates the VF table for the current descriptor by adding the \naddress of the first thunk to the module base address. The VF table is an \narray of function addresses, so the code uses the n variable as an index to \nlocate the target function address.\nOnce the address is found, the code in Listing 8-9 works just like a typi-\ncal VF hook: it stores the original function address, sets the protection of \nindex n in the VF table to PAGE_READWRITE , inserts the new function address \ninto the VF table , and finally restores the old protection.\nIf you stitch together the code from Listings 8-4 through 8-9, the final \nIAT hooking function looks like Listing 8-10.\nDWORD hookIAT(const char* funcName, DWORD newFunc)\n{\n DWORD baseAddr = (DWORD)GetModuleHandle(NULL);\n164 Chapter 8\n auto dosHeader = pointMemory(baseAddr);\n if (dosHeader->e_magic != 0x5A4D)\n return 0;\n auto optHeader =\n pointMemory(baseAddr + dosHeader->e_lfanew + 24);\n if (optHeader->Magic != 0x10B)\n return 0;\n auto IAT =\n optHeader->DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT];\n if (IAT.Size == 0 || IAT.VirtualAddress == 0)\n return 0;\n auto impDesc =\n pointMemory(baseAddr + IAT.VirtualAddress);\n while (impDesc->FirstThunk) {\n auto thunkData =\n pointMemory(baseAddr + impDesc->OriginalFirstThunk);\n int n = 0;\n while (thunkData->u1.Function) {\n char* importFuncName = pointMemory\n (baseAddr + (DWORD)thunkData->u1.AddressOfData + 2);\n if (strcmp(importFuncName, funcName) == 0) {\n auto vfTable = pointMemory(baseAddr + impDesc->FirstThunk);\n DWORD original = vfTable[n];\n auto oldProtection =\n protectMemory((DWORD)&vfTable[n], PAGE_READWRITE);\n vfTable[n] = newFunc;\n protectMemory((DWORD)&vfTable[n], oldProtection);\n return original;\n }\n n++;\n thunkData++;\n }\n impDesc++;\n }\n}\nListing 8-10: The complete IAT hooking function\nThis is the most complex code that we’ve put together so far, and \nit’s pretty hard to read when squished to fit on a page. If you haven’t yet \nwrapped your head around what it’s doing, you might want to study the \nexample code from this book’s resource files before continuing.\nUsing an IAT Hook to Sync with a Game Thread\nWith the code in Listing 8-10, hooking any Windows API function is as simple \nas knowing the function name and the proper prototype. The Sleep() API is \na common API to hook when game hacking, as bots can use a Sleep() hook \nto thread-sync with a game’s main loop.\nManipulating Control Flow in a Game 165\nHere’s one way to use hookIAT() to hook the Sleep() API:\nVOID WINAPI newSleepFunction(DWORD ms)\n{\n // do thread-sensitive things\n originalSleep(ms);\n}\ntypedef VOID (WINAPI _origSleep)(DWORD ms);\n_origSleep* originalSleep =\n (_origSleep*)hookIAT(\"Sleep\", (DWORD)&newSleepFunction);\nHere’s why this works. At the end of a game’s main loop, it might call \nSleep() to rest until it’s ready to draw the next frame. Since it’s sleeping, it’s \nsafe for you to do anything you want without worrying about synchroniza-\ntion issues. Some games might not do this, or they might call Sleep() from \nmultiple threads, and those games will require a different method.\nA more portable alternative is to hook the PeekMessageA() API function, \nbecause games often call that function from the main loop while waiting \nfor input. Then, your bot can do thread-sensitive operations from within \nthe PeekMessageA() hook, ensuring that they’re done from the game’s main \nthread. You may also want your bot to use this method to hook the send() \nand recv() API functions, as intercepting these allows you to create a packet \nsniffer relatively simply.\nJump Hooking\nJump hooking allows you to hook code in places where there is no branch-\ning code to manipulate. A jump hook replaces the code being hooked \nwith an unconditional jump to a trampoline function. When the jump is hit, \nGetting in Sync with Thread Sync\nYour injected code will inevitably have to sync with a game’s main loop, or \nit won’t work. When you’re reading or writing data larger than 4 bytes, for \nexample, being out of sync allows the game to read or write that data at the \nsame time as you. You’ll be stepping on the game’s toes, and vice versa, lead-\ning to all sorts of race conditions and data corruption issues. Similarly, if you \ntry to call a game’s function from your own thread, you run the risk of crashing \nthe game if the function is not thread safe.\nSince IAT hooks are thread-safe modifications to the PE header, they can \nbe placed from any thread. By placing one on a function that’s called before \nor after the game’s main loop, you can effectively sync with the game’s main \nthread. All you need to do is place the hook and execute any thread-sensitive \ncode from your hook callback.\n166 Chapter 8\nthe trampoline function stores all current register and flag values, calls a \ncallback function of your choice, restores the registers, restores the flags, \nexecutes the code that was replaced by the hook, and finally jumps back to \nthe code just below the hook. This process is shown in Figure 8-1.\nSUB EAX, 1\nADD EBX, EAX\nPUSH EBX\nMOV ECX ESI\nCALL function\nPOP EAX\nMOV AL, 1\nPOP EDI\nPOP ESI\nRETN\nOriginal Code\nPUSHFD\nPUSHAD\nCALL callback\nPOPAD\nPOPFD\nPOP EAX\nMOV AL, 1\nPOP EDI\nPOP ESI\nJMP restore\nTrampoline\nSUB EAX, 1\nADD EBX, EAX\nPUSH EBX\nMOV ECX, ESI\nCALL function\nJMP trampoline\nRETN\nHooked Code\n// do stuff\nRETN\nCallback\nFigure 8-1: A jump hook\nThe original code shows an example of some unmodified assembly you \nmight find in a game, and the hooked code shows how that assembly might \nlook after being hooked by a jump hook. The trampoline box shows an \nexample trampoline function, in assembly, and the callback represents the \ncode you’re trying to execute through hooking. In the original code, the \nassembly executes from top to bottom. In the hooked code, to get from the \nSUB EAX,1 instruction to the RETN instruction, execution must follow the path \nshown by the dashed arrows.\nN o t e \t\nIf your callback code is simple, it can be integrated into the trampoline instead. It’s \nalso not always necessary to store and restore the registers and flags, but doing so is \ngood practice.\nPlacing a Jump\nThe byte code of an unconditional jump resembles that of a near call, but \nthe first byte is 0xE9 instead of 0xE8. (See “Working with Near Calls in \nMemory” on page 153 for a refresher.) In Figure 8-1, the unconditional \njump JMP trampoline replaces the following four operations:\nPOP EAX\nMOV AL, 1\nPOP EDI\nPOP ESI\nIn this case, you need to replace multiple sequential operations to \naccommodate the 5-byte size of the unconditional jump. You may come \nacross cases where the size of the operation (or operations) being replaced \nis larger than 5 bytes. When this happens, replace the remaining bytes with \nNOP instructions.\nManipulating Control Flow in a Game 167\nNow, let’s look at how to replace those operations. Listing 8-11 shows \nthe code to place a jump hook.\nDWORD hookWithJump(DWORD hookAt, DWORD newFunc, int size)\n{\n if (size > 12) // shouldn't ever have to replace 12+ bytes\n return 0;\n DWORD newOffset = newFunc - hookAt - 5;\n auto oldProtection =\n protectMemory(hookAt + 1,PAGE_EXECUTE_READWRITE);\n writeMemory(hookAt, 0xE9);\n writeMemory(hookAt + 1, newOffset);\n for (unsigned int i = 5; i < size; i++)\n writeMemory(hookAt + i, 0x90);\n protectMemory(hookAt + 1, oldProtection);\n return hookAt + 5;\n}\nListing 8-11: How to place a jump hook\nThis function takes the address to hook at, the address of the callback \nfunction, and the size of the memory to overwrite (in bytes) as arguments. \nFirst, it calculates the offset between the hook site and the trampoline and \nstores the result in newOffset . Next, PAGE_EXECUTE_READWRITE permissions are \napplied to the memory to be changed. The unconditional jump (0xE9) \nand the address of the callback function are then written to memory, \nand a for loop writes NOP instructions (0x90) to any abandoned bytes. \nAfter the old protections are reapplied, hookWithJump() returns to the origi-\nnal address.\nNotice that the hookWithJump() function ensures that size is not above 12 \nbefore placing the jump. This check is important because a jump takes up 5 \nbytes, meaning it can replace up to five commands if the first four are each \na single byte. If the first four commands are each a single byte, the fifth \ncommand would need to be more than 8 bytes to trigger the if (size > 12) \nclause. Because 9-byte operations are very, very rare, 12 is a safe but flexible \nlimit. Having this limit can stop all sorts of bugs from happening, especially \nif your bot is dynamically detecting the size parameter. If the bot messes up \nand passes a size of 500,000,000, for instance, the check will stop you from \nNOPing the whole universe.\nWriting the Trampoline Function\nUsing the function in Listing 8-11, you can replicate the hook shown in \nFigure 8-1, but first you’ll have to create the trampoline function as follows:\nDWORD restoreJumpHook = 0;\nvoid __declspec(naked) myTrampoline()\n{\n168 Chapter 8\n __asm {\n PUSHFD\n PUSHAD\n CALL jumpHookCallback\n POPAD\n POPFD\n POP EAX\n MOV AL, 1\n POP EDI\n POP ESI\n JMP [restoreJumpHook]\n }\n}\nJust like the trampoline described alongside Figure 8-1, this trampoline \nstores all current flag and register values , calls a callback function , \nrestores the registers , restores the flags , executes the code that was \nreplaced by the hook at and , and finally jumps back to the original \ncode just below the jump and NOPs .\nN o t e \t\nTo ensure that the compiler doesn’t autogenerate any extra code within the trampoline, \nalways declare the trampoline using the __declspec(naked) convention.\nFinishing the Jump Hook\nOnce you create the trampoline, define the callback and set the hook \nlike so:\nvoid jumpHookCallback() {\n // do stuff\n}\nrestoreJumpHook = hookWithJump(0xDEADBEEF, &myTrampoline, 5);\nFinally, inside the jumpHookCallback() function, execute the code that \nrelies on the hook. If your code needs to read or write the values of the \nregisters as they were when the hook executed, you’re in luck. The PUSHAD \ncommand pushes them to the stack in the order EAX, ECX, EDX, EBX, \noriginal ESP, EBP, ESI, and EDI. The trampoline calls PUSHAD directly before \nthe jumpHookCallback() call, so you can reference the register values as argu-\nments, like this:\nvoid jumpHookCallback(DWORD EDI, DWORD ESI, DWORD EBP, DWORD ESP,\n DWORD EBX, DWORD EDX, DWORD ECX, DWORD EAX) {\n // do stuff\n}\nrestoreJumpHook = hookWithJump(0xDEADBEEF, &myTrampoline, 5);\nSince the trampoline uses POPAD to directly restore the registers from \nthese values on the stack, any modifications you make to the parameters \nwill be applied to the actual registers when they are restored from the \nstack.\nManipulating Control Flow in a Game 169\nLike VF table hooks, jump hooks are rarely needed, and they can \nbe tricky to simulate with a simple example. To help you wrap your head \naround them, I’ll explore a real-world, practical use case in “Applying Jump \nHooks and VF Hooks to Direct3D” on page 175.\nApplying Call Hooks to Adobe AIR\nAdobe AIR is a development framework that can be used to make cross-\nplatform games in an environment similar to Abode Flash. AIR is a \ncommon framework for online games, as it allows developers to write \ncross-platform code in a versatile, high-level language called ActionScript. \nActionScript is an interpreted language, and AIR runs the code inside a \nvirtual machine, which makes it infeasible to hook game-specific code with \nAIR. Instead, it is easier to hook AIR itself.\nThe example code for this section can be found in GameHackingExamples/\nChapter8_AdobeAirHook in this book’s source files. The code comes from an \nold project of mine, and it works on any game running Adobe AIR.dll version \n3.7.0.1530. I’ve gotten it working on other versions as well, but I can’t guar-\nantee it will work with much newer or much older versions, so treat this as a \ncase study.\nAccessing the RTMP Goldmine\nThe Real Time Messaging Protocol (RTMP) is a text-based network protocol \nthat ActionScript uses to serialize and send entire objects over the network. \nRTMP sits on top of the HyperText Transfer Protocol (HTTP), and a secure \nversion, RTMPS, sits on top of HTTP Secure (HTTPS). RTMPS allows game \ndevelopers to easily send and receive entire object instances over a secure \nconnection with little complication, making it the network protocol of \nchoice for any games running on AIR.\nN o t e \t\nData sent over RTMP/RTMPS is serialized through Action Message Format \n(AMF), and parsing AMF packets is beyond the scope of this book. Search online for \n“AMF3 Parser,” and you’ll find a lot of code that does it.\nProfessional API Hooking Libr aries\nThere are prewritten hooking libraries, like Microsoft’s Detours and MadCHook, \nthat use only jump hooks. These libraries can automatically detect and follow \nother hooks, they know how many instructions to replace, and they generate \ntrampoline functions for you. The libraries are able to do this because they \nunderstand how to disassemble and walk through assembly instructions to deter-\nmine lengths, jump destinations, and so on. If you need to use hooks with that \nmuch power, it is arguably better to use one of those libraries than to create \nyour own.\n170 Chapter 8\nData sent over RTMP and RTMPS is very rich. The packets contain \ninformation about object types, names, and values. This is a gold mine. If \nyou can intercept this data in real time, you can instantaneously respond \nto changes in game state, see a wealth of critical information without ever \nreading it from memory, and find useful pieces of data that you might not \neven know exist.\nA while back, I was working on a tool that required a ton of insight into \nthe state of a game. Obtaining such a large amount of data directly from \nmemory would have been extremely hard, if not impossible. After some \nresearch, I realized that the game was using RTMPS to communicate with \nthe server, and that prompted me to start digging into this gold mine.\nSince RTMPS is encrypted, I knew I had to somehow hook the cryp-\ntographic functions used by AIR before I could get any usable data. After \nsearching online, I found source code for a small tool called airlog, created \nby another game hacker who, like me, was trying to log packets sent over \nRTMPS. Although the tool hooked the exact functions I needed, the code \nwas outdated, messy, and, worst of all, didn’t work on the version of AIR \nI was trying to hook.\nBut that didn’t mean it was useless. Not only did airlog hook the two \nfunctions I needed, but it also located them by scanning for certain byte \npatterns within the Adobe AIR library. These byte patterns were three years \nold, though, so they weren’t working anymore. The newer versions of Adobe \nAIR had changed enough that the assembly bytes were no longer the same. \nThe difference in bytes was a problem for the code in airlog, but not for me.\nInside an inline assembly block, you can specify raw bytes with the fol-\nlowing function call:\n_emit BYTE\nIf you replace BYTE with, say, 0x03, the code will be compiled in a way \nthat treats 0x03 as a byte in the assembly code, regardless of whether that \nmakes sense. Using this trick, I compiled the byte arrays back to assembly \ncode. The code didn’t do anything, and it wasn’t meant to; using this trick \nsimply allowed me to attach to my dummy application with OllyDBG and \ninspect bytes, which were conveniently presented as a clean disassembly.\nSince these bytes represented the code surrounding the functions I \nneeded, so, too, did their disassembly. The code was pretty standard and \ndidn’t seem likely to change, so I turned my attention to the constants. The \ncode had a few immediate values passed as offsets in commands. Knowing \nhow commonly these can change, I rewired airlog’s pattern-matching algo-\nrithm to support wildcards, updated the patterns to treat any constants as \nwildcards, and then ran the match. After some tweaks to the patterns and \na bit of digging through duplicate search results, I tracked down the func-\ntions I wanted to hook. I appropriately named them encode() and decode() \nand began working on a tool similar to airlog—but better.\nManipulating Control Flow in a Game 171\nHooking the RTMPS encode() Function\nI discovered that the encode() function, which is used to encrypt the data for \noutgoing packets, is a nonvirtual __thiscall, meaning it’s called by a near \ncall. Moreover, the call happens inside a loop. The entire loop looks like \nListing 8-12, taken directly from the OllyDBG disassembly pane.\nloop:\n MOV EAX, [ESI+3C58]\n SUB EAX,EDI\n PUSH EAX\n LEA EAX, [ESI+EDI+1C58]\n PUSH EAX\n MOV ECX,ESI\n CALL encode\n CMP EAX,-1\n JE SHORT endLoop\n ADD EDI,EAX\n CMP EDI, [ESI+3C58]\n JL loop\nendLoop:\nListing 8-12: The encode() loop\nWith a bit of analysis and some guidance from airlog, I determined \nthat the encode() function called at takes a byte array and buffer length \n(let’s call them buffer and size, respectively) as parameters. The function \nreturns -1 when it fails and returns size otherwise. The function operates \non chunks of 4,096 bytes, which is why this happens in a loop.\nTurned into more readable pseudocode, the loop calling encode() \nlooks like this (the numbers refer to the relevant assembly instructions \nin Listing 8-12):\nfor (EDI = 0; EDI < x[ESI+3C58]; ) {\n EAX = vencode(&[ESI+EDI+1C58], [ESI+3C58] - EDI);\n if (EAX == -1) wbreak;\n EDI += EAX;\n}\nI wasn’t interested in what encode() did, but I needed the entire buffer \nit was looping over, and hooking encode() was my means of getting that buf-\nfer. Looking at the real loop in Listing 8-12, I knew that the calling object \ninstance’s full buffer was stored at ESI+0x1C58, that the full size was stored \nat ESI+0x3C58, and that EDI contained the loop counter. I devised the \nhook with these things in mind, ultimately creating a two-part hook.\nThe first part of my hook was a reportEncode() function that logs the \nentire buffer on the first loop iteration. Here’s the reportEncode() function \nin full:\nDWORD __stdcall reportEncode(\n const unsigned char* buffer,\n unsigned int size,\n172 Chapter 8\n unsigned int loopCounter)\n{\n if (loopCounter == 0)\n printBuffer(buffer, size);\n return origEncodeFunc;\n}\nThis function takes buffer, size, and loopCounter as parameters and \nreturns the address of the function I dubbed encode(). Before fetching that \naddress, however, the second part of my hook, a myEncode() function, does all \nof the dirty work to obtain buffer, size, and loopCounter, as follows:\nvoid __declspec(naked) myEncode()\n{\n __asm {\n MOV EAX, DWORD PTR SS:[ESP + 0x4] // get buffer\n MOV EDX, DWORD PTR DS:[ESI + 0x3C58] // get full size\n PUSH ECX // store ecx\n PUSH EDI // push current pos (loop counter)\n PUSH EDX // push size\n PUSH EAX // push buffer\n CALL reportEncode // report the encode call\n POP ECX // restore ecx\n JMP EAX // jump to encode\n }\n}\nThe myEncode() function is a pure assembly function that replaces the \noriginal encode() function call using a near call hook. After storing ECX on \nthe stack, myEncode() obtains buffer, size, and loopCounter and passes them \nto the reportEncode() function. After calling the reportEncode() function, the \nmyEncode() function restores ECX and jumps directly into encode(), causing \nthe original function to execute and return gracefully to the loop.\nSince myEncode() cleans everything it uses from the stack, the stack still \ncontains the original parameters and return address in the correct spot \nafter myEncode() runs. That’s why myEncode() jumps directly into encode() \ninstead of using a function call: that stack is already set up with the proper \nreturn address and parameters, so the encode() function will think every-\nthing happened as normal.\nHooking the RTMPS decode() Function\nThe function I named decode(), which is used to decrypt incoming data, \nwas also a __thiscall that was called in a loop. It worked on chunks of \n4,096 bytes and took a buffer and size as parameters. The loop was quite \na bit more complex, containing multiple function calls, nested loops, and \nloop escapes, but hooking worked much the same as hooking the so-called \nencode() function. The reason for the added complexity is not relevant to \nhooking the function, but it makes the code difficult to summarize, so I won’t \nshow the original function here. The bottom line is this: once all the com-\nplexity was rubbed away, the decode() loop was the encode() loop in reverse.\nManipulating Control Flow in a Game 173\nOnce again, I devised a two-part near call hook. The first part, \nreportDecode(), is shown here:\nvoid __stdcall reportDecode(const unsigned char* buffer, unsigned int size)\n{\n printBuffer(buffer, size);\n}\nThe function logs each packet that comes through. I didn’t have a loop \nindex at the time, so I decided that it was okay to log every single partial \npacket.\nThe second part of the hook, the myDecode() function, acts as the new \ncallee and does all of the dirty work, as follows:\nvoid __declspec(naked) myDecode()\n{\n __asm {\n MOV EAX, DWORD PTR SS:[ESP + 0x4] // get buffer\n MOV EDX, DWORD PTR SS:[ESP + 0x8] // get size\n PUSH EDX // push size\n PUSH EAX // push buffer\n CALL [origDecodeFunc]\n MOV EDX, DWORD PTR SS:[ESP + 0x4] // get the buffer\n PUSH EAX // store eax (return value)\n PUSH ECX // store ecx\n PUSH EAX // push size\n PUSH EDX // push buffer\n CALL reportDecode // report the results now\n POP ECX // restore ecx\n POP EAX // restore eax (return value)\n RETN 8 // return and clean stack\n }\n}\nI knew the buffer was decrypted in place, meaning the encrypted chunk \nwould be overwritten with the decrypted one once the call to decode() was \ncomplete. This meant that myDecode() had to call the original decode() func-\ntion before calling the reportDecode() function, which would give the \nresults of the decoding. Ultimately, myDecode() also needed to return with \nthe same value that the original decode() function would and clean up the \nstack, and the final POP and RETN instructions took care of that.\nPlacing the Hooks\nThe next problem I ran into was that the hooks were for code inside the \nmodule Adobe AIR.dll, which was not the main module of the game. Because \nof the code’s location, I needed to find the base addresses for the hooks \na bit differently. Additionally, since I needed these hooks to work across a \nfew different versions of Adobe AIR, I also had to find the right addresses \n174 Chapter 8\nfor each version. Instead of trying to get my hands on all of the different \nversions of Adobe AIR, I took another page out of airlog’s playbook and \ndecided to programmatically locate the addresses by writing a small mem-\nory scanner. Before I could write the memory scanner, I needed both the \nbase address and size of Adobe AIR.dll so I could limit my memory search to \nonly that area.\nI found these values using Module32First() and Module32Next() as follows:\nMODULEENTRY32 entry;\nentry.dwSize = sizeof(MODULEENTRY32);\nHANDLE snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPMODULE, NULL);\nDWORD base, size;\nif (Module32First(snapshot, &entry) == TRUE) {\n while (Module32Next(snapshot, &entry) == TRUE) {\n std::wstring binaryPath = entry.szModule;\n if (binaryPath.find(\"Adobe AIR.dll\") != std::wstring::npos) {\n size = (DWORD)entry.modBaseSize;\n base = (DWORD)entry.modBaseAddr;\n break;\n }\n }\n}\nCloseHandle(snapshot);\nThis code loops through all modules in the process until it finds \nAdobe AIR.dll . When it finds the correct module entry , it fetches the \nmodBaseSize and modBaseAddr properties from it and breaks out immediately.\nThe next step was finding a sequence of bytes I could use to identify the \nfunctions. I decided to use the byte code surrounding each call. I also had \nto make sure that each sequence was unique while avoiding the use of any \nconstants in the patterns to ensure the code’s portability. Listing 8-13 shows \nthe byte sequences I ended up with.\nconst char encodeSeq[16] = {\n 0x8B, 0xCE, // MOV ECX, ESI\n 0xE8, 0xA6, 0xFF, 0xFF, 0xFF, // CALL encode\n 0x83, 0xF8, 0xFF, // CMP EAX, -1\n 0x74, 0x16, // JE SHORT endLoop\n 0x03, 0xF8, // ADD EDI, EAX\n 0x3B, 0xBE}; // part of CMP EDI, [ESI+0x3C58]\nconst char decodeSeq[12] = {\n 0x8B, 0xCE, // MOV ECX, ESI\n 0xE8, 0x7F, 0xF7, 0xFF, 0xFF, // CALL decode\n 0x83, 0xF8, 0xFF, // CMP EAX, -1\n 0x89, 0x86}; // part of MOV [ESI+0x1C54], EAX \nListing 8-13: The encode() and decode() byte sequences\nManipulating Control Flow in a Game 175\nNotice the CALL instruction in each pattern; these are the calls to the \nAdobe AIR functions I named encode() and decode(). I scanned for these \nsequences with the following function:\nDWORD findSequence(\n DWORD base, DWORD size, \n const char* sequence, \n unsigned int seqLen){\n for (DWORD adr = base; adr <= base + size – seqLen; adr++) {\n if (memcmp((LPVOID)sequence, (LPVOID)adr, seqLen) == 0)\n return adr;\n }\n return 0;\n}\nTreating the memory of Adobe AIR.dll as a byte array, the findSequence() \nfunction looks for a sequence of bytes as a subset of that byte array and \nreturns the address of the first match it finds. With the findSequence() func-\ntion written, finding the addresses I needed to hook encode() and decode() \nwas simple. Here’s how those calls looked:\nDWORD encodeHookAt =\n findSequence(base, size, encodeSeq, 16) + 2;\nDWORD decodeHookAt =\n findSequence(base, size, decodeSeq, 12) + 2;\nSince each target call was 2 bytes into its receptive search sequence, \nall I had to do was locate each sequence and add 2. After that, the final \nstep was to place the hooks using the method described in “Call Hooking” \non page 153.\nWith my hook finished, I could see every single piece of data that went \nbetween the game’s client and server. Moreover, since the RTMPS protocol \nsends serialized ActionScript objects, the data was basically self-document-\ning. Every single piece of information was accompanied by a variable name. \nEvery variable existed as a member of a well-described object. Every object \nhad a consistent name. Like I said—it was a gold mine.\nApplying Jump Hooks and VF Hooks to Direct3D\nUnlike the Adobe AIR hook I just described, hooks for Direct3D (the 3D \ngraphics component of Microsoft’s DirectX API) are very common and \nhighly documented. Direct3D is ubiquitous in the world of gaming: a major-\nity of PC games use the library, which means that hooking it gives you a \nvery powerful method for intercepting data and manipulating the graphics \nlayers of many different games. You can use a Direct3D hook for a number \n176 Chapter 8\nof tasks, such as detecting locations of hidden enemy players, increasing \nthe lighting of dark in-game environments, and seamlessly displaying \nadditional graphical information. Making effective use of a Direct3D hook \nrequires you to learn about the API, but there’s more than enough informa-\ntion in this book to get you started.\nIn this section, I’ll give you a high-level introduction to a game loop that \nuses Direct3D before diving right into the implementation of a Direct3D \nhook. Rather than detailing the internals and giving you the analytical \nbackstory as I did with the Adobe AIR hook, I’ll go over the most popular \nDirect3D hook method, as it is well documented and used by the majority \nof game hackers.\nThe online resources for this book include two pieces of example code \nfor this section; find those files now if you want to follow along. The first \npart, an example Direct3D 9 application for you to hack on, can be found \nunder GameHackingExamples/Chapter8_Direct3DApplication. The second part, \nthe actual hook, is under Chapter8_Direct3DHook.\nThere are multiple versions of Direct3D in use at any given time, \nand there are ways to hook each one. For this book, I’ll focus on hooking \nDirect3D 9, because it is the only commonly used version that is supported \nby Windows XP.\nN o t e \t\nEven though XP has reached end of life, many people in less developed countries still \nuse it as a primary gaming system. Direct3D 9 works on all versions of Windows and \nis nearly as powerful as its successors, so many game companies still prefer to use it \nover newer versions that don’t have as much backward compatibility.\nThe Drawing Loop\nLet’s jump right in with a crash course on how Direct3D works. Inside a \nDirect3D game’s source code, you’ll find an infinite loop that processes \ninput and renders graphics. Each iteration in this drawing loop is called a \nframe. If we cut out all the extraneous code and focus simply on a bare skele\nton, we can visualize a game’s main loop with the following code:\nint WINAPI WinMain(args)\n{\n /* Some code here would be called\n to set up Direct3D and initialize\n the game. Leaving it out for brevity. */\n MSG msg;\n while(TRUE) {\n /* Some code would be here to handle incoming\n mouse and keyboard messages. */\n drawFrame(); // this is the function we care about\n }\n /* Some code here would be called to\n clean up everything before exiting. */\n}\nManipulating Control Flow in a Game 177\nThis function is the entry point of the game. Simply put, it initial-\nizes the game and then enters the game’s main loop. Inside the main \nloop, it executes code responsible for processing user input before calling \ndrawFrame() to redraw the screen using Direct3D. (Check out the code in \nGameHackingExamples/Chapter8_Direct3DApplication to see a fully functional \ngame loop.)\nEach time it is called, the drawFrame() function redraws the entire \nscreen. The code looks something like this:\nvoid drawFrame()\n{\n device->Clear(0, NULL, D3DCLEAR_TARGET, D3DCOLOR_XRGB(0, 0, 0), 1.0f, 0);\n device->BeginScene();\n // drawing will happen here\n device->EndScene();\n device->Present(NULL, NULL, NULL, NULL);\n}\nAfter clearing the screen with device->Clear , the drawFrame() function \ncalls device->BeginScene() to unlock the scene for drawing. It then executes \nsome drawing code (what that drawing code actually does isn’t important \nright now) and locks the scene with a device->EndScene() call. To finish up, it \nrenders the scene to the screen by calling the device->Present() function.\nNotice that all of these functions are called as members of some \ninstance called device. This is simply an object instance representing the \nDirect3D device, which is used to invoke all sorts of drawing calls. Also, \nnotice that this function is devoid of any actual drawing code, but that’s \nokay. It’s really only important for you to understand the high-level con-\ncepts of drawing loops, frames, and the Direct3D device. To recap, games \nhave a main loop with two responsibilities:\n• \nHandling incoming messages\n• \nDrawing the game to the screen\nEach iteration in this loop is called a frame, and each frame is drawn by \na device. Taking control of the device gives you access to the most sensitive \nand descriptive details of the game’s state; that is, you’ll be able to peek into \nthe game’s state after the data has been parsed, processed, and rendered \nto the screen. Moreover, you’ll be able to modify the output of this state. \nThese two superpowers enable you to pull off all kinds of awesome hacks.\nFinding the Direct3D Device\nTo take control of a Direct3D device, you hook the member functions in the \ndevice’s VF table. Unfortunately, however, using the Direct3D API to instan-\ntiate your own instance of the same device class from injected code doesn’t \nmean you’ll share a VF table with the game’s instance. Direct3D devices use \n178 Chapter 8\na customized runtime implementation of VF tables, and each device gets its \nown unique VF table. Additionally, devices sometimes rewrite their own VF \ntables, removing any hooks and restoring the original function addresses. \nBoth of these Direct3D quirks leave you with one inevitable option: you \nmust find the address of the game’s device and modify its VF table directly. \nHere’s how:\n1.\t Create a Direct3D device and traverse its VF table to locate the true \naddress of EndScene().\n2.\t Place a temporary jump hook on EndScene().\n3.\t When the jump hook callback is executed, store the address of the \ndevice that was used to call the function, remove the hook, and restore \nexecution normally.\n4.\t From there, use VF hooks to hook any member function of the \nDirect3D device.\nJump Hooking EndScene()\nSince every device will call EndScene() at the end of each frame, you can \nhook EndScene() using a jump hook and intercept the game’s device from \nyour hook callback. Unique devices may have their own unique VF tables, \nbut the different tables still point to the same functions, so you can find the \naddress of EndScene() in the VF table of any arbitrary device. Using standard \nDirect3D API calls, you can create your own device like this:\nLPDIRECT3D9 pD3D = Direct3DCreate9(D3D_SDK_VERSION);\nif (!pD3D) return 0;\nD3DPRESENT_PARAMETERS d3dpp;\nZeroMemory( &d3dpp, sizeof(d3dpp) );\nd3dpp.Windowed = TRUE;\nd3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;\nd3dpp.hDeviceWindow = hWnd;\nLPDIRECT3DDEVICE9 device;\nHRESULT res = pD3D->CreateDevice(\n D3DADAPTER_DEFAULT,\n D3DDEVTYPE_HAL,\n hWnd,\n D3DCREATE_SOFTWARE_VERTEXPROCESSING,\n &d3dpp, &device);\nif (FAILED(res)) return 0;\nExplaining how everything in Direct3D works is outside the scope of \nthis book, so just know that you can copy this code to create a Direct3D \ndevice that contains the EndScene() function as a member. The EndScene() \naddress is at index 42 in the VF table of device (see “The Meaning of Device, \nManipulating Control Flow in a Game 179\nDirect3D, and VF Hooks” box to learn how to find that index), and you can \nread it using a subset of the VF table hooking code from “Using a VF Table \nHook” on page 159, as follows:\nDWORD getVF(DWORD classInst, DWORD funcIndex)\n{\n DWORD VFTable = readMemory(classInst);\n DWORD hookAddress = VFTable + funcIndex * sizeof(DWORD);\n return readMemory(hookAddress);\n}\nDWORD EndSceneAddress = getVF((DWORD)device, 42);\nOnce you’ve obtained the address, your discovery device has served its \npurpose, and it can be destroyed with a call to the Release() function:\npD3D->Release();\ndevice->Release();\nWith the address of EndScene() in hand, you’d be ready to start thinking \nabout how to place your hook in memory. But since you just have a function \naddress, your only option is to place a jump hook at the top of the function.\nThe Meaning of Device, Direct3D, and VF Hooks\nIf you’re wondering how I know that the index of the EndScene() function is 42, \nyou’ve come to the right box. Since Direct3D 9 is a freely available library, you \ncan actually see quite a bit of what goes on under the hood. The main header \nfile for the library is d3d9.h. If you open this file in your editor and search for \n“EndScene,” you’ll end up in the middle of a large class definition that speci-\nfies several functions using C macros. This is the base class for all Direct3D 9 \ndevice implementations, and it defines the virtual functions used by the class.\nThe VF table is constructed in the same order as the functions are defined \nin code, so you can determine the index of any member function by simply \ncounting the lines. You can scroll to the top of the class definition (at line 426 in \nmy version of the library, and probably yours too), note the line where the first \nfunction is declared (line 429), and then scroll to the EndScene() definition and \nnote that line (line 473). Finally, count the number of blank or commented lines \n(two for me) and do some math: 473 – 429 – 2 = 42.\nPresto! The EndScene() function is the 43rd function declared, so it sits \nat the 42nd spot in the VF table. Another advantage to having this header is \nthat you can see the name, argument types, argument names, and return type \nof every single function in the device class. So when you’re writing your own \nhooks in the future, you’ll know exactly where to look.\n180 Chapter 8\nPlacing and Removing the Jump Hook\nSince you’re just using the hook to find the device, you need to call it only \nonce. After obtaining the device, you’ll remove the jump hook and restore \nexecution back to the start of EndScene() so that the drawing loop can carry \non its work. Believe it or not, this makes your life much easier. Since the \ncode will be restored immediately, there’s no need for your trampoline to \nexecute the commands that are replaced by the jump, and there’s no need \nto pad the jump with NOPs. All you need to do is store the original bytes \nand place the hook. To do so, you use a slightly tweaked version of the \njump-hooking code from Listing 8-11:\nunsigned char* hookWithJump(DWORD hookAt, DWORD newFunc)\n{\n DWORD newOffset = newFunc - hookAt - 5;\n auto oldProtection = protectMemory(hookAt, PAGE_EXECUTE_READWRITE);\n unsigned char* originals = new unsigned char[5];\n for (int i = 0; i < 5; i++)\n originals[i] = readMemory(hookAt + i);\n writeMemory(hookAt, 0xE9);\n writeMemory(hookAt + 1, newOffset);\n protectMemory(hookAt, oldProtection);\n return originals;\n}\nLike the function in Listing 8-11, this function makes the memory writ-\nable , places the hook , and restores the memory protection. Before \nplacing the hook, it allocates a 5-byte buffer called originals and fills it \nwith the original bytes. After the hook is placed, it returns originals to the \ncalling function.\nWhen it’s time to remove the hook, pass originals to the following \nfunction:\nvoid unhookWithJump(DWORD hookAt, unsigned char* originals)\n{\n auto oldProtection = protectMemory(hookAt, PAGE_EXECUTE_READWRITE);\n for (int i = 0; i < 5; i++)\n writeMemory(hookAt + i, originals[i]);\n protectMemory(hookAt, oldProtection);\n delete [] originals;\n}\nThis code simply iterates over originals and quietly places those 5 bytes \nback where they were found so that everything is as expected when execu-\ntion returns to the EndScene() function. When the time comes, you can place \nand remove your actual hook using two lines of code, like this:\nauto originals = hookWithJump(EndSceneAddress, (DWORD)&endSceneTrampoline);\nunhookWithJump(EndSceneAddress, originals);\nManipulating Control Flow in a Game 181\nOnce you have the hookWithJump() and unhookWithJump() functions, it’s \ntime to prepare the callback and find the device.\nWriting the Callback and Trampoline\nEven though you can obtain the EndScene() address from a VF table, the \nEndScene() function doesn’t actually follow the __thiscall convention. Direct3D \nclasses are simple wrappers around a C API, and all of the member function \ncalls are forwarded to __stdcall functions that take a class instance as a first \nparameter. This means that your trampoline only needs to grab the device \nfrom the stack, pass it to the callback, and then jump back to EndScene(). \nThe callback only has to remove the jump hook before returning to the \ntrampoline.\nThe final code for the callback and trampoline to this jump hook looks \nsomething like this:\nLPDIRECT3DDEVICE9 discoveredDevice;\nDWORD __stdcall reportInitEndScene(LPDIRECT3DDEVICE9 device)\n{\n discoveredDevice = device;\n unhookWithJump(EndSceneAddress, originals);\n return EndSceneAddress;\n}\n__declspec(naked) void endSceneTrampoline()\n{\n __asm {\n MOV EAX, DWORD PTR SS:[ESP + 0x4]\n PUSH EAX // give the device to the callback\n CALL reportInitEndScene\n JMP EAX // jump to the start of EndScene\n }\n}\nUsing the hookWithJump() function, you can place a jump hook on \nEndScene() that calls the endSceneTrampoline() function. When the game’s \ndevice calls the EndScene() function, the trampoline function calls the \nreportInitEndScene() function . The reportInitEndScene() function stores the \ncaptured device pointer to a global variable called discoveredDevice, removes \nthe hook by calling unhookWithJump(), and returns the address of EndScene() \nto the trampoline. To finish up, the trampoline jumps directly to EAX, which \nwill be holding the address that was returned from the reporting function.\nN o t e \t\nYou can use jump hooks to completely skip the VF table hooking that I’ll show you, \nbut it’s very unreliable to use “dumb” jump hooks on commonly hooked API func-\ntions. Consistently obtaining good results with only jump hooks requires professional \nhooking libraries, and I’d rather teach you how to do it completely on your own.\nAt this point, all that’s left to do is hook the VF table of discoveredDevice \nto hack the game. The next two sections will walk you through hooks on the \nEndScene() and Reset() functions, which are required if you want a stable hook.\n182 Chapter 8\nWriting a Hook for EndScene()\nA hook on EndScene() is useful because it allows you to intercept a com-\npleted frame just before it is rendered; you can effectively execute your own \nrendering code inside the game loop. As you saw when locating this func-\ntion’s address in “Jump Hooking EndScene()” on page 178, this function \nis at index 42 in the VF table. You can hook EndScene() using a VF hook as \nfollows:\ntypedef HRESULT (WINAPI* _endScene)(LPDIRECT3DDEVICE9 pDevice);\n_endScene origEndScene =\n (_endScene)hookVF((DWORD)discoveredDevice, 42,(DWORD)&myEndScene);\nHRESULT WINAPI myEndScene(LPDIRECT3DDEVICE9 pDevice)\n{\n // draw your own stuff here\n return origEndScene(pDevice);\n}\nThis code uses the hookVF() function from “Using a VF Table Hook” on \npage 159 to hook EndScene() at index 42 of discoveredDevice, using myEndScene() \nas the callback function. A direct Direct3D device will occasionally repatch \nits own VF table and restore the original function addresses. This typically \nhappens from within the EndScene() function, meaning you also have to \nrepatch the VF table after calling the original EndScene() function. There \nare a few changes you can make to this hook to handle that, as shown in \nListing 8-14.\n_endScene origEndScene = NULL;\nvoid placeHooks()\n{\n auto ret = hookVF((DWORD)discoveredDevice, 42, (DWORD)&myEndScene);\n if (ret != (DWORD)&myEndScene) // don't point to your hook\n origEndScene = (_endScene)ret;\n}\nplaceHooks();\nHRESULT WINAPI myEndScene(LPDIRECT3DDEVICE9 pDevice)\n{\n // draw your own stuff here\n auto ret = origEndScene(pDevice);\n placeHooks(); // update hooks\n return ret;\n}\nListing 8-14: Final code to hook EndScene()\nThe code to place the hook has been moved into a function called \nplaceHooks() so it can be called multiple times with ease. The callback func-\ntion still forwards the call to the original function, but it makes sure to call \nplaceHooks() before returning. This ensures that the hook is always active, \neven if the original EndScene() function removes it.\nManipulating Control Flow in a Game 183\nAnother point to notice is that placeHooks() updates the address of \norigEndScene() every time the hook is replaced, as long as the address \nreturned from hookVF() isn’t the address of the myEndScene() function. This \ndoes two distinct things. First, it allows other applications to hook EndScene() \nwithout stepping on their toes, since it will update origEndScene() to whatever \nis seen in the VF table. Second, it makes sure that the value of origEndScene() \ncan never be the address of our callback, preventing a potential infinite \nloop. An infinite loop is possible otherwise, because origEndScene() doesn’t \nalways fix the device’s VF table, meaning placeHooks() can be called when \nthe VF table still contains the myEndScene() function.\nWriting a Hook for Reset()\nWhen you’re using a Direct3D hook in production, you’ll be doing all kinds \nof tasks like drawing custom text, displaying images related to your bot, and \ninteracting with function calls from the game. These tasks will require you \nto create your own Direct3D objects that are tied to the game’s device, and \nthat can be a problem. From time to time, the game may completely reset \nits device through a Reset() function. When a device is reset, you’ll need to \nupdate any objects (most commonly fonts and sprites) that you’ve created \nfor the device, using their OnLostDevice() member functions.\nSince Reset() is called from the VF table of the device, you can use \na hook on it to tell you when the device has been reset. Reset() takes two \nparameters and is at index 16 in the VF table. You can add this code to \nplaceHooks() in Listing 8-14 to hook the Reset() function:\nauto ret = hookVF((DWORD)discoveredDevice, 16, (DWORD)&myReset);\nif (ret != (DWORD)&myReset)\n origReset = (_reset)ret;\nAnd this is the declaration to use for origReset:\ntypedef HRESULT (WINAPI* _reset)(\n LPDIRECT3DDEVICE9 pDevice,\n D3DPRESENT_PARAMETERS* pPresentationParameters);\n_reset origReset = NULL;\nWhen a reset is successful, the original function returns D3D_OK. Your \nhook function recognizes this and calls OnLostDevice() accordingly:\nHRESULT WINAPI myReset(\n LPDIRECT3DDEVICE9 pDevice,\n D3DPRESENT_PARAMETERS* pPresentationParameters)\n{\n auto result = origReset(pDevice, pPresentationParameters);\n if (result == D3D_OK) {\n // call onLostDevice() for all of your objects\n }\n return result;\n}\n184 Chapter 8\nOnce you fill in the contents of the if() statement, all of your objects \nare ready to use again.\nWhat’s Next?\nNow that I’ve shown you how to take control of a game’s Direct3D device, \nyou’re probably wondering what you can do with it. Unlike the other \nexamples in the book, the code in this section and the example code don’t \nhave a one-to-one correlation, but the functionality is still the same. Here’s \na high-level view of the correlation between this chapter and the code in the \nChapter8_Direct3DHook example project.\nThe file DirectXHookCallbacks.h contains the callbacks for the EndScene() \nand Reset() functions, two callbacks for other common functions, and the \ntrampoline and reporter functions for the temporary jump hook. These func-\ntions are all pretty much as described in this chapter, except they call into a \nsingleton class defined in DirectXHook.h and DirectXHook.cpp. This singleton \nclass is responsible for forwarding the calls to the original functions.\nThe class is also responsible for all of the heavy lifting, and it contains \nthe code to create the discovery device, place the hooks, draw text, handle \ndevice resets, and display images. Furthermore, it allows external code to \nadd custom callbacks for each hook, as you can see in main.cpp. Here, you’ll \nsee a number of different callbacks that are drawing custom text, adding \nnew images to the screen, and changing the textures of models that are \ndrawn by the game. I recommend poking around in the code to get a better \nunderstanding of what’s going on, but don’t get too carried away. We’ll dive \ninto this code in Chapter 9 to talk about all the cool hacks it can do.\nOptional Fixes for Stabilit y\nThe Reset() and EndScene() hooks described in this chapter should work well \nfor any game running Direct3D 9, but it is slightly unstable. If the game tries \nto execute EndScene() when the jump hook is placed, it will crash because the \nbytes are being modified. There are two ways to fix this. First, you can place \nthe jump hook from within an IAT hook on PeekMessage(). This will work \nbecause placing an IAT hook is a thread-safe operation, but it assumes that \nPeekMessage() is called only from the same thread that does the Direct3D \ndrawing.\nA safer, but more complex, alternative is to iterate over every thread in the \ngame (similar to how it worked for thread hijacking) and use SuspendThread() \nto pause all threads in the game (except for the one placing the hook, of \ncourse). Before pausing a thread, you must make sure its EIP is not execut-\ning the first 5 bytes of EndScene(). After the hook is placed, you must use \nResumeThread() to restore execution with your hook in place.\nManipulating Control Flow in a Game 185\nClosing Thoughts\nControl flow manipulation is a very important skill in game hacking, and \na lot of the hacks in this book rely on it. Throughout the next two chap-\nters you’ll learn how to create common hacks using the Direct3D hook, \nand you’ll get a better idea of the general use cases of hooking. Even if you \nfeel a little shaky, continue to Chapter 9. The code examples there center \non the Direct3D hook and will get you even more familiar with hooking \ntechniques.\nPart 4\nCr e at ing Bo t s\n9\nU sing E x t r a se n sory \nPe rce p t ion t o Wa r d Of f \nFog of Wa r \nFog of war (often shortened to just fog) is \na mechanism that game developers com-\nmonly use to limit a player’s situational \nawareness and hide information about the \ngame environment. Fog is often a literal lack of sight \nin massive online battle arena (MOBA) games, but \nthe concept also includes any lack or obscurity of \npertinent gameplay information. Cloaked figures, \ndark rooms, and enemies hiding behind walls are all \nforms of fog.\nGame hackers can reduce or even completely remove fog using an extra-\nsensory perception (ESP) hack. An ESP hack uses hooking, memory manipula-\ntion, or both to force a game to display hidden information. These hacks \ntake advantage of the fact that some types of fog are often implemented on \nthe client side, as opposed to the server side, meaning that the game clients \nstill contain information (partial or complete) about what is being hidden.\n190 Chapter 9\nIn this chapter, you will learn how to implement different types of ESP \nhacks. First, you’ll learn to light up dark environments. Next, you’ll use \nx-ray vision to see through walls. Finally, you’ll learn about zoom hacking, \ntweaking heads-up displays, and other simple ESP hacks that can reveal all \nsorts of useful (but otherwise hidden) information about the game you’re \nplaying.\nBackground Knowledge\nThis chapter starts the transition from hacking, puppeteering, and reverse \nengineering to coding. From here on out, you’ll be learning how to actually \ncode your own hacks. To keep on topic, everything I’ve talked about thus \nfar will be treated as background knowledge. If you see a technique used \nthat you don’t quite remember, such as memory scanning, setting memory \nbreakpoints, hooking, or writing memory, flip back to the relevant chapters \nand study them a bit more before continuing. Throughout the text, you’ll \nfind notes to remind you where you can brush up on certain topics.\nSpecifically, this chapter will talk a lot about Direct3D. In “Applying \nJump Hooks and VF Hooks to Direct3D” on page 175, I explained \nhow to hook into a game’s Direct3D drawing loop. The example code \nfor that chapter includes a fully featured Direct3D hooking engine in \nGameHackingExamples/Chapter8_Direct3DHook. A lot of the hacks in this \nchapter build on that hook, and their example code can be found in the \nmain.cpp file of the Direct3D hook code. You can run the compiled appli-\ncation from GameHackingExamples/Chapter8_Direct3DApplication to see the \nhacks in action on a test application.\nRevealing Hidden Details with Lighthacks\nLighthacks increase lighting in dark environments, allowing you to clearly \nsee enemies, treasure chests, pathways, and anything else that is normally \nobscured by darkness. Lighting is often a cosmetic change that’s added at a \ngame’s graphical layer, and it can usually be directly modified with a hook \non the graphics layer.\nOptimal lighting depends on camera orientation, environment layout, \nand even specific traits of a game’s engine, and you can manipulate any of \nthese factors to create lighthacks. But the easiest way is simply to add more \nlight to a room.\nAdding a Central Ambient Light Source\nThe online resources for this book include two small lighthack examples. \nThe first is the enableLightHackDirectional() function in main.cpp, which is \nshown in Listing 9-1.\nvoid enableLightHackDirectional(LPDIRECT3DDEVICE9 pDevice)\n{\n D3DLIGHT9 light;\nUsing Extrasensory Perception to Ward Off Fog of War 191\n ZeroMemory(&light, sizeof(light));\n light.Type = D3DLIGHT_DIRECTIONAL;\n light.Diffuse = D3DXCOLOR(0.5f, 0.5f, 0.5f, 1.0f);\n light.Direction = D3DXVECTOR3(-1.0f, -0.5f, -1.0f);\n pDevice->SetLight(0, &light);\n pDevice->LightEnable(0, TRUE);\n}\nListing 9-1: A directional lighthack\nThis code is called from the EndScene() hook, and it adds light to the \nscene by creating a light source called light. The code sets light.Type to \ndirectional, which means the light source will act like a spotlight and proj-\nect light in a specific direction. The code then sets the red, green, and blue \nvalues of light.Diffuse to 0.5, 0.5, and 0.5, giving the light an off-white shine \nwhen reflected from a surface. Next, it sets light.Direction to an arbitrary \npoint in the three-dimensional space. Finally, the code uses the game’s \nDirect3D device to set up the light at index 0 and enable lighting effects.\nN O T E \t\nIn the example application, the light shines up and to the right from the bottom left of \nthe scene. You may need to change this location depending on how your target game is \nrendered.\nNote that inserting the light at index 0 works for this proof of con-\ncept, but it won’t always work. Games typically have multiple light sources \ndefined, and setting your light at an index the game uses might override \ncritical lighting effects. In practice, you might try setting the index to an arbi-\ntrarily high number. There’s an issue with this type of lighthack, though: \ndirectional lights will be blocked by objects such as walls, creatures, and \nterrain, meaning shadows can still be cast. Directional lights work great \nfor wide-open spaces, but not so well for tightly wound corridors or under-\nground caves.\nIncreasing the Absolute Ambient Light\nThe other lighthack method, seen in the enableLightHackAmbient() function, \nis far more aggressive than the one in Listing 9-1. It affects the light level \nglobally, rather than adding an extra light source. Here’s what the code \nlooks like:\nvoid enableLightHackAmbient(LPDIRECT3DDEVICE9 pDevice)\n{\n pDevice->SetRenderState(D3DRS_AMBIENT, D3DCOLOR_XRGB(100, 100, 100));\n}\nThis lighthack sets the absolute ambient light (which you indicate \nby passing D3DRS_AMBIENT to the SetRenderState() function) to a medium-\nstrength white. The D3DCOLOR_XRGB macro sets that strength, taking 100 as \n192 Chapter 9\nits parameters for the red, green, and blue levels. This lights up objects \nusing an omnidirectional white light, effectively revealing everything at \nthe cost of shadows and other lighting-based details.\nCreating Other Types of Lighthacks\nThere are many other ways to create lighthacks, but they differ from game \nto game. One creative way to affect the light in a game is to NOP the code \nthat the game uses to call the device->SetRenderState() function. Since this \nfunction is used to set up the global ambient light strength, disabling calls \nto it leaves Direct3D at the default light settings and makes everything vis-\nible. This is perhaps the most powerful type of lighthack, but it requires \nyour bot to know the address of the lighting code to NOP.\nThere are also memory-based lighthacks. In some games, players and \ncreatures emit light of different colors and strengths, often depending on \nattributes like their equipment, mount, or active spells. If you understand \nthe structure of the game’s creature list, you can directly modify the values \nthat determine a creature’s light level.\nFor instance, imagine a game in which characters emit a bluish ball \nof light when under a healing or strengthening spell. Somewhere in the \ngame’s memory are values associated with each creature that tell the game \nthe color and intensity of light the creature should emit. If you can locate \nthese values in memory, you can change them so that the creatures effec-\ntively emit orbs of light. This type of lighthack is commonly used in games \nwith a 2D top-down style, since the orbs around individual creatures pro-\nduce a cool artistic effect while shedding light on important parts of the \nscreen. In 3D games, however, this sort of hack just turns creatures into \nblobs of light that run around.\nYou can also hook the SetLight() member function at index 51 in the \nVF table of the game’s Direct3D device. Then, whenever your hook callback \nis invoked, you can modify the properties of the intercepted D3DLIGHT9 light \nstructure before passing it to the original function. You might, for instance, \nchange all lights to the D3DLIGHT_POINT type, causing any existing light sources \nin the game to radiate light in every direction like a light bulb. This type of \nlighthack is very powerful and accurate, but it can produce some disturbing \nvisuals. It also tends to break in any environment that has no lighting, and \nopaque obstacles still block point light sources.\nLighthacks are very powerful, but they don’t reveal anything. If infor-\nmation is hidden behind an obstacle, rather than by darkness, you’ll need a \nwallhack to reveal it.\nRevealing Sneaky Enemies with Wallhacks\nYou can use wallhacks to show enemies that are hidden by walls, floors, and \nother obstacles. There are a few ways to create these hacks, but the most com-\nmon method takes advantage of a type of rendering known as z-buffering.\nUsing Extrasensory Perception to Ward Off Fog of War 193\nRendering with Z-Buffering\nMost graphics engines, including Direct3D, support z-buffering, which is a \nway to make sure that when there are overlapping objects in a scene, only \nthe top object is drawn. Z-buffering works by “drawing” the scene to a two-\ndimensional array that describes how close the object at each pixel on the \nscreen is to the viewer. Think of the array’s indices as axes: they correspond \nto the x-axis (right and left) and y-axis (up and down) for each pixel on the \nscreen. Each value stored in the array is the z-axis value for a pixel.\nWhen a new object appears, whether it is actually drawn on the \nscreen is decided by the z-buffer array. If the spot at the object’s x- and \ny-position is already filled in the array, that means there’s another object \nat that pixel on the screen. The new object will appear only if it has a lower \nz-axis value (that is, if it’s closer to the viewer) than the pixel already there. \nWhen the scene is finished being drawn to the array, it is flushed to the \nscreen.\nTo illustrate this, imagine a three-dimensional space that needs to be \ndrawn to a two-dimensional canvas by some game with 4×4-pixel viewport. \nThe z-buffer for this scenario would look like Figure 9-1.\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\nz = 0\nNo color\n(0,0)\n(0,3)\n(3,3)\n(3,0)\nFigure 9-1: An empty z-buffer\nTo start, the game draws a blue background that completely fills the \nviewport and is located as far away on the z-axis as possible; let’s say the \nhighest z-value is 100. Next, the game draws a 2×2-pixel red rectangle at \n194 Chapter 9\nposition (0,0) with a z-position of 5. Finally, the game draws a 2×2-pixel \ngreen rectangle at position (1,1) with a z-position of 3. The z-buffer would \nnow look like Figure 9-2.\nz = 5\nRed\nz = 5\nRed\nz = 100\nBlue\nz = 100\nBlue\nz = 5\nRed\nz = 100\nBlue\nz = 100\nBlue\nz = 3\nGreen\nz = 3\nGreen\nz = 100\nBlue\nz = 3\nGreen\nz = 3\nGreen\nz = 100\nBlue\nz = 100\nBlue\nz = 100\nBlue\nz = 100\nBlue\n(0,0)\n(0,3)\n(3,3)\n(3,0)\nFigure 9-2: A filled z-buffer\nThe z-buffer neatly handled overlapping objects based on their z-posi-\ntions. The green square that’s closest on the z-axis overlaps the red square \nthat’s a bit farther away, and both squares overlap the blue background, \nwhich is very far away.\nThis behavior allows a game to draw its map, players, creatures, details, \nand particles without worrying about what is actually visible to the player. \nThis is a huge optimization for game developers, but it exposes a large area \nof attack. Since all game models are always given to the graphics engine, \nyou can use hooks to detect objects that the player can’t actually see.\nCreating a Direct3D Wallhack\nYou can create wallhacks that manipulate z-buffering in Direct3D using a \nhook on the DrawIndexedPrimitive() function, which is called when a game \ndraws a 3D model to the screen. When an enemy player model is drawn, \na wallhack of this type disables z-buffering, calls the original function to \ndraw the model, and then reenables z-buffering. This causes the enemy \nmodel to be drawn on top of everything else in the scene, regardless of \nwhat’s in front of it. Some wallhacks can also render specific models in a \nsolid color, such as red for enemies and green for allies.\nUsing Extrasensory Perception to Ward Off Fog of War 195\nToggling Z-Buffering\nThe Direct3D hook in main.cpp from GameHackingExamples/Chapter8_\nDirect3DHook has this example wallhack in the onDrawIndexedPrimitive() \nfunction:\nvoid onDrawIndexedPrimitive(\n DirectXHook* hook,\n LPDIRECT3DDEVICE9 device,\n D3DPRIMITIVETYPE primType,\n INT baseVertexIndex, UINT minVertexIndex,\n UINT numVertices, UINT startIndex, UINT primCount)\n{\n if (numVertices == 24 && primCount == 12) {\n // it's an enemy, do the wallhack\n }\n}\nThis function is used as a callback for a hook on DrawIndexedPrimitive() \nat VF index 82 of the game’s Direct3D device. Every model the game draws \npasses through this function, accompanied by some model-specific proper-\nties. By inspecting a subset of the properties, namely the numVertices and \nprimCount values, the hook detects when an enemy model is drawn and com-\nmences the wallhack. In this example, the values representing an enemy \nmodel are 24 and 12.\nThe magic happens inside the if() statement. Using just a few lines \nof code, the wallhack draws the model in a way that ignores z-buffering, \nlike so:\ndevice->SetRenderState(D3DRS_ZENABLE, false); // disable z-buffering\nDirectXHook::origDrawIndexedPrimitive( // draw model\n device, primType, baseVertexIndex,\n minVertexIndex, numVertices, startIndex, primCount);\ndevice->SetRenderState(D3DRS_ZENABLE, true); // enable z-buffering\nSimply put, this code disables z-buffering when drawing the enemy \nmodel and reenables it afterward. With z-buffering off, the enemy is drawn \nin front of everything.\nChanging an Enemy Texture\nWhen a model is rendered onscreen, a texture is used to skin the model. \nTextures are 2D images that are stretched around 3D models to apply the \ncolors and patterns that make up the model’s 3D artwork. To change the \nway an enemy looks when it’s drawn in your wallhack, you can set it to be \ndrawn with a different texture, as in this example:\n// when hook initializes\nLPDIRECT3DTEXTURE9 red;\nD3DXCreateTextureFromFile(device, \"red.png\", &red);\n196 Chapter 9\n// just before drawing the primitive\ndevice->SetTexture(0, red);\nThe first block of this code loads the texture from a file and is executed \nonly once—when the hook is initialized. The full example code does this in \nan initialize() function, which gets called the first time the EndScene() hook \ncallback is invoked. The second block of this code happens right before the \ncall to the original DrawIndexedPrimitive() function in the wallhack, and it \ncauses the model to be drawn with the custom texture.\nFingerprinting the Model You Want to Reveal\nThe trickiest part to creating a good wallhack is finding the right values for \nnumVertices and primCount. To do this, you can create a tool that logs every \nunique combination of the two variables and allows you to iterate over the \nlist using your keyboard. Working example code for this tool won’t be use-\nful in the example application provided with this chapter, but I’ll give you \nsome high-level implementation details.\nFirst, in the global scope, you’d declare a structure that has members to \nstore the following:\n• \nnumVertices and primCount\n• \nA std::set of this structure (let’s call it seenParams)\n• \nAn instance of that structure (let’s call it currentParams)\nThe std::set requires a comparator for this structure, so you’d also \ndeclare a comparison functor that calls memcmp() to compare two of the \nstructures using memcmp(). Each time the DrawIndexedPrimitive() callback is \ninvoked, your hack could create a structure instance with the intercepted \nvalues and pass it to a seenParams.insert() function, which should insert the \nparameter pair into the list only if the pair isn’t already there.\nUsing the GetAsyncKeyState() Windows API function, you could then \ndetect when the spacebar is pressed and execute something similar to this \npseudocode:\nauto current = seenParams.find(currentParam);\nif (current == seenParams.end())\n current = seenParams.begin();\nelse\n current++;\ncurrentParams = *current;\nThis would set currentParams to the next pair in seenParams when the space-\nbar is pressed. With this code in place, you could use code similar to a wall-\nhack to change the texture of models matching currentParams.numVertices \nand currentParams.primCount. The tool could also draw those values on the \nscreen so you could see them and write them down.\nWith a tool like this, finding the proper models is as easy as starting \nup a game in a mode where your character won’t die (against a friend, in \nUsing Extrasensory Perception to Ward Off Fog of War 197\na customization mode, and so on), running the bot, and pressing the space-\nbar until each model you need is highlighted. Once you have the values \nfor your target models, you’ll modify the numVertices and primCount check in \nyour wallhack so it knows which models to highlight.\nN o t e \t\nCharacter models are commonly made up of smaller models for individual body seg-\nments, and games often show different models of a character at different distances. \nThat means a game may have 20 or more models for one type of character. Even in \nthat case, selecting only one model (say, the enemy’s torso) to show in your wallhack \nmay be enough.\nGetting a Wider Field of Vision with Zoomhacks\nMany games in the MOBA and real-time strategy (RTS) genres use a 3D \ntop-down style that makes them immune to wallhacks. They also use dark-\nness on the map as a type of fog, but showing the dark areas using a light-\nhack doesn’t give any extra information; models hidden inside the fog are \nknown only to the game server, not to the client.\nThis style makes most types of ESP hacks useless: there’s little unknown \ninformation to reveal, so these hacks only augment your view of the infor-\nmation you can already see. One type of ESP hack, however, can still be \nhelpful. Zoomhacks let you zoom out much farther than a game normally \nallows, effectively revealing large portions of the map that you couldn’t see \notherwise—and thus getting around the game’s wallhack and lighthack \nimmunity.\nUsing NOPing Zoomhacks \nMOBA and RTS games typically allow players a variable but limited amount \nof zoom. The simplest type of zoomhack finds the value of the zoom factor \n(a multiplier that changes as the zoom level changes, typically a float or \ndouble) and overwrites it with a larger value.\nTo find the zoom factor, fire up Cheat Engine and search for a float \nwith an unknown initial value. (To brush up on Cheat Engine, head over \nto “Cheat Engine’s Memory Scanner” on page 5.) For rescans, repeat \nthe following process until there are only a few values left to find the zoom \nfactor:\n1.\t Go to the game window and zoom in.\n2.\t Search for an increased value in Cheat Engine.\n3.\t Go to the game window and zoom out.\n4.\t Search for a decreased value in Cheat Engine.\nTry to get the value list down to one option. To confirm that the \nremaining value is the zoom factor, freeze it in Cheat Engine and see how \nzoom behaves in-game; freezing the proper value will disable zooming. If \nyou fail to find the zoom factor using a float search, retry the search using \n198 Chapter 9\na double. If both searches fail, try them again but correspond zooming in \nwith decreased values and zooming out with increased values instead. Once \nyou’ve found the zoom factor in memory, you can write a small bot to over-\nwrite it to the zoom factor that best suits you.\nMore advanced zoomhacks NOP the game code responsible for mak-\ning sure the zoom factor is within a set range. You should be able to find \nthis code with OllyDbg. Set a memory on-write breakpoint on the zoom \nfactor, zoom in-game to trigger the breakpoint, and inspect the code at \nthe breakpoint. (To hone your OllyDbg memory breakpoint skills, flip to \n“Controlling OllyDbg Through the Command Line” on page 43.) You \nshould see the code that modified the zoom factor. Zoom limitation code \nis typically easy to spot: constants that match the minimum and maximum \nzoom values are a dead giveaway.\nIf you can’t find the limitation code using this method, then the limita-\ntion may be applied when the graphics are redrawn at a new zoom level, \nrather than when the zoom factor changes. In this case, switch your break-\npoint to memory on-read and look for the same clues.\nScratching the Surface of Hooking Zoomhacks\nYou can also create zoomhacks by using a Direct3D hook on the function \ndevice->SetTransform(type, matrix), but this requires a deep understanding of \nhow a game sets up the player’s perspective. There are a few different ways \nto manage perspective, but you control zoom level using either view (trans-\nform type D3DTS_VIEW) or projection (transform type D3DTS_PROJECTION).\nProperly manipulating transform matrices that control view and projec-\ntion requires some pretty extensive knowledge of the mathematics behind \n3D graphics, though, so I stay away from this method at all costs—and I’ve \nnever had trouble simply manipulating the zoom factor. If you’re interested \nin this kind of hack, though, I recommend reading a 3D game program-\nming book to learn more about 3D mathematics first.\nBut sometimes, even a zoomhack isn’t enough. Some useful informa-\ntion may remain hidden as a part of a game’s internal state or may simply \nbe hard for a player to determine at a moment’s glance. For these situations, \na heads-up display is the tool for the job.\nDisplaying Hidden Data with HUDs\nA heads-up display (HUD) is a type of ESP hack that displays critical game \ninformation in an overlay. HUDs often resemble a game’s existing interface \nfor displaying information like your remaining ammunition, a mini-map, \nyour current health level, any active ability cooldowns, and so on. HUDs \ntypically display either historical or aggregated information, and they’re \nmostly used on MMORPGs. They are often text based, but some also con-\ntain sprites, shapes, and other small visual effects.\nUsing Extrasensory Perception to Ward Off Fog of War 199\nThe HUDs you can create depend on what data is available in the \ngame. Common data points are these:\n• \nExperience gain per hour (exp/h)\n• \nCreature kills per hour (KPH)\n• \nDamage per second (DPS)\n• \nGold looted per hour (GPH)\n• \nHealing per minute\n• \nEstimated time until next level\n• \nAmount of gold spent on supplies\n• \nOverall gold value of items looted\nMore advanced custom HUDs may display large tables containing items \nlooted, supplies used, the number of kills for each type of creature, and the \nnames of players that have recently been seen.\nBeyond what you’ve already learned about reading memory, hooking \ngraphics engines, and displaying customized data, there’s not much else \nI can teach you about how to create a HUD. Most games have a simple \nenough architecture that you can easily obtain most of the information you \nneed from memory. Then, you can run some basic hourly, percentage, or \nsummation calculations to get the data into a usable format.\nCreating an Experience HUD\nImagine you want a HUD that displays your current level, hourly experi-\nence, and how long you’ll have to play before your character levels up. First, \nyou could use Cheat Engine to find the variables that contain your level and \nexperience. When you know those values, you can use either a game-specific \nalgorithm or a hardcoded experience table to calculate the experience \nrequired to reach the next level.\nWhen you know how much experience you need to level up, you can \ncalculate your hourly experience. Put into pseudocode, that process might \nlook like this:\n// this example assumes the time is stored in milliseconds\n// for seconds, remove the \"1000 * \"\ntimeUnitsPerHour = 1000 * 60 * 60\ntimePassed = (currentTime - startTime)\nu timePassedToHourRatio = timeUnitsPerHour / timePassed\nv expGained = (currentExp - startExp)\nhourlyExp = expGained * timePassedToHourRatio\nw remainingExp = nextExp - currentExp\nx hoursToGo = remainingExp / hourlyExp\n200 Chapter 9\nTo find your hourly experience, hourlyExp, you’d store your experience \nand the time when your HUD first starts; these are startExp and startTime, \nrespectively. This example also assumes currentLevel and currentExp are pre-\nviously defined, where currentLevel is the character’s level and currentExp is \nthe current amount of experience.\nWith these values, hourlyExp can be calculated by multiplying a ratio u \nof the time units in an hour to the time that has passed by the experience \ngained since startTime v. In this case, the time unit is a millisecond, so the \ntime units get multiplied by 1,000.\nNext, currentExp is subtracted from nextExp to determine the remaining \nexperience w to level up. To calculate how many hours are left to level up, \nyour remaining experience is divided by your hourly experience x.\nWhen you have all this information, you can finally display it onscreen. \nUsing the Direct3D hooking engine provided in this book’s example code, \nyou’d draw the text using this call inside the EndScene() hook callback:\nhook->drawText(\n 10, 10,\n D3DCOLOR_ARGB(255, 255, 0, 0),\n \"Will reach level %d in %0.20f hours (%d exp per hour)\",\n currentLevel, hoursToGo, hourlyExp);\nThat’s all you need for a working, experience-tracking HUD. Variations \nof these same equations can be used to calculate KPH, DPS, GPH, and other \nuseful time-based measures. Furthermore, you can use the drawText() func-\ntion of the Direct3D hook to display any information you can locate and \nnormalize. The hook also contains addSpriteImage() and drawSpriteImage() \nfunctions that you can use to draw your own custom images, allowing you \nto make your HUDs as fancy as you want.\nUsing Hooks to Locate Data\nMemory reading isn’t the only way to get data for a custom HUD. You can \nalso gather information by counting the number of times a specific model \nis drawn by the DrawIndexedPrimitive() function, hooking the game’s internal \nfunctions responsible for drawing certain types of text, or even intercept-\ning function calls responsible for processing data packets from the game \nserver. The methods you use to do this will be drastically different for \nevery game, and finding those methods will require you to pair everything \nyou’ve learned from this book with your own ingenuity and programming \ninstincts.\nFor instance, to create a HUD that displays how many enemies are on \nthe map, you could use the model-fingerprinting methods used by wallhacks \nto count the number of enemies and output that number to the screen. This \nmethod is better than creating a way to read the list of enemies from mem-\nory, since it doesn’t require new memory addresses every time the game \npatches.\nUsing Extrasensory Perception to Ward Off Fog of War 201\nAnother example is displaying a list of enemy cooldowns, which would \nrequire you to intercept incoming packets that tell the client which spell \neffects to display. You could then correlate certain spells with certain \nenemies based on spell and enemy location, spell type, and so on, and use \nthat information to track spells each enemy has used. If you correlate the \ndata with a database of cooldown times, you can display exactly when each \nenemy spell can be used again. This is especially powerful because most \ngames don’t store enemy cooldowns in memory.\nAn Overview of Other ESP Hacks\nIn addition to the hacks discussed in this chapter, there are a number of \nESP hacks that don’t have common names and are specific to certain genres \nor even certain games. I’ll quickly take you through the theory, background, \nand architecture of some of these hacks.\nRange Hacks\nRange hacks use a method similar to wallhacks to detect when the \nmodels for different types of champions or heroes are drawn. Then \nthey draw circles on the ground around each hero model. The radius \nof each circle corresponds to the maximum attack range of the cham-\npion or hero it surrounds, effectively showing you areas where you can \nbe damaged by each enemy.\nLoading-Screen HUDs\nLoading-screen HUDs are common in MOBA and RTS games that \nrequire all players to sit through a loading screen while everyone’s game \nis starting up. These hacks take advantage of the fact that such games \noften have websites where historical player statistics can be queried. You \ncan write a bot that automatically queries the statistics of each player in \nthe game and seamlessly displays the information as an overlay on your \nloading screen, allowing you to study your enemies before launching \ninto battle.\nPick-Phase HUDs\nPick-phase HUDs are similar to their loading-screen cousins, but they \nare displayed during the pregame phase when each player is picking \na champion or hero to play. Instead of showing enemy statistics, pick-\nphase HUDs show statistics about allies. This allows you to quickly assess \nthe strengths and weaknesses of your allies so you can make better deci-\nsions about which character to play.\nFloor Spy Hacks\nFloor spy hacks are common in older 2D top-down games that have dif-\nferent distinct floors or platforms. If you’re on the top floor, you might \nwant to know what’s going on downstairs before you go charging in. \nYou can write floor spy hacks that modify the current floor value (typi-\ncally an unsigned int) to a different floor above or below you, allowing \nyou to spy on other floors.\n202 Chapter 9\nGames often recalculate the current floor value every frame based on \nplayer position, so NOPs are sometimes required to keep the value from \nbeing reset every time a frame is redrawn. Finding the current floor value \nand the code to NOP would be similar to finding the zoom factor, as dis-\ncussed in “Using NOPing Zoomhacks” on page 197.\nClosing Thoughts\nESP hacks are powerful ways to obtain extra information about a game. \nSome of them can be done pretty easily through Direct3D hooks or simple \nmemory editing. Others require you to learn about a game’s internal data \nstructures and hook proprietary functions, giving you a reason to employ \nyour reverse engineering skills.\nIf you want to experiment with ESP hacks, study and tweak the example \ncode for this chapter. For practice with more specific ESP hacks, I encour-\nage you to go out and find some games to play around with.\n10\nR e s p on si v e H acks\nThe average gamer has a reaction time of \n250 milliseconds, or a quarter of a second. \nProfessional gamers average a fifth of a \nsecond, but some can react in a sixth of a sec-\nond. These figures are based on online tests that mea-\nsure players’ reaction times to singular, predictable \nevents. In actual games, though, players must react to dozens of differ-\nent events, like health loss, incoming skill shots, abilities coming off of \ncooldown, enemy attacks, and many others. Only very skilled gamers can \nmaintain a fourth- or fifth-of-a-second reaction time in such dynamic envi-\nronments; the only way to be faster is to be a computer.\nIn this chapter, you’ll learn how to make bots that react faster than any \nplayer. First, I’ll show you some code patterns you can incorporate into a \nbot to detect when certain events happen within a game. Next, you’ll learn \nhow to make a bot that moves your character, heals, or casts spells all on its \nown. Once you’ve explored those fundamental techniques, I’ll help you tie \nthem together to implement some of the most common, and most powerful, \nresponsive hacks.\n204 Chapter 10\nObserving Game Events\nWithin just a few seconds of playing a game, most people can make essen-\ntial observations about the game environment. You can clearly see when \nmissiles are flying toward your character, when your health is too low, and \nwhen abilities come off of cooldown. For a bot, though, these seemingly \nintuitive observations are not as easy to make. The bot must detect each \nevent by looking for changes in memory, detecting visual cues, or intercept-\ning network traffic.\nMonitoring Memory\nTo detect simple events, such as your health bar dropping low, you can pro-\ngram a bot to periodically read your health from memory and compare it to \nsome minimum acceptable value, as in Listing 10-1.\n// do this every 10 milliseconds (100 times a second)\nauto health = readMemory(HEALTH_ADDRESS);\nif (health <= 500) {\n // some code to tell the bot how to react\n}\nListing 10-1: An if statement that checks health\nGiven the address of your character’s health, you can check the value \nthere as often as you need; every 10 milliseconds is typically a good rate. \n(Flip back to Chapter 1 if you need a refresher on locating values in mem-\nory.) Once health drops below a certain value, you’ll want to run some reac-\ntion code to cast a healing spell or drink a potion. I’ll talk about how you \ncan do this later in the chapter.\nIf you want your bot to have more granular information and the chance \nfor a greater variety of responses, you can program it to react to any change \nin health, instead of only after a set threshold. To do so, change the code in \nListing 10-1 to compare your current health to the amount you had during \nthe previous execution, as follows:\n// still do this every 10 milliseconds\nstatic int previousHealth = 0;\nauto health = readMemory(HEALTH_ADDRESS);\nif (health != previousHealth) {\n if (health > previousHealth) {\n // react to increase\n } else {\n // react to decrease\n }\n previousHealth = health;\n}\nNow, this code uses a static variable called previousHealth to track the \nvalue of health on the previous iteration. If previousHealth and health differ, \nthe bot not only reacts to the change in health but also reacts differently \nResponsive Hacks 205\nto health increases and decreases. This technique is the simplest, and most \ncommon, way to react to changes in a game state. With the proper memory \naddresses, you can use this code pattern to observe changes in health, \nmana, ability cooldowns, and other critical information.\nDetecting Visual Cues\nHealth is relatively simple for a bot to check because it’s just a number, but \nsome game elements have to be relayed to the bot differently. For example, \nwhen status ailments or buffs are affecting a character, the easiest way for \nyou to tell is to simply look for an onscreen status indicator, and the same is \ntrue for bots. \nWhen reading memory isn’t enough, you can detect certain events by \nhooking a game’s graphics engine and waiting for the game to render a \nspecific model. (Refer back to “Applying Jump Hooks and VF Hooks to \nDirect3D” on page 175 and “Creating a Direct3D Wallhack” on page 194 \nto get refreshed on Direct3D hooks. ) When the model is drawn, you can \nqueue up a reaction to be executed after the frame is drawn, like this:\n// below is the drawIndexedPrimitive hook\nvoid onDrawIndexedPrimitive(...) {\n if (numVertices == EVENT_VERT && primCount == EVENT_PRIM) {\n // react, preferably after drawing is done\n }\n}\nUsing the same model-fingerprinting trick as the wallhack code in \nChapter 9, this code detects when a specific model is drawn to the screen \nand reacts accordingly. This code reacts to the event every single frame, \nthough, and that can make your game unplayable. You’ll probably want \nsome internal cooldown to avoid spamming a reaction. In cases where the \nindicator model is persistently drawn (that is, not blinking), you can actu-\nally track it across frames to determine when it appears and disappears. \nHere’s a code snippet that also handles tracking:\nbool eventActive = false;\nbool eventActiveLastFrame = false;\n// below is the drawIndexedPrimitive hook\nvoid onDrawIndexedPrimitive(...) {\n if (numVertices == EVENT_VERT && primCount == EVENT_PRIM)\n eventActive = true;\n}\n// below is the endScene hook\nvoid onDrawFrame(...) {\n if (eventActive) {\n if (!eventActiveLastFrame) {\n // react to event model appear\n }\n eventActiveLastFrame = true;\n206 Chapter 10\n } else {\n if (eventActiveLastFrame) {\n // react to event model disappear\n }\n eventActiveLastFrame = false;\n }\n eventActive = false;\n}\nThe onDrawIndexedPrimitive() function still checks whether a certain \nmodel was drawn, but now, two Booleans track whether the model was \ndrawn this frame or the previous frame. Then, when the frame is com-\npletely drawn, the bot can check these variables and react to the model \neither appearing or disappearing.\nThis method works great for detecting visual status indicators that \nappear only when your character is affected by stuns, movement slows, \nsnares, poisons, and so on. You can also use it to detect when enemies \nappear and disappear in MOBA and RTS games, as these games draw only \nenemies that are explicitly in the sight range of an allied unit or player.\nIntercepting Network Traffic\nOne of the most reliable ways to observe events is the same way the game \nclient does: by waiting for the game server to tell you that they occurred. \nIn this type of communication, the game server sends byte arrays called \npackets over the network to the client, using sockets. The packets are typi-\ncally encrypted and contain blobs of data serialized through a proprietary \nformat. \nA Typical Packet-Parsing Function\nTo receive and process packets, a game client does something like \nListing 10-2 before it draws a frame.\nvoid parseNextPacket() {\n if (!network->packetReady()) return;\n auto packet = network->getPacket();\n auto data = packet->decrypt();\n switch (data->getType()) {\n case PACKET_HEALTH_CHANGE:\n onHealthChange(data->getMessage());\n break;\n case PACKET_MANA_CHANGE:\n onManaChange(data->getMessage());\n break;\n // more cases for more packet types\n }\n}\nListing 10-2: A simplified look at how a game parses packets\nResponsive Hacks 207\nThe exact code for any particular game might look different, but the \ncontrol flow is always the same: receive a packet, decrypt it, decide what \nkind of message it contains, and call a function that knows what to do with \nit. Some game hackers intercept raw network packets and replicate this \nfunctionality in their bots. This technique works, but it requires extensive \nknowledge of encryption, a complete understanding of how the game stores \ndata inside a packet, the ability to man-in-the-middle the network connec-\ntion, and a way to locate the decryption keys being used by the game client.\nHooking the functions responsible for handling the packets after they \nare decrypted and parsed is a much better approach; in Listing 10-2, those \nfunctions are the onHealthChange() and onManaChange() functions. This method \nleverages the game’s inherent ability to process packets, allowing a bot to \nremain ignorant of the various network facilities the game uses. It also gives \nyou discretion over which network data you intercept, as you need to hook \nonly the handlers that meet your needs.\nN o t e \t\nIntercepting entire packets can sometimes be advantageous—for example, in any \ngame that uses Adobe AIR and communicates using RTMPS. Since RTMPS is so \nheavily documented, there’s no need to reverse engineer the format or encryption. \nChapter 8 explains how to hook RTMPS in detail.\nThere are a few tricks you can use to easily find the parser function \nand, ultimately, the switch() statement that dispatches packets to their han-\ndlers. The most useful method I’ve found is to place a breakpoint on the \nfunction the game uses to receive data from the network, and then analyze \nthe flow of the application when the breakpoint is hit.\nLet’s walk through how you might do this with OllyDbg attached to \nyour target game. In Windows, recv() is the API function to receive data \nfrom a socket. From the OllyDbg command line, you can set a breakpoint \non recv() by entering the bp recv command. When the breakpoint is hit, \nyou can climb the call stack using ctrl-F9, the shortcut for execute until \nreturn, and F8, the shortcut for step over. This combination essentially lets \nthe program execute until the callee has returned to the caller, allowing \nyou to climb the call stack in tandem with the game. At each stack level, you \ncan inspect the code of each caller until you find one that has a big switch() \nstatement; this should be the packet parser.\nA Trickier Parser\nDepending on the game’s architecture, though, finding the parser function \nmay not be that simple. Consider a game with a parser function that looks \nlike this:\npacketHandlers[PACKET_HEALTH_CHANGE] = onHealthChange;\npacketHandlers[PACKET_MANA_CHANGE] = onManaChange;\nvoid parseNextPacket()\n{\n if (!network->packetReady()) return;\n208 Chapter 10\n auto packet = network->getPacket();\n auto data = packet->decrypt();\n auto handler = packetHandlers[data->getType()];\n handler->invoke(data->getMessage());\n}\nSince the parseNextPacket() function doesn’t have a switch() statement, \nthere’s no obvious way to identify it in memory. Unless you pay very close \nattention, you’ll likely climb right past it on the call stack. When a game \nhas a parser function like this, trying to figure out what the parser func-\ntion looks like might be pointless. If you don’t see a switch() statement when \nclimbing the recv() call stack, you’ll have to note every callee on the call \nstack instead.\nInstead of climbing up the call stack from the breakpoint, you’d go \nto every address marked as a RETURN below ESP in the OllyDbg stack pane. \nThese are the return addresses into each caller for each callee. At each \nreturn address, you’d need to find the top of the caller in OllyDbg’s dis\nassembly pane and note the address. As a result, you’d have a list of every \nfunction call leading up to the recv() call.\nNext, you’d repeat the same list-making process from breakpoints \nplaced on a few of the game’s handler functions. You can find a handler \nfunction by monitoring memory that it will inevitably use. The handler for \na health change packet, for instance, will update your health in memory. \nUsing OllyDbg, you can set a memory on write breakpoint to the health \naddress. When the breakpoint gets triggered, it means the game updated \nthe health value from a handler function. This should work the same way \nfor most values that are controlled by the server. The server will control any \ngame-critical values, such as health, mana, level, items, and so on.\nOnce you’ve recorded the call stack from recv() and a few handler func-\ntions, you can correlate them to locate the parser function. For example, \nconsider the three pseudo–call stacks in Table 10-1.\nTable 10-1: Pseudo–Call Stacks for Three Packet-Related Functions\nrecv() stack\nonHealthChange() stack\nonManaChange() stack\n0x0BADF00D\n0x101E1337\n0x14141414\n0x40404040\n0x50505050\n0x60606060\n0xDEADBEEF\n0xDEADBEEF\n0xDEADBEEF\n0x30303030\n0x30303030\n0x30303030\n0x20202020\n0x20202020\n0x20202020\n0x10101010\n0x10101010\n0x10101010\nThese stacks show what memory might look like during a call to recv() \nand to a game’s hypothetical onHealthChange() and onManaChange() functions. \nNotice that each function originates from a chain of four common function \nResponsive Hacks 209\ncalls (shown in boldface). The deepest common address, 0xDEADBEEF, is \nthe address of the parser. For a better understanding of this structure, look \nat the call stacks laid out in a tree view, as in Figure 10-1.\n0x0BADF00D \u001f recv()\n0x40404040 \u001f network->getPacket()\n0x14141414 \u001f onManaChange()\n0x60606060 \u001f handler->invoke()\n0x101E1337 \u001f onHealthChange()\n0x50505050 \u001f handler->invoke()\n0xDEADBEEF \u001f parseNextPacket()\n0x30303030 \u001f processInput()\n0x20202020 \u001f executeFrame()\n0x10101010 \u001f main()\nFigure 10-1: Tree view of our three call stacks\nEach function’s call stack branches out from the function at \n0xDEADBEEF, meaning that function is a common point of origin for \nall three calls. The example parseNextPacket() function is responsible for \ncalling these functions, so it must be the most recent common ancestor at \n0xDEADBEEF.\nN o t e \t\nThese call stacks are hypothetical, and they’re simplified beyond what you’ll typically \nencounter. Real call stacks will probably have quite a few more function calls, and \ncomparing them won’t be as easy. \nA Hybrid Parsing System\nA third variation of the parsing loop might be a hybrid of the previous two \nthat uses a switch() statement after a function call. Here’s another hypo-\nthetical function:\nvoid processNextPacket()\n{\n if (!network->packetReady()) return;\n auto packet = network->getPacket();\n auto data = packet->decrypt();\n dispatchPacket(data);\n}\nvoid dispatchPacket(data)\n{\n switch (data->getType()) {\n case PACKET_HEALTH_CHANGE:\n processHealthChangePacket(data->getMessage());\n break;\n210 Chapter 10\n case PACKET_MANA_CHANGE:\n processManaChangePacket(data->getMessage());\n break;\n // more cases for more data types\n }\n}\nThe processNextPacket() function fetches a new packet and calls \ndispatchPacket() to handle the data. In this case, the dispatchPacket() func-\ntion exists in the call stack of each handler, but not in the one for the \nrecv() function. Look at the hypothetical stacks in Table 10-2, for \nexample. \nTable 10-2: Pseudo–Call Stacks for Three Packet-Related Functions\nrecv() stack\nonHealthChange() stack\nonManaChange() stack\n0x0BADF00D\n0x101E1337\n0x14141414\n0x40404040\n0x00ABCDEF\n0x00ABCDEF\n0xDEADBEEF\n0xDEADBEEF\n0xDEADBEEF\n0x30303030\n0x30303030\n0x30303030\n0x20202020\n0x20202020\n0x20202020\n0x10101010\n0x10101010\n0x10101010\nAlthough these three functions have the same first four addresses in \ntheir call stacks, only the two handlers have one more address in common \n(again shown in boldface). That’s 0x00ABCDEF, and it’s the address of the \ndispatchPacket() function. Once again, you can imagine these laid out in a \ntree view, as in Figure 10-2.\n0x0BADF00D \u001f recv()\n0x40404040 \u001f network->getPacket()\n0xDEADBEEF \u001f parseNextPacket()\n0x30303030 \u001f processInput()\n0x20202020 \u001f executeFrame()\n0x10101010 \u001f main()\n0x14141414 \u001f onManaChange()\n0x101E1337 \u001f onHealthChange()\n0x0ABCDEF \u001f dispatchPacket()\nFigure 10-2: Tree view of our three call stacks\nA Parser Hack\nOnce you’ve located the function responsible for dispatching packets to \ntheir handlers, you’ll be able to spot every handler that can be called. You \nResponsive Hacks 211\ncan deduce a handler’s purpose by placing a breakpoint on it and watching \nwhat values change in memory when it executes. Then, you can hook any \nhandlers that your bot needs to react to. (Flip back to Chapter 8 if you need \na refresher on how you might hook these functions. ) \nOf course, there are endless ways to implement network behavior. I \ncan’t cover them all, but seeing these three common techniques should \nhelp you understand the methodology. No matter what game you’re deal-\ning with, a breakpoint on recv() should be a step in the right direction.\nPerforming In-Game Actions\nBefore a bot can react to events, you have to teach it to play the game. It \nneeds to be able to cast spells, move around, and activate items. On this \nfront, bots aren’t much different from people: they can just be told which \nbuttons to press. Pressing buttons is simple and suffices in many cases, but \nin more intricate situations, a bot may have to communicate on the network \nand tell the server what it’s trying to do.\nTo follow along with the examples in this section and explore on your \nown afterward, open the files in the GameHackingExamples/Chapter10_\nResponsiveHacks/ folder in this book’s resource files.\nEmulating the Keyboard\nThe most common buttons you’ll press in a game are keyboard keys, and \nthere are a couple of ways you can teach your bot to type.\nThe SendInput() Function\nOne common way to emulate the keyboard is with the SendInput() Windows \nAPI function. This function, which sends keyboard and mouse input to the \ntopmost window, has the following prototype:\nUINT SendInput(UINT inputCount, LPINPUT inputs, int size);\nThe first parameter, inputCount, is the number of inputs being sent. For \nthe examples in this book, I’ll always use a value of 1. The second parame-\nter, inputs, is a pointer to a structure (or an array of structures whose length \nmatches the inputCount value) with the predefined type INPUT. The final \nparameter is the size of inputs in memory, as calculated with the formula \nsize = inputCount × sizeof(INPUT). \nThe INPUT structure tells the SendInput() function what type of input to \nsend, and the following code shows how you might initialize an instance of \nINPUT to press the F1 key:\nINPUT input = {0};\ninput.type = INPUT_KEYBOARD;\ninput.ki.wVk = VK_F1;\n212 Chapter 10\nTo have your bot actually press F1, you’d need to send this input twice, \nlike so:\nSendInput(1, &input, sizeof(input));\n// change input to key up\ninput.ki.dwFlags |= KEYEVENTF_KEYUP;\nSendInput(1, &input, sizeof(input));\nThe first call to SendInput() presses F1, and the second releases it. The \nrelease happens not because the input was sent twice, but because the sec-\nond call was made with the KEYEVENTF_KEYUP flag enabled in the input param-\neter’s keyboard flags field. Since setting up input for even a single key is a bit \nmessy, it’s best to wrap everything inside a function. The result looks some-\nthing like Listing 10-3.\nvoid sendKeyWithSendInput(WORD key, bool up)\n{\n INPUT input = {0};\n input.type = INPUT_KEYBOARD;\n input.ki.wVk = key;\n input.ki.dwFlags = 0;\n if (up)\n input.ki.dwFlags |= KEYEVENTF_KEYUP;\n SendInput(1, &input, sizeof(input));\n}\nsendKeyWithSendInput(VK_F1, false); // press\nsendKeyWithSendInput(VK_F1, true); // release\nListing 10-3: A wrapper for emulating keystrokes with SendInput()\nThis function initializes input with the given key, enables the flag \nKEYEVENTF_KEYUP if up is set, and calls the SendInput() function. This means \nsendKeyWithSendInput() must be called a second time to send the key release, \neven though the release is always required. The function is written this way \nbecause key combinations that involve modifiers like shift, alt, or ctrl \nmust be sent a bit differently; the modifier’s press must come before the \nkey’s press, but its release must come after the key’s release. \nThe following code shows how you’d use the sendKeyWithSendInput() func-\ntion to tell a bot to press shift-F1:\nsendKeyWithSendInput(VK_LSHIFT, false); // press shift\nsendKeyWithSendInput(VK_F1, false); // press F1\nsendKeyWithSendInput(VK_F1, true); // release F1\nsendKeyWithSendInput(VK_LSHIFT, true); // release shift\nYou’d have to call sendKeyWithSendInput() four times, but that’s still easier \nthan using the code without a wrapper function.\nResponsive Hacks 213\nThe SendMessage() Function\nAn alternative method for sending keystrokes relies on the SendMessage() \nWindows API function. This function allows you to send input to any win-\ndow, even if it’s minimized or hidden, by posting data directly to the target \nwindow’s message queue. This advantage makes it the method of choice \nfor game hackers, because it enables users to do other things while their \nbot plays the game in the background. SendMessage() has the following \nprototype:\nLRESULT SendMessage(\n HWND window,\n UINT message,\n WPARAM wparam,\n LPARAM lparam);\nThe first parameter, window, is a handle to the window that the input \nis being sent to. The second parameter, message, is the type of input being \nsent; for keyboard input, this parameter is WM_KEYUP, WM_KEYDOWN, or WM_CHAR. \nThe third parameter, wparam, should be the key code. The final parameter, \nlparam, should be 0 when the message is WM_KEYDOWN and 1 otherwise.\nBefore you can use the SendMessage() function, you must obtain a handle \nto the target process’s main window. Given the title of the window, you can \nobtain a handle using the FindWindow() Windows API function, as follows:\nauto window = FindWindowA(NULL, \"Title Of Game Window\");\nWith a valid window handle, making a call to SendMessage() looks some-\nthing like this:\nSendMessageA(window, WM_KEYDOWN, VK_F1, 0);\nSendMessageA(window, WM_KEYUP, VK_F1, 0);\nThe first call presses the F1 key, and the second call releases it. Keep in \nmind, however, that this series of calls works only for keys that don’t input \ntext, like F1, insert, or tab. To have your bot press keys that input text, you \nmust also send a WM_CHAR message between the down and up messages. To \ntype W, for instance, you’d do something like this:\nDWORD key = (DWORD)'W';\nSendMessageA(window, WM_KEYDOWN, key, 0);\nSendMessageA(window, WM_CHAR, key, 1);\nSendMessageA(window, WM_KEYUP, key, 1);\nThis creates a key variable so the letter key to press can be changed eas-\nily. Then it follows the same steps the F1 example used, just with a WM_CHAR \nmessage in between.\n214 Chapter 10\nN o t e \t\nYou can actually send nothing but the WM_CHAR message and get the same result, but \nit’s best practice to send all three messages. Game developers can easily shut down bots \nby patching the game to ignore WM_CHAR messages that don’t follow WM_KEYDOWN, and \nthey can even use it as a way to detect your bot and ban you.\nAs I showed with the SendInput() technique, you can create a wrapper \naround this functionality to make your bot code easier to work with. The \nwrapper looks something like this:\nvoid sendKeyWithSendMessage(HWND window, WORD key, char letter)\n{\n SendMessageA(window, WM_KEYDOWN, key, 0);\n if (letter != 0)\n SendMessageA(window, WM_CHAR, letter, 1);\n SendMessageA(window, WM_KEYUP, key, 1);\n}\nUnlike Listing 10-3, this wrapper actually sends both the press and \nrelease. This is because SendMessage() can’t be used to send keystrokes with \nmodifiers, so there’s never any need to insert code between the two calls. \nN o t e \t\nThere are multiple ways a game might check whether a modifier key is pressed, \nthough. You might be able to send modifier keys to certain games by calling the \nSendMessage() function, but it depends on how those games detect modifiers.\nYou can use this wrapper in a similar way as the one in Listing 10-3. For \nexample, this code sends F1 followed by W:\nsendKeyWithSendMessage(window, VK_F1, 0);\nsendKeyWithSendMessage(window, 'W', 'W');\nThis example, like all of the SendMessage() code I’ve shown so far, sim-\nply gets the job done. It can input text, but it doesn’t exactly send proper \nmessages. \nThere are a lot of small details you have to get right if you want to send \n100 percent valid messages with the SendMessage() function. For instance, \nthe first 16 bits of lparam should store the number of times the key has been \nautomatically repeated as a result of being held down. The next 8 bits \nshould store the scan code, a key identifier that is specific to each keyboard \nmanufacturer. The next bit, number 24, should be set only if the button is \non an extended part of the keyboard, such as the number pad. The follow-\ning 4 bits are undocumented, and the next bit should be set only if the alt \nkey was down when the message originated. The last 2 bits are the previous \nstate flag and the transition state flag. The previous state flag is set only if \nthe key was previously down, and the transition state is set only if the key \nwas previously in the state opposite its current position (that is, if the key is \nnow up and was previously down, or vice versa).\nResponsive Hacks 215\nThankfully, the average game doesn’t consider most of these values. For \nthat matter, the average piece of software doesn’t care about them either. \nIf you have to fill all of these values with proper data to make your bot \nwork, you’re moving in the wrong direction. There are many other ways to \nperform actions, the majority of which are simpler than trying to emulate \nthe exact behavior of the operating system’s kernel-level keyboard input \nhandler/dispatcher. In fact, there’s already a function that does that, and \nI’ve already talked about it: the SendInput() function.\nYou can also control the mouse with the SendInput() and SendMessage() \nfunctions, but I highly recommend avoiding it. Any mouse commands \nyou send will affect, and be affected by, any legitimate mouse movements, \nmouse clicks, or keystrokes sent by the player. The same is true for keyboard \ninput, but the complications are much rarer.\nSending Packets\nBefore a game draws a frame, it checks for keyboard and mouse input. \nWhen it receives input that results in an action, such as moving around or \ncasting a spell, it checks to make sure the action is possible and, if so, tells \nthe game server that the action has been performed. The game code to \ncheck for events and alert the server often looks something like this:\nvoid processInput() {\n do {\n auto input = getNextInput();\n if (input.isKeyboard())\n processKeyboardInput(input);\n // handle other input types (e.g., mouse)\n } while (!input.isEmpty());\n}\nvoid processKeyboardInput(input) {\n if (input.isKeyPress()) {\n if (input.getKey() == 'W')\n step(FORWARD);\n else if (input.getKey() == 'A')\n step(BACKWARD);\n // handle other keystrokes (e.g., 'S' and 'D')\n }\n} \nvoid step(int direction) {\n if (!map->canWalkOn(player->position))\n return;\n playerMovePacket packet(direction);\n network->send(packet);\n}\nThe processInput() function is called every frame. The function iterates \nover all pending inputs and dispatches different types of inputs to their rel-\nevant handlers. In this case, when keyboard input is received, it’s dispatched \n216 Chapter 10\nto the processKeyboardInput() function. This handler then checks whether \nthe key is either W or S, and, if so, calls step() to move the player in the cor-\nresponding direction.\nSince step() is used to perform an action, it is called an actor function. \nThe invocation of an actor function is called actuation. You can directly call \na game’s actor functions from your bot to perform an action while com-\npletely bypassing the input layer. \nBefore you can call an actor, though, you must find its address. To do \nthis, you can attach OllyDbg to the game, open the command line, and \nenter bp send. This will place a breakpoint on the send() function, which is \nused to send data over the network. When you play the game, every time \nyou take a step, cast a spell, pick up loot, or do anything else, your break-\npoint should trigger, and you can note each function in the call stack.\nN o t e \t\nThe game should call send() every time you do anything while playing. Pay attention \nto what you did before each send() breakpoint is hit, as that will give you a rough \nidea of what action each call is communicating to the server, and, ultimately, what \nthe actor you find is responsible for. \nOnce you have a few different call stacks, you can compare them to \nlocate the actor functions. To see how to spot the actor functions, let’s com-\npare the two annotated call stacks in Figure 10-3.\n0x0BADF00D \u001f castSpell()\n0x40404040 \u001f processKeyboardInput()\n0x30303030 \u001f processInput()\n0x20202020 \u001f doFrame()\n0x10101010 \u001f main()\n0xDEADBEEF \u001f step()\n0x70707070 \u001f send()\n0x60606060 \u001f connection->send()\n0x50505050 \u001f network->send()\nFigure 10-3: Tree view of call stacks to two actor functions\nLike these two stacks, the call stacks you find should be identical at the \ntop, sharing a couple of common functions responsible for generic network \ntransmission. They should also be identical on the bottom, since each call \nto send() should have originated from the processInput() function. Each \nstack should have some unique functions between these identical regions, \nResponsive Hacks 217\nthough, and those are the actor functions you’re looking for. Typically, the \nfunction of interest is immediately beneath the common network calls. In \nthis case, the two actors are the step() and castSpell() functions.\nAfter hacking the same game for a while, you’ll learn how far up \nthe stack the actor functions are from the send() call. In Figure 10-3, for \nexample, the actors happen three calls before the send() call. Knowing this, \nyou could just climb the stack in OllyDbg (ctrl-F9 followed by F8) three \ntimes when your send() breakpoint is hit and be inside the actor function \nthat sent the data.\nOnce you’ve found an actor function, you can call it from an injected \nDLL. Here’s how you might call step() if you found it at 0xDEADBEEF:\ntypedef void _step(int direction);\nauto stepActor = (_step*)0xDEADBEEF;\nstepActor(FORWARD);\nSince the bot won’t know the actual name for this game function, the \ncode assigns the contents of memory at 0xDEADBEEF to a conveniently \nnamed variable: stepActor. Then, the code just calls stepActor() like any \nother function.\nIf you’ve got the right address, function prototype, and parameters, this \nshould work beautifully; you’ll be able to automate actions as if you have \naccess to the game’s source code. Just make sure to call the actor functions \nfrom inside the same thread as the game, or you can run into threading \nissues. The best way to do this is to call the actors from a hook on a major \nfunction like Direct3D’s EndScene() or the Windows API’s PeekMessage() func-\ntion, as these functions will usually be called only from the game’s main \nthread.\nUsing this to Call __thiscall\nIf you try to call an actor function that’s a nonstatic member of a class, the \nfunction will have a _thiscall calling convention, which means you’ll need to \npass the instance of the class on the ECX register. (You can brush up on calling \nconventions in “Function Calls” on page 94.) Passing the instance is straight-\nforward, but you’ll have to locate a pointer chain to the class instance first. \nTo find the pointer chain, you can drop a breakpoint on the actor function, \ngrab the class instance value from ECX when the breakpoint kicks, and throw \nthat value into a Cheat Engine pointer scan. Then, to call the function, you’d \nwalk the pointer chain, obtain the current instance address, and use inline \nassembly to set up ECX and make the actual function call. This process works \nsimilarly to the way VF hook callbacks call their original counterparts, as shown \nin “Writing a VF Table Hook” on page 156.\n218 Chapter 10\nTying the Pieces Together\nAfter you’ve created frameworks for observing events and performing \nactions, you can tie them together to create responsive hacks. Responsive \nhacks come in many flavors, but there are a few common ones. \nMaking the Perfect Healer\nA favorite bot among gamers is autohealing, a hack that automatically uses a \nhealing spell when the player’s health decreases drastically or drops below \na certain threshold. Given a way to detect changes in health and an actor \nfunction to cast spells, an autohealer might look something like this:\nvoid onHealthDecrease(int health, int delta) {\n if (health <= 500) // health below 500\n castHealing();\n else if (delta >= 400) // large drop in health\n castHealing();\n}\nThis autohealing function is pretty simple, but it works well. More \nadvanced autohealers might have many more levels of healing and be able \nto learn as they go. You’ll get working example code and an in-depth expla-\nnation of advanced autohealers in “Control Theory and Game Hacking” on \npage 222.\nResisting Enemy Crowd-Control Attacks\nAnti-crowd-control hacks detect incoming crowd-control attacks and automati-\ncally cast spells that reduce their effects or completely negate them. Crowd-\ncontrol attacks disable players in some way, so having enemies cast them on \nyou can be a pain. \nGiven a way to detect incoming or active crowd-control effects, such as \nby detecting a Direct3D model or by intercepting an incoming packet, and \nan actor function to cast spells, you could have a bot react instantly to such \nattacks like so:\nvoid onIncomingCrowdControl() {\n // cast a shield to block the crowd control\n castSpellShield();\n}\nvoid onReceiveCrowdControl() {\n // cleanse crowd control that has already taken effect\n castCleanse();\n}\nAn onIncomingCrowdControl() function might try to stop the crowd-\ncontrol spell from ever hitting you. Failing that, the bot could call an \nonReceiveCrowdControl() spell to remove the effects.\nResponsive Hacks 219\nAvoiding Wasted Mana\nSpell trainers are also quite common among botters. Spell trainers wait until \nthe player has full mana and then cast spells to increase the player’s magic \nlevel or stats. This allows players to quickly increase their magic skills, as \nthey will never waste mana regeneration just because they have full mana. \nGiven a way to detect changes in mana and an actor function to cast \nspells, a bot might include the following pseudocode for a spell trainer:\nvoid onManaIncrease(int mana, int delta) {\n if (delta >= 100) // player is using mana potions,\n return; // they must need the mana, abort\n if (mana >= MAX_MANA - 10) // mana is nearly full, waste some\n castManaWasteSpell();\n}\nThis function takes the player’s mana and the increase in that player’s \nmana (delta) as parameters. If the increase in mana is above a certain \namount, it assumes the player is using potions or other items to replenish \nmana, and it won’t cast any extra spells. Otherwise, if the player has plenty \nof mana, the function fires off any old spell to get the player some experi-\nence points.\nOther common responsive hacks are autoreload to instantly reload \nammo, autododge to evade incoming projectiles, and autocombo to instantly \nattack the same target as a nearby ally. Really, the only limit to the number \nof responsive hacks you can add to a bot is the number of events your bot \ncan observe in the game, multiplied by the number of valid and helpful \nresponses it can send for each event.\nClosing Thoughts\nUsing hooks, memory manipulation, and keyboard simulation, you can \nbegin creating your first responsive hacks. These hacks are your entry \npoint into gaming autonomy, but they’re only a glimpse of what’s possible. \nChapter 11 will be the pinnacle of your game-hacking adventure. Using \neverything you’ve learned so far, and building on the principles of respon-\nsive hacks, you’ll learn how to automate advanced actions and create a truly \nautonomous bot.\nIf you’re not feeling quite ready to go deeper, I strongly recommend \nreviewing the earlier material and then getting some practice in an isolated \nenvironment on your own machine. Implementing bots like this is a lot eas-\nier than you might think, and it’s an amazingly satisfying experience. Once \nyou’re comfortable making autohealers and other basic responsive hacks, \nyou’ll be ready to start completely automating gameplay.\n11\nP u t t ing I t A l l T og e t h e r: \nW r i t ing Au t onom ou s Bo t s\nThe end goal of game hacking is to make \na full-fledged automated bot capable of \nplaying a game for hours on end. Such \nbots can heal, drink potions, farm monsters, \nloot corpses, walk around, sell loot, buy supplies, and \nmore. Making bots this powerful requires you to com-\nbine your hooks and memory reads with concepts like \ncontrol theory, state machines, and search algorithms, \nwhich are all covered in this chapter.\nThroughout the lessons here, you’ll also learn about common auto-\nmated hacks and how they should behave at a high level. After covering \nthe theory and code behind automated hacks, I’ll give you a high-level \nlook at two types of bots that rely on such code: cavebots, which can explore \n222 Chapter 11\ncaves and bring home the loot, and warbots, which can fight enemies for \nyou. By the end of the chapter, you should be ready to bust out your tools, \nfire up your development environment, and start making some really awe-\nsome bots.\nControl Theory and Game Hacking\nControl theory is a branch of engineering that provides a way to control the \nbehavior of dynamic systems. Control theory determines the state of a system \nusing sensors, after which a controller determines the set of actions needed to \nbring the system’s current state to some other desired state. After the con-\ntroller executes the first action in the set, the entire process—known as a \nfeedback loop—repeats (see Figure 11-1).\nAction\nSystem output\nController\nSystem\nSensors\nFigure 11-1: A control theory feedback loop\nLet’s apply this feedback loop to game hacking. To automate play \nwithin a game (the system), a bot implements some algorithms (the con-\ntroller) that understand how to play the game in any state observed by \nthe memory reads, network hooks, and so on (the sensors). The control-\nler typically has some human inputs, like the path to walk, creatures to \nattack, and loot to pick up. Thus, to reach the desired state, the controller \nmust perform some subset of these inputs that are possible given the cur-\nrent state.\nFor instance, if there are no creatures onscreen and no corpses to \nloot, the desired state may be for the player to reach the next location \n(called a waypoint) in the predefined path. In this case, the controller \nmoves the player one step closer to the waypoint on each iteration. If the \nplayer encounters a creature, the controller might decide to attack the \ncreature in the first frame and, in the following frames, switch between \nrunning from the creature (known as kiting) and shooting spells at it. \nOnce the creature dies, the controller executes a set of actions to loot \nthe body and continue to the next waypoint.\nGiven this example of how a feedback loop might operate, it might \nseem overwhelming to code such a system. Luckily, there are a few design \npatterns that make the task much easier than it sounds.\nPutting It All Together: Writing Autonomous Bots 223\nState Machines\nState machines are mathematical models \nof computation that describe how a sys-\ntem behaves based on input. Figure 11-2 \nshows a simple state machine that reads \na list of binary digits. The machine \nstarts with an initial state of S1. As it \niterates over the digits in the input, it \nchanges its state accordingly. In this \ncase, states S1 and S2 repeat themselves \nwhen the machine encounters a 1 and \nactivate one another when it encounters \na 0. For example, for the binary digits \n11000111, the state transitions would be \nS1, S1, S2, S1, S2, S2, S2, and finally S2.\nWith a small spin on the classical state machine theory, a state machine \ncan be the controller in a control theory feedback loop. This tweaked ver-\nsion of a state machine comprises a list of states, the conditions signifying \neach state, and the actions that must happen to reach each state.\nS1\nS2\nInput\n1\n1\n0\n0\nFigure 11-2: A simple state machine\nState Machines and Game Hacking\nA game-hacking state machine not only must keep an internal state but also \nmust respond to (or actuate) the game environment based on that state. The \noverall game state can change based on your bot’s actuation, the behavior \nof other players, and other unpredictable occurrences in the game environ-\nment. For this reason, trying to persistently walk a state machine based on the \nobserved game environment is futile; it’s nearly impossible to create a set of \ntransitions for each state to account for every possible observation that can \nbe made between iterations. It makes more sense for the state machine to \nreevaluate the game environment as a fresh slate each time it considers the \ninput. To do this, the state machine must use the game environment itself as \nthe mechanism for transitioning between states—that is, the machine’s actua-\ntion on the environment should have enough of an effect on the next iterations \nthat it activates a new state. Classical state machines can be devised that are \ncapable of working like this, but we’re going to flatten them out and use them in \na much simpler, yet still very powerful, way.\nIf you’re familiar with classical state machines, this may not seem intuitive, \nbut in the coming sections you’ll see how state machines can be mutated and \npaired with control theory to achieve what we want. \n224 Chapter 11\nThe major difference is that instead of one state merely activating \nanother, for each state in a game automation state machine, a bot will per-\nform in-game actions that change the overall state of the game and, thus, \nthe state that is detected on the next iteration of the feedback loop. In \ncode, an object to represent a state in this machine might look like this:\nclass StateDefinition {\npublic:\n StateDefinition(){}\n ~StateDefinition(){}\n bool condition();\n void reach();\n};\nYou can assemble StateDefinition objects into a state machine with a \nsimple std::vector definition, like this:\nstd::vector stateMachine;\nAnd presto, you have the skeleton of a state machine, ready to receive \nany StateDefinition objects you create. In conjunction with a feedback loop, \nthis state machine can be used to define the flow of automation.\nFirst, you can create a list of definitions that model your bot’s desired \nbehavior, ordered in the vector by importance. Each StateDefinition object \ncan use information from your sensors as input, passing that data to the \ncondition() function to determine whether or not the state should be acti-\nvated. Then, you can create a controller that loops over the list of states, \ncalling the reach() function of the first state whose condition() function \nreturns false. Finally, you can wrap the controller in a feedback loop. If you \ndon’t see how this feedback loop would work yet, don’t worry; I’ll show you \nhow to code it now.\nN o t e \t\nYou can think of the statement in your condition() function as a requirement for the \nmachine to transition to the next state. If the statement is true, it means no actuation \nmust happen before the next state in the list can be evaluated and the loop can con-\ntinue iterating. If the statement is false, it means some actuator must occur before the \ntransition can happen.\nYou’ll find all of the example code for the following section and \n“Error Correction” on page 230 in the GameHackingExamples/Chapter11_\nStateMachines directory of this book’s source files. The included projects \ncan be compiled with Visual Studio 2010, but they should also work with any \nother C++ compiler. Download them at https://www.nostarch.com/gamehacking/ \nand compile them if you want to follow along.\nPutting It All Together: Writing Autonomous Bots 225\nCombining Control Theory and State Machines\nTo tie states together with a feedback loop, first you have to provide each \nStateDefinition object with a generic way to access the sensors and actua-\ntors that you’ve implemented. The StateDefinition class then becomes the \nfollowing:\nclass StateDefinition {\npublic:\n StateDefinition(){}\n ~StateDefinition(){}\n bool condition(GameSensors* sensors);\n void reach(GameSensors* sensors, GameActuators* actuators);\n};\nThis change simply modifies the condition() and reach() functions to \naccept instances of the classes GameSensors and GameActuators as arguments. \nGameSensors and GameActuators are classes you need to define; GameSensors will \ncontain the results of memory reads, network hooks, and other data sources \nyour bot intercepts from the game, while GameActuators will be a collection of \nactor functions capable of performing actions inside the game.\nNext, you need a generic way to define each individual state. You \ncould abstract the definition of each state to its own class that inherits \nStateDefinition and implements condition() and reach() as virtual functions. \nAlternatively, if the source code needs to fit in a small space (like a book, \nwink wink), you could keep a single class to represent each definition and \nuse std::function to implement the condition() and reach() functions outside \nthe class definition.\nFollowing that alternative method, the final version of StateDefinition \nwould look like this:\nclass StateDefinition {\npublic:\n StateDefinition(){}\n ~StateDefinition(){}\n std::function condition;\n std::function reach;\n};\nWith this version of the StateDefinition class, you could define a new \nstate by creating an instance of the class and assigning condition() and \nreach() to functions that correspond with the intended behavior.\nA Basic Healer State Machine\nThe next step is defining the bot’s actual behavior. To keep the example \ncode simple, let’s say you’re implementing an automatic healer. This healer \n226 Chapter 11\nhas two healing methods: it uses strong healing if the player is at or below \n50 percent health and weak healing if the player is between 51 and 70 per-\ncent health.\nA state machine representing this behavior needs two states, one for \nstrong healing and one for weak healing. To start, you need to define the \nstate machine as a vector with two StateDefinition objects:\nstd::vector stateMachine(2);\nThis code creates a state machine called stateMachine and initializes \nit with two empty StateDefinition objects. Next, you define the condition() \nand reach() functions for these state definitions. The strong healing state is \nthe most important because it keeps the character from dying, so it should \ncome first in the vector, as shown in Listing 11-1.\nauto curDef = stateMachine.begin();\ncurDef->condition = [](GameSensors* sensors) {\nu return sensors->getHealthPercent() > 50;\n};\ncurDef->reach = [](GameSensors* sensors, GameActuators* actuators) {\nv actuators->strongHeal();\n};\nListing 11-1: Code for a strong healing state\nThis code first creates an iterator called curDef that points to the first \nStateDefinition object in the stateMachine vector. The object’s condition() \nfunction is then defined u; in English, this definition says, “The state is met \nif the player’s health percent is greater than 50.” If the state isn’t met, then \nthe object’s reach() function calls the strongHeal() actor function v so that \nstrong healing can be performed.\nWith the strong healing state defined, next you define the weak healing \nstate, as shown in Listing 11-2.\ncurDef++;\ncurDef->condition = [](GameSensors* sensors) {\nu return sensors->getHealthPercent() > 70;\n};\ncurDef->reach = [](GameSensors* sensors, GameActuators* actuators) {\nv actuators->weakHeal();\n};\nListing 11-2: Code for weak healing\nAfter incrementing curDef so it points to the second StateDefinition \nobject in the stateMachine vector, this code defines the object’s condition() \nPutting It All Together: Writing Autonomous Bots 227\nfunction u as, “The state is met if the player’s health percent is greater than \n70.” It also defines the object’s reach() function as an actuators->weakHeal() \ncall v.\nOnce you’ve finished defining the state machine, you must imple-\nment the controller. Since the actual behavior of the controller is con-\ntained in the state machine, you only need to add a simple loop to \ncomplete it:\nfor (auto state = stateMachine.begin(); state != stateMachine.end(); state++) {\n if (u!state->condition(&sensors)) {\n state->reach(&sensors, &actuators);\n break;\n }\n}\nThis controller loop iterates over the state machine, executes the \nreach() function of the first state whose condition() function returns false u, \nand breaks out if any reach() function is called. The final step is to imple-\nment the feedback loop and plop the controller loop inside it, as shown in \nListing 11-3.\nwhile (true) {\n for (auto state = stateMachine.begin();\n state != stateMachine.end();\n state++) {\n if (!state->condition(&sensors)) {\n state->reach(&sensors, &actuators);\n break;\n }\n Sleep(FEEDBACK_LOOP_TIMEOUT);\n}\nListing 11-3: Final healing state machine and feedback loop\nThis loop continuously executes the controller loop and sleeps for \nFEEDBACK_LOOP_TIMEOUT milliseconds between each execution. The Sleep() \ncall allows the game server to receive and process any actuation from the \nprevious iteration and allows the game client to receive any results of the \nactuation from the server before executing the next controller loop.\nIf you’re still a bit confused about what I just showed you, check out \nFigure 11-3, which shows how the infinitely looping code in Listing 11-3 \nworks. First, it checks whether the strong healing condition is true, and if it \nis, the weak healing condition is checked. If the strong healing condition \nis false, then the player’s health must be at or below 50 percent, so a strong \nhealing method gets called. If the weak healing condition check is false, \nthen the player’s health must be between 51 and 70 percent, so the weak \nhealing method is executed.\n228 Chapter 11\nWeak healing condition:\nPlayer has more than\n70 percent health\nStrong healing condition:\nPlayer has more than\n50 percent health\nWeak healing\nreach()\nStrong healing\nreach()\nSleep()\nwhile (true)\nBegin\nTrue\nTrue\nFalse\nFalse\nFigure 11-3: Flowchart of the healing state machine and feedback loop\nAfter either method, the machine sleeps. If both condition checks are \ntrue, then the player needs no healing. The machine does nothing to change \nthe state and sleeps before starting again at the top of the while loop.\nA Complex Hypothetical State Machine\nThe behavior implemented in the healing state machine is simple, so roll-\ning it into this kind of control structure may seem like overkill, but it’s \nuseful if you want to expand the controller. If, for example, you wanted to \ncombine the healing state machine with the “walk, attack, loot” behavior \nthat I discussed in “Control Theory and Game Hacking” on page 222, the \ncontrol structure would be much more complex. Let’s take a high-level look \nat the states you’d need:\nStrong healing Condition met if health is over 50 percent. Reach by \ncasting strong healing spell.\nWeak healing Condition met if health is over 70 percent. Reach by \ncasting weak healing spell.\nAttack spell Condition met if no target is available or if attack spell is \non cooldown. Reach by casting attack spell on target.\nPutting It All Together: Writing Autonomous Bots 229\nKite monster Condition met if no target is available or if distance \nfrom target is adequate. (The definition of “adequate” depends on how \nfar away you want to be from enemies when kiting.) Reach by taking a \nstep away from target.\nTarget monster Condition met if there’s no creature to attack. Reach \nby attacking a creature.\nLoot item Condition met if there’s no corpse open or if open corpse \nhas nothing to loot. Reach by taking an item from open corpse.\nApproach corpse Condition met if there are no corpses to open or if \nadjacent to a corpse. Reach by taking a step toward a corpse that will be \nopened.\nOpen corpse Condition met if the character is not adjacent to a \ncorpse that can be opened. Reach by opening adjacent corpse.\nFollow path Condition met if the character is unable to move to cur-\nrent waypoint or if standing on current waypoint. Reach by taking a \nstep toward current waypoint.\nAdvance waypoint Condition met if there are no waypoints left to \nfollow. Reach by updating the current waypoint to the next waypoint \nin the list. If the character can’t reach the current waypoint for some \nreason (say, if the character is stuck), then the Advance Waypoint state \nkeeps it from being stuck. If the character has reached the current way-\npoint, Advance Waypoint selects the next waypoint to keep things mov-\ning along.\nThis state machine is quite a bit more complex than the healing-\nonly state machine. If I diagrammed this state machine, there would \nbe 23 objects in the diagram, with arrows going over 33 control paths. \nCompare that to Figure 11-3, which has only 7 objects and 9 control paths.\nYou could code the healer behavior without using a state machine or \nfeedback loop, but I can’t imagine how to easily do the same for this full-\nfledged bot. Each of these 10 states relies on not only its own condition \nbut also the condition of every state preceding it. Moreover, hardcoding \nthe logic would either require a ton of nested if() statements or a bunch \nof stacked if()/return() statements—and, either way, it would just behave \nexactly like the state machine but with no runtime flexibility.\nRuntime flexibility refers to a state machine’s ability to mutate. Unlike \nhardcoded condition checks, state definitions in a state machine can be \nmoved, removed, and added dynamically. The state machine method allows \nyou to plug and play different behaviors and features depending on user \ninput.\nTo take this concept a step further, you could expose your sensors and \nactuators to a Lua environment, create Lua functions capable of adding \nand removing states from the state machine, and modify the StateDefinition \nso that its condition() and reach() functions can call Lua functions exposed \nby the Lua environment. Writing a control system this way would allow you \n230 Chapter 11\nto code the core of your bot (hooks, memory reading, actuation) in C++ \nwhile making Lua (a high-level, dynamic language) available to you for \nautomation.\nN o t e \t\nYou can embed Lua in your own programs by including a few headers and link-\ning against the Lua library. This process is not difficult, but it’s outside the scope of \nthis book, so I encourage you to check out Chapter 24 of Programming in Lua by \nRoberto Ierusalimschy (http://www.lua.org/pil/24.html) for more information.\nError Correction\nAnother piece of control theory that’s useful for game hacking is error correc-\ntion. An error correction mechanism in a controller observes the outcome \nof actuation, compares the outcome to an expected result, and adjusts future \ncalculations to bring later outcomes closer to the expected one. Error correc-\ntion can come in handy when you’re working with stochastic systems, where \nthe output generated from a given input is not fully predictable.\nGames as a whole are stochastic, but, luckily for game hackers, the results \nof actions are mostly deterministic. Take the healing controller, for example. \nIn most games, you can calculate exactly how much health you can heal \nwith a given spell, and, thus, you know exactly when to heal. But imagine \nyou’re writing a healer for the small spectrum of situations where your heal-\ning is impossible to calculate; for instance, maybe the bot is supposed to \nwork on a variety of characters spanning many levels without user input.\nError correction could enable your bot to learn how to best heal the \nplayers. In this scenario, there are two ways you can implement error cor-\nrection, each of which depends on how the healing system works.\nAdjusting for a Constant Ratio\nIf you heal for a constant ratio of health, you’ll only need to adjust your con-\ntroller after the first heal. Assuming that your sensors can detect how much \nyou’ve healed, this adds only a few lines of code. You could easily modify the \nweak healing state in Listing 11-2 to something like this:\ncurDef->condition = [](GameSensors* sensors) -> bool {\n static float healAt = 70;\n static bool hasLearned = false;\n if (!hasLearned && sensors->detectedWeakHeal()) {\n hasLearned = true; \n healAt = 100 - sensors->getWeakHealIncrease();\n }\n return sensors->getHealthPercent() > healAt;\n};\nInstead of hardcoding 70 as the threshold for weak healing, this code \nmoves the threshold to a static variable called healAt. It also adds another \nstatic variable called hasLearned so that the code knows when learning is \ncomplete.\nPutting It All Together: Writing Autonomous Bots 231\nOn each invocation of this condition() function, the code checks two \nconditions: whether hasLearned is false and whether the sensors detected a \nweak healing event. When this check passes, the code sets hasLearned to true \nand updates healAt to heal at or below the perfect percentage; that is, if your \nweak healing mustered up a 20 percent increase in health, healAt would be \nset to 80 percent health instead of 70 percent, so each heal would bring the \nplayer back up to 100 percent health.\nImplementing Adaptable Error Correction\nBut what if your healing power increases? If a character can gain levels, \napply skill points, or increase maximum health, the amount of health it can \nheal may change accordingly. For example, if you start a bot on a level-10 \ncharacter and let it run until the character is level 40, your healing code \nwill need to adapt. A level-40 character healing like it did at level 10 would \neither immensely overheal or die quickly against on-level game enemies.\nTo handle this scenario, a bot needs to constantly update its healing \nthreshold to reflect the observed healing amount. Listing 11-4 shows how you \ncan modify the strong healing condition function in Listing 11-1 to do this.\ncurDef->condition = [](GameSensors* sensors) -> bool {\n static float healAt = 50;\nu if (sensors->detectedStrongHeal()) {\n auto newHealAt = 100 - sensors->getStrongHealIncrease();\nv healAt = (healAt + newHealAt) / 2.00f;\nw sensors->clearStrongHealInfo();\n }\n return sensors->getHealthPercent() > healAt;\n};\nListing 11-4: Tweaking the strong healing condition code\nAs in the modified weak healing function, the healing threshold has \nbeen moved to a static variable called healAt, but this time, the logic is a bit \ndifferent. Since learning must happen continually, there’s no variable to \ntrack whether the bot has already learned its true healing capacity. Instead, \nthe code just checks whether the sensors have seen a strong healing event \nsince its last invocation u. If so, the code replaces healAt with the average of \nhealAt and newHealAt and calls a function to clear the sensors of information \nrelated to strong healing w.\nClearing the sensors is actually very important, because it keeps the \ncode from constantly updating healAt against feedback from the same \nstrong healing cast. Notice, too, that this function doesn’t update healAt \nto a perfect value but instead slides it toward the observed optimal value. \nThis behavior makes the new function ideal for situations where there is \nsome amount of randomness in how much you can actually heal. If your bot \nneeds to slide toward the new value faster, you might change the line at v \nto something like this:\nhealAt = (healAt + newHealAt * 2) / 3.00f;\n232 Chapter 11\nThis code to update healAt uses an average weighted toward the newHealAt \nvalue. There are a few points to consider when using this approach, however. \nFirst, what happens when you overheal? In some games, when you heal to \nfull health, your sensors might be able to detect only how much you actu-\nally healed. In other games, your sensors may be able to detect the actual \namount healed. Put another way, if you cast a 30 percent strong heal from \n85 percent health, do your sensors see a heal of 30 percent or 15 percent? \nIf the answer is 30 percent, you’re set. If the answer is 15 percent, your code \nneeds a way to adjust downward.\nOne way to adjust accordingly is to decrement healAt when your sensors \nsee a heal that brings you to full health, like this:\ncurDef->condition = [](GameSensors* sensors) -> bool {\n static float healAt = 50;\n if (sensors->detectedStrongHeal()) {\nu if (sensors->getStrongHealMaxed()) {\n healAt--;\n } else {\n auto newHealAt = 100 - sensors->getStrongHealIncrease();\n healAt = (healAt + newHealAt) / 2.00f;\n }\n sensors->clearStrongHealInfo();\n }\n return sensors->getHealthPercent() > healAt;\n};\nThis code is almost the same as Listing 11-4, but it adds an if() clause \nto decrement healAt if a max heal is detected u. Otherwise, the function \nshould behave like Listing 11-4.\nHealing is a simple case, but this code shows a great example of how \nyou can use error correction to dynamically improve your bots’ behavior. \nOne more advanced use case is adjusting skillshots to account for enemy \nmovement patterns. Every player has patterns in how they avoid skillshots, \nso if your sensors are able to measure the direction and distance an enemy \nmoves when dodging a skillshot, your controller code can adjust the loca-\ntion where the bot initially shoots the skillshot. In this same scenario, learn-\ning would also help the bot account for differences in game server latency, \ncharacter movement speed, and so on.\nWhen using error correction, note that your code will be cleaner and \nmore portable if your state definitions have some form of internal book-\nkeeping other than static variables. Moreover, to avoid cluttering your state \ndefinitions, I suggest encapsulating the error correction logic in some \nexternal modules that are easily invoked when needed.\nPathfinding with Search Algorithms\nOne common challenge you’ll face when writing an autonomous bot is cal-\nculating a path for a character to follow from one location to another. Aside \nfrom the sheer reverse engineering challenge of creating sensors to read \nPutting It All Together: Writing Autonomous Bots 233\nwhich coordinates on the game map are blocking forward movement or \nnot, there’s also the algorithmic challenge of calculating a path within that \nmap. Calculating a path is called pathfinding, and game hackers often use a \nsearch algorithm to tackle it.\nTwo Common Search Techniques\nGiven a grid of tiles, a starting location a, and an ending location b, a \nsearch algorithm calculates a path from a to b. The algorithm does this \nby creating a node at a, adding nodes adjacent to a to a list of tiles to be \nexplored (called the frontier), updating the node to the best tile in the fron-\ntier, and repeating the process until the node reaches b. Different search \nalgorithms select the best node differently, using either a cost, a heuristic, \nor both.\nDijkstra’s algorithm, for example, calculates the cost of a tile based on its \ndistance from the a node and selects the tile with the lowest cost. Imagine \nan empty two-dimensional grid with a in the middle. In a search following \nDijkstra’s algorithm, the frontier will expand in a circular pattern around a \nuntil b lies on the edge of the circle, as seen in Figure 11-4.\nThe greedy best-first search algorithm, instead of prioritizing nodes by \ntheir distance from the starting point, uses a heuristic to estimate the dis-\ntance from a node in the frontier to b. The algorithm then selects the node \nwith the shortest estimated distance. Imagine this algorithm in the same \ngrid as before; the frontier would be a line going almost directly from a to \nb, as seen in Figure 11-5.\nHow Obstacles Disrupt Searches\nThe difference in how these algorithms behave becomes clearer once obsta-\ncles are added to the grid. If, for instance, a wall separates a and b, Dijkstra’s \nFigure 11-4: The frontier of Dijkstra’s \nalgorithm. Lighter tiles are higher cost.\nFigure 11-5: The frontier of the \ngreedy best-first search algorithm. \nLighter tiles are higher cost.\na\nb\na\nb\n234 Chapter 11\nalgorithm will always find the quickest path, but with a huge consequence. \nThe radius of the circular frontier around a will be equal to the length of \nthe final path; let’s call that radius r. If no grid boundaries clip the frontier, \nyou can roughly calculate the number of nodes opened by taking the area \nof a circle with radius r. If the path around the wall is 50 tiles, the algorithm \nwill open roughly 7,854 tiles, as shown in this equation:\nπ × 50 2 = 7,854\nIn the same scenario, greedy best-first search will calculate a less-than-\noptimal path but open substantially fewer tiles. It’s not as easy to visualize \nhow the frontier will expand, and it’s not important right now, so I won’t go \ninto it here. At the end of the day, neither of these algorithms really fits the \npathfinding problem well. The optimal path is slow, and the fast path isn’t \noptimal.\nTo quickly calculate an optimal path, you need to fuse Dijkstra’s algo-\nrithm with greedy best-first search. Luckily, someone has already done this, \nand the resulting algorithm is a monster known as A-star search (often just \ncalled A*).\nA* uses the sum of a cost, called g, and a heuristic, called h, to select \nnodes. These resulting sum is called the score. Put simply, score = g + h. Like \nDijkstra’s algorithm, A* can calculate the most optimal path from a to b, \nand like greedy best-first search, it can do so relatively quickly.\nAn A* Search Algorithm\nNow that you know the fundamentals, let’s write code to implement the \nA* algorithm. This implementation will work in a two-dimensional grid. It \nwon’t allow diagonal movement at first, but I’ll discuss in a bit how you can \nmodify the code to work with diagonal movement, too.\nAll of the example code for this section is in the GameHackingExamples/\nChapter11_SearchAlgorithms directory of this book’s source files. The included \nprojects can be compiled with Visual Studio 2010, but they should also work \nwith any other C++ compiler. Download them at https://www.nostarch.com/\ngamehacking/ and compile them to follow along. If you execute Chapter11_\nSearchAlgorithms.exe, you’ll be able to define your own 20×20 grid and watch \nthe algorithm calculate a search path.\nCreating an A* Node\nTo start, define an empty AStarNode class as follows:\ntypedef std::shared_ptr AStarNodePtr;\nclass AStarNode\n{\npublic:\n};\nThis code defines the AStarNode class and a std::shared_ptr type defini-\ntion called AStarNodePtr to make it easier to create safe pointers to the class. \nPutting It All Together: Writing Autonomous Bots 235\nNext, within the public scope of this class, declare member variables for the \nnode’s x-position, y-position, cost, and node’s score:\nint x, y;\nint g, score;\nAdditionally, you need a public member of type AStarNodePtr that refer-\nences the parent node:\nAStarNodePtr parent;\nAfter declaring all member variables, declare a public constructor that \ninitializes them upon instance creation, as follows:\nAStarNode(int x, int y, int cost, AStarNodePtr p, int score = 0)\n : x(x), y(y), g(cost), score(score), parent(p)\n{}\nNow, to make creating safe pointers easier, add a static helper function \nlike this:\nstatic AStarNodePtr makePtr(\n int x, int y, int cost, \n AStarNodePtr p, \n int score = 0)\n{\n return AStarNodePtr(new AStarNode(x, y, cost, p, score));\n}\nThis makePtr() function creates a new instance of AStarNode and returns \nthe instance wrapped inside of an AstarNodePtr.\nLet’s recap. The AStarNode class has member variables x, y, g, score, and \nparent. When the class is constructed, all of these members are initialized \nfrom values passed to the constructor, with the exception of score, which \nis optional (because you use it only when making copies of an AStarNode \ninstance) and set to 0 if not provided. \nNext, define a public member function to calculate the heuristic when \ngiven the destination coordinates:\nint heuristic(const int destx, int desty) const\n{\n int xd = destx - x;\n int yd = desty - y;\nu return abs(xd) + abs(yd);\n}\nThis function returns the Manhattan distance heuristic u, a distance cal-\nculation designed for grids where diagonal movement is not possible: \n∆\n∆\nx\ny\n+\n236 Chapter 11\nTo calculate a path that allows diagonal movement, you’d need to \nmodify this function to use the Euclidean distance heuristic, which looks \nlike this:\n(\n) \n(\n)\n∆\n∆\n∆\n∆\nx\nx\ny\ny\n×\n+\n×\nThe class also needs a function to update score. You add that function \nto the public scope as follows:\n#define TILE_COST 1\nvoid updateScore(int endx, int endy)\n{\n auto h = this->heuristic(endx, endy) * TILE_COST;\n this->score = g + h;\n}\nNow, score should change to g + h when given destination coordinates \nto calculate h.\nTo wrap up, the node class also needs a function that can calculate all \nof its child nodes. The function could do this by creating new nodes for \neach tile adjacent to the current node. Each new node refers to the current \nnode as its parent, so the class needs to be able to create an AStarNodePtr to a \ncopy of the current node as well. Here’s how all that works:\nAStarNodePtr getCopy()\n{\n return AStarNode::makePtr(x, y, g, parent, score);\n}\nstd::vector getChildren(int width, int height)\n{\n std::vector ret;\n auto copy = getCopy();\n if (x > 0)\nu ret.push_back(AStarNode::makePtr(x - 1, y, g + TILE_COST, copy));\n if (y > 0)\nv ret.push_back(AStarNode::makePtr(x, y - 1, g + TILE_COST, copy));\n if (x < width - 1)\nw ret.push_back(AStarNode::makePtr(x + 1, y, g + TILE_COST, copy));\n if (y < height - 1)\nx ret.push_back(AStarNode::makePtr(x, y + 1, g + TILE_COST, copy));\n return ret;\n}\nThis function creates child nodes at (x – 1, y) u, (x, y – 1) v, (x + 1, \ny) w, and (x, y + 1) x. Their parent is the node that called getChildren, and \ntheir g is the parent’s g plus TILE_COST.\nTo allow for diagonal movement, this function needs to add children \nat (x – 1, y – 1), (x + 1, y – 1), (x + 1, y + 1), and (x – 1, y + 1). Additionally, if \nPutting It All Together: Writing Autonomous Bots 237\nmoving diagonally would cost more—that is, if the character requires more \ntime to do it—you’d also need to do the following:\n1.\t Change TILE_COST to 10.\n2.\t Define a constant DIAG_TILE_COST as TILE_COST multiplied by the time \nincrease. If a diagonal step takes 1.5 times as long, DIAG_TILE_COST would \nbe 15.\n3.\t Give diagonal children a g of the parent’s g plus DIAG_TILE_COST.\nTo finish off AStarNode, declare operators for comparing the priority and \nequality of two nodes. You could place these declarations outside the class \nin global scope like this:\nu bool operator<(const AStarNodePtr &a, const AStarNodePtr &b)\n{\n return a.score > b.score;\n}\nv bool operator==(const AStarNodePtr &a, const AStarNodePtr &b)\n{\n return a.x == b.x && a.y == b.y;\n}\nThese operators allow std::priority_queue to sort nodes by score u and \nstd::find to determine node equality by location v.\nWriting the A* Search Function\nNow that you’ve completed the AStarNode class, you can code the actual \nsearch function. Start by defining the function prototype:\ntemplate\nbool doAStarSearch(\n int map[WIDTH][HEIGHT],\n int startx, int starty,\n int endx, int endy,\n int path[WIDTH][HEIGHT])\n{ }\nThe prototype accepts the game map’s width and height, as well as the \nvalue that signifies a blocking tile on the map, as template parameters. The \ndoAStarSearch() function also takes the map itself (map), the starting coordi-\nnates (startx and starty), the destination coordinates (endx and endy), and a \nblank map (path) where it can fill the calculated path when it finishes.\nN o t e \t\nThe first three parameters are template parameters, so you can pass them as compile-\ntime constants. I’ve done this for the example code to allow explicit array size declara-\ntions for the map and path parameters and to allow a definite value to signify blocking \ntiles on the map. In practice, the map you read from a game will have a dynamic size, \nand you’ll probably need a more robust way to pass this data.\n238 Chapter 11\nNext, the doAStarSearch() function needs a sorted list to hold the fron-\ntier and a container to track all created notes so you can update the score \nand parent of an existing node if it’s opened as a child of a different parent. \nYou can create these as follows: \nstd::vector allNodes;\nstd::priority_queue frontier;\nThe frontier is defined with std::priority_queue since it can automati-\ncally sort the nodes based on their score. The node container, allNodes, is \ndefined as a std::vector.\nNow, let’s create the first node:\nauto node = AStarNode::makePtr(startx, starty, 0, nullptr);\nnode->updateScore(endx, endy);\nallNodes.push_back(node);\nThe first node is a no-cost orphan node at the position (startx, starty). \nThe node is given a score based on what the updateScore() function returns, \nand then it’s added to the allNodes container.\nWith a node in the container, it’s time to write the meat of the A* algo-\nrithm, starting with a simple loop:\nwhile (true) {\n}\nUntil otherwise specified, the rest of the code in this section will appear \ninside of this loop, in the order shown.\nFrom here, the first step is to check the goal state. In this case, the goal is \nto find a path for the player to follow to the next waypoint, which happens \nwhen the node object’s position is (endx, endy). Thus, to check the goal state, \nthe program needs to check whether node has reached those coordinates or \nnot. Here’s how that check should look:\nif (node->x == endx && node->y == endy) {\n makeList(node, allNodes, path);\n return true;\n}\nWhen the goal state is met, the program reports true back to the caller \nand fills path with the final path. For now, assume a function called makeList() \ncan fill in path for you; I’ll show you this function shortly. If the goal state isn’t \nmet, you need to expand the children of node, which is actually a pretty com-\nplicated process:\nauto children = node->getChildren(WIDTH, HEIGHT);\nfor (auto c = children.begin(); c != children.end(); c++) {\nu if (map[(*c)->x][(*c)->y] == BLOCKING) continue;\nPutting It All Together: Writing Autonomous Bots 239\n auto found = std::find(allNodes.rbegin(), allNodes.rend(), *c);\nv if (found != allNodes.rend()) {\nw if (*found > *c) {\n (*found)->g = (*c)->g;\n (*found)->parent = (*c)->parent;\n (*found)->updateScore(endx, endy);\n }\n } else {\n (*c)->updateScore(endx, endy);\nx frontier.push(*c);\ny allNodes.push_back(*c);\n }\n}\nAfter calling node->getChildren to generate a list of nodes that can be \nadded to the frontier, the code iterates over each child and ignores any that \nare on blocking tiles u. Next, for each child, the code checks whether a node \nhas already been opened at the same coordinates v. If so, and if the score \nof the existing node is greater than the score of the new child, the exist-\ning node is updated to the parent, cost, and score of the new child by the \nif() statement at w. If the new child doesn’t have a brother-from-another-\nmother, it will be added as is to the frontier x and the node list y.\nAlso notice that std::find uses the reverse begin and reverse end itera-\ntors of allNodes instead of the regular iterators u. The example does this \nbecause new nodes are appended to the end of the vector and duplicate \nnodes will be close together, so duplicates will usually be closer to the end \nof the vector. (This step could also be done directly against the frontier, but \nstd::priority_queue doesn’t allow iteration over nodes and writing the sort in \nplace would make the code too large for print.)\nEventually, the function will run out of new children to add to the fron-\ntier; the following if() statement handles that situation:\nif (frontier.size() == 0) return false;\nu node = frontier.top();\nv frontier.pop();\nThis code points node to the cheapest node from the frontier u, removes \nit from the frontier v, and lets the loop repeat. If the frontier ends up empty, \nthe function reports false back to the caller, since there’s nothing left to \nsearch.\nCreating the Path List\nFinally, it’s time to implement the makeList() function:\ntemplate\nvoid makeList(\n AStarNodePtr end,\n std::vector nodes,\n int path[WIDTH][HEIGHT])\n240 Chapter 11\n{\n for (auto n = nodes.begin(); n != nodes.end(); n++)\nu path[(*n)->x][(*n)->y] = 2;\n auto node = end;\n while (node.get() != nullptr) {\nv path[node->x][node->y] = 1;\n node = node->parent;\n }\n}\nThis function updates path with both a list of closed nodes u and the \ncalculated path v. For this example, the value 2 represents the closed nodes \nand 1 represents the path nodes. The program calculates nodes in the path \nby following parent nodes from the goal node until it reaches the starting \nnode, which is an orphan with nullptr as a parent.\nWhen A* Searches Are Particularly Useful\nMake sure to play with the example code and executable for the previous \nsection, because that’s the only way you’ll really get acquainted with the \nbehavior of A* searches. In most newer games, you should be able to just \nsend a packet with the destination or even emulate a click on the map at the \ndesired spot, but when you come across a situation where you need to calcu-\nlate a path, you’ll be glad you learned A*.\nThere are actually many situations where calculating a path can be \nuseful:\nSelecting targets\nWhen your bot is selecting targets to attack, you may want to check \nwhether your character can actually reach them. Otherwise, if an \nenemy is isolated in an unreachable room, you might get stuck in \nplace trying to target them forever!\nSelecting corpses\nAs your looting state(s) determine which corpses to open, you can opti-\nmize by always trying to loot the closest corpse first.\nEmulating mouse movements\nVery rarely, some heavily protected games actually correlate in-game \nactions with mouse movements to ensure that there’s no bot running. \nIn this case, you might need to emulate the mouse. Using a modified \nversion of A* where the screen is the map, there are no blocking tiles, \nand node costs are slightly randomized, you can calculate human-like \npaths for your mouse to follow when you simulate movement.\nKiting monsters\nIf you ever need to write code to kite monsters, you can implement A* \nwith a goal state of being N units away from all creatures. Using the \nsame cost mechanism shown in this chapter, play with the heuristic \nPutting It All Together: Writing Autonomous Bots 241\nto give a higher cost to nodes that are closer to creatures. Kiting \nisn’t exactly a conventional use case, and the heuristic will require a \nbunch of tweaking, but it works amazingly once you’ve got it going. \nSome implementations can kite any number of monsters better than \na human!\nPredicting enemy movements\nIf you’re writing a bot that fights other players, you can use A* to pre-\ndict their movements and act accordingly. For instance, if your enemy \nstarts running away, your bot can assume they are running to their \nbase, calculate their route, and use a spell to block their path or even \nteleport to a location where it expects them to be.\nThese are just a few use cases for A* searches, and you’ll definitely \nfind many more as you improve your bots. For the rest of the chapter, I’ll \ndescribe some popular automated hacks that you can implement using the \ntechniques described in this book.\nCommon and Cool Automated Hacks\nNow that you’ve seen the design patterns and algorithms needed to create \nefficient, self-teaching bots, it’s time to learn about some popular auto-\nmated hacks that go beyond simple healing and pathfinding. Let’s fly up to \n10,000 feet to explore two types of bots at a high level.\nLooting with Cavebots\nWhile discussing control theory, state machines, and search algorithms, I \ntouched on the idea of a cavebot that kills creatures, grabs loot, and walks \naround caves. The abilities of cavebots can vary greatly.\nOther Uses for A* Search\nA* isn’t just for calculating paths. With abstractions on top of the AStarNode \nclass, you can adapt the same algorithm to any search problem. Realistically, \nA* is just a weighted iteration over a multidimensional data set that iterates \nuntil some goal object is found, and, thus, it can solve any problem that can be \nrepresented as a multidimensional data set. More advanced applications for \nA* include playing chess and checkers, and—when it’s paired with a three-\ndimensional Manhattan distance heuristic and a depth-first search implementa-\ntion—even solving a Rubik’s cube. Sadly, I’m not going to go into these use \ncases; if you want to get really good with search algorithms, I encourage you \nto research more online.\n242 Chapter 11\nDepositing Gold and Restocking Supplies\nIf you want to leave a character botting for days on end, you’ll need a deposi-\ntor and a refiller. A depositor can deposit loot in your bank or vault, while a \nrefiller refills your potions, runes, and other supplies. These features can be \ndescribed with six basic states:\nLeave spawn Condition met if the character is in the spawn area or \ncave, if it has nothing to deposit, and if it has enough supplies. Reach \nthis state by exiting the spawn area or cave.\nWalk to town Condition met if the character is in the spawn area or \ncave. Reach this state by walking from the spawn or cave to town.\nDeposit Condition met if the character is in the spawn area or cave, \nor if the character is in town and has nothing to deposit. Reach this \nstate by putting loot in the bank or vault.\nWithdraw cash Condition met if the character is in the spawn area \nor cave, is in town with no supplies to purchase, or has enough gold to \npurchase supplies. Reach this state by withdrawing gold from the bank \nor vault.\nPurchase supplies Condition met if the character is in the spawn area \nor cave or if the character has enough supplies to start hunting. Reach \nby buying supplies.\nEnter spawn Condition met if the character is in the spawn area or \ncave. Reach this state by walking to the spawn area or cave.\nThese states would come before the states related to following waypoints \n(I describe a couple of those states in “A Complex Hypothetical State \nMachine” on page 228) in the vector of StateDefinition objects. Placing \nthem first gives them priority over remaining in the cave, while still allow-\ning the character to target, kill, and loot monsters on the way back to town. \nDepending on where you’re hunting and how you want the bot to behave, \nyou may also tell your targeting states not to attack creatures if the charac-\nter isn’t in the spawn area or cave, and you might add an extra state before \nwalk to town that attacks only creatures that block the character’s path to \ntown. Specifying that extra state increases the bot’s efficiency, since trips to \nand from town will be much quicker if the monsters on the way aren’t worth \nkilling.\nUsing the Character as Bait\nTwo other cavebot features that can make your bot awesome are lure mode \nand dynamic lure. You wouldn’t implement these two features as actual states \nin a complex bot; rather, you’d have them inform the bot’s targeting and \nwalking states to help the bot make decisions.\nYou can control lure mode with special waypoints in your path, and its \ncode will tell your targeting states to attack creatures only if the bot is stuck, \nsimilar to the mechanism discussed for walking to or from town. The differ-\nence is that lure mode can be switched on and off at different areas in the \nPutting It All Together: Writing Autonomous Bots 243\ncave, allowing you to lure multiple mobs of monsters to certain locations \nbefore attacking them. This can make your bot much more efficient, as cer-\ntain types of characters may excel at killing many monsters at once.\nDynamic lure is similar, but instead of turning it on and off at definite \nlocations via waypoints, you can automatically turn lure mode on when \nthere aren’t enough monsters. For example, a bot with the dynamic lure \nfeature might tell the targeting states not to attack any creature until five \nmonsters are on screen. The targeting states would resume attacking and \nkiting until all five monsters are dead, and the bot would snap back into \nlure mode until a suitably sized mob appears again.\nIf your character is quick enough to outrun monsters, though, you’ll \nneed to modify your bot’s walking states to walk slowly when lure mode is \non and creatures are present. Otherwise, your character will leave mobs \nbehind without killing them. You can slow down a character by adding \na state before the follow path state in your state machine definition that \ndelays movement slightly when lure mode is on and any creatures are too \nfar away.\nAllowing Players to Script Custom Behaviors\nNearly every cavebot includes a scripting interface that allows players to add \ntheir own behaviors. You could implement this interface as a way to specify \ncustom waypoints to follow, spells to use, or items to loot. In more advanced \nbots, you might make your targeting, looting, walking, and luring systems \nas dynamic as possible so players can add unique features. If you implement \nyour automation in Lua, third parties could easily improve and expand your \nbot’s abilities.\nMaking your bot easy to write scripts for takes a lot of work off your \nshoulders, since other programmers who play the game might release \nscripts to add support for new hunting spots and improve your automation. \nSuch scripting services are common in botting communities, and players \noften create and sell professional-grade scripts that integrate with bots.\nAutomating Combat with Warbots\nAnother class of automated bots is used for player versus player (PvP) combat. \nThese warbots, or PvP bots, have many features categorized as responsive \nor ESP hacks, since the bots focus on responding to incoming damage or \nspells, revealing hidden enemies, and giving the player an information \nadvantage.\nFully automated warbots are rare, but I’ve already lightly discussed how \nyou can use some automation techniques to make smarter healers, teach \nbots to land more accurate skillshots, and predict players’ paths to stop \nthem in their tracks. Let’s explore a few other cool hacks that fall on the \nfringe of responsive, ESP, and automated.\nN o t e \t\nIn games that are completely PvP based, such as battlegrounds or real-time strategy \ngames, some players might also just call these bots, since war or PvP is the bot’s only \npurpose.\n244 Chapter 11\nAutowall Bots\nIf your character has a spell to create a temporary wall, you can code \na bot that automatically blocks enemy players when they enter small \ncorridors. Using error correction, the bot could learn how far ahead of \nthe enemy to place the wall. With some really creative engineering, the \nbot could even learn which enemies can jump over walls by checking \nwhether each enemy manages to get past the wall before it disappears.\nAutosnipe Bots\nFor characters with a long-range skillshot or global execution spell, you \ncan use automation to detect when an enemy across the map has low \nhealth and cast your spell to kill them. You can also use error correc-\ntion to more accurately guess where to shoot a long-range skillshot. If \nyou’re unable to calculate exact damage amounts, error correction can \nalso help a bot determine how much damage a spell does and tweak the \ncasting threshold accordingly.\nAutokite Bots\nIf you’re playing a carry character that does most of its damage by \nattacking at a short distance, you might implement a bot to automati-\ncally kite enemies. Using a set of states similar to the ones a cavebot \nmight use to kite monsters, you can make a bot that automatically kites \nenemy characters when you attack them. When you stop targeting the \nenemy, the bot can stop kiting. Using A* search, you can improve the \nkiting mechanism to avoid multiple enemies, or, if you want to escape \nwhile attacking, guide the kiting mechanism back to a safe place, such \nas your team’s base or a neutral location.\nClosing Thoughts\nBy this point, you should be ready to go out and make some pretty awesome \nbots. Don’t worry if you’re still not completely comfortable with the tech-\nniques in this chapter; the best way to learn is to just dive in and start hack-\ning. Use the thousands of lines of example code provided for this book to \nget started without working from scratch, and most of all, have fun!\nIn the next chapter, I’ll discuss ways that bots can hide from anti-cheat \nmechanisms, which are pieces of software that games use to detect and stop \nbotters.\n12\nS tay ing H idde n\nGame hacking is an ever-evolving prac-\ntice, a game of cat and mouse between \nhackers and game developers where each \nparty works to subvert the other. As long as \npeople make bots, game companies will find ways to \nhinder bot advances and ban players who use bots. \nRather than making their games inherently harder to \nhack, though, game companies focus on detection.\nThe largest game companies have very sophisticated detection suites \ncalled anti-cheat software. In the beginning of this chapter, I’ll discuss the \ncapabilities of the most common anti-cheat suites. After revealing how these \nsuites detect bots, I’ll teach you some powerful ways to evade them.\n246 Chapter 12\nProminent Anti-Cheat Software\nThe best-known anti-cheat suites use the same methods as most antivirus soft-\nware to scan for bots and flag them as threats. Some anti-cheat suites are also \ndynamic, meaning their inner workings and capabilities can change based \non the game they’re protecting. Anti-cheat software developers also track \ndown and patch their suites against bypass software, so always do your own \nin-depth research of any anti-cheat software that you might face.\nWhen these suites detect a botter, they flag the botter’s account for ban-\nishment. Every few weeks, game company administrators ban the flagged \nplayers in a ban wave. Game companies use ban waves instead of instan-\ntaneous bans because banning in waves is more profitable. If botters are \nbanned after a few weeks of playing, their familiarity with the game will \nmake them more likely to buy a new account than if they were banned the \nmoment their bot started running.\nThere are dozens of anti-cheat suites, but I’ll focus on the five packages \nthat are the most common and thoroughly understood: PunkBuster, ESEA \nAnti-Cheat, Valve Anti-Cheat (VAC), GameGuard, and Warden.\nThe PunkBuster Toolkit\nPunkBuster, made by Even Balance Inc., is the original anti-cheat tool-\nkit. Many games use PunkBuster, but it’s most common in first-person \nshooter games like Medal of Honor, Far Cry 3, and several installments of \nthe Battlefield series.\nThe toolkit uses a myriad of detection methods, the most formidable of \nwhich are signature-based detection (SBD), screenshots, and hash valida-\ntion. PunkBuster is also known for imposing hardware bans that perma-\nnently ban a cheater’s computer, rather than just their game account, by \nsaving a fingerprint of the hardware’s serial numbers and blocking logins \nfrom a machine that matches it.\nSignature-Based Detection\nPunkBuster scans the memory of all processes on a system running a game \nthat employs it, searching for byte patterns unique to known cheat software, \ncalled signatures. If PunkBuster detects a signature, the player is flagged \nfor a ban. PunkBuster carries out memory scans from user mode using the \nNtQueryVirtualMemory() Windows API function, and it sometimes runs scans \nfrom multiple hidden processes.\nSignature-based detection is blind to context by design, and it ulti-\nmately suffers from a fatal flaw: false positives. On March 23, 2008, a team \nof hackers set out to prove the existence of this flaw by spamming public \nchatrooms with a text string that PunkBuster would identify as a bot signa-\nture. Since SBD blindly scans process memory for matching patterns, any \nand all legitimate players inside these public chatrooms were flagged as \nbotters.\nStaying Hidden 247\nThis caused thousands of fair players to be banned with no justification. \nA similar situation happened again in November 2013: PunkBuster falsely \nbanned thousands of players on Battlefield 4. That time, no one was trying to \nprove a point; the company had just added a bad signature to its software.\nPunkBuster resolved both of these issues by restoring the players’ \naccounts, but these incidents show just how aggressive its flavor of SBD is. \nIn the time since these attacks, though, PunkBuster’s SBD has reduced \nthe number of false positives by checking only for signatures at predefined \nbinary offsets.\nScreenshots\nAs another method of bot detection, PunkBuster also periodically takes \nscreenshots of a player’s screen and sends them to the central game server. \nThis form of detection is a nuisance, and it’s weak compared to SDB. Game-\nhacking communities speculate that PunkBuster implemented this feature \nto give game admins proof against botters who dispute bans.\nHash Validation\nIn addition to employing SBD and screenshots, PunkBuster detects bots by \ncreating cryptographic hashes of a game’s executable binaries on a player’s \nsystem and comparing them to hashes stored on a central server. If the \nhashes do not match, the player is flagged for a ban. This check is carried \nout only on the binaries on the filesystem, not on in-memory binaries.\nThe ESEA Anti-Cheat Toolkit\nThe ESEA Anti-Cheat toolkit is used by the E-Sports Entertainment Association \n(ESEA), primarily for its Counter-Strike: Global Offensive league. Unlike \nPunkBuster, this suite is known for generating very few false positives and \nbeing highly effective at catching cheaters.\nESEA Anti-Cheat’s detection capabilities resemble those of PunkBuster, \nwith one noteworthy difference. ESEA Anti-Cheat’s SBD algorithm is car-\nried out from a kernel-mode driver using three different Windows Kernel \nfunctions: the MmGetPhysicalMemoryRanges() function, the ZwOpenSection() func-\ntion, and the ZwMapViewOfSection() function. This implementation makes \nthe anti-cheat system nearly immune to memory spoofing (a common way \nto defeat SBD), as the functions used by the scan are much harder to hook \nwhen they’re called from a driver.\nThe VAC Toolkit \nVAC is the toolkit Valve Corporation applies to its own games and many of \nthe third-party games available on its Steam gaming platform. VAC uses \nSDB and hash validation methods that resemble PunkBuster’s detection \ntechniques, and it also uses Domain Name System (DNS) cache scans and \nbinary validation.\n248 Chapter 12\nDNS Cache Scans\nDNS is a protocol that converts between domain names and IP addresses \nsmoothly, and the DNS cache is where that information gets stored on a \ncomputer. When VAC’s SBD algorithm detects cheat software, VAC scans \nthe player’s DNS cache for any domain names associated with cheating \nwebsites. It’s not certain whether a positive DNS cache scan is required for \nVAC’s SBD algorithm to flag a player for banishment, or if the DNS cache \nscan simply acts as another nail in the coffin for players who are already \nflagged by SBD.\nN o t e \t\nTo see your DNS cache, enter ipconfig /displaydns at a command prompt. Yes, VAC \nlooks at all of that.\nBinary Validation\nVAC also uses binary validation to prevent in-memory tampering of execut-\nable binaries. It scans for modifications like IAT, jump, and code hooking \nby comparing hashes of in-memory binary code to hashes of the same code \nin the binaries on the filesystem. If it finds a mismatch, VAC flags the player \nfor a ban.\nThis detection method is formidable, but Valve’s initial implementation \nof the algorithm was flawed. In July 2010, VAC’s binary validation falsely \nbanned 12,000 Call of Duty players. The binary validation module failed \nto account for a Steam update, and it banned the players when their in-\nmemory code did not match the updated binaries on the filesystem.\nFalse Positives\nVAC has had other issues with false positives. Its initial release routinely \nbanned fair players for “faulty memory.” This same early version banned \nplayers for using Cedega, a platform that ran Windows games on Linux. And \non April 1, 2004, Valve falsely banned a couple thousand players due to a \nserver-side glitch. On two separate occasions, one in June 2011 and one in \nFebruary 2014, VAC also falsely banned thousands of Team Fortress 2 and \nCounter-Strike players due to bugs that the company refuses to disclose. As \nwith PunkBuster, these incidents show that VAC is very aggressive.\nThe GameGuard Toolkit\nGameGuard is an anti-cheat toolkit made by INCA Internet Co. Ltd. and \nused by many MMORPGs, including Lineage II, Cabal Online, and Ragnarok \nOnline. In addition to some mildly aggressive SBD, GameGuard uses root-\nkits to proactively prevent cheat software from running.\nUser-Mode Rootkit\nGameGuard utilizes a user-mode rootkit to deny bots access to the Windows \nAPI functions they use to operate. The rootkit hooks the functions at their \nStaying Hidden 249\nlowest-level entry point, often inside undocumented functions in ntdll.dll, \nuser32.dll, and kernel32.dll. These are the most notable API functions \nGameGuard hooks, and here’s what GameGuard does from inside each \nhooked function:\nNtOpenProcess() Blocks any OpenProcess() attempts on the game being \nprotected.\nNtProtectVirtualMemory() Blocks any VirtualProtect() or VirtualProtectEx() \nattempts on the game.\nNtReadVirtualMemory() and NtWriteVirtualMemory() Block any \nReadProcessMemory() and WriteProcessMemory() attempts on the game.\nNtSuspendProcess() and NtSuspendThread() Block any attempts to suspend \nGameGuard.\nNtTerminateProcess() and NtTerminateThread() Block any attempts to ter-\nminate GameGuard.\nPostMessage(), SendMessage(), and SendInput() Block any attempts to send \nprogrammatic input to the game.\nSetWindowsHookEx() Prevents bots from globally intercepting mouse and \nkeyboard input.\nCreateProcessInternal() Automatically detects and hooks into new \nprocesses.\nGetProcAddress(), LoadLibraryEx(), and MapViewOfFileEx() Prevent any \nattempt to inject libraries into the game or GameGuard.\nKernel-Mode Rootkit\nGameGuard also uses a driver-based rootkit to prevent bots that work in \nthe kernel. This rootkit has the same abilities as its user-mode counterpart, \nand it works by hooking ZwProtectVirtualMemory(), ZwReadVirtualMemory(), \nZwWriteVirtualMemory(), SendInput(), and similar functions.\nThe Warden Toolkit\nWarden, made exclusively for Blizzard’s games, is by far the most advanced \nanti-bot toolkit I’ve encountered. It’s hard to say what exactly Warden does, \nbecause it downloads dynamic code at runtime. This code, delivered as \ncompiled shellcode, typically has two responsibilities:\n• \nDetect bots.\n• \nPeriodically send a heartbeat signal to the game server. The value sent \nis not predefined but instead is generated by some subset of the detec-\ntion code.\nIf Warden fails to complete the second task or sends the wrong value, the \ngame server will know that it’s been disabled or tampered with. Furthermore, \na bot can’t disable the detection code and leave the heartbeat code running.\n250 Chapter 12\nWarden is formidable because you not only have no way to know what \nyou’re hiding from but also have no way to disable the toolkit. Even if you \nmanage to avoid detection today, a new detection method might be used \ntomorrow.\nIf you plan on publicly distributing bots, you will eventually meet one \nof the anti-cheat solutions described in the previous sections—and you’ll \nhave to beat it. Depending on your bot’s footprint, the type of detection in \nthe game you’re botting, and your implementation, the difficulty of evading \none of these toolkits can range from trivial to extremely hard.\nCarefully Managing a Bot’s Footprint\nA bot’s footprint is how many unique, detectable characteristics it has. For \nexample, a bot that hooks 100 functions will typically be easier to detect \nthan a bot that hooks only 10 functions because the former makes an order \nof magnitude more changes to a game’s code than the latter. Since a tar-\ngeted detection system needs to detect only one hook, the developer of the \nformer bot needs to spend much more time making sure all of the bot’s \nhooks are as stealthy as possible.\nAnother footprint characteristic is how detailed a bot’s user interface \nis. If a known bot has many dialog boxes that all have specific titles, a game \ncompany can just have its anti-cheat software detect the bot by searching for \nwindows that have those titles. This same basic reasoning can be used with \nprocess names and filenames.\nMinimizing a Bot’s Footprint\nDepending on how your bot works, there are many ways to minimize its \nfootprint. If your bot relies heavily on hooks, for instance, you can avoid \ndirectly hooking a game’s code and instead focus on hooking Windows API \nfunctions. Windows API hooking is surprisingly common, so developers \ncan’t assume a program that hooks the Windows API is a bot.\nThe Halting Problem\nA bot that could disable Warden’s detection code and still send the heart-\nbeat signal would solve the halting problem, which Alan Turing proved to be \nimpossible in 1936. The halting problem is the problem of determining, with a \ngeneric algorithm, whether a program will finish running or continue forever. \nBecause Warden does two tasks using the same shellcode, writing a generic \nalgorithm that can disable just one task is a variation of the halting problem: \nthe algorithm can’t be sure which parts of the code will definitely execute, \nwhich parts won’t, and which parts are responsible for each task.\nStaying Hidden 251\nIf your bot has a well-defined user interface, you can mask the interface \nby removing all strings from window bars, buttons, and so on. Instead, dis-\nplay images that show text. If you’re worried about specific process names \nor filenames being detected by the anti-cheat software, use generic file-\nnames and make your bot copy itself to a new, randomized directory every \ntime it launches.\nMasking Your Footprint\nMinimizing your footprint is a preferred way to avoid detection, but it’s not \nnecessary. You can also obfuscate your bot, making it harder for anyone to \nfigure out how it works. Obfuscation can prevent both anti-bot developers \nfrom trying to detect your bot and other bot developers from analyzing \nyour bot to steal proprietary functionality. If you sell your bot, obfuscation \nprevents people from cracking it to bypass your purchase verification, too.\nOne common type of obfuscation is called packing. Packing an execut-\nable encrypts it and hides it inside another executable. When the container \nexecutable is launched, the packed executable is decrypted and executed \nin-memory. When a bot is packed, analyzing the binary to learn what the \nbot does is impossible, and debugging the bot process is much harder. \nSome common packer programs are UPX, Armadillo, Themida, and ASPack.\nTeaching a Bot to Detect Debuggers\nWhen anti-bot developers (or other bot creators) can debug a bot, they can \nfigure out how it works and thus how to stop it. If someone is actively try-\ning to pick apart a bot, packing the executable may not be enough to evade \nthem. To protect against this, bots often employ anti-debugging techniques, \nwhich obfuscate control flow by changing the bot’s behavior when a debug-\nger is detected. In this section, I’ll quickly cover some well-known methods \nfor detecting when a debugger is attached to your bot, and in the next, I’ll \nshow you some tricks for obfuscation.\nCalling CheckRemoteDebuggerPresent()\nCheckRemoteDebuggerPresent() is a Windows API function that can tell you if a \ndebugger is attached to the current process. Code to check for a debugger \nmight look like this:\nbool IsRemoteDebuggerPresent() {\n BOOL dbg = false;\n CheckRemoteDebuggerPresent(GetCurrentProcess(), &dbg);\n return dbg;\n}\nThis check is pretty straightforward—it calls CheckRemoteDebuggerPresent() \nwith the current process and a pointer to the dbg Boolean. Calling this \nfunction is the easiest way to detect a debugger, but it’s also very easy for a \ndebugger to evade.\n252 Chapter 12\nChecking for Interrupt Handlers\nInterrupts are signals the processor sends to trigger a corresponding han-\ndler in the Windows kernel. Interrupts are typically generated by hardware \nevents, but they can also be generated in software using the INT assem-\nbly instruction. The kernel allows some interrupts—namely, interrupts \n0x2D and 0x03—to trigger user-mode interrupt handlers in the form of \nexception handlers. You can take advantage of these interrupts to detect \ndebuggers.\nWhen a debugger sets a breakpoint on an instruction, it replaces that \ninstruction with a breakpoint instruction, such as INT 0x03. When the \ninterrupt is executed, the debugger is notified via an exception handler, \nwhere it handles the breakpoint, replaces the original code, and allows the \napplication to resume execution seamlessly. When faced with an unrecog-\nnized interrupt, some debuggers even silently step over that interrupt and \nallow execution to continue normally, without triggering any other excep-\ntion handlers.\nYou can detect this behavior by purposely generating interrupts within \nexception handlers in your code, as shown in Listing 12-1.\ninline bool Has2DBreakpointHandler() {\n __try { __asm INT 0x2D }\n __except (EXCEPTION_EXECUTE_HANDLER){ return false; }\n return true;\n}\ninline bool Has03BreakpointHandler() {\n __try { __asm INT 0x03 }\n __except (EXCEPTION_EXECUTE_HANDLER){ return false; }\n return true;\n}\nListing 12-1: Detecting interrupt handlers\nDuring normal execution, these interrupts trigger the exception han-\ndlers surrounding them in the code. During a debugging session, some \ndebuggers might intercept the exceptions generated by these interrupts and \nsilently ignore them, preventing the surrounding exception handlers from \nexecuting. Thus, if the interrupts don’t trigger your exception handler, then \na debugger is present.\nChecking for Hardware Breakpoints\nDebuggers can also set breakpoints using the processor’s debug registers; \nthese are called hardware breakpoints. A debugger can set a hardware break-\npoint on an instruction by writing the address of the instruction to one of \nthe four debug registers.\nStaying Hidden 253\nWhen an address present on a debug register is executed, the debug-\nger is notified. To detect hardware breakpoints (and thus, the presence of \na debugger), you can check for nonzero values on any of the four debug \nregisters like this:\nbool HasHardwareBreakpoints() {\n CONTEXT ctx = {0};\n ctx.ContextFlags = CONTEXT_DEBUG_REGISTERS;\n auto hThread = GetCurrentThread();\n if(GetThreadContext(hThread, &ctx) == 0)\n return false;\n return (ctx.Dr0 != 0 || ctx.Dr1 != 0 || ctx.Dr2 != 0 || ctx.Dr3 != 0);\n}\nPrinting Debug Strings\nOutputDebugString() is a Windows API function that can be used to print log \nmessages to a debugger console. If no debugger is present, the function will \nreturn with an error code. If a debugger is present, however, the function \nwill return with no error code. Here’s how you can use this function as a \ntrivial debugger check:\ninline bool CanCallOutputDebugString() {\n SetLastError(0);\n OutputDebugStringA(\"test\");\n return (GetLastError() == 0);\n}\nLike the CheckRemoteDebuggerPresent() method, this method is very \nstraightforward but also very easy for a debugger to evade.\nChecking for DBG_RIPEXCEPTION Handlers\nDebuggers typically have exception handlers that blindly catch exceptions \nwith Windows’ DBG_RIPEXCEPTION exception code, making that code a clear \nway to spot a debugger. You can detect these exception handlers in much \nthe same way Listing 12-1 detects interrupt handlers:\n#define DBG_RIPEXCEPTION 0x40010007\ninline bool hasRIPExceptionHandler() {\n __try { RaiseException(DBG_RIPEXCEPTION, 0, 0, 0); }\n __except(EXCEPTION_EXECUTE_HANDLER){ return false; }\n return true;\n}\n254 Chapter 12\nTiming Control-Critical Routines\nIf an anti-bot developer is debugging your bot, the developer will likely \nplace breakpoints on and single-step through parts of your code that are \ncritical to the bot’s behavior. You can detect this activity by measuring code \nexecution times; when someone steps through code, execution takes a lot \nlonger than usual.\nFor example, if a function only places some hooks, you can be sure that \nthe code shouldn’t take more than a tenth of a second to do the memory \nprotection. You could check the execution time for memory protection with \nhelp from the GetTickCount() Windows API function, as follows:\n--snip--\nauto startTime = GetTickCount();\nprotectMemory<>(...);\nif (GetTickCount() - startTime >= 100)\n debuggerDetectedGoConfuseIt();\n--snip--\nChecking for Debug Drivers\nSome debuggers load kernel-mode drivers to assist their operation. You can \ndetect these debuggers by attempting to get a handle to their kernel-mode \ndrivers, like this:\nbool DebuggerDriversPresent() {\n // an array of common debugger driver device names\n const char drivers[9][20] = {\n \"\\\\\\\\.\\\\EXTREM\", \"\\\\\\\\.\\\\ICEEXT\",\n \"\\\\\\\\.\\\\NDBGMSG.VXD\", \"\\\\\\\\.\\\\RING0\",\n \"\\\\\\\\.\\\\SIWVID\", \"\\\\\\\\.\\\\SYSER\",\n \"\\\\\\\\.\\\\TRW\", \"\\\\\\\\.\\\\SYSERBOOT\",\n \"\\0\"\n };\n for (int i = 0; drivers[i][0] != '\\0'; i++) {\n auto h = CreateFileA(drivers[i], 0, 0, 0, OPEN_EXISTING, 0, 0);\n if (h != INVALID_HANDLE_VALUE) {\n CloseHandle(h);\n return true;\n }\n }\n return false;\n}\nThere are a few common kernel-mode driver device names to check for, \nlike \\\\\\\\.\\\\EXTREM and the others shown in the drivers array. If this handle-\nfetching code succeeds, then there’s a debugger running on the system. \nUnlike with the previous methods, though, obtaining a handle to one of \nthose drivers doesn’t always mean the debugger is attached to your bot.\nStaying Hidden 255\nAnti-Debugging Techniques\nOnce you detect a debugger, there are multiple ways to obfuscate your con-\ntrol flow. For instance, you might try to crash the debugger. The following \ncode crashes OllyDbg v1.10:\nOutputDebugString(\"%s%s%s%s\");\nThe string \"%s%s%s%s\" contains format specifiers, and OllyDbg passes \nit to printf() without any extra parameters, which is why the debugger \ncrashes. You could place this code in a function that gets called in response \nto detecting a debugger, but this option works only against OllyDbg.\nCausing an Unavoidable Infinite Loop\nAnother obfuscation method to try is overloading the system until the per-\nson debugging your bot is forced to close the bot and debugger. This func-\ntion does the trick:\nvoid SelfDestruct() {\n std::vector explosion;\n while (true)\n explosion.push_back(new char[10000]);\n}\nThe infinite while loop just keeps adding elements to explosion until the \nprocess runs out of memory or someone pulls the plug.\nOverflowing the Stack\nIf you want to really confuse the analyst, you can make a chain of functions \nthat eventually cause a stack overflow, but in an indirect way:\n#include \ntypedef void (* _recurse)();\nvoid recurse1(); void recurse2();\nvoid recurse3(); void recurse4();\nvoid recurse5();\n_recurse recfuncs[5] = {\n &recurse1, &recurse2, &recurse3,\n &recurse4, &recurse5\n};\nvoid recurse1() { recfuncs[rand() % 5](); }\nvoid recurse2() { recfuncs[(rand() % 3) + 2](); }\nvoid recurse3() {\n if (rand() % 100 < 50) recurse1();\n else recfuncs[(rand() % 3) + 1]();\n}\nvoid recurse4() { recfuncs[rand() % 2](); }\nvoid recurse5() {\n for (int i = 0; i < 100; i++)\n if (rand() % 50 == 1)\n256 Chapter 12\n recfuncs[i % 5]();\n recurse5();\n}\n// call any of the above functions to trigger a stack overflow\nIn a nutshell, these functions randomly and infinitely recurse until \nthere’s no room left on the call stack. Causing the overflow indirectly makes \nit hard for the analyst to pause and examine previous calls before they real-\nize what’s happened.\nCausing a BSOD\nIf you’re serious about obfuscation, you can even trigger a Blue Screen of \nDeath (BSOD) when you detect a debugger. One way to do that is to set \nyour bot’s process as critical using the SetProcessIsCritical() Windows API \nfunction and then call exit(), since Windows will trigger a BSOD when a \ncritical process is killed. Here’s how you might do that:\nvoid BSODBaby() {\n typedef long (WINAPI *RtlSetProcessIsCritical)\n (BOOLEAN New, BOOLEAN *Old, BOOLEAN NeedScb);\n auto ntdll = LoadLibraryA(\"ntdll.dll\");\n if (ntdll) {\n auto SetProcessIsCritical = (RtlSetProcessIsCritical)\n GetProcAddress(ntdll, \"RtlSetProcessIsCritical\");\n if (SetProcessIsCritical)\n SetProcessIsCritical(1, 0, 0);\n }\n}\nBSODBaby();\nexit(1);\nOr maybe you’re evil, in which case you can do this:\nBSODBaby();\nOutputDebugString(\"%s%s%s%s\");\nrecurse1();\nexit(1);\nAssuming you’ve implemented all of the techniques described in this \nsection, this code would cause a BSOD, crash the debugger (if it’s OllyDbg \nv1.10), overflow the stack, and exit the running program. If any one of the \nmethods fails or gets patched, the analyst still has to deal with the remain-\ning ones before they can continue debugging.\nDefeating Signature-Based Detection\nEven with amazing obfuscation, you won’t easily beat signature detection. \nEngineers who analyze bots and write signatures are very skilled, and obfus-\ncation is, at best, a nuisance that makes their job marginally harder.\nStaying Hidden 257\nTo completely evade SBD, you need to subvert the detection code. This \nrequires knowing exactly how the SBD works. PunkBuster, for instance, \nuses NtQueryVirtualMemory() to scan the memory of all running processes \nfor any signatures. If you want to bypass this, you can inject code into all \nPunkBuster processes with a hook on the NtQueryVirtualMemory() function.\nWhen the function tries to query memory from your bot process, you \ncan give it whatever data you want, like this:\nNTSTATUS onNtQueryVirtualMemory(\n HANDLE process, PVOID baseAddress,\n MEMORY_INFORMATION_CLASS memoryInformationClass,\n PVOID buffer, ULONG numberOfBytes, PULONG numberOfBytesRead) {\n // if the scan is on this process, make sure it can't see the hook DLL\n if ((process == INVALID_HANDLE_VALUE ||\n process == GetCurrentProcess()) &&\n baseAddress >= MY_HOOK_DLL_BASE &&\n baseAddress <= MY_HOOK_DLL_BASE_PLUS_SIZE)\nu return STATUS_ACCESS_DENIED;\n // if the scan is on the bot, zero the returned memory\n auto ret = origNtQueryVirtualMemory(\n process, baseAddress,\n memoryInformationClass,\n buffer, numberOfBytes, numberOfBytesRead);\n if(GetProcessId(process) == MY_BOT_PROCESS)\nv ZeroMemory(buffer, numberOfBytesRead);\n return ret;\n}\nThis onNtQueryVirtualMemory() hook returns STATUS_ACCESS_DENIED u \nwhen NtQueryVirtualMemory() tries to query the hook DLL’s memory, but it \ngives zeroed memory v when NtQueryVirtualMemory() tries to query the bot’s \nmemory. The difference isn’t for any specific reason; I’m just showing two \nways you can hide from the NtQueryVirtualMemory() function call. If you’re \nreally paranoid, you can even replace the entire buffer with a random byte \nsequence.\nOf course, this method works only for SBD that happens from user \nmode, like the SBD in PunkBuster or VAC. SBD that happens from the \ndriver, like ESEA’s, or that isn’t predictable, like Warden’s, isn’t as easy to \nbypass.\nIn those cases, you can take precautions to eliminate unique signa-\ntures in your bot. If you’re distributing the bot to more than a dozen or so \npeople, however, removing all distinguishing properties is tricky. To throw \nanalysts off the scent, each time you give somebody a copy of the bot, you \ncould try some combination of the following:\n• \nCompiling the bot using a different compiler\n• \nChanging the compiler optimization settings\n• \nToggling between using __fastcall and __cdecl\n258 Chapter 12\n• \nPacking the binaries using a different packer\n• \nSwitching between static and dynamic linking of runtime libraries\nVarying these elements creates a different assembly for each user, but \nthere’s a limit on how many unique versions of the bot you can produce that \nway. Past some point, this method doesn’t scale to demand, and eventually, \ngame companies will have signatures for every incarnation of your bot.\nApart from obfuscation and code mutation, there aren’t many ways \nto defeat advanced SBD mechanisms. You could implement your bot in \na driver or create a kernel-mode rootkit to hide your bot, but even those \nmethods aren’t foolproof.\nN o t e \t\nThis book doesn’t cover implementing a bot in a driver or creating a rootkit to hide \na bot, as both topics are pretty complex. Rootkit development alone is a subject that \ndozens of books have covered already. I’d recommend Bill Blunden’s The Rootkit \nArsenal: Escape and Evasion in The Dark Corners of The System (Jones & \nBartlett Learning, 2009).\nSome game hackers try to cover every single base, hooking every \nmemory-reading function and the entire filesystem API, but still get \ncaught by determined systems like Warden. In fact, I recommend stay-\ning away from Warden and Blizzard at all costs.\nDefeating Screenshots\nIf you encounter a detection mechanism that uses screenshots as additional \nproof to nail botters, you’re in luck. Bypassing screenshot mechanisms is \neasy: don’t let your bot be seen.\nYou can subvert this type of detection by keeping a minimal UI and \nmaking no visibly distinguishable changes to the game client. If your bot \nrequires a HUD or other distinctive UI displays, though, don’t fret—you \ncan have your cake and eat it, too. As long as you can intercept the screen-\nshot code, you can hide your fingerprints while a screenshot is taken.\nIn some versions of PunkBuster, for example, the Windows API func-\ntion GetSystemTimeAsFileTime() is called just before a screenshot is taken. You \ncan use a hook on this function to quickly hide your UI for a few seconds to \nensure it’s not seen:\nvoid onGetSystemTimeAsFileTime(LPFILETIME systemTimeAsFileTime) {\n myBot->hideUI(2000); // hide UI for 2 seconds\n origGetSystemTimeAsFileTime(systemTimeAsFileTime);\n}\nJust hook GetSystemTimeAsFileTime() using the techniques described in \n“Hooking to Redirect Game Execution” on page 153, write a hideUI() func-\ntion, and call the hideUI() function before execution resumes.\nStaying Hidden 259\nDefeating Binary Validation\nDefeating binary validation is as simple as not placing hooks inside game-\nspecific binaries. Jump hooks and IAT hooks on Windows API functions are \nextremely common, so wherever you can, try to get away with using those \nmethods instead of using jump or near-call hooks in a game binary. In cases \nwhere you must directly hook a game’s code, you can trick the anti-cheat \nsoftware’s binary validation routines by intercepting the binary scan and \nspoofing the data to match what the software expects to see.\nLike SBD, binary validation often uses NtQueryVirtualMemory() to \nscan memory. To trick the validation code, start with a hook on that \nfunction. Then, write a function like this one to spoof the data when \nNtQueryVirtualMemory() is called:\nNTSTATUS onNtQueryVirtualMemory(\n HANDLE process, PVOID baseAddress,\n MEMORY_INFORMATION_CLASS memoryInformationClass,\n PVOID buffer, ULONG numberOfBytes, PULONG numberOfBytesRead) {\n auto ret = origNtQueryVirtualMemory(\n process, baseAddress,\n memoryInformationClass,\n buffer, numberOfBytes, numberOfBytesRead);\n // place tricky code somewhere in here\n return ret;\n}\nInside this hook, you’ll need to watch for any memory scans over mem-\nory that has been modified by one of your hooks.\nN o t e \t\nThis example assumes the bot has only one hook and that variables prefixed with \nHOOK_ already exist and describe the code the hook replaces.\nListing 12-2 shows some scan-monitoring code.\n// is the scan on the current process?\nbool currentProcess =\n process == INVALID_HANDLE_VALUE ||\n process == GetCurrentProcess();\n// is the hook in the memory range being scanned?\nauto endAddress = baseAddress + numberOfBytesRead - 1;\nbool containsHook =\n (HOOK_START_ADDRESS >= baseAddress &&\n HOOK_START_ADDRESS <= endAddress) ||\n (HOOK_END_ADDRESS >= baseAddress &&\n HOOK_END_ADDRESS <= endAddress);\nu if (currentProcess && containsHook) {\n // hide the hook\n}\nListing 12-2: Checking whether hooked memory is being scanned\n260 Chapter 12\nWhen a memory scan over the hooked code happens (which makes \ncurrentProcess and containsHook become true at the same time), code inside \nthe if() statement u updates the output buffer to reflect the original code. \nThis means you must know where the hooked code is within the scanned \nblock, taking into account the fact that the block may span only a subset of \nthe hooked code.\nSo if baseAddress marks the address where the scan starts, HOOK_START_\nADDRESS marks the spot where the modified code starts, endAddress marks the \naddress where the scan ends, and HOOK_END_ADDRESS marks the address where \nthe modified code ends, you can use some simple math to calculate which \nparts of the modified code are present in which parts of the buffer. You do \nso as follows, using writeStart to store the offset of the modified code in the \nscan buffer and readStart to store the offset of the scan buffer relative to \nthe modified code, in case the scan buffer starts in the middle of the modi-\nfied code:\nint readStart, writeStart;\nif (HOOK_START_ADDRESS >= baseAddress) {\n readStart = 0;\n writeStart = HOOK_START_ADDRESS - baseAddress;\n} else {\n readStart = baseAddress - HOOK_START_ADDRESS;\n writeStart = baseAddress;\n}\n \nint readEnd;\nif (HOOK_END_ADDRESS <= endAddress)\n readEnd = HOOK_LENGTH - readStart - 1;\nelse\n readEnd = endAddress – HOOK_START_ADDRESS;\nOnce you know how many bytes you need to replace, where to put them, \nand where to get them, you can do the spoof with three lines of code:\nchar* replaceBuffer = (char*)buffer;\nfor ( ; readStart <= readEnd; readStart++, writeStart++)\n replaceBuffer[writeStart] = HOOK_ORIG_DATA[readStart];\nCompletely assembled, the code looks like this:\nNTSTATUS onNtQueryVirtualMemory(\n HANDLE process, PVOID baseAddress,\n MEMORY_INFORMATION_CLASS memoryInformationClass,\n PVOID buffer, ULONG numberOfBytes, PULONG numberOfBytesRead) {\n auto ret = origNtQueryVirtualMemory(\n process, baseAddress,\n memoryInformationClass,\n buffer, numberOfBytes, numberOfBytesRead);\n bool currentProcess =\n process == INVALID_HANDLE_VALUE ||\n process == GetCurrentProcess();\nStaying Hidden 261\n auto endAddress = baseAddress + numberOfBytesRead - 1;\n bool containsHook =\n (HOOK_START_ADDRESS >= baseAddress &&\n HOOK_START_ADDRESS <= endAddress) ||\n (HOOK_END_ADDRESS >= baseAddress &&\n HOOK_END_ADDRESS <= endAddress);\n if (currentProcess && containsHook) {\n int readStart, writeStart;\n if (HOOK_START_ADDRESS >= baseAddress) {\n readStart = 0;\n writeStart = HOOK_START_ADDRESS - baseAddress;\n } else {\n readStart = baseAddress - HOOK_START_ADDRESS;\n writeStart = baseAddress;\n }\n \n int readEnd;\n if (HOOK_END_ADDRESS <= endAddress)\n readEnd = HOOK_LENGTH - readStart - 1;\n else\n readEnd = endAddress – HOOK_START_ADDRESS;\n char* replaceBuffer = (char*)buffer;\n for ( ; readStart <= readEnd; readStart++, writeStart++)\n replaceBuffer[writeStart] = HOOK_ORIG_DATA[readStart];\n }\n return ret;\n}\nOf course, if you had multiple hooks that you needed to hide from \nbinary validation scans, you would need to implement this functionality \nin a more robust way that would allow it to track multiple modified code \nregions accordingly.\nDefeating an Anti-Cheat Rootkit\nGameGuard and some other anti-cheat suites come with user-mode rootkits \nthat not only detect bots but also proactively prevent them from running. \nTo defeat this type of protection, rather than think outside the box, you can \ncompletely copy the box and work inside that copy.\nFor example, if you want to write memory to a game, you must call \nthe WriteProcessMemory() function, which is exported by kernel32.dll. When \nyou call this function, it directly calls NtWriteVirtualMemory() from ntdll.dll. \nGameGuard hooks ntdll.NtWriteVirtualMemory() to prevent you from writing \nmemory. But if NtWriteVirtualMemory() is exported from, say, ntdll_copy.dll, \nGameGuard won’t hook that function.\nThat means you can copy ntdll.dll and dynamically import all of the \nfunctions you need, as follows:\n// copy and load ntdll\ncopyFile(\"ntdll.dll\", \"ntdll_copy.dll\");\n262 Chapter 12\nauto module = LoadLibrary(\"ntdll_copy.dll\");\n// dynamically import NtWriteVirtualMemory\ntypedef NTSTATUS (WINAPI* _NtWriteVirtualMemory)\n (HANDLE, PVOID, PVOID, ULONG, PULONG);\nauto myWriteVirtualMemory = (_NtWriteVirtualMemory)\n GetProcAddress(module, \"NtWriteVirtualMemory\");\n// call NtWriteVirtualMemory\nmyWriteVirtualMemory(process, address, data, length, &writtenlength);\nAfter copying ntdll.dll, this code imports the NtWriteVirtualMemory() from \nthe copy with the name myWriteVirtualMemory(). From there, the bot can use \nthis function in place of the NtWriteVirtualMemory() function. They’re effec-\ntively the same code in the same library, just loaded under different names. \nCopying a function that anti-cheat software hooks works only if you \ncall that function at its lowest-level entry point, though. If this code copied \nkernel32.dll and dynamically imported the WriteProcessMemory() function, an \nanti-cheat rootkit would still stop the bot, because kernel32_copy.dll would \nstill rely on ntdll.NtWriteVirtualMemory() when calling the WriteProcessMemory() \nfunction.\nDefeating Heuristics\nIn addition to all of the advanced client-side detection mechanisms we’ve \njust discussed, game companies will employ server-side heuristics that can \ndetect bots simply by monitoring a player’s behavior. These systems learn \nto distinguish between human and autonomous player behavior through \nmachine-learning algorithms. Their decision-making process is often inter-\nnal and incomprehensible to humans, so it’s difficult to pinpoint exactly \nwhat features of gameplay lead to detection.\nYou don’t need to know how such algorithms work to trick them; your \nbot just needs to act human. Here are some common behaviors that are dis-\ntinguishably different between humans and bots:\nIntervals between actions\nMany bots perform actions unreasonably fast or at consistent intervals. \nBots will seem more human-like if they have a reasonable cooldown \nperiod between actions. They should also have some form of random-\nization to prevent them from repeating an action at a constant rate.\nPath repetition\nBots that farm enemies automatically visit a preprogrammed list of \nlocations to kill creatures. These waypoint lists are often extremely accu-\nrate, indicating each location as an exact pixel. Humans, conversely, \nmove in less predictable ways and visit more unique locations along the \nway to a familiar area. To replicate this behavior, a bot might walk to a \nStaying Hidden 263\nrandom location within a certain range of a target location, rather than \nto the target location itself. Also, if the bot randomizes the order in \nwhich it visits target locations, the variety of paths it takes will increase \nfurther.\nUnrealistic play\nSome botters run their bots in the same location for hundreds of con-\nsecutive hours, but humans can’t play a game that long. Encourage your \nusers to refrain from botting for more than eight hours at a time and \nwarn them that doing the same thing for seven straight days will defi-\nnitely trigger alarms in a heuristic system.\nPerfect accuracy\nBots can hit a thousand head shots in a row without firing a single extra \nbullet, and they can hit every skill shot with consistent precision. But it’s \nvirtually impossible for a human to do the same, so a smart bot should \nbe intentionally inaccurate at times.\nThese are just a few examples, but in general, you can sneak past \nheuristic checks if you just use common sense. Don’t try to have a bot do \nsomething a human can’t, and don’t have the bot do any single thing for \ntoo long.\nClosing Thoughts\nGame hackers and game developers are engaged in a constant battle of \nwits. Hackers will keep finding ways to subvert detection, and developers \nwill keep finding better ways to detect them. If you’re determined, however, \nthe knowledge in this chapter should help you defeat any anti-cheat soft-\nware you encounter.\nA\nAbout text field, Trainer generator \ndialog, 9\naccessing memory\nin injected DLL, 145–146\nfor writing and reading, 122–124\nAction Message Format (AMF), 169\nactor functions, 216\nactuation, 216, 223\nAddress column\nEvent Properties dialog, 55\nOllyDbg disassembler pane, 27\naddresses, memory. See memory \naddresses\nAddress Space Layout Randomization \n(ASLR), 128\nbypassing in injected DLL, \n146–147\nbypassing in production, \n128–130\ndisabling for bot \ndevelopment, 128\nin Process Explorer, 56, 57\nAdobe AIR hooking, 169\ndecode() function, 172–173, \n174–175\nencode() function, 171–172, \n174–175\nplacing hooks, 173–175\nRTMP, assessing, 169–170\nAdobe AIR.dll, 173–175\nairlog tool, 170\nalignment\nin numeric data, 68\nof variables, in data structures, \n70–71\nambient light, adding, 190–192\nAMF (Action Message Format), 169\nanti-cheat software, 245–246\nanti-cheat rootkit, defeating, \n261–262\nbinary validation, defeating, \n259–261\nbot footprints, managing, \n250–256\nESEA Anti-Cheat toolkit, 247\nGameGuard toolkit, 248–249\nheuristics, defeating, 262–263\nPunkBuster toolkit, 246–247\nscreenshots, defeating, 258\nsignature-based detection, \nevading, 256–257\nVAC toolkit, 247–248\nWarden toolkit, 249–250\nanti-crowd-control hacks, 218\nanti-debugging techniques, 251, \n255–256\narithmetic instructions, 90–92\nA* search algorithm, 234\ncost, 233\ncreating node, 234–237\ncreating path list, 239–240\nscore, 234\nuses for, 240–241\nwriting search function, 237–239\nASLR. See Address Space Layout \nRandomization (ASLR)\nAsm2Clipboard plug-in, 42\nassembly code\ncopying, 42\ntracing, 32–33\nviewing and navigating in \nOllyDbg, 27–29\nassembly language, 78. See also \nx86 assembly language\nassembly patterns, searching for, \n19–21\nInde x\n266 Index\nAStarNode class, 234–236\nAT&T syntax, 80\nautocombo, 219\nautododge, 219\nautokite bots, 244\nautomatic healer, 218, 225–228, \n230–232\nautonomous bots, 221–222. See also \ncontrol theory; state \nmachines\ncavebots, 241–243\ncomplex hypothetical state \nmachine, 228–230\nerror correction, 230–232\nhealer state machine, 225–228\npathfinding with search \nalgorithms, 232–234\nwarbots, 243–244\nautoreload, 219\nautosnipe bots, 244\nautowall bots, 244\nB\nban waves, 246\nBigger Than scan type, Cheat \nEngine, 6\nbinary arithmetic instructions, 90\nbinary validation, 248, 259–261\nbits, EFLAGS register, 84\nBlue Screen of Death (BSOD), 256\nbots. See also autonomous bots; \nextrasensory perception \n(ESP) hacks\nanti-crowd-control hacks, 218\nanti-debugging techniques, 251, \n255–256\nautomatic healer, 218, 225–228, \n230–232\ndetecting debuggers, 251–254\ndetecting visual cues, 205–206\ndisabling ASLR for \ndevelopment, 128\nemulating keyboard, 211–215\nfootprints, managing, 250–256\ngame updates, dealing with, \n101–104\nintercepting network traffic, \n206–211\nmonitoring memory, 204–205\nobfuscation, 251, 255–256\nsending packets, 215–217\nspell trainers, 219\nbranching, 92–94\nbreakpoints, 30, 34, 38\nBreakpoints window, OllyDbg, 26\nBSOD (Blue Screen of Death), 256\nBYTE data type, 67\nbytes, machine code, 78\nC\nC++, 66\ncallee, 94–95\ncaller, 94–95\ncallHook() function, 154\ncall hooking, 153–156. See also Adobe \nAIR hooking\ncalling conventions, 95\nfor call hooks, 155\n__cdecl, 95, 155\n__fastcall, 95\n__stdcall, 95\n__thiscall, 95, 217\nfor trampoline functions, 168\nfor VF table hooks, 156–158\nCALL instruction, 94–95\ncall stack\noverflow, 255–256\nviewing, 30\nx86 assembly language, 86–88\nCall stack window, OllyDbg, 26\ncapacity of std::vector, 109\ncasting spells. See spells\ncavebots, 241–243\n__cdecl convention, 95, 155\nChanged Value scan type, Cheat \nEngine, 7\ncharacters. See also enemies\nhealth bars, monitoring with \nbots, 204–205\npausing execution when health \ndrops, 39–42\nplayer health, finding with \nOllyDbg, 99–101\nIndex 267\nchar data type, 67\nCheat Engine, 3, 5–6\nautomatically locating string \naddresses with, 102\ncheat tables, 7–8\ncorrect address, determining, 7\nfirst scan, running, 6\ninstalling, 4\nLua scripting environment, \n18–22\nmemory modification, 8–11\nnext scan, running, 7\npointer scanning with, 14–18\nscan types, 6\nstd::list, determining whether \ndata is stored in, 112–113\nstd::map, determining whether \ndata is stored in, 117\ntrainer generator, 9–11\nVF tables, 78\nzoom factor, finding, 197\ncheat tables, Cheat Engine, 7–8\nCheat Utility plug-in, 42–43\nCheckRemoteDebuggerPresent() \nfunction, 251\nclasses, 74–78\nclass instances, 76\nCloseHandle() function, 122, 138\nclosing mutexes, 59–60\nCMP instruction, 92\ncode caves, 134\nloading DLLs, 143–146\nthread hijacking, 138–142\nthread injection, 134–138\ncode injection, 133–134\nbypassing ASLR in production, \n128–130\nDLLs, 142–146\nwith thread hijacking, 138–142\nwith thread injection, 134–138\ncode patches, creating, 31–32\ncolumn configurations, Process \nMonitor, 51\ncombat, automating, 243–244\ncommand line plug-in, OllyDbg, \n43–44\ncommand syntax, x86 assembly \nlanguage, 79–81\nComment column, OllyDbg \ndisassembler pane, 28\ncomplex hypothetical state machine, \n228–230\nconditional breakpoints, 34, 38\nconditional statements, 93\nconstant ratio of health, adjusting \nfor, 230–231\ncontrol-critical routines, timing, 254\ncontrol flow hacks, 31\ncontrol flow manipulation, 149–150. \nSee also Adobe AIR \nhooking; Direct3D \nhooking\ncall hooking, 153–156\nIAT hooking, 160–165\njump hooking, 165–169\nNOPing, 150–152\nVF table hooking, 156–160\ncontrol theory, 222\ncombining with state \nmachines, 225\ncomplex hypothetical state \nmachine, 228–230\nerror correction, 230–232\nhealer state machine, 225–228\ncontrol windows, OllyDbg, 25–26\ncooldowns, displaying enemy, \n200–201\ncopying assembly code, 42\ncopy-on-write protection, 126\ncorpses, bot behavior toward, \n229, 240\ncorrect address, determining in \nCheat Engine, 7\nCPU window, OllyDbg, 26–30, 40\ncrashing debuggers, 255\nCreateRemoteThread() function, 129, \n130, 134, 138\nCreateToolhelp32Snapshot() function, \n120, 141\ncreature data, knowing structure \nbehind, 106–107\ncritical game information, displaying, \n198–201\ncrowd-control attacks, 218\ncryptographic functions, \nhooking, 170\n268 Index\nCS register, 85\nC-style operators, OllyDbg, 34–35\ncustom behaviors for cavebots, \nscripting, 243\nD\ndark environments, lighting up, \n190–192\ndata modification instructions, 89\ndata structures, 71–73\ndata types, 66\nclasses and VF tables, 74–78\nnumeric data, 67–69\nOllyDbg, 36\nstring data, 69–71\nunions, 73–74\nDBG_RIPEXCEPTION handlers, \nchecking for, 253\ndebugging. See also OllyDbg\nanti-debugging techniques, \n255–256\ndebug drivers, checking for, 254\ndebug strings, printing, 253\ndetecting debuggers, 251–254\nProcess Monitor, 52–53\n__declspec(naked) convention, 168\ndecode() function, hooking, 172–173, \n174–175\nDecreased Value By scan type, Cheat \nEngine, 7\nDecreased Value scan type, Cheat \nEngine, 7\ndependencies, DLL, 145\ndependency loading, 160\ndepositor, 242\ndestination operand, 80\ndetection, avoiding. See anti-cheat \nsoftware\ndevice->SetRenderState() function, 192\nDijkstra’s algorithm, 233–234\nDirect3D 9, 176\nDirect3D hooking, 175–176. See also \nextrasensory perception \n(ESP) hacks\ndetecting visual cues in games, \n205–206\ndrawing loop, 176–177\nfinding devices, 177–181\noptional fixes for stability, 184\nwriting hook for EndScene(), \n182–183\nwriting hook for Reset(), 183–184\ndirectional lighthacks, 190–191\ndisabling ASLR, 128\ndisassembler pane, OllyDbg, \n27–29, 42\nDisassembly column, OllyDbg \ndisassembler pane, 28\ndispatchPacket() function, 210\ndisplay base, 27\nDLL (dynamic link library), \ninjecting, 142–146\nDllMain() entry point, 144–145\nDLLs option, Process Explorer \npane, 57\nDomain Name System (DNS) cache \nscans, 248\nDOS header, 160–161\nDrawIndexedPrimitive() function, 194, \n195, 196, 200\ndrawing loop, Direct3D, 176–177\nDS register, 85\ndump pane, OllyDbg, 29–30\nDWORD data type, 67, 145–146\ndynamically allocated memory, 6, \n11, 12\ndynamic link library (DLL), \ninjecting, 142–146\ndynamic lure, 242–243\ndynamic structures, 105\nstd::list class, 110–113\nstd::map class, 114–118\nstd::string class, 105–108\nstd::vector class, 108–110\nE\nEAX register, 81\nEBP register, 83\nEBX register, 82\nECX register, 82, 157\nEDI register, 83\nEDX register, 82\nEFLAGS register, 84, 92\nEIP register, 83, 139\nIndex 269\nemulating keyboard, 211–215\nenableLightHackDirectional() function, \n190–191\nencode() function, hooking, 171–172, \n174–175\nEndScene() function\njump hooking, 178–181\nstability of, 184\nwriting hook for, 182–183\nendSceneTrampoline() function, 181\nenemies. See also extrasensory \nperception (ESP) hacks\ncooldowns, displaying, 200–201\ncritical game information, \ndisplaying, 198–201\npredicting movements of, 241\ntexture, changing, 195–196\nentropy, 5, 7\nEnvironment tab, Process Explorer \nProperties dialog, 58\nerror correction, 230–232\nESEA (E-Sports Entertainment \nAssociation), 247\nESEA Anti-Cheat toolkit, 247\nESI register, 83\nESP hacks. See extrasensory \nperception (ESP) hacks\nESP register, 83\nES register, 85\nEuclidean distance heuristic, 236\nevent class filters, Process Monitor, \n51–52\nevent log, Process Monitor, 52–53\nEvent Properties dialog, 54–55\nExact Value scan type, Cheat \nEngine, 6\nexception handlers, checking for, 253\nexecute protection, 125–128\nExecute until return button, \nOllyDbg, 25\nexperience-tracking HUD, 200\nexponent, float data type, 68\nexpressions, OllyDbg, 36–37\naccessing memory \ncontents with, 36\nelements evaluated by, 35–36\nexpression engine, 33–36\npausing execution when health \nof character drops, \n39–42\npausing execution when name of \nplayer is printed, 37–38\nsupported data types, 36\nextrasensory perception (ESP) hacks, \n189–190\nbackground knowledge, 190\nfloor spy hacks, 201–202\nHUDs, 198–201\nlighthacks, 190–192\nloading-screen HUDs, 201\npick-phase HUDs, 201\nrange hacks, 201\nwallhacks, 192–197\nzoomhacks, 197–198\nF\nfalse positives, VAC toolkit, 248\n__fastcall convention, 95\nfeedback loop, 222\nfile accesses, inspecting in Process \nExplorer, 60\nFilesystem event class filter, 52\nFILO (first-in-last-out), 86\nfilters, event class, 51–52\nfindItem() function, 116–117\nfindSequence() function, 175\nfirst-in-last-out (FILO), 86\nfirst-person shooter (FPS), xxii, 246\nfirst scan, running in Cheat \nEngine, 6\nflags, process access, 121\nfloat data type, 67–68\nfloor spy hacks, 201–202\nfog of war, 189. See also extrasensory \nperception (ESP) hacks\nfootprints, managing, 250–256\nFound intermodular calls window, \nOllyDbg, 40\nFPS (first-person shooter), xxii, 246\nFPU registers, 29\nFrame column, Event Properties \nwindow, 54\nframes, in Direct3D drawing loop, 176\n270 Index\nFreeze interval, Trainer generator \ndialog, 9\nfreezing\naddresses, 8\nmain thread, 141\nfrontier, 233\nFS register, 85\nfunction calls, x86 assembly \nlanguage, 94–95\nfunction flowchart, OllyFlow, 45\nfunction names, finding for IAT \nhooking, 163\nG\nGameActuators class, 225\ngame automation state machine, \n223–224\nGameGuard toolkit, 248–249\ngame updates, determining new \naddresses after, 101–104\ngeneral registers, 81–82\ngeneric memory functions, 123–124\ngetAddressforNOP() function, 152\nGetAsyncKeyState() function, 196\nGetExitCodeThread() function, 129\nGetModuleFileName() function, 144\nGetModuleHandle() function, 129–130, \n134, 144, 146–147\nGetSystemTimeAsFileTime() function, 258\nGetThreadContext() function, 139, 142\nGetTickCount() function, 254\nGetWindowThreadProcessId() function, 120\ngoal state, 238\nGo To button, OllyDbg, 25\ngreedy best-first search algorithm, \n233–234\nGS register, 85\nguard protection, 126\nH\nhalting problem, 250\nhandle manipulation options, \nProcess Explorer, 59–60\nhandler functions, 208\nhandles, 56, 121, 210–211, 252\nHandles option, Process Explorer \npane, 57\nHandles window, OllyDbg, 26\nhardware breakpoints, checking for, \n252–253\nhash validation, 247\nheads-up display (HUD), 198–201\nhealer state machine, 225–228, \n230–232\nhealth of characters\nhealth bars, monitoring with \nbots, 204–205\nhealth bars of enemies, \ndisplaying, 150–152\npausing execution upon \ndrop in, 39–42\nheap data, 16\nheuristics, 233\ndefeating, 262–263\nEuclidean distance, 236\nManhattan distance, 235\nHex dump column, OllyDbg \ndisassembler pane, 27–28\nhidden data, displaying, 198–201\nHidden option, Process Explorer \npane, 57\nhooking, 42, 149, 153. See also Adobe \nAIR hooking; Direct3D \nhooking; extrasensory \nperception (ESP) hacks\ncall, 153–156\ndetecting visual cues in games, \n205–206\nIAT, 160–165\nintercepting network traffic, \n206–211\njump, 165–169\nprewritten libraries, 169\nsignature-based detection, \nevading, 257\nVF table, 156–160\nzoomhacks, 198\nhotkeys\nPatches window, OllyDbg, 32\nProcess Explorer, 57\nProcess Monitor, 52\nfor trainer, setting up, 10\nIndex 271\nhourly experience, finding, 200\nHTTP (HyperText Transfer \nProtocol), 169\nHTTPS (HTTP Secure), 169\nHUD (heads-up display), 198–201\nI\nIAT (import address table) hooking, \n160–165\nIDIV instruction, 92\nIMAGE_DOS_HEADER structure, 161\nIMAGE_IMPORT_DESCRIPTOR structure, 162\nIMAGE_OPTIONAL_HEADER structure, 161\nImage tab, Process Explorer \nProperties dialog, 57–58\nIMAGE_THUNK_DATA structure, 162\nimmediate value, 80\nimport address table (IAT) hooking, \n160–165\nimport descriptors, 162\nIMUL arithmetic instruction, 90–91\nIncreased Value By scan type, Cheat \nEngine, 7\nIncreased Value scan type, Cheat \nEngine, 7\nindex registers, 83\ninfinite loops, causing \nunavoidable, 255\nin-game actions, bots for\nanti-crowd-control hacks, 218\nautomatic healer, 218, 225–228, \n230–232\nemulating keyboard, 211–215\nsending packets, 215–217\nspell trainers, 219\nin-game events, logging, 50–52\ninstructions, 79\narithmetic, 90–92\nbranching, 92–94\ndata modification, 89\nfunction calls, 94–95\njump, 92–94\nint data type, 67\nIntel syntax, 80\ninterrupt handlers, checking for, 252\niterator, 120\nJ\njumpHookCallback() function, 168\njump hooking, 165–169, 178–181\njump instructions, x86 assembly \nlanguage, 92–94\nK\nkernel-mode rootkit, GameGuard \ntoolkit, 249\nkeyboard, emulating, 211–215\nKEYEVENTF_KEYUP flag, 212\nkiting, 222, 240–241\nL\nlibraries, hooking, 169\nlighthacks, 190–192\nlist class, 110–111\nlistItem class, 110–111\nlittle-endian ordering, 67\nloader lock, 144\nloading-screen HUDs, 201\nLoadLibrary() function, 143–144\nLocation column, Event Properties \nwindow, 54\nlogging events, Process Monitor, \n50–52\nLog window, OllyDbg, 25\nlong data type, 67\nlong long data type, 67\nlooting, 229, 241–243\nLua scripting environment, Cheat \nEngine, 18–22\nlure mode, 242\nM\nmachine code, 78\nmain loop\nDirect3D drawing loop, \n176–177\nsyncing with, 164–165\nmana, avoiding wasted, 219\nManhattan distance heuristic, 235\nmantissa, float data type, 68\n272 Index\nmassively multiplayer online \nrole-playing games \n(MMORPGs), xxi–xxii, \n198, 248\nmassive online battle arena (MOBA), \nxxii, 189, 197, 201, 206\nmemcpy() function, 136\nmemory, 65–66\nclasses and VF tables, 74–78\ndata structures, 71–73\nnumeric data, 67–69\nstring data, 69–71\nunions, 73–74\nmemory access\nin injected DLL, 145–146\nfor writing and reading, \n122–124\nmemory addresses, 4\naccessing with OllyDbg \nexpressions, 36\ncorrect, determining in Cheat \nEngine, 7\nfreezing, 8\nnew, determining after game \nupdates, 101–104\nrebasing at runtime, 128–129\nstatic, 6\nmemory-based lighthacks, 192\nmemory dump\nof class data, 76\nof code cave, 137\nof data structures, inspecting, \n70–71\nof numeric data, inspecting, \n68–69\nof string data, inspecting, 70\nmemory forensics, 97–98\nnew addresses, determining \nafter game updates, \n101–104\nplayer health, finding with \nOllyDbg, 99–101\npurpose of data, deducing, \n98–99\nstd::list class, 110–113\nstd::map class, 114–118\nstd::string class, 105–108\nstd::vector class, 108–110\nmemory manipulation, 119\naccessing memory, 122–124\naddress space layout \nrandomization, 128–130\nmemory protection, 124–128\nprocess identifier, obtaining, \n120–122\nMemory map window, OllyDbg, 26\nmemory modification, 8–11\nmemory monitoring with bots, \n204–205\nmemory offset, 80\nmemory on write breakpoint, 208\nmemory pointer, 11\nmemory protection, 124–128, 151\nmemory scanning, 3, 98. See also \nCheat Engine; pointer \nscanning\nbasic, 4–5\nimportance of, 4\nmemory modification, 8–11\nnew addresses, determining after \ngame updates, 101–104\noptimization of code, 22\nplayer health, finding with \nOllyDbg, 99–101\npurpose of data, deducing, \n98–99\nMMORPGs (massively multiplayer \nonline role-playing \ngames), xxi–xxii, 198, 248\nmnemonics, 78\nMOBA (massive online battle arena), \nxxii, 189, 197, 201, 206\nmodifying memory values, 8–11\nModule32First() function, 144, 174\nModule32Next() function, 144, 174\nModule column, Event Properties \nwindow, 54\nModules window, OllyDbg, 25\nmonitoring memory with bots, \n204–205\nmonsters, kiting, 240–241\nmouse movements, emulating, \n215, 240\nMOV instruction, 89\nmulticlient patching, 30\nmutexes, closing, 59–60\nIndex 273\nN\nnamed pipes, locating, 60\nname of specific player, pausing \nexecution when printed, \n37–38\nNames window, OllyDbg, 29\nnear calls, 153–154\nnear function call, 39\n.NET processes, 59\nNetwork event class filter, 52\nnew addresses, determining after \ngame updates, 101–104\nnext scan, running in Cheat \nEngine, 7\nnodes, 233, 234–238\nno-operation (NOP) commands, \n31, 32\nNOPing, 150–152\nlighthacks, 192\nzoomhacks, 197–198\nNtQueryVirtualMemory() function, 246, \n257, 259\nNtWriteVirtualMemory() function, \n261–262\nnull terminator, 70\nnumeric data types, 67–69\nnumeric operators, OllyDbg, 34–35\nO\nobfuscation, 251, 255–256\nobserving game events\ndetecting visual cues, 205–206\nintercepting network traffic, \n206–211\nmonitoring memory, 204–205\nobstacles, searches disrupted by, \n233–234\noffset, 54\nOllyDbg, 23–24\nassembly code, 27–29, 32–33\ncall stack, viewing, 30\ncode patches, creating, 31–32\ncommand line for, 43–44\ncontrol windows, 25–26\nCPU window, 26–30\ncrashing debuggers, 255\ndealing with game updates, 104\ndebugger buttons and \nfunctions, 25\nexpression engine, 33–37\nmemory, viewing and searching, \n29–30\nmemory dump of numeric data, \n68–69\nmemory dump of string data, 70\npacket parser, finding, 207–208\nPatches window, 31–32\npatching if() statements, 46–47\npausing execution when health \nof character drops, \n39–42\npausing execution when name \nof player is printed, \n37–38\nplug-ins, 42–46\nregister contents, viewing and \nediting, 29\nRun trace window, 32–33\nsupported data types, 36\ntranslating code cave assembly \nto shellcode, 135–136\nuser interface, 24–26\nzoom limitation code, \nfinding, 198\nOllyFlow plug-in, 45–46\nopcodes, 78\nOpenProcess() function, 121–122\nOpenThread() function, 142\noperands\nbinary arithmetic \ninstructions, 90\nIDIV instruction, 92\nMOV instruction, 89\nsyntax, 80–81\nunary arithmetic instructions, 90\noperations, 79\noperators, using in OllyDbg \nexpression engine, 34–35\noptimizing memory code, 22\nordering, little-endian, 67\norder of variables, in data structures, \n70–71\nOutputDebugString() function, 253\n274 Index\nP\npackets\nintercepting, 206–211\nsending, 215–217\npacking, 251\npadding, 68\npage protection, 125–126\npages, 124\nparsing packets, 206–211\nPatches window, OllyDbg, 26, 31–32\npatching, multiclient, 30\npatching if() statements, 46–47\nPath column, Event Properties \ndialog, 55\npathfinding with search algorithms, \n232–234. See also A* search \nalgorithm\npath list, A* search algorithm, \n239–240\nPause button, OllyDbg, 25\npausing execution, 37–38, 39–42\npausing threads, 184\nPEB (process environment block) \nstructure, 146\nPeekMessage() function, 184\nPE header, 160–161\npick-phase HUDs, 201\nPID (process identifier), 120–122\npipes, locating named, 60\nPlay button, OllyDbg, 25\nplayer health, finding with OllyDbg, \n99–101\nplayer versus player (PvP) combat, \n243–244\nplug-ins, OllyDbg, 42–46\npointer chains, 11–12\npointer path, 11\nPointerscanner Scanoptions dialog, \nCheat Engine, 14–16\npointer scanning, 11\nbasics of, 12–14\nwith Cheat Engine, 14–18\npointer chains, 11–12\nrescanning, 17–18\nPong, 46–47\nPopup trainer on keypress field, \nTrainer generator \ndialog, 9\npredicting enemy movements, 241\nprewritten hooking libraries, 169\nprintf() call, 72, 73–74, 75\nprinting debug strings, 253\nProcess32First() function, 120\nProcess32Next() function, 120–121\nprocess access flags, 121\nPROCESS_ALL_ACCESS flag, 121\nProcess and thread activity event \nclass filter, 52\nPROCESS_CREATE_THREAD flag, 121\nprocess environment block (PEB) \nstructure, 146\nProcess Explorer, 49–50, 55–56\nconfiguring colors, 56\nhandle manipulation options, \n59–60\nhotkeys, 57\nProperties dialog, 57–59\nuser interface and controls, \n56–57\nprocess handles, obtaining, 121\nprocess identifier (PID), 120–122\nprocessInput() function, 215–216\nprocessKeyboardInput() function, 216\nProcess Monitor, 49–50\nconfiguring columns in, 51\ndebugging, 53–55\nevent class filters, 51–52\nhigh-score file, finding, 55\nhotkeys, 52\ninspecting events in event log, \n52–53\nlogging in-game events, 50–52\nProcess Monitor Filter dialog, 50\nProcessname field, Trainer generator \ndialog, 9\nprocessNextPacket() function, 210\nprocessor registers, 81–86\nProcess profiling event class \nfilter, 52\nPROCESS_VM_OPERATION flag, 121, 122\nPROCESS_VM_READ flag, 121\nPROCESS_VM_WRITE flag, 121\nIndex 275\nProperties dialog, Process Explorer, \n57–59\nprotection, memory, 124–128, 151\nPunkBuster toolkit, 246–247, 257\npurpose of data, deducing, 98–99\nPvP (player versus player) combat, \n243–244\nR\nrange hacks, 201\nreading from game memory, 119\naccessing memory, 122–124\naddress space layout \nrandomization, 128–130\nmemory protection, 124–128\nprocess identifier, obtaining, \n120–122\nReadProcessMemory() function, 122–124\nread protection, 125–128\nReal Time Messaging Protocol \n(RTMP)\nassessing, 169–170\ndecode() function, hooking, \n172–173, 174–175\nencode() function, hooking, \n171–172, 174–175\nintercepting packets, 207\nreal-time strategy (RTS), xxii, 197, \n201, 206, 243\nrebasing addresses at runtime, 128–129\nreconnaissance, 49–50\nProcess Explorer, 55–60\nProcess Monitor, 50–55\nrecv() function, 207–208\nred-black tree, 114–115\nReferences window, OllyDbg, 26, \n28–29, 40, 100\nrefiller, 242\nregisters, processor, 81–86\nregisters pane, OllyDbg, 29\nRegistry event class filter, 51\nRescan pointerlist window, Cheat \nEngine, 17–18\nresponsive hacks, 203\nanti-crowd-control hacks, 218\nautomatic healer, 218, 225–228, \n230–232\ndetecting visual cues, 205–206\nemulating keyboard, 211–215\nintercepting network traffic, \n206–211\nmonitoring memory, 204–205\nsending packets, 215–217\nspell trainers, 219\nrootkits\ndefeating anti-cheat, 261–262\nGameGuard toolkit, 248–249\nroot node, 113–114\nRTMP. See Real Time Messaging \nProtocol\nRTS (real-time strategy), xxii, 197, \n201, 206, 243\nruntime flexibility, 229\nRun trace window, OllyDbg, 26, 32–33\nS\nSBD. See signature-based \ndetection (SBD)\nscan code, 214\nscan types, Cheat Engine, 6\nscan value, 4\nscore, 234\nscreenshots, 247, 258\nscripting custom behaviors for \ncavebots, 243\nscripting engine, Cheat Engine, 18–22\nsearch algorithms, 232–234. See also \nA* search algorithm\nSecurity tab, Process Explorer \nProperties dialog, 58\nsegment registers, 84–86\nsend() function, 216–217\nsending packets, 215–217\nSendInput() function, 211–212, 215\nSendMessage() function, 213–215\nsensors, of a system, 222\nSet/Change hotkey screen, Cheat \nEngine, 10\nSetLight() member function, 192\nSetProcessIsCritical() function, 256\nshellcode, 134, 135–136, 138–141\nshort data type, 67\nsign, float data type, 68\n276 Index\nsignature-based detection (SBD)\nESEA Anti-Cheat toolkit, 247\nevading, 256–257\nPunkBuster toolkit, 246–247\nsignatures, 246\nsingle-instance limitation, 59–60\nskillshots, 232\nSleep() function, 164–165, 227\nSmaller Than scan type, Cheat \nEngine, 6\nsource operand, 80\nSource window, OllyDbg, 26\nspawning threads, 129\nspells\nanti-crowd-control hacks, 218\ncomplex hypothetical state \nmachine, 228–230\nspell trainers, 219\nSS register, 85\nstack frame, 87–89\nstack overflow, 255–256\nstack pane, OllyDbg, 30\nstack trace, Process Monitor, 54–55\nstate machines, 223–224\nautomated healer, 225–228\ncombining with control \ntheory, 225\ncomplex hypothetical, 228–230\nerror correction, 230–232\nLua functions, adding, 229–230\nruntime flexibility, 229\nstatic addresses, 6\n__stdcall convention, 95\nstd::list class, 110–113\nstd::map class, 114–118\nstd::string class, 105–108\nstd::vector class, 108–110\nStep into button, OllyDbg, 25\nStep over button, OllyDbg, 25\nstochastic systems, 230\nstring data, 21, 69–71, 100–101\nstring operators, OllyDbg, 35\nStrings tab, Process Explorer \nProperties dialog, 58\nstruct member alignment, 71\nstructures, data, 71–73\nsubregisters, 83\nSuspendThread() function, 142, 184\nsyncing with game threads, 164–165\nsystems, controlling behavior of, 222\nT\ntargets, selecting, 240\nTCP/IP tab, Process Explorer \nProperties dialog, 58\nTEB (thread environment block), 146\ntemplates\nfor changing memory \nprotection, 127\nmemory access functions, \n123–124, 145–146\nTEST instruction, 92\ntext strings, 21, 69–71, 100–101\ntexture of enemies, changing, \n195–196\n__thiscall convention, 95, \n156–158, 217\nThread32First() function, 141\nThread32Next() function, 141\nthread environment block \n(TEB), 146\nthreads\nhijacking, 138–142\ninjection, 134–138\nspawning, 129\nThreads tab, Process Explorer \nProperties dialog, 58\nThreads window, OllyDbg, 26\nthunks, 162–163\ntiming control-critical routines, 254\nTitle field, Trainer generator \ndialog, 9\ntoggling z-buffering, 195\nTrace into button, OllyDbg, 25\nTrace over button, OllyDbg, 25\ntracing with OllyDbg, 32–33, 39–42\ntrainer generator, Cheat Engine, \n9–11\ntrampoline functions, 165–168, 181\ntraversals\nIAT hooking, 162\nVF tables, 156\nIndex 277\nU\nunary arithmetic instructions, 90\nunavoidable infinite loops, \ncausing, 255\nUnchanged Value scan type, Cheat \nEngine, 7\nunions, 73–74\nUnix syntax, 80\nUnknown Initial Value scan type, \nCheat Engine, 6\nupdates, determining new addresses \nafter, 101–104\nuser interface, Process Explorer, \n56–57\nuser-mode rootkit, GameGuard \ntoolkit, 248–249\nV\nVAC toolkit, 247–248\nValue Between scan type, Cheat \nEngine, 6\nValue Type directive, Cheat Engine, 6\nVF (virtual function) tables\nclass instances and, 76–78\nfinding Direct3D devices, \n177–181\nhooking, 156–160, 182–183\ntraversals, 156\nVirtualAllocEx() function, \n136–137, 138\nvirtual functions, classes with, 75–76\nVirtualProtectEx() function, 126–128\nVirtualProtect() function, 127\nW\nWaitForSingleObject() function, 129, 138\nwallhacks, 192\ncreating for Direct3D, 194–197\nrendering with z-buffering, \n193–194\nwarbots, 243–244\nWarden toolkit, 249–250\nwaypoints, 222, 229\nwchar_t data type, 67\nwindow handle, fetching, 120\nWindows window, OllyDbg, 26\nWM_CHAR messages, 213–214\nWORD data type, 67\nWriteProcessMemory() function, \n122–124, 136–137, 138\nwrite protection, 125–128\nwriting to game memory, 119\naccessing memory, 122–124\naddress space layout \nrandomization, 128–130\ncode caves, 136–137\nmemory protection, 124–128\nprocess identifier, obtaining, \n120–122\nX\nx86 assembly language, 78–79\narithmetic instructions, 90–92\nbranching instructions, 92–94\ncall stack, 86–88\ncommand syntax, 79–81\ndata modification \ninstructions, 89\nfunction calls, 94–95\njump instructions, 92–94\nNOPing, 150–152\nprocessor registers, 81–86\nx86 Windows memory protection \nattributes, 125–126\nZ\nz-buffering, 192–195\nzoom factor, 197\nzoomhacks, 197–198\nPractical Forensic Imaging\nSecuring Digital Evidence with Linux Tools\nby bruce nikkel\nfall 2016, 256 pp., $49.95\nisbn 978-1-59327-793-2 \nPractical Malware Analysis\nThe Hands-On Guide to \nDissecting Malicious Software\nby michael sikorski and \nandrew honig\nfeb 2012, 800 pp., $59.95\nisbn 978-1-59327-290-6\nIOS Application Security\nThe Definitive Guide for Hackers \nand Developers\nby david thiel\nfeb 2016, 296 pp., $49.95\nisbn 978-1-59327-601-0\nBlack Hat Python\nPython Programming for \nHackers and Pentesters\nby justin seitz\ndec 2014, 192 pp., $34.95\nisbn 978-1-59327-590-7\nThe Car Hacker’s Handbook\nA Guide for the Penetration Tester\nby craig smith\nmar 2016, 304 pp., $49.95\nisbn 978-1-59327-703-1\nThe IDA Pro Book, 2nd Edition\nThe Unofficial Guide to the \nWorld’s Most Popular Disassembler\nby chris eagle\njul 2011, 672 pp., $69.95\nisbn 978-1-59327-289-0\nMore no-nonsense books from \nNo Starch Press\n800.420.7240 or 415.863.9900 | sales@nostarch.com | www.nostarch.com\nresources\nVisit https://www.nostarch.com/gamehacking/ for resources, errata, and other \ninformation.\nYou don’t need to be a wizard to transform a \ngame you like into a game you love. Imagine \nif you could give your favorite PC game a more \ninformative heads-up display or instantly col-\nlect all that loot from your latest epic battle.\nBring your knowledge of Windows-based \ndevelopment and memory management, \nand Game Hacking will teach you what you \nneed to become a true game hacker. Learn \nthe basics, like reverse engineering, assembly \ncode analysis, programmatic memory manip\nulation, and code injection, and hone your \nnew skills with hands-on example code and \npractice binaries.\nLevel up as you learn how to:\n💎\t Scan and modify memory with Cheat \nEngine\n💎\t Explore program structure and execution \nflow with OllyDbg\n💎\t Log processes and pinpoint useful data files \nwith Process Monitor\n💎\t Manipulate control flow through NOPing, \nhooking, and more\n💎\t Locate and dissect common game memory \nstructures\nYou’ll even discover the secrets behind common \ngame bots, including:\n💎\t Extrasensory perception hacks, such as \nwallhacks and heads-up displays\n💎\t Responsive hacks, such as autohealers and \ncombo bots\n💎\t Bots with artificial intelligence, such as \ncave walkers and automatic looters\nGame hacking might seem like black magic, \nbut it doesn’t have to be. Once you understand \nhow bots are made, you’ll be better positioned \nto defend against them in your own games. \nJourney through the inner workings of PC \ngames with Game Hacking, and leave with a \ndeeper understanding of both game design \nand computer security.\nAbout the Author\nNick Cano wrote his first scripts for open source \ngame servers when he was 12 and has been \na part of the game-hacking community ever \nsince. He has years of experience in detecting \nand defending against malware, and advises \ndevelopers and designers on best practices \nto protect their games against bots. Nick has \nspoken about his research and tools at many \nconferences.\nGet Inside the Game\n$44.95 ($51.95 CDN)\t\nShelve In: Computers/Security\nTHE FINEST IN GEEK ENTERTAINMENT™\nwww.nostarch.com\nWARNING! This book does not condone piracy, violating the DMCA, infringing copyright, or breaking in-game Terms of \nService. Game hackers have been banned from games for life, sued for millions of dollars, and even jailed for their work. \n"}
\ No newline at end of file
diff --git a/speechify_desktop.py b/speechify_desktop.py
new file mode 100644
index 0000000..40b354a
--- /dev/null
+++ b/speechify_desktop.py
@@ -0,0 +1,871 @@
+"""
+Speechify Desktop Clone - 100% Local/Offline TTS Reader
+Modern cross-platform desktop application using PyQt6 and Piper TTS
+"""
+
+import sys
+import json
+import threading
+import wave
+from pathlib import Path
+from typing import Optional, List, Dict
+import re
+
+from PyQt6.QtWidgets import (
+ QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
+ QTextEdit, QPushButton, QComboBox, QSlider, QLabel, QFileDialog,
+ QProgressBar, QTabWidget, QSpinBox, QCheckBox, QGroupBox, QMessageBox,
+ QSplitter, QStatusBar
+)
+from PyQt6.QtCore import Qt, QThread, pyqtSignal, QTimer, QSettings
+from PyQt6.QtGui import QFont, QTextCursor, QTextCharFormat, QColor, QPalette, QIcon
+
+# Document readers
+import fitz # PyMuPDF
+from docx import Document
+import ebooklib
+from ebooklib import epub
+from bs4 import BeautifulSoup
+
+# TTS Engine
+import subprocess
+import tempfile
+import os
+
+# Audio playback
+try:
+ import pygame
+ pygame.mixer.init()
+ AUDIO_AVAILABLE = True
+except ImportError:
+ AUDIO_AVAILABLE = False
+ print("Warning: pygame not available. Install with: pip install pygame")
+
+
+class PiperTTS:
+ """Local Piper TTS engine wrapper"""
+
+ def __init__(self, models_dir: str = "models/piper"):
+ self.models_dir = Path(models_dir)
+ self.voices: Dict[str, Path] = {}
+ self.discover_voices()
+
+ def discover_voices(self):
+ """Scan for available Piper voice models"""
+ if not self.models_dir.exists():
+ print(f"Models directory not found: {self.models_dir}")
+ return
+
+ for onnx_file in self.models_dir.glob("*.onnx"):
+ json_file = onnx_file.with_suffix(".onnx.json")
+ if json_file.exists():
+ voice_name = onnx_file.stem
+ self.voices[voice_name] = onnx_file
+
+ print(f"Found {len(self.voices)} Piper voices: {list(self.voices.keys())}")
+
+ def synthesize(self, text: str, voice: str, output_path: str, speed: float = 1.0) -> bool:
+ """Generate speech using Piper"""
+ if voice not in self.voices:
+ print(f"Voice {voice} not found")
+ return False
+
+ model_path = self.voices[voice]
+
+ try:
+ # Check for piper executable
+ piper_exe = self._find_piper_executable()
+ if not piper_exe:
+ print("Piper executable not found. Please install Piper.")
+ return False
+
+ # Piper command: echo "text" | piper -m model.onnx -f output.wav
+ cmd = [
+ str(piper_exe),
+ "--model", str(model_path),
+ "--output_file", output_path
+ ]
+
+ # Add speed control if supported
+ if speed != 1.0:
+ cmd.extend(["--length_scale", str(1.0 / speed)])
+
+ # Run Piper
+ process = subprocess.Popen(
+ cmd,
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True
+ )
+
+ stdout, stderr = process.communicate(input=text)
+
+ if process.returncode == 0 and Path(output_path).exists():
+ return True
+ else:
+ print(f"Piper error: {stderr}")
+ return False
+
+ except Exception as e:
+ print(f"TTS synthesis error: {e}")
+ return False
+
+ def _find_piper_executable(self) -> Optional[Path]:
+ """Find Piper executable in common locations"""
+ # Check current directory
+ if Path("piper.exe").exists():
+ return Path("piper.exe")
+ if Path("piper").exists():
+ return Path("piper")
+
+ # Check system PATH
+ import shutil
+ piper_path = shutil.which("piper")
+ if piper_path:
+ return Path(piper_path)
+
+ return None
+
+
+class TTSWorker(QThread):
+ """Background thread for TTS generation"""
+
+ progress = pyqtSignal(int) # Progress percentage
+ finished = pyqtSignal(str) # Output file path
+ error = pyqtSignal(str) # Error message
+ sentence_started = pyqtSignal(int) # Sentence index
+
+ def __init__(self, tts_engine: PiperTTS, text: str, voice: str, speed: float):
+ super().__init__()
+ self.tts_engine = tts_engine
+ self.text = text
+ self.voice = voice
+ self.speed = speed
+ self.sentences = self._split_sentences(text)
+ self._stop_requested = False
+
+ def _split_sentences(self, text: str) -> List[str]:
+ """Split text into sentences"""
+ # Simple sentence splitter
+ sentences = re.split(r'([.!?]+\s+)', text)
+ result = []
+ current = ""
+
+ for i, part in enumerate(sentences):
+ current += part
+ if i % 2 == 1: # Every other part is a delimiter
+ if current.strip():
+ result.append(current.strip())
+ current = ""
+
+ if current.strip():
+ result.append(current.strip())
+
+ return result if result else [text]
+
+ def stop(self):
+ self._stop_requested = True
+
+ def run(self):
+ """Generate TTS for all sentences"""
+ temp_files = []
+
+ try:
+ for i, sentence in enumerate(self.sentences):
+ if self._stop_requested:
+ break
+
+ self.sentence_started.emit(i)
+
+ # Generate audio for this sentence
+ temp_file = tempfile.mktemp(suffix=".wav")
+ if self.tts_engine.synthesize(sentence, self.voice, temp_file, self.speed):
+ temp_files.append(temp_file)
+ else:
+ self.error.emit(f"Failed to synthesize sentence {i}")
+ return
+
+ # Update progress
+ progress = int((i + 1) / len(self.sentences) * 100)
+ self.progress.emit(progress)
+
+ if self._stop_requested:
+ # Clean up temp files
+ for f in temp_files:
+ try:
+ os.unlink(f)
+ except:
+ pass
+ return
+
+ # Combine all audio files
+ output_path = tempfile.mktemp(suffix=".wav")
+ if self._combine_wav_files(temp_files, output_path):
+ self.finished.emit(output_path)
+ else:
+ self.error.emit("Failed to combine audio files")
+
+ # Clean up temp files
+ for f in temp_files:
+ try:
+ os.unlink(f)
+ except:
+ pass
+
+ except Exception as e:
+ self.error.emit(str(e))
+
+ def _combine_wav_files(self, input_files: List[str], output_path: str) -> bool:
+ """Combine multiple WAV files into one"""
+ try:
+ if not input_files:
+ return False
+
+ # Read first file to get parameters
+ with wave.open(input_files[0], 'rb') as first:
+ params = first.getparams()
+
+ # Write combined file
+ with wave.open(output_path, 'wb') as output:
+ output.setparams(params)
+
+ for input_file in input_files:
+ with wave.open(input_file, 'rb') as infile:
+ output.writeframes(infile.readframes(infile.getnframes()))
+
+ return True
+
+ except Exception as e:
+ print(f"Error combining WAV files: {e}")
+ return False
+
+
+class SpeechifyDesktop(QMainWindow):
+ """Main application window"""
+
+ def __init__(self):
+ super().__init__()
+
+ self.setWindowTitle("Speechify Desktop Clone - Local TTS Reader")
+ self.setGeometry(100, 100, 1200, 800)
+
+ # Settings
+ self.settings = QSettings("SpeechifyLocal", "Desktop")
+
+ # TTS Engine
+ self.tts_engine = PiperTTS()
+ self.tts_worker: Optional[TTSWorker] = None
+ self.current_audio_file: Optional[str] = None
+ self.is_playing = False
+ self.is_paused = False
+
+ # Sentences for highlighting
+ self.sentences: List[str] = []
+ self.current_sentence_idx = 0
+
+ # Init UI
+ self._init_ui()
+ self._load_settings()
+
+ # Check for audio support
+ if not AUDIO_AVAILABLE:
+ QMessageBox.warning(
+ self,
+ "Audio Not Available",
+ "pygame not installed. Audio playback disabled.\nInstall with: pip install pygame"
+ )
+
+ def _init_ui(self):
+ """Initialize the user interface"""
+ # Central widget
+ central = QWidget()
+ self.setCentralWidget(central)
+ layout = QVBoxLayout(central)
+
+ # Top controls
+ controls_layout = self._create_controls()
+ layout.addLayout(controls_layout)
+
+ # Main splitter (text area and settings)
+ splitter = QSplitter(Qt.Orientation.Horizontal)
+
+ # Left: Text area
+ text_widget = self._create_text_area()
+ splitter.addWidget(text_widget)
+
+ # Right: Settings panel
+ settings_widget = self._create_settings_panel()
+ splitter.addWidget(settings_widget)
+
+ splitter.setSizes([800, 400])
+ layout.addWidget(splitter)
+
+ # Bottom: Progress and status
+ self.progress_bar = QProgressBar()
+ self.progress_bar.setVisible(False)
+ layout.addWidget(self.progress_bar)
+
+ # Status bar
+ self.statusBar().showMessage("Ready")
+
+ # Apply theme
+ self._apply_theme()
+
+ def _create_controls(self) -> QHBoxLayout:
+ """Create top control buttons"""
+ layout = QHBoxLayout()
+
+ # File import buttons
+ self.btn_import_txt = QPushButton("📄 Import TXT")
+ self.btn_import_txt.clicked.connect(lambda: self._import_file("txt"))
+ layout.addWidget(self.btn_import_txt)
+
+ self.btn_import_pdf = QPushButton("📕 Import PDF")
+ self.btn_import_pdf.clicked.connect(lambda: self._import_file("pdf"))
+ layout.addWidget(self.btn_import_pdf)
+
+ self.btn_import_docx = QPushButton("📘 Import DOCX")
+ self.btn_import_docx.clicked.connect(lambda: self._import_file("docx"))
+ layout.addWidget(self.btn_import_docx)
+
+ self.btn_import_epub = QPushButton("📚 Import EPUB")
+ self.btn_import_epub.clicked.connect(lambda: self._import_file("epub"))
+ layout.addWidget(self.btn_import_epub)
+
+ layout.addStretch()
+
+ # Playback controls
+ self.btn_play = QPushButton("▶️ Play")
+ self.btn_play.clicked.connect(self._play)
+ layout.addWidget(self.btn_play)
+
+ self.btn_pause = QPushButton("⏸️ Pause")
+ self.btn_pause.clicked.connect(self._pause)
+ self.btn_pause.setEnabled(False)
+ layout.addWidget(self.btn_pause)
+
+ self.btn_stop = QPushButton("⏹️ Stop")
+ self.btn_stop.clicked.connect(self._stop)
+ self.btn_stop.setEnabled(False)
+ layout.addWidget(self.btn_stop)
+
+ layout.addStretch()
+
+ # Export button
+ self.btn_export = QPushButton("💾 Export MP3/WAV")
+ self.btn_export.clicked.connect(self._export_audio)
+ layout.addWidget(self.btn_export)
+
+ return layout
+
+ def _create_text_area(self) -> QWidget:
+ """Create main text editing area"""
+ widget = QWidget()
+ layout = QVBoxLayout(widget)
+
+ label = QLabel("Text to Read:")
+ label.setFont(QFont("Arial", 12, QFont.Weight.Bold))
+ layout.addWidget(label)
+
+ self.text_edit = QTextEdit()
+ self.text_edit.setPlaceholderText(
+ "Paste or type your text here...\n\n"
+ "Or import a file using the buttons above.\n"
+ "Supports: TXT, PDF, DOCX, EPUB"
+ )
+ self.text_edit.setFont(QFont("Arial", 11))
+ layout.addWidget(self.text_edit)
+
+ return widget
+
+ def _create_settings_panel(self) -> QWidget:
+ """Create settings panel"""
+ widget = QWidget()
+ layout = QVBoxLayout(widget)
+
+ # Voice selection
+ voice_group = QGroupBox("Voice Settings")
+ voice_layout = QVBoxLayout()
+
+ voice_layout.addWidget(QLabel("Voice:"))
+ self.voice_combo = QComboBox()
+ self.voice_combo.addItems(list(self.tts_engine.voices.keys()))
+ if self.tts_engine.voices:
+ self.voice_combo.setCurrentIndex(0)
+ voice_layout.addWidget(self.voice_combo)
+
+ voice_group.setLayout(voice_layout)
+ layout.addWidget(voice_group)
+
+ # Speed control
+ speed_group = QGroupBox("Speed Control")
+ speed_layout = QVBoxLayout()
+
+ self.speed_label = QLabel("Speed: 1.0×")
+ speed_layout.addWidget(self.speed_label)
+
+ self.speed_slider = QSlider(Qt.Orientation.Horizontal)
+ self.speed_slider.setMinimum(5) # 0.5×
+ self.speed_slider.setMaximum(40) # 4.0×
+ self.speed_slider.setValue(10) # 1.0×
+ self.speed_slider.setTickPosition(QSlider.TickPosition.TicksBelow)
+ self.speed_slider.setTickInterval(5)
+ self.speed_slider.valueChanged.connect(self._update_speed_label)
+ speed_layout.addWidget(self.speed_slider)
+
+ speed_group.setLayout(speed_layout)
+ layout.addWidget(speed_group)
+
+ # Highlighting options
+ highlight_group = QGroupBox("Highlighting")
+ highlight_layout = QVBoxLayout()
+
+ self.highlight_checkbox = QCheckBox("Enable real-time highlighting")
+ self.highlight_checkbox.setChecked(True)
+ highlight_layout.addWidget(self.highlight_checkbox)
+
+ highlight_group.setLayout(highlight_layout)
+ layout.addWidget(highlight_group)
+
+ # Theme
+ theme_group = QGroupBox("Appearance")
+ theme_layout = QVBoxLayout()
+
+ self.dark_mode_checkbox = QCheckBox("Dark Mode")
+ self.dark_mode_checkbox.setChecked(False)
+ self.dark_mode_checkbox.stateChanged.connect(self._apply_theme)
+ theme_layout.addWidget(self.dark_mode_checkbox)
+
+ theme_group.setLayout(theme_layout)
+ layout.addWidget(theme_group)
+
+ layout.addStretch()
+
+ # Info
+ info_label = QLabel(
+ "💡 100% Local/Offline\n"
+ "Uses Piper TTS\n"
+ "No cloud API calls"
+ )
+ info_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
+ info_label.setStyleSheet("color: #666; font-size: 10px;")
+ layout.addWidget(info_label)
+
+ return widget
+
+ def _update_speed_label(self):
+ """Update speed label when slider changes"""
+ speed = self.speed_slider.value() / 10.0
+ self.speed_label.setText(f"Speed: {speed:.1f}×")
+
+ def _apply_theme(self):
+ """Apply dark or light theme"""
+ if self.dark_mode_checkbox.isChecked():
+ # Dark theme
+ palette = QPalette()
+ palette.setColor(QPalette.ColorRole.Window, QColor(53, 53, 53))
+ palette.setColor(QPalette.ColorRole.WindowText, Qt.GlobalColor.white)
+ palette.setColor(QPalette.ColorRole.Base, QColor(25, 25, 25))
+ palette.setColor(QPalette.ColorRole.AlternateBase, QColor(53, 53, 53))
+ palette.setColor(QPalette.ColorRole.Text, Qt.GlobalColor.white)
+ palette.setColor(QPalette.ColorRole.Button, QColor(53, 53, 53))
+ palette.setColor(QPalette.ColorRole.ButtonText, Qt.GlobalColor.white)
+ palette.setColor(QPalette.ColorRole.BrightText, Qt.GlobalColor.red)
+ palette.setColor(QPalette.ColorRole.Link, QColor(42, 130, 218))
+ palette.setColor(QPalette.ColorRole.Highlight, QColor(42, 130, 218))
+ palette.setColor(QPalette.ColorRole.HighlightedText, Qt.GlobalColor.black)
+ self.setPalette(palette)
+ else:
+ # Light theme (default)
+ self.setPalette(QApplication.style().standardPalette())
+
+ def _import_file(self, file_type: str):
+ """Import file and extract text"""
+ if file_type == "txt":
+ file_path, _ = QFileDialog.getOpenFileName(
+ self, "Import TXT File", "", "Text Files (*.txt)"
+ )
+ if file_path:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ text = f.read()
+ self.text_edit.setPlainText(text)
+ self.statusBar().showMessage(f"Loaded: {Path(file_path).name}")
+
+ elif file_type == "pdf":
+ file_path, _ = QFileDialog.getOpenFileName(
+ self, "Import PDF File", "", "PDF Files (*.pdf)"
+ )
+ if file_path:
+ text = self._extract_pdf_text(file_path)
+ self.text_edit.setPlainText(text)
+ self.statusBar().showMessage(f"Loaded: {Path(file_path).name}")
+
+ elif file_type == "docx":
+ file_path, _ = QFileDialog.getOpenFileName(
+ self, "Import DOCX File", "", "Word Files (*.docx)"
+ )
+ if file_path:
+ text = self._extract_docx_text(file_path)
+ self.text_edit.setPlainText(text)
+ self.statusBar().showMessage(f"Loaded: {Path(file_path).name}")
+
+ elif file_type == "epub":
+ file_path, _ = QFileDialog.getOpenFileName(
+ self, "Import EPUB File", "", "EPUB Files (*.epub)"
+ )
+ if file_path:
+ text = self._extract_epub_text(file_path)
+ self.text_edit.setPlainText(text)
+ self.statusBar().showMessage(f"Loaded: {Path(file_path).name}")
+
+ def _extract_pdf_text(self, file_path: str) -> str:
+ """Extract text from PDF"""
+ try:
+ doc = fitz.open(file_path)
+ text = ""
+ for page in doc:
+ text += page.get_text()
+ doc.close()
+ return text
+ except Exception as e:
+ QMessageBox.critical(self, "Error", f"Failed to read PDF: {e}")
+ return ""
+
+ def _extract_docx_text(self, file_path: str) -> str:
+ """Extract text from DOCX"""
+ try:
+ doc = Document(file_path)
+ text = "\n".join([para.text for para in doc.paragraphs])
+ return text
+ except Exception as e:
+ QMessageBox.critical(self, "Error", f"Failed to read DOCX: {e}")
+ return ""
+
+ def _extract_epub_text(self, file_path: str) -> str:
+ """Extract text from EPUB"""
+ try:
+ book = epub.read_epub(file_path)
+ text = ""
+
+ for item in book.get_items():
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
+ soup = BeautifulSoup(item.get_content(), 'html.parser')
+ text += soup.get_text() + "\n\n"
+
+ return text
+ except Exception as e:
+ QMessageBox.critical(self, "Error", f"Failed to read EPUB: {e}")
+ return ""
+
+ def _play(self):
+ """Start or resume playback"""
+ if self.is_paused:
+ # Resume playback
+ if AUDIO_AVAILABLE and pygame.mixer.music.get_busy() == 0:
+ pygame.mixer.music.unpause()
+ self.is_paused = False
+ self.btn_play.setEnabled(False)
+ self.btn_pause.setEnabled(True)
+ self.btn_stop.setEnabled(True)
+ self.statusBar().showMessage("Resumed")
+ return
+
+ # Start new playback
+ text = self.text_edit.toPlainText().strip()
+ if not text:
+ QMessageBox.warning(self, "No Text", "Please enter or import text first.")
+ return
+
+ if not self.tts_engine.voices:
+ QMessageBox.warning(self, "No Voices", "No Piper voices found. Please install voice models.")
+ return
+
+ # Get settings
+ voice = self.voice_combo.currentText()
+ speed = self.speed_slider.value() / 10.0
+
+ # Split into sentences for highlighting
+ self.sentences = self._split_text_into_sentences(text)
+ self.current_sentence_idx = 0
+
+ # Start TTS generation in background
+ self.tts_worker = TTSWorker(self.tts_engine, text, voice, speed)
+ self.tts_worker.progress.connect(self._on_tts_progress)
+ self.tts_worker.finished.connect(self._on_tts_finished)
+ self.tts_worker.error.connect(self._on_tts_error)
+ self.tts_worker.sentence_started.connect(self._on_sentence_started)
+ self.tts_worker.start()
+
+ # Update UI
+ self.progress_bar.setVisible(True)
+ self.progress_bar.setValue(0)
+ self.btn_play.setEnabled(False)
+ self.btn_pause.setEnabled(False)
+ self.btn_stop.setEnabled(True)
+ self.statusBar().showMessage("Generating speech...")
+ self.is_playing = True
+
+ def _pause(self):
+ """Pause playback"""
+ if AUDIO_AVAILABLE:
+ pygame.mixer.music.pause()
+ self.is_paused = True
+ self.btn_play.setEnabled(True)
+ self.btn_pause.setEnabled(False)
+ self.statusBar().showMessage("Paused")
+
+ def _stop(self):
+ """Stop playback"""
+ # Stop TTS worker if running
+ if self.tts_worker and self.tts_worker.isRunning():
+ self.tts_worker.stop()
+ self.tts_worker.wait()
+
+ # Stop audio playback
+ if AUDIO_AVAILABLE:
+ pygame.mixer.music.stop()
+
+ # Clean up
+ if self.current_audio_file and Path(self.current_audio_file).exists():
+ try:
+ os.unlink(self.current_audio_file)
+ except:
+ pass
+
+ self.current_audio_file = None
+ self.is_playing = False
+ self.is_paused = False
+
+ # Clear highlighting
+ self._clear_highlighting()
+
+ # Update UI
+ self.progress_bar.setVisible(False)
+ self.btn_play.setEnabled(True)
+ self.btn_pause.setEnabled(False)
+ self.btn_stop.setEnabled(False)
+ self.statusBar().showMessage("Stopped")
+
+ def _on_tts_progress(self, value: int):
+ """Update progress bar"""
+ self.progress_bar.setValue(value)
+
+ def _on_tts_finished(self, audio_file: str):
+ """TTS generation complete, start playback"""
+ self.current_audio_file = audio_file
+ self.progress_bar.setVisible(False)
+
+ if AUDIO_AVAILABLE:
+ try:
+ pygame.mixer.music.load(audio_file)
+ pygame.mixer.music.play()
+
+ self.btn_pause.setEnabled(True)
+ self.statusBar().showMessage("Playing...")
+
+ # Start highlighting timer
+ if self.highlight_checkbox.isChecked():
+ self._start_highlighting()
+
+ except Exception as e:
+ QMessageBox.critical(self, "Playback Error", f"Failed to play audio: {e}")
+ self._stop()
+ else:
+ QMessageBox.information(
+ self,
+ "Audio Generated",
+ f"Audio file generated: {audio_file}\n\nInstall pygame for playback."
+ )
+ self._stop()
+
+ def _on_tts_error(self, error_msg: str):
+ """Handle TTS error"""
+ QMessageBox.critical(self, "TTS Error", f"Failed to generate speech:\n{error_msg}")
+ self._stop()
+
+ def _on_sentence_started(self, index: int):
+ """Highlight sentence being generated"""
+ self.current_sentence_idx = index
+
+ def _split_text_into_sentences(self, text: str) -> List[str]:
+ """Split text into sentences"""
+ sentences = re.split(r'([.!?]+\s+)', text)
+ result = []
+ current = ""
+
+ for i, part in enumerate(sentences):
+ current += part
+ if i % 2 == 1:
+ if current.strip():
+ result.append(current.strip())
+ current = ""
+
+ if current.strip():
+ result.append(current.strip())
+
+ return result if result else [text]
+
+ def _start_highlighting(self):
+ """Start real-time highlighting during playback"""
+ # This is a simplified version - in production you'd sync with actual audio timing
+ if not self.sentences:
+ return
+
+ self.current_sentence_idx = 0
+ self._highlight_timer = QTimer()
+ self._highlight_timer.timeout.connect(self._update_highlighting)
+
+ # Calculate approximate time per sentence
+ total_chars = sum(len(s) for s in self.sentences)
+ speed = self.speed_slider.value() / 10.0
+ # Rough estimate: 150 words per minute at 1× speed
+ chars_per_minute = 750 * speed
+ interval_ms = int((len(self.sentences[0]) / chars_per_minute) * 60000)
+
+ self._highlight_timer.start(max(1000, interval_ms))
+
+ def _update_highlighting(self):
+ """Update which sentence is highlighted"""
+ if not AUDIO_AVAILABLE or not pygame.mixer.music.get_busy():
+ if self._highlight_timer:
+ self._highlight_timer.stop()
+ self._clear_highlighting()
+ self._stop()
+ return
+
+ if self.current_sentence_idx >= len(self.sentences):
+ if self._highlight_timer:
+ self._highlight_timer.stop()
+ return
+
+ # Clear previous highlighting
+ cursor = self.text_edit.textCursor()
+ cursor.select(QTextCursor.SelectionType.Document)
+ fmt = QTextCharFormat()
+ cursor.setCharFormat(fmt)
+
+ # Highlight current sentence
+ text = self.text_edit.toPlainText()
+ sentence = self.sentences[self.current_sentence_idx]
+ start = text.find(sentence)
+
+ if start >= 0:
+ cursor = self.text_edit.textCursor()
+ cursor.setPosition(start)
+ cursor.setPosition(start + len(sentence), QTextCursor.MoveMode.KeepAnchor)
+
+ fmt = QTextCharFormat()
+ fmt.setBackground(QColor(255, 255, 0, 100)) # Yellow highlight
+ cursor.setCharFormat(fmt)
+
+ # Scroll to current sentence
+ self.text_edit.setTextCursor(cursor)
+ self.text_edit.ensureCursorVisible()
+
+ self.current_sentence_idx += 1
+
+ def _clear_highlighting(self):
+ """Remove all highlighting"""
+ cursor = self.text_edit.textCursor()
+ cursor.select(QTextCursor.SelectionType.Document)
+ fmt = QTextCharFormat()
+ cursor.setCharFormat(fmt)
+
+ def _export_audio(self):
+ """Export generated audio to file"""
+ if not self.current_audio_file or not Path(self.current_audio_file).exists():
+ QMessageBox.warning(
+ self,
+ "No Audio",
+ "Please generate audio first by clicking Play."
+ )
+ return
+
+ file_path, _ = QFileDialog.getSaveFileName(
+ self,
+ "Export Audio",
+ "",
+ "WAV Audio (*.wav);;MP3 Audio (*.mp3)"
+ )
+
+ if file_path:
+ try:
+ import shutil
+ if file_path.endswith('.wav'):
+ shutil.copy(self.current_audio_file, file_path)
+ elif file_path.endswith('.mp3'):
+ # Convert WAV to MP3 (requires ffmpeg)
+ try:
+ subprocess.run([
+ 'ffmpeg', '-i', self.current_audio_file,
+ '-codec:a', 'libmp3lame', '-qscale:a', '2',
+ file_path
+ ], check=True)
+ except:
+ QMessageBox.warning(
+ self,
+ "MP3 Conversion Failed",
+ "ffmpeg not found. Saving as WAV instead."
+ )
+ file_path = file_path.replace('.mp3', '.wav')
+ shutil.copy(self.current_audio_file, file_path)
+
+ QMessageBox.information(
+ self,
+ "Export Successful",
+ f"Audio saved to:\n{file_path}"
+ )
+
+ except Exception as e:
+ QMessageBox.critical(
+ self,
+ "Export Failed",
+ f"Failed to export audio:\n{e}"
+ )
+
+ def _load_settings(self):
+ """Load saved settings"""
+ if self.settings.value("voice"):
+ index = self.voice_combo.findText(self.settings.value("voice"))
+ if index >= 0:
+ self.voice_combo.setCurrentIndex(index)
+
+ if self.settings.value("speed"):
+ self.speed_slider.setValue(int(float(self.settings.value("speed")) * 10))
+
+ if self.settings.value("dark_mode"):
+ self.dark_mode_checkbox.setChecked(
+ self.settings.value("dark_mode") == "true"
+ )
+
+ def closeEvent(self, event):
+ """Save settings on close"""
+ self.settings.setValue("voice", self.voice_combo.currentText())
+ self.settings.setValue("speed", str(self.speed_slider.value() / 10.0))
+ self.settings.setValue("dark_mode", str(self.dark_mode_checkbox.isChecked()))
+
+ # Clean up
+ self._stop()
+
+ event.accept()
+
+
+def main():
+ """Application entry point"""
+ app = QApplication(sys.argv)
+ app.setApplicationName("Speechify Desktop Clone")
+ app.setOrganizationName("SpeechifyLocal")
+
+ # Set app-wide font
+ font = QFont("Arial", 10)
+ app.setFont(font)
+
+ window = SpeechifyDesktop()
+ window.show()
+
+ sys.exit(app.exec())
+
+
+if __name__ == "__main__":
+ main()
diff --git a/start-fresh.bat b/start-fresh.bat
new file mode 100644
index 0000000..f424451
--- /dev/null
+++ b/start-fresh.bat
@@ -0,0 +1,20 @@
+@echo off
+echo ====================================
+echo OpenWebTTS - Clear Cache Helper
+echo ====================================
+echo.
+echo This script will help you see the latest changes.
+echo.
+echo Step 1: Stopping any running Python servers...
+taskkill /F /IM python.exe /T 2>nul
+timeout /t 2 /nobreak >nul
+echo.
+echo Step 2: Starting OpenWebTTS...
+echo.
+echo Opening browser in 3 seconds...
+echo.
+echo IMPORTANT: When browser opens, press Ctrl+Shift+R to hard refresh!
+echo.
+timeout /t 3 /nobreak
+start http://localhost:5000
+python app.py
diff --git a/static/css/pre.css b/static/css/pre.css
index 914c696..4a03f10 100644
--- a/static/css/pre.css
+++ b/static/css/pre.css
@@ -58,8 +58,317 @@ button, a.button {
width: inherit;
}
+#pdf-viewer {
+ position: relative;
+ z-index: 1;
+}
+
+.pdf-page-wrapper {
+ position: relative;
+ margin-bottom: 40px;
+ padding: 25px;
+ border: 5px solid #000;
+ border-radius: 6px;
+ background: white;
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
+ transition: box-shadow 0.3s ease;
+ page-break-inside: avoid;
+ break-inside: avoid;
+}
+
+.pdf-page-wrapper:hover {
+ box-shadow: 0 6px 16px rgba(0, 0, 0, 0.15);
+}
+
+.pdf-page-wrapper canvas {
+ display: block;
+ position: relative;
+}
+
+.dark .pdf-page-wrapper {
+ border-color: #6b7280;
+ background: #1f2937;
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
+}
+
+.dark .pdf-page-wrapper:hover {
+ box-shadow: 0 6px 16px rgba(0, 0, 0, 0.5);
+}
+
+.pdf-page-separator {
+ display: none;
+}
+
#pdf-viewer canvas {
max-width: 100%;
+ position: relative;
+ z-index: 1;
+ display: block;
+}
+
+#pdf-viewer-wrapper {
+ position: relative;
+}
+
+#highlight-layer {
+ position: absolute;
+ top: 0;
+ left: 0;
+ width: 100%;
+ height: 100%;
+ pointer-events: none;
+ z-index: 10;
+}
+
+#pdf-text-layer {
+ position: absolute;
+ top: 0;
+ left: 0;
+ width: 100%;
+ height: 100%;
+ pointer-events: none;
+ z-index: 1;
+ overflow: visible;
+ user-select: none;
+}
+
+.pdf-text-layer-page {
+ pointer-events: none;
+ width: 100%;
+ height: 100%;
+ position: relative;
+}
+
+/* PDF.js standard text layer for text selection and copying */
+.textLayer {
+ position: absolute;
+ text-align: initial;
+ left: 0;
+ top: 0;
+ right: 0;
+ bottom: 0;
+ overflow: hidden;
+ opacity: 1;
+ line-height: 1;
+ -webkit-text-size-adjust: none;
+ -moz-text-size-adjust: none;
+ text-size-adjust: none;
+ forced-color-adjust: none;
+ z-index: 3;
+ pointer-events: auto;
+ user-select: text;
+ -webkit-user-select: text;
+ -moz-user-select: text;
+ -ms-user-select: text;
+}
+
+/* Add hover effect to text spans when not selecting */
+.textLayer span:not(:has(::selection)):hover {
+ background-color: rgba(59, 130, 246, 0.12) !important;
+ transform: scale(1.02);
+}
+
+.textLayer span,
+.textLayer br {
+ color: transparent;
+ position: absolute;
+ white-space: pre;
+ cursor: pointer;
+ transform-origin: 0% 0%;
+ transition: background-color 0.2s ease, transform 0.1s ease;
+ border-radius: 2px;
+ user-select: text;
+ -webkit-user-select: text;
+}
+
+.textLayer span.markedContent {
+ top: 0;
+ height: 0;
+}
+
+.textLayer ::selection {
+ background: rgba(59, 130, 246, 0.5);
+ color: inherit;
+}
+
+.textLayer ::-moz-selection {
+ background: rgba(59, 130, 246, 0.5);
+ color: inherit;
+}
+
+.textLayer br::selection {
+ background: transparent;
+}
+
+.textLayer .highlight {
+ margin: -1px;
+ padding: 1px;
+ background-color: rgba(180, 0, 170, 0.2);
+ border-radius: 4px;
+}
+
+.textLayer .highlight.appended {
+ position: initial;
+}
+
+.textLayer .highlight.begin {
+ border-radius: 4px 0 0 4px;
+}
+
+.textLayer .highlight.end {
+ border-radius: 0 4px 4px 0;
+}
+
+.textLayer .highlight.middle {
+ border-radius: 0;
+}
+
+.textLayer .highlight.selected {
+ background-color: rgba(59, 130, 246, 0.3);
+}
+
+.pdf-selectable-text-layer {
+ position: absolute;
+ pointer-events: auto;
+ z-index: 3;
+ user-select: text;
+ -webkit-user-select: text;
+ -moz-user-select: text;
+ -ms-user-select: text;
+ cursor: text;
+ line-height: 1;
+ color: transparent;
+}
+
+.pdf-selectable-text-layer span {
+ user-select: text;
+ -webkit-user-select: text;
+ -moz-user-select: text;
+ -ms-user-select: text;
+ color: transparent !important;
+ position: absolute;
+}
+
+/* Show semi-transparent text for debugging alignment */
+.pdf-selectable-text-layer:hover span {
+ background: rgba(255, 255, 0, 0.05);
+}
+
+.pdf-selectable-text-layer span:hover {
+ background: rgba(255, 255, 0, 0.15);
+}
+
+.pdf-selectable-text-layer::selection {
+ background: rgba(59, 130, 246, 0.3);
+}
+
+.pdf-selectable-text-layer span::selection {
+ background: rgba(59, 130, 246, 0.3);
+}
+
+.pdf-text-chunk {
+ pointer-events: none;
+ cursor: default;
+ transition: all 0.2s cubic-bezier(0.4, 0, 0.2, 1);
+ font-size: 0;
+ line-height: 0;
+ border: 2px solid rgba(59, 130, 246, 0.2);
+ background-color: rgba(59, 130, 246, 0.08);
+ border-radius: 3px;
+ user-select: none;
+ opacity: 0.6;
+ z-index: 1;
+ will-change: opacity, transform, z-index;
+}
+
+/* Make chunks more visible and clickable when Ctrl key is held */
+body.ctrl-held .pdf-text-chunk {
+ pointer-events: auto;
+ cursor: pointer;
+ border-color: rgba(59, 130, 246, 0.5);
+ background-color: rgba(59, 130, 246, 0.28);
+ opacity: 1;
+ z-index: 100;
+ transform: translateZ(0);
+}
+
+body.ctrl-held #pdf-text-layer {
+ z-index: 10;
+}
+
+body.ctrl-held .textLayer {
+ pointer-events: none;
+ z-index: 1;
+}
+
+body.ctrl-held .pdf-selectable-text-layer {
+ pointer-events: none;
+ z-index: 1;
+}
+
+body.ctrl-held .pdf-text-chunk:hover {
+ background-color: rgba(59, 130, 246, 0.4) !important;
+ border-color: rgba(59, 130, 246, 0.7);
+ box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.25),
+ 0 4px 12px rgba(59, 130, 246, 0.3);
+ transform: scale(1.03) translateZ(0);
+ z-index: 200;
+}
+
+body.ctrl-held .pdf-text-chunk:active {
+ background-color: rgba(59, 130, 246, 0.45) !important;
+ border-color: rgba(59, 130, 246, 0.8);
+ transform: scale(0.98);
+}
+
+body.ctrl-held .dark .pdf-text-chunk {
+ border-color: rgba(96, 165, 250, 0.3);
+ background-color: rgba(96, 165, 250, 0.15);
+}
+
+body.ctrl-held .dark .pdf-text-chunk:hover {
+ background-color: rgba(96, 165, 250, 0.4) !important;
+ border-color: rgba(96, 165, 250, 0.7);
+ box-shadow: 0 0 0 2px rgba(96, 165, 250, 0.5), 0 2px 8px rgba(96, 165, 250, 0.4);
+}
+
+/* Visual indicator when Ctrl is held */
+body.ctrl-held::after {
+ content: '🖱️ Click on highlighted chunks to start reading from that point';
+ position: fixed;
+ bottom: 30px;
+ left: 50%;
+ transform: translateX(-50%);
+ background: linear-gradient(135deg, rgba(59, 130, 246, 0.98), rgba(99, 102, 241, 0.98));
+ color: white;
+ padding: 12px 24px;
+ border-radius: 12px;
+ font-size: 14px;
+ font-weight: 500;
+ box-shadow: 0 8px 24px rgba(59, 130, 246, 0.4),
+ 0 2px 8px rgba(0, 0, 0, 0.2);
+ z-index: 9999;
+ pointer-events: none;
+ animation: slideUpFade 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+ backdrop-filter: blur(8px);
+ letter-spacing: 0.3px;
+}
+
+@keyframes slideUpFade {
+ from {
+ opacity: 0;
+ transform: translateX(-50%) translateY(10px);
+ }
+ to {
+ opacity: 1;
+ transform: translateX(-50%) translateY(0);
+ }
+}
+
+body.ctrl-held.dark::after {
+ background: linear-gradient(135deg, rgba(96, 165, 250, 0.98), rgba(129, 140, 248, 0.98));
+ box-shadow: 0 8px 24px rgba(96, 165, 250, 0.4),
+ 0 2px 8px rgba(0, 0, 0, 0.4);
}
.highlight {
@@ -78,6 +387,7 @@ button, a.button {
border-radius: inherit;
margin-right: inherit;
padding: inherit;
+ z-index: 11;
}
/* Modal Styles */
@@ -255,4 +565,190 @@ i.fas {
height: auto;
border-radius: 4px;
margin: 1em 0;
-}
\ No newline at end of file
+}
+/* PDF Reading Tools Enhancements */
+#pdf-search-bar {
+ transition: all 0.3s ease-in-out;
+}
+
+#pdf-search-input {
+ transition: border-color 0.2s ease-in-out;
+}
+
+#pdf-search-input:focus {
+ border-color: var(--color-indigo-500);
+ outline: none;
+ box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1);
+}
+
+#pdf-search-results {
+ animation: fadeIn 0.2s ease-in-out;
+}
+
+@keyframes fadeIn {
+ from { opacity: 0; transform: translateY(-5px); }
+ to { opacity: 1; transform: translateY(0); }
+}
+
+/* PDF page controls styling */
+#page-input::-webkit-outer-spin-button,
+#page-input::-webkit-inner-spin-button {
+ -webkit-appearance: none;
+ margin: 0;
+}
+
+#page-input[type=number] {
+ -moz-appearance: textfield;
+}
+
+#page-input:focus {
+ border-color: var(--color-indigo-500);
+ outline: none;
+ box-shadow: 0 0 0 2px rgba(99, 102, 241, 0.1);
+}
+
+/* Zoom level display */
+#zoom-level {
+ font-weight: 600;
+ letter-spacing: 0.5px;
+}
+
+/* Button hover effects for PDF tools */
+#pagination-controls button {
+ transition: all 0.2s ease-in-out;
+}
+
+#pagination-controls button:hover {
+ transform: translateY(-1px);
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+
+#pagination-controls button:active {
+ transform: translateY(0);
+}
+
+/* Book info panel responsive width */
+@media (min-width: 768px) {
+ #book-info {
+ min-width: 450px;
+ }
+}
+
+/* Search result highlighting in PDF */
+.pdf-search-highlight {
+ background-color: rgba(255, 255, 0, 0.3);
+ border: 2px solid rgba(255, 200, 0, 0.5);
+ animation: pulseHighlight 1s ease-in-out;
+}
+
+@keyframes pulseHighlight {
+ 0%, 100% { opacity: 1; }
+ 50% { opacity: 0.7; }
+}
+
+/* Smooth transitions for PDF viewer */
+#pdf-viewer canvas {
+ transition: opacity 0.2s ease-in-out;
+}
+
+/* Improved notification styles */
+.notification {
+ animation: slideInRight 0.3s cubic-bezier(0.4, 0, 0.2, 1),
+ fadeOut 0.3s ease-in-out 4.7s forwards;
+ backdrop-filter: blur(8px);
+ box-shadow: 0 4px 16px rgba(0, 0, 0, 0.1),
+ 0 2px 4px rgba(0, 0, 0, 0.06);
+}
+
+@keyframes slideInRight {
+ from {
+ transform: translateX(100%);
+ opacity: 0;
+ }
+ to {
+ transform: translateX(0);
+ opacity: 1;
+ }
+}
+
+/* Text hover instruction tooltip */
+.textLayer span[data-text]:hover::after {
+ content: 'Click to read from here';
+ position: absolute;
+ bottom: 100%;
+ left: 50%;
+ transform: translateX(-50%) translateY(-8px);
+ background: rgba(30, 30, 30, 0.95);
+ color: white;
+ padding: 6px 12px;
+ border-radius: 6px;
+ font-size: 11px;
+ white-space: nowrap;
+ pointer-events: none;
+ z-index: 1000;
+ opacity: 0;
+ animation: fadeInTooltip 0.2s ease-in-out 0.5s forwards;
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.2);
+}
+
+/* Show Alt+Q hint when text is selected */
+::selection {
+ position: relative;
+}
+
+body::after {
+ content: '';
+ position: fixed;
+ bottom: 20px;
+ right: 20px;
+ padding: 0;
+ opacity: 0;
+ pointer-events: none;
+ transition: opacity 0.2s ease;
+}
+
+body.has-selection::after {
+ content: '💡 Press Alt+Q to read selected text';
+ background: linear-gradient(135deg, rgba(59, 130, 246, 0.98), rgba(99, 102, 241, 0.98));
+ color: white;
+ padding: 10px 16px;
+ border-radius: 8px;
+ font-size: 13px;
+ opacity: 1;
+ box-shadow: 0 4px 12px rgba(59, 130, 246, 0.3),
+ 0 2px 4px rgba(0, 0, 0, 0.1);
+ z-index: 9998;
+ animation: slideUpFade 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+ backdrop-filter: blur(8px);
+}
+
+@keyframes fadeInTooltip {
+ from {
+ opacity: 0;
+ transform: translateX(-50%) translateY(-4px);
+ }
+ to {
+ opacity: 1;
+ transform: translateX(-50%) translateY(-8px);
+ }
+}
+
+/* Improve modal appearance */
+.modal-content {
+ animation: scaleIn 0.2s cubic-bezier(0.4, 0, 0.2, 1);
+}
+
+@keyframes scaleIn {
+ from {
+ transform: scale(0.95);
+ opacity: 0;
+ }
+ to {
+ transform: scale(1);
+ opacity: 1;
+ }
+}
+
+#pdf-viewer-wrapper {
+ scroll-behavior: smooth;
+}
diff --git a/static/js/UI.js b/static/js/UI.js
index f70592d..8b79532 100644
--- a/static/js/UI.js
+++ b/static/js/UI.js
@@ -52,7 +52,10 @@ export function handleSidebarCollapse(appState) {
* @param {object} appState The main appState object.
*/
export function renderNotifications(appState) {
- const notifications = JSON.parse(localStorage.getItem('notifications')).reverse();
+ const notificationsData = localStorage.getItem('notifications');
+ if (!notificationsData) return;
+
+ const notifications = JSON.parse(notificationsData).reverse();
if (notifications?.length < 0) return;
// Stop here, since default is already a no notification message.
@@ -294,7 +297,11 @@ export function resetPdfView(appState) {
appState.elements.pdfViewerWrapper.classList.add('hidden');
appState.elements.textboxViewerWrapper.classList.remove('hidden');
appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
appState.variables.pdfDoc = null;
+ appState.variables.pdfTextPositions = []; // Clear backend text positions
// When resetting PDF view, ensure PDF-specific controls are disabled
appState.elements.zoomInBtn.disabled = true;
diff --git a/static/js/config.js b/static/js/config.js
index a12e6c2..9b2d785 100644
--- a/static/js/config.js
+++ b/static/js/config.js
@@ -24,6 +24,7 @@ document.addEventListener('DOMContentLoaded', () => {
const cacheSizeDisplay = document.getElementById('cache-size-display');
const chunkSizeSlider = document.getElementById('chunk-size-slider');
const chunkSizeDisplay = document.getElementById('chunk-size-display');
+ const semanticSplittingToggle = document.getElementById('semantic-splitting-toggle');
const downloadStatus = document.getElementById('download-status');
const googleVoiceInput = document.getElementById('google-voice');
@@ -169,6 +170,14 @@ document.addEventListener('DOMContentLoaded', () => {
chunkSizeSlider.value = parseInt(prefs.chunkSize);
chunkSizeDisplay.textContent = prefs.chunkSize;
}
+
+ // Set semantic splitting toggle state
+ if (prefs.useSemanticSplitting !== undefined) {
+ semanticSplittingToggle.checked = prefs.useSemanticSplitting;
+ } else {
+ prefs.useSemanticSplitting = true; // Default to enabled
+ semanticSplittingToggle.checked = true;
+ }
prefs.accessibleFontEnabled = accessibleFontCheckbox.checked || false;
prefs.accessibleFontUIEnabled = accessibleFontUICheckbox.checked || false;
@@ -186,7 +195,19 @@ document.addEventListener('DOMContentLoaded', () => {
block.classList.remove('shadow-xl');
});
- document.querySelector(`#highlight-customization > div.${prefs.highlightColor}`).classList.add('shadow-xl');
+ document.querySelector(`#highlight-customization > div.${prefs.highlightColor}`)?.classList.add('shadow-xl');
+ }
+
+ // Handle word highlight color
+ if (prefs.wordHighlightColor) {
+ document.querySelectorAll(`#word-highlight-customization > div`).forEach(block => {
+ block.classList.remove('shadow-xl');
+ });
+ document.querySelector(`#word-highlight-customization > div.${prefs.wordHighlightColor}`)?.classList.add('shadow-xl');
+ } else {
+ // Default to same as chunk color if not set
+ prefs.wordHighlightColor = prefs.highlightColor || 'yellow';
+ handlePrefs(prefs);
}
if (prefs.useBlackBG) {
@@ -196,12 +217,29 @@ document.addEventListener('DOMContentLoaded', () => {
clearCacheButton.addEventListener('click', async () => {
+ // Clear server-side caches (audio + PDF)
const response = await fetch('/api/clear_cache');
if (!response.ok) {
throw new Error('Failed to clear cache.');
}
+ // Clear client-side PDF caches from localStorage
+ const keysToRemove = [];
+ for (let i = 0; i < localStorage.length; i++) {
+ const key = localStorage.key(i);
+ if (key && key.startsWith('pdf_positions_')) {
+ keysToRemove.push(key);
+ }
+ }
+
+ keysToRemove.forEach(key => localStorage.removeItem(key));
+
+ console.log(`🗑️ Cleared ${keysToRemove.length} PDF caches from localStorage`);
+
+ const data = await response.json();
+ alert(`Cache cleared successfully!\n\n${data.message}\n\nCleared ${keysToRemove.length} PDF caches from browser storage.`);
+
getCacheSize(); // Refresh cache size after clearing
});
@@ -217,6 +255,12 @@ document.addEventListener('DOMContentLoaded', () => {
handlePrefs(prefs);
});
+
+ semanticSplittingToggle.addEventListener('change', () => {
+ prefs.useSemanticSplitting = semanticSplittingToggle.checked;
+ handlePrefs(prefs);
+ console.log('Semantic splitting:', prefs.useSemanticSplitting ? 'enabled' : 'disabled');
+ });
const apiKeyContainer = document.getElementById('explanation-container');
apiKeyContainer.style.display = 'none';
@@ -355,7 +399,7 @@ document.addEventListener('DOMContentLoaded', () => {
handlePrefs(prefs);
});
- // Add events to highlight buttons
+ // Add events to highlight buttons (chunk colors)
document.querySelectorAll("#highlight-customization button").forEach((item) => {
item.addEventListener('click', () => {
prefs.highlightColor = item.dataset.value;
@@ -363,6 +407,24 @@ document.addEventListener('DOMContentLoaded', () => {
populatePrefsInputs();
});
});
+
+ // Add events to highlight buttons (chunk colors)
+ document.querySelectorAll("#highlight-customization button").forEach((item) => {
+ item.addEventListener('click', () => {
+ prefs.highlightColor = item.dataset.value;
+ handlePrefs(prefs);
+ populatePrefsInputs();
+ });
+ });
+
+ // Add events to word highlight buttons
+ document.querySelectorAll("#word-highlight-customization button").forEach((item) => {
+ item.addEventListener('click', () => {
+ prefs.wordHighlightColor = item.dataset.value;
+ handlePrefs(prefs);
+ populatePrefsInputs();
+ });
+ });
// Call init functions
getPiperVoices();
diff --git a/static/js/helpers.js b/static/js/helpers.js
index b9c342e..6f6a4fb 100644
--- a/static/js/helpers.js
+++ b/static/js/helpers.js
@@ -51,11 +51,18 @@ export const handlePrefs = (data, defaultValue = null) => {
};
/**
- * Save the current local books object to localStorage.
+ * Save local books to localStorage, excluding pdfData to avoid size limits.
* @param {object} appState The main app state object.
*/
export function saveLocalBooks(appState) {
- localStorage.setItem('books', JSON.stringify(appState.variables.localBooks));
+ // Clone the books object and remove pdfData (ArrayBuffer) before saving
+ const booksToSave = {};
+ for (const [bookId, book] of Object.entries(appState.variables.localBooks)) {
+ booksToSave[bookId] = { ...book };
+ // Remove pdfData as it can't be serialized and is too large for localStorage
+ delete booksToSave[bookId].pdfData;
+ }
+ localStorage.setItem('books', JSON.stringify(booksToSave));
}
/**
@@ -70,6 +77,9 @@ export function splitTextIntoChunks(text, chunkSize) {
let prefs = JSON.parse(localStorage.getItem('prefs') || '{}');
const parsedChunkSize = parseInt(prefs.chunkSize);
chunkSize = (!isNaN(parsedChunkSize) && parsedChunkSize > 0) ? parsedChunkSize : 200;
+
+ // Check if semantic processing is enabled
+ const useSemanticSplitting = prefs.useSemanticSplitting !== false; // Default true
const chunks = [];
let currentIndex = 0;
@@ -80,14 +90,19 @@ export function splitTextIntoChunks(text, chunkSize) {
// If we're not at the end of text, look for a good break point
if (chunkEnd < text.length) {
- // Look for word boundaries (space, punctuation followed by space)
- let wordBoundary = text.lastIndexOf(' ', chunkEnd);
+ // Look for sentence boundaries (. ! ? followed by space)
let sentenceBoundary = Math.max(
text.lastIndexOf('. ', chunkEnd),
text.lastIndexOf('! ', chunkEnd),
- text.lastIndexOf('? ', chunkEnd)
+ text.lastIndexOf('? ', chunkEnd),
+ text.lastIndexOf('." ', chunkEnd), // Handle quotes after periods
+ text.lastIndexOf('. "', chunkEnd), // Space before quote
+ text.lastIndexOf('.\\" ', chunkEnd) // Escaped quotes
);
+ // Word boundary as fallback
+ let wordBoundary = text.lastIndexOf(' ', chunkEnd);
+
// Look for HTML/XML tag boundaries
let tagBoundary = -1;
let inTag = false;
@@ -136,6 +151,53 @@ export function splitTextIntoChunks(text, chunkSize) {
return chunks;
}
+/**
+ * Async version of splitTextIntoChunks that uses semantic sentence splitting via API.
+ * Falls back to local rule-based splitting if API fails.
+ * @param {string} text - Text to split into chunks
+ * @param {number} chunkSize - Maximum size of each chunk
+ * @returns {Promise} Array of text chunks
+ */
+export async function splitTextIntoChunksAsync(text, chunkSize) {
+ let prefs = JSON.parse(localStorage.getItem('prefs') || '{}');
+ const parsedChunkSize = parseInt(prefs.chunkSize);
+ chunkSize = (!isNaN(parsedChunkSize) && parsedChunkSize > 0) ? parsedChunkSize : 200;
+
+ const useSemanticSplitting = prefs.useSemanticSplitting !== false;
+
+ // If semantic splitting disabled or text is short, use local splitting
+ if (!useSemanticSplitting || text.length < 200) {
+ return splitTextIntoChunks(text, chunkSize);
+ }
+
+ try {
+ const response = await fetch('/api/process_text', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ text: text,
+ chunk_size: chunkSize,
+ use_llm: true
+ })
+ });
+
+ if (response.ok) {
+ const data = await response.json();
+ if (data.status === 'success' && data.chunks && data.chunks.length > 0) {
+ console.log(`✅ Semantic splitting: ${data.chunk_count} chunks`);
+ return data.chunks;
+ }
+ }
+
+ console.log('⚠️ API semantic splitting failed, using local fallback');
+ } catch (error) {
+ console.log('⚠️ Semantic splitting error:', error.message);
+ }
+
+ // Fallback to local splitting
+ return splitTextIntoChunks(text, chunkSize);
+}
+
/**
* Calculates the similarity between two strings and checks if it meets a minimum percentage.
* Similarity is determined using the Levenshtein distance.
diff --git a/static/js/index.js b/static/js/index.js
index 30e1e63..5177474 100644
--- a/static/js/index.js
+++ b/static/js/index.js
@@ -6,11 +6,14 @@
*
*/
+// Import PDF.js
+import * as pdfjsLib from './pdf.min.mjs';
+
// Import podcast generation
import { getPodcasts, generatePodcast, deletePodcast } from './podcast.js';
// Import Speech Generation functions
-import { generateSpeech } from "./speechGen.js";
+import { generateSpeech, generateSpeechWithTiming } from "./speechGen.js";
// Import helpers
import {
@@ -23,6 +26,7 @@ import {
getCurrentMainVisiblePage,
mapTextContent,
splitTextIntoChunks,
+ splitTextIntoChunksAsync,
handlePrefs,
createSleepTimer,
startRecording,
@@ -55,6 +59,113 @@ import {
document.addEventListener('DOMContentLoaded', () => {
+ // Track Ctrl key state for PDF chunk interaction
+ // Chunks are only clickable when Ctrl is held, allowing normal text selection otherwise
+ document.addEventListener('keydown', (e) => {
+ if (e.key === 'Control' || e.key === 'Meta') { // Meta for Mac Cmd key
+ document.body.classList.add('ctrl-held');
+ }
+
+ // Alt+Q: Read selected text
+ if (e.altKey && e.key === 'q') {
+ e.preventDefault();
+ const selectedText = window.getSelection().toString().trim();
+
+ if (selectedText && selectedText.length > 0) {
+ console.log('🎯 Alt+Q pressed - Reading selected text:', selectedText.substring(0, 50) + '...');
+ showNotification('📢 Reading selected text: "' + selectedText.substring(0, 40) + '..."', 'info');
+
+ // Process and play the selected text
+ processTextAndPlay(selectedText, appState.variables.currentPageNum || 1);
+ } else {
+ showNotification('⚠️ No text selected. Select text and press Alt+Q to read it.', 'warning');
+ }
+ }
+
+ // ?: Show keyboard shortcuts
+ if (e.key === '?' && !e.ctrlKey && !e.altKey && !e.metaKey) {
+ // Only if not typing in an input
+ if (!['INPUT', 'TEXTAREA'].includes(document.activeElement.tagName)) {
+ e.preventDefault();
+ if (appState.elements.shortcutsModal) {
+ appState.elements.shortcutsModal.classList.remove('hidden');
+ }
+ }
+ }
+
+ // Space: Play/Pause audio
+ if (e.key === ' ' && !e.ctrlKey && !e.altKey && !e.metaKey) {
+ // Only if not typing in an input
+ if (!['INPUT', 'TEXTAREA'].includes(document.activeElement.tagName)) {
+ e.preventDefault();
+ if (appState.variables.isPlaying && !appState.variables.isPaused) {
+ // Pause
+ appState.elements.audioPlayer.pause();
+ appState.variables.isPaused = true;
+ showNotification('⏸️ Paused', 'info');
+ } else if (appState.variables.isPaused) {
+ // Resume
+ appState.elements.audioPlayer.play();
+ appState.variables.isPaused = false;
+ showNotification('▶️ Playing', 'info');
+ }
+ }
+ }
+
+ // Arrow keys: Navigate pages (only in PDF mode)
+ if ((e.key === 'ArrowRight' || e.key === 'ArrowLeft') && !e.ctrlKey && !e.altKey && !e.metaKey) {
+ if (!['INPUT', 'TEXTAREA'].includes(document.activeElement.tagName)) {
+ if (appState.variables.pdfDoc) {
+ e.preventDefault();
+ if (e.key === 'ArrowRight') {
+ // Next page
+ const nextPage = Math.min(appState.variables.currentPageNum + 1, appState.variables.pdfDoc.numPages);
+ if (nextPage !== appState.variables.currentPageNum) {
+ appState.variables.isManualPageChange = true;
+ appState.elements.pdfViewer.innerHTML = '';
+ renderPage(nextPage).then(() => {
+ appState.variables.isManualPageChange = false;
+ });
+ }
+ } else if (e.key === 'ArrowLeft') {
+ // Previous page
+ const prevPage = Math.max(appState.variables.currentPageNum - 1, 1);
+ if (prevPage !== appState.variables.currentPageNum) {
+ appState.variables.isManualPageChange = true;
+ appState.elements.pdfViewer.innerHTML = '';
+ renderPage(prevPage).then(() => {
+ appState.variables.isManualPageChange = false;
+ });
+ }
+ }
+ }
+ }
+ }
+ });
+
+ document.addEventListener('keyup', (e) => {
+ if (e.key === 'Control' || e.key === 'Meta') {
+ document.body.classList.remove('ctrl-held');
+ }
+ });
+
+ // Remove class when window loses focus
+ window.addEventListener('blur', () => {
+ document.body.classList.remove('ctrl-held');
+ });
+
+ // Detect text selection and show Alt+Q hint
+ document.addEventListener('selectionchange', () => {
+ const selection = window.getSelection();
+ const selectedText = selection.toString().trim();
+
+ if (selectedText && selectedText.length > 0) {
+ document.body.classList.add('has-selection');
+ } else {
+ document.body.classList.remove('has-selection');
+ }
+ });
+
let appState = {
elements: {
// Inputs
@@ -73,6 +184,9 @@ document.addEventListener('DOMContentLoaded', () => {
newBookBtn: document.getElementById('new-book-btn'),
libraryBtn: document.getElementById('library-btn'),
commandsBtn: document.getElementById('commands-btn'),
+ shortcutsBtn: document.getElementById('shortcuts-btn'),
+ shortcutsModal: document.getElementById('shortcuts-modal'),
+ shortcutsCloseBtn: document.getElementById('shortcuts-close-btn'),
accountSwitcherBtn: document.getElementById('account-switcher-btn'),
accountSwitcherMenu: document.getElementById('account-switcher-menu'),
currentUserButton: document.getElementById('current-user-button'),
@@ -80,6 +194,23 @@ document.addEventListener('DOMContentLoaded', () => {
downloadAudioBtn: document.getElementById('download-link'),
zoomInBtn: document.getElementById('zoom-in-btn'),
zoomOutBtn: document.getElementById('zoom-out-btn'),
+ fitWidthBtn: document.getElementById('fit-width-btn'),
+ fitPageBtn: document.getElementById('fit-page-btn'),
+ rotateLeftBtn: document.getElementById('rotate-left-btn'),
+ rotateRightBtn: document.getElementById('rotate-right-btn'),
+ firstPageBtn: document.getElementById('first-page-btn'),
+ lastPageBtn: document.getElementById('last-page-btn'),
+ pageInput: document.getElementById('page-input'),
+ zoomLevel: document.getElementById('zoom-level'),
+ pdfSearchBtn: document.getElementById('pdf-search-btn'),
+ pdfSearchBar: document.getElementById('pdf-search-bar'),
+ pdfSearchInput: document.getElementById('pdf-search-input'),
+ pdfSearchPrev: document.getElementById('pdf-search-prev'),
+ pdfSearchNext: document.getElementById('pdf-search-next'),
+ pdfSearchClose: document.getElementById('pdf-search-close'),
+ pdfSearchResults: document.getElementById('pdf-search-results'),
+ debugModeBtn: document.getElementById('debug-mode-btn'),
+ pdfLoading: document.getElementById('pdf-loading'),
generateBtn: document.getElementById('generate-btn'),
generateBtnIcon: document.getElementById('generate-btn-icon'),
stopBtn: document.getElementById('stop-btn'),
@@ -105,6 +236,7 @@ document.addEventListener('DOMContentLoaded', () => {
textDisplay: document.getElementById('text-display'),
pdfViewer: document.getElementById('pdf-viewer'),
pdfViewerWrapper: document.getElementById('pdf-viewer-wrapper'),
+ pdfTextLayer: document.getElementById('pdf-text-layer'),
generateBtnText: document.getElementById('generate-btn-text'),
audioOutput: document.getElementById('audio-output'),
audioPlayer: document.getElementById('audio-player'),
@@ -156,6 +288,9 @@ document.addEventListener('DOMContentLoaded', () => {
currentChunkIndex: 0,
localPrefs: handlePrefs(),
pdfTextContent: {},
+ pdfTextPositions: [], // Backend-extracted text positions
+ debugMode: false, // Debug mode for development
+ keyboardShortcutsVisible: false, // Keyboard shortcuts overlay
// Speech to Text variables
mediaRecorder: null,
@@ -164,6 +299,10 @@ document.addEventListener('DOMContentLoaded', () => {
currentUser: null,
isTwoPageView: false,
currentScale: 1.5, // Initial scale
+ currentRotation: 0, // PDF rotation angle
+ searchMatches: [], // Store PDF search matches
+ currentSearchIndex: -1, // Current search result index
+ isManualPageChange: false, // Flag to prevent infinite scroll interference
// Pagination
textCurrentPage: 1,
@@ -177,6 +316,16 @@ document.addEventListener('DOMContentLoaded', () => {
currentReadingPage: null,
// Global sleep timer
playerSleepTimer: null,
+ // Word-by-word highlighting
+ currentWordIndex: 0,
+ currentWordCount: 0,
+ currentChunkId: null,
+ currentPdfWordIndex: 0, // For PDF word-by-word highlighting
+
+ // Timing-based highlighting for backend PDF rendering
+ useTimingBasedHighlighting: false, // Enable for PDFs with backend processing
+ currentChunkTimingData: null, // Store timing data for current chunk
+ audioTimingData: {}, // Cache timing data for all chunks
},
functions: {
showNotification: showNotification,
@@ -187,6 +336,21 @@ document.addEventListener('DOMContentLoaded', () => {
// Set workerSrc for PDF.js
pdfjsLib.GlobalWorkerOptions.workerSrc = './static/js/pdf.worker.min.mjs';
+ // Debug: Log PDF control button initialization
+ console.log('🔧 PDF Control Elements Initialization:');
+ console.log(' Zoom In Button:', appState.elements.zoomInBtn ? '✅ Found' : '❌ Missing');
+ console.log(' Zoom Out Button:', appState.elements.zoomOutBtn ? '✅ Found' : '❌ Missing');
+ console.log(' Fit Width Button:', appState.elements.fitWidthBtn ? '✅ Found' : '❌ Missing');
+ console.log(' Fit Page Button:', appState.elements.fitPageBtn ? '✅ Found' : '❌ Missing');
+ console.log(' Rotate Left Button:', appState.elements.rotateLeftBtn ? '✅ Found' : '❌ Missing');
+ console.log(' Rotate Right Button:', appState.elements.rotateRightBtn ? '✅ Found' : '❌ Missing');
+ console.log(' First Page Button:', appState.elements.firstPageBtn ? '✅ Found' : '❌ Missing');
+ console.log(' Last Page Button:', appState.elements.lastPageBtn ? '✅ Found' : '❌ Missing');
+ console.log(' Page Input:', appState.elements.pageInput ? '✅ Found' : '❌ Missing');
+ console.log(' Zoom Level Display:', appState.elements.zoomLevel ? '✅ Found' : '❌ Missing');
+ console.log(' PDF Search Button:', appState.elements.pdfSearchBtn ? '✅ Found' : '❌ Missing');
+ console.log(' PDF Viewer:', appState.elements.pdfViewer ? '✅ Found' : '❌ Missing');
+
appState.elements.pageNumInput.type = 'number';
appState.elements.pageNumInput.className = 'w-16 text-center bg-gray-200 rounded-lg';
appState.elements.pageNumInput.style.display = 'none';
@@ -618,49 +782,112 @@ document.addEventListener('DOMContentLoaded', () => {
);
}
- // This function highlights only plain-text.
- function highlightChunk(chunkObject) {
+ // This function highlights only plain-text with word-by-word tracking
+ function highlightChunk(chunkText) {
+ if (!appState.variables.currentReadingPage) {
+ console.warn('No currentReadingPage set');
+ return;
+ }
+
+ // Clear any existing word highlights
+ const existingWords = appState.variables.currentReadingPage.querySelectorAll('.word-span');
+ existingWords.forEach(span => {
+ const text = span.textContent;
+ span.replaceWith(document.createTextNode(text));
+ });
+
+ // Get clean text content
const fullText = appState.variables.currentReadingPage.textContent;
- const chunkText = chunkObject.text;
- const startIndex = fullText.indexOf(chunkText);
+
+ // Normalize whitespace for better matching
+ const normalizedChunk = chunkText.trim().replace(/\s+/g, ' ');
+ const normalizedFull = fullText.replace(/\s+/g, ' ');
+ let startIndex = normalizedFull.indexOf(normalizedChunk);
+ // If exact match not found, try first few words
if (startIndex === -1) {
- console.error("Could not find chunk text to highlight:", chunkObject);
- console.debug(fullText);
- return;
+ const firstWords = normalizedChunk.split(' ').slice(0, 5).join(' ');
+ startIndex = normalizedFull.indexOf(firstWords);
}
- let highlightClass = "highlight";
+ if (startIndex === -1) {
+ console.error("Could not find chunk text to highlight");
+ console.debug('Looking for:', normalizedChunk.substring(0, 100));
+ console.debug('In text:', normalizedFull.substring(0, 200));
+ return;
+ }
- if (appState.variables.localPrefs.highlightColor)
- highlightClass += ` ${localPrefs.highlightColor}`;
+ let highlightColor = appState.variables.localPrefs.highlightColor || '';
+ const chunkId = `chunk-${appState.variables.currentChunkIndex}`;
- // Create the HTML for the highlighted chunk (with spans for each word)
- const words = chunkText.split(/(\s+)/); // Keep spaces
+ // Map normalized position back to original text position
+ let actualStartIdx = 0;
+ let normPos = 0;
+ for (let i = 0; i < fullText.length && normPos < startIndex; i++) {
+ const char = fullText[i];
+ if (char !== ' ' || fullText[i-1] !== ' ') {
+ normPos++;
+ }
+ actualStartIdx = i + 1;
+ }
+
+ // Split chunk into words for highlighting (no sentence wrapper)
+ const words = chunkText.split(/(\s+)/);
let highlightedHtml = '';
+
+ let wordIndex = 0;
words.forEach(word => {
if (word.trim() !== '') {
- // Use the same data-chunk-id for easy removal later
- highlightedHtml += `${word} `;
+ // Each word gets a span for word-by-word highlighting
+ highlightedHtml += `${word} `;
+ wordIndex++;
} else {
highlightedHtml += word;
}
});
- // Replace the plain text of the chunk with our new highlighted HTML
- appState.variables.currentReadingPage.innerHTML = fullText.substring(0, startIndex) +
+ // Find end position by counting words in original text
+ let actualEndIdx = actualStartIdx;
+ let wordsFound = 0;
+ let inWord = false;
+ for (let i = actualStartIdx; i < fullText.length && wordsFound < wordIndex; i++) {
+ const char = fullText[i];
+ if (/\S/.test(char)) {
+ if (!inWord) {
+ wordsFound++;
+ inWord = true;
+ }
+ } else {
+ inWord = false;
+ }
+ actualEndIdx = i + 1;
+ }
+
+ // Replace text with highlighted version
+ appState.variables.currentReadingPage.innerHTML = fullText.substring(0, actualStartIdx) +
highlightedHtml +
- fullText.substring(startIndex + chunkText.length);
+ fullText.substring(actualEndIdx);
+
+ // Store word count and chunk ID
+ appState.variables.currentWordCount = wordIndex;
+ appState.variables.currentWordIndex = 0;
+ appState.variables.currentChunkId = chunkId;
}
- function unhighlightChunk(chunkObject) {
- // Find all the spans for the chunk we just played
- const spans = appState.variables.currentReadingPage.querySelectorAll(`span[data-chunk-id="${chunkObject.id}"]`);
- if (spans.length === 0) return;
-
- // We can simply replace the entire innerHTML with the plain text again.
- // This is efficient because the highlight/unhighlight is the only change.
- appState.variables.currentReadingPage.textContent = appState.variables.allTextChunks.map(chunk => chunk.text).join(' ');
+ function unhighlightChunk(chunkText) {
+ // Remove all word-span elements and restore plain text
+ if (!appState.variables.currentReadingPage) return;
+
+ const wordSpans = appState.variables.currentReadingPage.querySelectorAll('.word-span');
+ wordSpans.forEach(span => {
+ const text = span.textContent;
+ span.replaceWith(document.createTextNode(text));
+ });
+
+ // Reset word tracking
+ appState.variables.currentWordIndex = 0;
+ appState.variables.currentWordCount = 0;
+ appState.variables.currentChunkId = null;
}
let renderBookContent = async (book) => {
@@ -696,20 +923,129 @@ document.addEventListener('DOMContentLoaded', () => {
appState.elements.bookView.classList.remove('hidden');
appState.variables.currentChunkIndex = 0; // Reset chunk index
- if (isPdfBook && appState.variables.currentUser) {
+ if (isPdfBook) {
let pdfData;
- // Fetch PDF from server
- const response = await fetch(book.content); // book.content is the URL to the PDF
- if (!response.ok) {
- throw new Error(`Failed to fetch PDF from ${book.content}`);
- }
- pdfData = await response.arrayBuffer();
+ if (appState.variables.currentUser) {
+ // Use POST endpoint to get PDF data as base64 JSON to bypass download managers like IDM
+ const dataUrl = book.content + '/data'; // Append /data to the PDF URL
+
+ const response = await fetch(dataUrl, {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ 'Accept': 'application/json'
+ }
+ });
+
+ if (!response.ok) {
+ const errorText = await response.text();
+ throw new Error(`Failed to fetch PDF data from ${dataUrl}: ${response.status} - ${errorText}`);
+ }
+
+ const jsonData = await response.json();
+
+ if (!jsonData.data || jsonData.size === 0) {
+ throw new Error('PDF file is empty (0 bytes)');
+ }
+
+ // Store text positions from backend
+ if (jsonData.text_positions && jsonData.text_positions.length > 0) {
+ appState.variables.pdfTextPositions = jsonData.text_positions;
+ console.log('✅ Received text positions for', jsonData.text_positions.length, 'pages from backend');
+ console.log('📊 First page dimensions:', {
+ width: jsonData.text_positions[0]?.width,
+ height: jsonData.text_positions[0]?.height,
+ items: jsonData.text_positions[0]?.text_items?.length
+ });
+
+ // Show helpful notification about PDF interaction features
+ setTimeout(() => {
+ showNotification('💡 PDF Tips: Hover over text to highlight • Hold Ctrl+Click to start reading • Select text to copy', 'info');
+ }, 1500);
+ } else {
+ console.warn('⚠️ No text positions received from backend');
+ appState.variables.pdfTextPositions = [];
+ showNotification('⚠️ Text selection may not be available for this PDF', 'warning');
+ }
+
+ // Decode base64 to binary
+ const binaryString = atob(jsonData.data);
+ const bytes = new Uint8Array(binaryString.length);
+ for (let i = 0; i < binaryString.length; i++) {
+ bytes[i] = binaryString.charCodeAt(i);
+ }
+ pdfData = bytes.buffer;
+
+ } else if (book.source === 'local' && appState.variables.localBooks[book.id].pdfData) {
+ // Use locally stored PDF data for anonymous users
+ pdfData = appState.variables.localBooks[book.id].pdfData;
+ }
if (pdfData) {
- appState.variables.pdfDoc = await pdfjsLib.getDocument({ data: pdfData, }).promise;
- const lastPage = parseInt(localStorage.getItem(appState.variables.pdfDoc.fingerprints[0])) || 1;
- renderPage(lastPage);
+ // Show loading indicator
+ if (appState.elements.pdfLoading) {
+ appState.elements.pdfLoading.classList.remove('hidden');
+ }
+
+ try {
+ appState.variables.pdfDoc = await pdfjsLib.getDocument({ data: pdfData }).promise;
+ const lastPage = parseInt(localStorage.getItem(appState.variables.pdfDoc.fingerprints[0])) || 1;
+
+ // Render initial page WITHOUT observers to prevent multiple triggers
+ appState.variables.isManualPageChange = true;
+ await renderPage(lastPage);
+
+ // Delay observer attachment to let DOM fully settle
+ setTimeout(() => {
+ const firstPage = appState.elements.pdfViewer.children[0];
+ if (firstPage && window.upwardsScroll) {
+ window.upwardsScroll.observe(firstPage);
+ console.log('✅ [DEBUG] Initial upward observer attached to first page');
+ }
+ if (window.downwardsScroll) {
+ const toolbarSpace = document.querySelector("#toolbar-space");
+ if (toolbarSpace) {
+ window.downwardsScroll.observe(toolbarSpace);
+ console.log('✅ [DEBUG] Initial downward observer attached to toolbar');
+ }
+ }
+ appState.variables.isManualPageChange = false;
+ }, 500); // Wait for DOM to fully settle
+
+ // Hide loading indicator
+ if (appState.elements.pdfLoading) {
+ appState.elements.pdfLoading.classList.add('hidden');
+ }
+
+ // Show helpful notification about Ctrl+Click feature
+ setTimeout(() => {
+ showNotification('💡 Tip: Hold Ctrl and click on blue chunks to start reading from that point', 'info');
+ }, 1000);
+ } catch (error) {
+ // Hide loading indicator
+ if (appState.elements.pdfLoading) {
+ appState.elements.pdfLoading.classList.add('hidden');
+ }
+
+ console.error('❌ Failed to load PDF:', error);
+ let errorMsg = 'Failed to load PDF: ';
+ if (error.message.includes('empty')) {
+ errorMsg += 'The PDF file is empty or corrupted. Try re-uploading.';
+ } else if (error.message.includes('Invalid')) {
+ errorMsg += 'The PDF format is not supported or is corrupted.';
+ } else {
+ errorMsg += error.message || 'Unknown error occurred';
+ }
+ showNotification(errorMsg, 'error');
+ resetPdfView(appState);
+ // Fall back to text display
+ isPdfBook = false;
+ appState.variables.fullBookText = bookContent || '';
+ appState.variables.totalTextPages = Math.max(1, Math.ceil(appState.variables.fullBookText.length / appState.variables.charsPerPage));
+ const lastTextPage = parseInt(localStorage.getItem(`text-page-${book.id}`)) || 1;
+ renderTextPage(lastTextPage);
+ }
} else {
resetPdfView(appState);
const lastTextPage = parseInt(localStorage.getItem(`text-page-${book.id}`)) || 1;
@@ -760,21 +1096,35 @@ document.addEventListener('DOMContentLoaded', () => {
await renderBookContent(book);
let scrollCompensationElement = null;
- // Start scroll events
- if (book.is_pdf) {
- scrollCompensationElement = appState.elements.pdfViewer.children[0];
- upwardsScroll.observe(appState.elements.pdfViewer.children[0]);
- } else {
- scrollCompensationElement = appState.elements.textDisplay.children[0];
- upwardsScroll.observe(appState.elements.textDisplay.children[0]);
- }
+ // Delay observer attachment for PDFs to prevent multiple triggers
+ const observerDelay = book.is_pdf ? 500 : 0;
+
+ setTimeout(() => {
+ // Start scroll events
+ if (book.is_pdf) {
+ scrollCompensationElement = appState.elements.pdfViewer.children[0];
+ if (scrollCompensationElement) {
+ upwardsScroll.observe(scrollCompensationElement);
+ console.log('✅ [DEBUG] Library PDF - upward observer attached');
+ }
+ } else {
+ scrollCompensationElement = appState.elements.textDisplay.children[0];
+ if (scrollCompensationElement) {
+ upwardsScroll.observe(scrollCompensationElement);
+ }
+ }
- // Only scroll if not on first page.
- if (scrollCompensationElement && scrollCompensationElement.dataset.page > 1) {
- document.scrollingElement.scrollTop = scrollCompensationElement.scrollHeight + 100;
- }
+ // Only scroll if not on first page.
+ if (scrollCompensationElement && scrollCompensationElement.dataset.page > 1) {
+ document.scrollingElement.scrollTop = scrollCompensationElement.scrollHeight + 100;
+ }
- downwardsScroll.observe(document.querySelector("#toolbar-space"));
+ downwardsScroll.observe(document.querySelector("#toolbar-space"));
+ if (book.is_pdf) {
+ console.log('✅ [DEBUG] Library PDF - downward observer attached');
+ }
+ }, observerDelay);
+
checkTextContent(appState);
if (appState.variables.currentUser && !book.is_pdf) { // Only show save button for non-PDF online books
@@ -851,7 +1201,7 @@ document.addEventListener('DOMContentLoaded', () => {
appState.elements.skipHeadersCheckbox.addEventListener('change', () => {
if (appState.variables.activeBook?.source === 'local') {
- appState.variables.localBooks[appState.variables.activeBook.id].skipHeadersNFooters = skipHeadersCheckbox.checked;
+ appState.variables.localBooks[appState.variables.activeBook.id].skipHeadersNFooters = appState.elements.skipHeadersCheckbox.checked;
saveLocalBooks(appState);
} else {
handlePrefs({ skipHeaders: appState.elements.skipHeadersCheckbox.checked })
@@ -930,8 +1280,16 @@ document.addEventListener('DOMContentLoaded', () => {
const currentAudio = appState.variables.audioQueue[appState.variables.currentChunkIndex];
updateTextChunkReader(appState);
- if (appState.variables.pdfDoc)
- await highlightPdfChunk(currentAudio.text);
+ // Check if we're using timing-based highlighting with backend rendering
+ const hasBackendRendering = document.querySelector('.backend-text-span') !== null;
+ const hasTimingData = currentAudio.timingData && appState.variables.useTimingBasedHighlighting;
+
+ if (appState.variables.pdfDoc) {
+ // Skip chunk highlighting if we have timing-based word highlighting
+ if (!(hasBackendRendering && hasTimingData)) {
+ await highlightPdfChunk(currentAudio.text);
+ }
+ }
else {
if (fastFormatDetect(appState.elements.textDisplay.innerHTML) == 'html') {
highlightHTML(currentAudio.text); // HTML highlights
@@ -949,6 +1307,159 @@ document.addEventListener('DOMContentLoaded', () => {
currentAudio.retries = 0;
}
+ // Setup word-by-word highlighting for plain text
+ if (!appState.variables.pdfDoc && fastFormatDetect(appState.elements.textDisplay.innerHTML) !== 'html') {
+ appState.elements.audioPlayer.ontimeupdate = () => {
+ // Check if word highlighting is enabled
+ const enableWordHighlight = appState.variables.localPrefs?.enableWordHighlight !== false;
+ if (!enableWordHighlight) return;
+
+ const currentTime = appState.elements.audioPlayer.currentTime;
+ const duration = appState.elements.audioPlayer.duration;
+
+ if (duration && appState.variables.currentWordCount > 0 && appState.variables.currentChunkId) {
+ // Calculate which word should be highlighted based on time
+ const progress = currentTime / duration;
+ const targetWordIndex = Math.floor(progress * appState.variables.currentWordCount);
+
+ if (targetWordIndex !== appState.variables.currentWordIndex && targetWordIndex < appState.variables.currentWordCount) {
+ // Get word highlight color from prefs
+ const wordColor = appState.variables.localPrefs?.wordHighlightColor || appState.variables.localPrefs?.highlightColor || '';
+
+ // Remove highlight from previous word
+ const prevWord = document.querySelector(`span.word-span.highlight-word[data-chunk-id="${appState.variables.currentChunkId}"]`);
+ if (prevWord) {
+ prevWord.classList.remove('highlight-word');
+ // Remove color class
+ if (wordColor) prevWord.classList.remove(wordColor);
+ }
+
+ // Highlight new word
+ const newWord = document.querySelector(`span.word-span[data-chunk-id="${appState.variables.currentChunkId}"][data-word-index="${targetWordIndex}"]`);
+ if (newWord) {
+ newWord.classList.add('highlight-word');
+ // Add color class
+ if (wordColor) newWord.classList.add(wordColor);
+ }
+
+ appState.variables.currentWordIndex = targetWordIndex;
+ }
+ }
+ };
+ } else if (appState.variables.pdfDoc) {
+ // Setup word-by-word highlighting for PDF text spans
+ appState.elements.audioPlayer.ontimeupdate = () => {
+ // Check if word highlighting is enabled
+ const enableWordHighlight = appState.variables.localPrefs?.enableWordHighlight !== false;
+ if (!enableWordHighlight) return;
+
+ const currentTime = appState.elements.audioPlayer.currentTime;
+ const duration = appState.elements.audioPlayer.duration;
+
+ if (!duration) return;
+
+ // Check if we have timing data for precision highlighting
+ const currentAudio = appState.variables.audioQueue[appState.variables.currentChunkIndex];
+ const timingData = currentAudio?.timingData;
+
+ if (timingData && timingData.words && appState.variables.useTimingBasedHighlighting) {
+ // Precision timing-based highlighting using backend timing data
+ const wordColor = appState.variables.localPrefs?.wordHighlightColor || appState.variables.localPrefs?.highlightColor || '';
+
+ // Find the word that should be highlighted at current time
+ const currentWord = timingData.words.find(word =>
+ currentTime >= word.startTime && currentTime < word.endTime && !word.skip
+ );
+
+ if (currentWord && currentWord.index !== undefined) {
+ // Remove previous highlight
+ const prevHighlighted = document.querySelector('.backend-text-span.pdf-text-word-highlight');
+ if (prevHighlighted) {
+ prevHighlighted.classList.remove('pdf-text-word-highlight');
+ if (wordColor) prevHighlighted.classList.remove(wordColor);
+ }
+
+ // Calculate global word index
+ // timingData.startOffset tells us where this chunk starts in global text
+ // currentWord.index is the word position within the chunk
+ const chunk = appState.variables.allTextChunks[appState.variables.currentChunkIndex];
+
+ // Find the first text span of this chunk to determine offset
+ const allTextSpans = document.querySelectorAll(`.backend-text-span`);
+ const chunkText = chunk.text.trim();
+ const chunkWords = chunkText.split(/\s+/);
+
+ // Search for the chunk's starting word in the spans to get base index
+ let baseWordIdx = -1;
+ const firstChunkWord = chunkWords[0];
+
+ for (const span of allTextSpans) {
+ const spanText = span.textContent.trim();
+ if (spanText.includes(firstChunkWord) || firstChunkWord.includes(spanText)) {
+ baseWordIdx = parseInt(span.dataset.wordIdx || '-1');
+ if (baseWordIdx >= 0) break;
+ }
+ }
+
+ if (baseWordIdx >= 0) {
+ const targetWordIdx = baseWordIdx + currentWord.index;
+
+ // Find and highlight the element with this word index
+ const targetSpan = document.querySelector(`.backend-text-span[data-word-idx="${targetWordIdx}"]`);
+ if (targetSpan) {
+ targetSpan.classList.add('pdf-text-word-highlight');
+ if (wordColor) targetSpan.classList.add(wordColor);
+
+ // Scroll to keep highlighted word visible
+ targetSpan.scrollIntoView({ behavior: 'smooth', block: 'center', inline: 'nearest' });
+ }
+ }
+ }
+ } else {
+ // Fallback: duration-based highlighting (less precise)
+ const currentPageNum = appState.variables.currentReadingPage?.dataset?.page;
+ if (!currentPageNum) return;
+
+ const textSpans = document.querySelectorAll(`.backend-text-span[data-page="${currentPageNum}"]`);
+ if (textSpans.length === 0) return;
+
+ // Calculate progress and target word index
+ const progress = currentTime / duration;
+ const targetWordIndex = Math.floor(progress * textSpans.length);
+
+ if (targetWordIndex !== appState.variables.currentPdfWordIndex && targetWordIndex < textSpans.length) {
+ // Get word highlight color from prefs
+ const wordColor = appState.variables.localPrefs?.wordHighlightColor || appState.variables.localPrefs?.highlightColor || '';
+
+ // Remove previous word highlight
+ const prevHighlighted = document.querySelector('.pdf-text-word-highlight');
+ if (prevHighlighted) {
+ prevHighlighted.classList.remove('pdf-text-word-highlight');
+ // Remove color class
+ if (wordColor) {
+ prevHighlighted.classList.remove(wordColor);
+ }
+ }
+
+ // Add word highlight to current word
+ const currentWord = textSpans[targetWordIndex];
+ if (currentWord) {
+ currentWord.classList.add('pdf-text-word-highlight');
+ // Add color class
+ if (wordColor) {
+ currentWord.classList.add(wordColor);
+ }
+ }
+
+ appState.variables.currentPdfWordIndex = targetWordIndex;
+ }
+ }
+ };
+ } else {
+ // Clear ontimeupdate for HTML modes
+ appState.elements.audioPlayer.ontimeupdate = null;
+ }
+
// Try playing the URL
try {
await appState.elements.audioPlayer.play();
@@ -985,6 +1496,17 @@ document.addEventListener('DOMContentLoaded', () => {
appState.elements.audioPlayer.onended = async () => {
clearAllHighlights();
appState.elements.currentChunk.classList.add('hidden');
+ appState.variables.currentWordIndex = 0;
+ appState.variables.currentWordCount = 0;
+ appState.variables.currentPdfWordIndex = 0; // Reset PDF word index
+
+ // Clear PDF text span highlights
+ const pdfHighlights = document.querySelectorAll('.pdf-text-chunk-highlight, .pdf-text-word-highlight');
+ pdfHighlights.forEach(span => {
+ span.classList.remove('pdf-text-chunk-highlight', 'pdf-text-word-highlight');
+ span.classList.remove('green', 'blue'); // Remove color classes
+ });
+
appState.variables.currentChunkIndex++;
processAndQueueChunk(appState.variables.currentChunkIndex + 1); // Pre-fetch next chunk
@@ -1004,6 +1526,14 @@ document.addEventListener('DOMContentLoaded', () => {
appState.variables.audioQueue = [];
appState.elements.audioPlayer.pause();
appState.elements.audioPlayer.src = '';
+ appState.variables.currentPdfWordIndex = 0; // Reset PDF word index
+
+ // Clear PDF text span highlights
+ const pdfHighlights = document.querySelectorAll('.pdf-text-chunk-highlight, .pdf-text-word-highlight');
+ pdfHighlights.forEach(span => {
+ span.classList.remove('pdf-text-chunk-highlight', 'pdf-text-word-highlight');
+ span.classList.remove('green', 'blue'); // Remove color classes
+ });
updatePlayerUI('IDLE', appState);
appState.elements.speechToTextSection.classList.remove('hidden');
@@ -1068,7 +1598,7 @@ document.addEventListener('DOMContentLoaded', () => {
playAudioQueue();
}
- function startSpeechGeneration() {
+ async function startSpeechGeneration() {
updatePlayerUI('BUFFERING', appState);
// If currentReadingPage is not set, we are going to find
// the page the user is currently looking at.
@@ -1090,8 +1620,8 @@ document.addEventListener('DOMContentLoaded', () => {
}
}
- // Now that we have our page text, prepare for generation.
- appState.variables.allTextChunks = splitTextIntoChunks(text);
+ // Now that we have our page text, prepare for generation with semantic splitting.
+ appState.variables.allTextChunks = await splitTextIntoChunksAsync(text);
if (appState.variables.allTextChunks.length === 0) return;
@@ -1117,8 +1647,67 @@ document.addEventListener('DOMContentLoaded', () => {
if (chunkIndex >= appState.variables.allTextChunks.length || chunkIndex < 0) return false;
const chunk = appState.variables.allTextChunks[chunkIndex];
- let cleanedChunk = chunk.text.replaceAll('\n', ' '); // Clean new lines
+ // Handle both string chunks (from API) and object chunks (from local splitting)
+ const chunkText = typeof chunk === 'string' ? chunk : chunk.text;
+ if (!chunkText) return false;
+ let cleanedChunk = chunkText.replaceAll('\n', ' '); // Clean new lines
+
+ // Check if we should use timing-based highlighting (for PDF backend rendering)
+ const usePdfBackendTiming = appState.variables.pdfTextPositions &&
+ appState.variables.pdfTextPositions.length > 0 &&
+ appState.variables.pdfDoc;
+ if (usePdfBackendTiming) {
+ // Use timing-based audio generation for precise highlighting
+ console.log(`🎵 Using timing-based audio for chunk${chunkIndex}`);
+
+ generateSpeechWithTiming(
+ cleanedChunk,
+ appState.variables.bookDetectedLang,
+ appState.elements.engineSelect.value,
+ appState.elements.voiceSelect.value,
+ 50, // chunk size in words
+ 1.0 // speed
+ ).then(timingData => {
+ if (timingData && timingData.chunks && timingData.chunks.length > 0) {
+ // Use the first chunk (since we're generating one chunk at a time)
+ const chunkData = timingData.chunks[0];
+ const audioUrl = chunkData.audioUrl;
+
+ // Store timing data with the chunk
+ appState.variables.audioQueue[chunkIndex] = {
+ url: audioUrl,
+ text: chunk,
+ timingData: chunkData // Contains words array with startTime/endTime
+ };
+
+ appState.variables.audioTimingData[chunkIndex] = chunkData;
+ appState.variables.useTimingBasedHighlighting = true;
+
+ console.log(`✅ Chunk ${chunkIndex} audio ready with timing data:`, chunkData);
+
+ // If playback isn't running and this is the chunk we're waiting for, start playing
+ if (!appState.variables.isPlaying && chunkIndex === appState.variables.currentChunkIndex) {
+ playAudioQueue();
+ }
+ } else {
+ console.debug(`Failed to get timing data for chunk ${chunkIndex}, falling back to regular generation`);
+ // Fallback to regular generation
+ generateRegularAudio(chunkIndex, cleanedChunk, chunk);
+ }
+ }).catch(error => {
+ console.error(`Error generating timing-based audio for chunk ${chunkIndex}:`, error);
+ // Fallback to regular generation
+ generateRegularAudio(chunkIndex, cleanedChunk, chunk);
+ });
+ } else {
+ // Regular audio generation without timing
+ generateRegularAudio(chunkIndex, cleanedChunk, chunk);
+ }
+ }
+
+ // Helper function for regular audio generation
+ function generateRegularAudio(chunkIndex, cleanedChunk, chunk) {
generateSpeech(cleanedChunk, appState.variables.bookDetectedLang, appState.elements.engineSelect.value, appState.elements.voiceSelect.value).then(audioUrl => {
if (audioUrl) {
appState.variables.audioQueue[chunkIndex] = { url: audioUrl, text: chunk };
@@ -1130,8 +1719,21 @@ document.addEventListener('DOMContentLoaded', () => {
}
function clearAllHighlights() {
+ // Remove old style highlights
const allWordElements = appState.elements.textDisplay.querySelectorAll('span.highlight');
allWordElements.forEach(span => span.classList.remove('highlight'));
+
+ // Remove word-span elements
+ const allWordSpans = appState.elements.textDisplay.querySelectorAll('.word-span');
+ allWordSpans.forEach(span => {
+ const text = span.textContent;
+ span.replaceWith(document.createTextNode(text));
+ });
+
+ // Clear active word highlights
+ const activeWords = appState.elements.textDisplay.querySelectorAll('.highlight-word');
+ activeWords.forEach(word => word.classList.remove('highlight-word'));
+
clearPdfHighlights();
}
@@ -1155,19 +1757,440 @@ document.addEventListener('DOMContentLoaded', () => {
appState.elements.textboxViewerWrapper.classList.remove('hidden');
appState.elements.zoomInBtn.disabled = true;
appState.elements.zoomOutBtn.disabled = true;
+ if (appState.elements.pdfSearchBtn.parentElement) {
+ appState.elements.pdfSearchBtn.parentElement.classList.add('hidden');
+ }
+ appState.elements.pdfSearchBar.classList.add('hidden');
+ }
+
+ async function renderTextLayer(pageNum) {
+ if (!appState.variables.pdfDoc || !appState.elements.pdfTextLayer) {
+ console.log('⚠️ Text layer rendering skipped - missing pdfDoc or pdfTextLayer element');
+ return;
+ }
+
+ // Check if we have backend text positions
+ if (!appState.variables.pdfTextPositions || appState.variables.pdfTextPositions.length === 0) {
+ console.warn('⚠️ No text positions from backend, skipping text layer');
+ return;
+ }
+
+ console.log(`📝 Rendering text layer for page ${pageNum} using backend positions`);
+
+ try {
+ const page = await appState.variables.pdfDoc.getPage(pageNum);
+ const viewport = page.getViewport({
+ scale: appState.variables.currentScale,
+ rotation: appState.variables.currentRotation
+ });
+
+ // Get text positions for this page from backend data
+ const pageData = appState.variables.pdfTextPositions.find(p => p.page_number === pageNum);
+ if (!pageData || !pageData.text_items || pageData.text_items.length === 0) {
+ console.warn(`⚠️ No text items for page ${pageNum}`);
+ return;
+ }
+
+ console.log(`📄 Page ${pageNum} has ${pageData.text_items.length} text items from backend`);
+
+ // Find the page wrapper and canvas - adaptive retry logic based on operation type
+ let pageWrapper = null;
+ let pageCanvas = null;
+
+ // During scroll operations, use more attempts with longer delays
+ const isScrollOperation = !appState.variables.isManualPageChange;
+ const maxAttempts = isScrollOperation ? 12 : 5; // More attempts for scroll
+ const baseDelay = isScrollOperation ? 300 : 150; // Longer base delay for scroll
+
+ console.log(`🔍 [DEBUG] Looking for page ${pageNum} wrapper (${isScrollOperation ? 'SCROLL' : 'MANUAL'}, max attempts: ${maxAttempts})`);
+
+ // First quick check
+ pageWrapper = appState.elements.pdfViewer.querySelector(`.pdf-page-wrapper[data-page="${pageNum}"]`);
+ pageCanvas = pageWrapper?.querySelector('canvas');
+
+ if (!pageCanvas || !pageWrapper) {
+ console.warn(`⚠️ [DEBUG] Canvas/wrapper for page ${pageNum} not immediately found, retrying...`);
+
+ // Retry with progressive delays
+ for (let attempt = 0; attempt < maxAttempts && (!pageCanvas || !pageWrapper); attempt++) {
+ await new Promise(resolve => setTimeout(resolve, baseDelay * (attempt + 1)));
+ pageWrapper = appState.elements.pdfViewer.querySelector(`.pdf-page-wrapper[data-page="${pageNum}"]`);
+ pageCanvas = pageWrapper?.querySelector('canvas');
+
+ if (pageCanvas && pageWrapper) {
+ console.log(`✅ [DEBUG] Found wrapper for page ${pageNum} on attempt ${attempt + 1}`);
+ break;
+ } else {
+ console.warn(`⏳ [DEBUG] Attempt ${attempt + 1}/${maxAttempts} failed for page ${pageNum}`);
+ }
+ }
+
+ if (!pageCanvas || !pageWrapper) {
+ console.error(`❌ [DEBUG] Could not find canvas/wrapper for page ${pageNum} after ${maxAttempts} attempts`);
+ if (isScrollOperation) {
+ console.warn(`⏭️ [DEBUG] Skipping text layer - page ${pageNum} may have been removed during scroll`);
+ }
+ return;
+ }
+ }
+
+ console.log(`📍 [DEBUG] Page ${pageNum} wrapper offset: top=${pageWrapper.offsetTop}, left=${pageWrapper.offsetLeft}`);
+
+ // Clear existing text layers for this page INSIDE the wrapper
+ const existingLayers = pageWrapper.querySelectorAll('.textLayer, .pdf-text-chunk');
+ if (existingLayers.length > 0) {
+ console.log(`🧹 [DEBUG] Removing ${existingLayers.length} existing text layers from page ${pageNum}`);
+ existingLayers.forEach(layer => layer.remove());
+ }
+
+ // Create container for this page's text layer INSIDE the page wrapper
+ // This ensures text stays with its page regardless of scrolling/DOM changes
+ const textLayerDiv = document.createElement('div');
+ textLayerDiv.className = 'textLayer';
+ textLayerDiv.dataset.page = pageNum;
+ textLayerDiv.style.position = 'absolute';
+ textLayerDiv.style.left = '0'; // Relative to pageWrapper
+ textLayerDiv.style.top = '0'; // Relative to pageWrapper
+ textLayerDiv.style.width = viewport.width + 'px';
+ textLayerDiv.style.height = viewport.height + 'px';
+ textLayerDiv.style.pointerEvents = 'auto';
+ textLayerDiv.style.zIndex = '1';
+
+ // Calculate scale factors (backend uses PDF coordinates, we need to scale to viewport)
+ const scaleX = viewport.width / pageData.width;
+ const scaleY = viewport.height / pageData.height;
+
+ // Create selectable text spans from backend positions with enhanced interaction
+ let textSpansCreated = 0;
+ for (const item of pageData.text_items) {
+ if (!item.text || item.text.trim() === '') continue;
+
+ const span = document.createElement('span');
+ span.textContent = item.text;
+ span.classList.add('pdf-text-span', 'selectable-text');
+
+ // Enhanced positioning with better accuracy
+ span.style.position = 'absolute';
+ span.style.left = (item.x * scaleX) + 'px';
+ span.style.top = (item.y * scaleY) + 'px';
+ span.style.fontSize = (item.size * scaleY) + 'px';
+ span.style.fontFamily = item.font || 'sans-serif';
+ span.style.lineHeight = '1.2';
+
+ // Make text selectable and visible on hover
+ span.style.color = 'transparent';
+ span.style.userSelect = 'text';
+ span.style.webkitUserSelect = 'text';
+ span.style.mozUserSelect = 'text';
+ span.style.msUserSelect = 'text';
+ span.style.cursor = 'text';
+ span.style.whiteSpace = 'pre';
+ span.style.transformOrigin = '0% 0%';
+ span.style.pointerEvents = 'auto';
+ span.style.display = 'inline-block';
+ span.style.width = (item.width * scaleX) + 'px';
+ span.style.height = (item.height * scaleY) + 'px';
+
+ // Add smooth transitions
+ span.style.transition = 'background-color 0.2s ease, color 0.2s ease, transform 0.15s ease';
+
+ // Store data
+ span.dataset.text = item.text;
+ span.dataset.page = pageNum;
+ span.dataset.itemId = item.id || textSpansCreated;
+
+ // Track mouse position for click vs drag detection
+ let mouseDownPos = null;
+
+ span.addEventListener('mousedown', function(e) {
+ mouseDownPos = { x: e.clientX, y: e.clientY };
+ // Prevent default browser behavior when Ctrl is held to ensure proper event handling
+ if (e.ctrlKey || e.metaKey) {
+ e.preventDefault();
+ }
+ });
+
+ // Enhanced click handler - distinguishes click from drag/select
+ span.addEventListener('mouseup', function(e) {
+ // Check if this was a drag (text selection) or a click
+ if (!mouseDownPos) return;
+
+ const dragDistance = Math.sqrt(
+ Math.pow(e.clientX - mouseDownPos.x, 2) +
+ Math.pow(e.clientY - mouseDownPos.y, 2)
+ );
+
+ // If mouse moved more than 5px, it's a drag/selection, not a click
+ const isDrag = dragDistance > 5;
+ mouseDownPos = null;
+
+ if (isDrag) {
+ // This was a text selection attempt, don't trigger playback
+ return;
+ }
+
+ // Check if Ctrl/Cmd key is held
+ const selection = window.getSelection();
+ const hasSelection = selection && selection.toString().length > 0;
+
+ // Only trigger playback if Ctrl is held and no selection and it was a click
+ if ((e.ctrlKey || e.metaKey) && !hasSelection && !isDrag) {
+ e.stopPropagation();
+ e.preventDefault();
+
+ const clickedText = this.textContent;
+ console.log('📝 Text clicked with Ctrl:', clickedText.substring(0, 50) + '...');
+ console.log('📄 Clicked on page:', pageNum);
+
+ // Get full text from PDF viewer (all text spans across all pages)
+ const pdfViewer = appState.elements.pdfViewer;
+ const allTextSpans = pdfViewer.querySelectorAll('.pdf-text-span');
+ console.log('📚 Found', allTextSpans.length, 'text spans in PDF');
+
+ // Build full text from all pages starting from the clicked page
+ let fullText = '';
+ let textBeforeClick = '';
+ let foundClickedSpan = false;
+
+ for (const span of allTextSpans) {
+ const spanPage = parseInt(span.dataset.page);
+ const spanText = span.textContent;
+
+ // Check if this is the clicked span
+ if (span === this) {
+ foundClickedSpan = true;
+ console.log('✅ Found clicked span at position', textBeforeClick.length);
+ }
+
+ // Only include text from current page onwards
+ if (spanPage >= pageNum) {
+ fullText += spanText + ' ';
+ if (!foundClickedSpan) {
+ textBeforeClick += spanText + ' ';
+ }
+ }
+ }
+
+ console.log('📄 Full text from page', pageNum, 'onwards:', fullText.length, 'characters');
+ console.log('📍 Text before click:', textBeforeClick.length, 'characters');
+
+ if (foundClickedSpan) {
+ // Start from the clicked position
+ const textFromPoint = fullText.substring(textBeforeClick.length);
+ console.log('🎵 Starting playback from clicked position, remaining text:', textFromPoint.length, 'characters');
+ showNotification('🎵 Starting from: "' + clickedText.substring(0, 30) + '..."', 'info');
+ processTextAndPlay(textFromPoint, pageNum);
+ } else {
+ console.error('❌ Could not determine click position in document');
+ showNotification('❌ Could not locate text in document', 'error');
+ }
+ }
+ });
+
+ // Enhanced hover effects
+ span.addEventListener('mouseenter', function(e) {
+ // Don't highlight during text selection
+ const selection = window.getSelection();
+ if (selection && selection.toString().length > 0) return;
+
+ // Show background and make text slightly visible
+ this.style.backgroundColor = 'rgba(59, 130, 246, 0.15)';
+ this.style.color = 'rgba(0, 0, 0, 0.05)';
+ this.style.transform = 'scale(1.02)';
+
+ // Show tooltip if Ctrl is not held
+ if (!e.ctrlKey && !e.metaKey) {
+ this.title = 'Hold Ctrl+Click to start reading from here, or select to copy text';
+ } else {
+ this.title = 'Click to start reading from here';
+ }
+ });
+
+ span.addEventListener('mouseleave', function() {
+ this.style.backgroundColor = 'transparent';
+ this.style.color = 'transparent';
+ this.style.transform = 'scale(1)';
+ });
+
+ // Update cursor based on Ctrl key state
+ span.addEventListener('mousemove', function(e) {
+ if (e.ctrlKey || e.metaKey) {
+ this.style.cursor = 'pointer';
+ } else {
+ this.style.cursor = 'text';
+ }
+ });
+
+ // Store word index for highlighting during reading
+ span.dataset.wordIndex = textSpansCreated;
+
+ // Debug mode visualization
+ if (appState.variables.debugMode) {
+ span.style.outline = '1px solid rgba(255, 0, 0, 0.3)';
+ span.style.backgroundColor = 'rgba(0, 255, 0, 0.05)';
+ }
+
+ textLayerDiv.appendChild(span);
+ textSpansCreated++;
+ }
+
+ // Append text layer to pageWrapper (not global overlay) so it stays with its page
+ pageWrapper.appendChild(textLayerDiv);
+ console.log(`✅ [DEBUG] Text layer with ${textSpansCreated} spans appended to page ${pageNum} wrapper`);
+
+ // Now create clickable chunks from the text
+ await renderClickableChunks(pageNum, pageData, viewport, pageWrapper, scaleX, scaleY);
+
+ } catch (error) {
+ console.error('❌ Error rendering text layer for page', pageNum, ':', error);
+ showNotification('Error rendering text layer: ' + error.message, 'error');
+ }
+ }
+
+ async function renderClickableChunks(pageNum, pageData, viewport, pageWrapper, scaleX, scaleY) {
+ // Clickable chunks are now handled by individual text spans with Ctrl+Click
+ // This function is kept for backward compatibility but does minimal work
+ console.log(`✅ [DEBUG] Text interaction enabled for ${pageData.text_items.length} items on page ${pageNum}`);
+ console.log(`ℹ️ TIP: Hold Ctrl and click on any text to start reading from that point`);
+ console.log(`ℹ️ TIP: Select text normally to copy it`);
+ }
+
+ async function processTextAndPlay(text, startPage) {
+ // Split text into chunks for TTS
+ try {
+ console.log('🎬 processTextAndPlay called with', text.length, 'characters');
+
+ // First, completely stop any current playback and reset state
+ if (appState.variables.isPlaying) {
+ console.log('⏹️ Stopping current playback before starting new one');
+ stopAudioQueue();
+
+ // Wait a bit for cleanup
+ await new Promise(resolve => setTimeout(resolve, 200));
+ }
+
+ // Reset the state completely
+ appState.variables.audioQueue = [];
+ appState.variables.allTextChunks = [];
+ appState.variables.currentChunkIndex = 0;
+ appState.variables.isPlaying = false;
+ appState.variables.isPaused = false;
+
+ const response = await fetch('/api/process_text', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ text: text,
+ chunk_size: 200,
+ use_llm: true
+ })
+ });
+
+ if (!response.ok) {
+ console.error('Failed to process text:', response.status, response.statusText);
+ showNotification('❌ Failed to process text', 'error');
+ return;
+ }
+
+ const data = await response.json();
+ console.log('✅ Received', data.chunks?.length || 0, 'chunks from server');
+
+ if (data.status === 'success' && data.chunks && data.chunks.length > 0) {
+ // Convert chunks to the format expected by the player
+ for (let i = 0; i < data.chunks.length; i++) {
+ appState.variables.allTextChunks.push({
+ text: data.chunks[i],
+ index: i
+ });
+ }
+
+ console.log('📝 Prepared', appState.variables.allTextChunks.length, 'chunks for TTS');
+
+ // Set playback state
+ appState.variables.currentChunkIndex = 0;
+ appState.variables.isPlaying = true;
+ appState.variables.isPaused = false;
+
+ // Start generating speech for the first few chunks
+ const chunksToPreload = Math.min(3, data.chunks.length);
+ console.log('🔊 Pre-loading', chunksToPreload, 'audio chunks');
+
+ for (let i = 0; i < chunksToPreload; i++) {
+ processAndQueueChunk(i);
+ }
+
+ // Start playback after a short delay
+ setTimeout(() => {
+ console.log('▶️ Starting playback');
+ playAudioQueue();
+ }, 500);
+ } else {
+ console.warn('⚠️ No chunks received from server');
+ showNotification('⚠️ No text to process', 'warning');
+ }
+ } catch (error) {
+ console.error('❌ Error processing text:', error);
+ showNotification('❌ Error: ' + error.message, 'error');
+ }
}
- async function renderPage(num, skipTextExtraction = false, append = true) {
+ async function renderPage(num, skipTextExtraction = false, append = true, prepend = false) {
if (!appState.variables.pdfDoc) return;
+
+ console.log(`📄 [DEBUG] renderPage called - page: ${num}, skipText: ${skipTextExtraction}, append: ${append}, prepend: ${prepend}`);
+
+ // Validate page number
+ if (!num || num < 1 || num > appState.variables.pdfDoc.numPages) {
+ console.error('❌ [DEBUG] Invalid page number:', num, '(valid range: 1-' + appState.variables.pdfDoc.numPages + ')');
+ return;
+ }
+
+ // Clear text layer when replacing (not appending or prepending)
+ if (!append && !prepend && appState.elements.pdfTextLayer) {
+ console.log(`🧹 [DEBUG] Clearing global text layer (replacing)`);
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+
+ // Clear PDF viewer when replacing (not appending or prepending)
+ if (!append && !prepend) {
+ console.log(`🧹 [DEBUG] Clearing PDF viewer (replacing)`);
+ appState.elements.pdfViewer.innerHTML = '';
+ }
+
appState.elements.pdfViewerWrapper.classList.remove('hidden');
appState.elements.textboxViewerWrapper.classList.add('hidden');
appState.elements.zoomInBtn.disabled = false;
appState.elements.zoomOutBtn.disabled = false;
+ appState.elements.pdfSearchBtn.parentElement.classList.remove('hidden');
appState.variables.currentPageNum = num;
+
+ console.log(`📌 [DEBUG] Current page set to ${num}, scale: ${appState.variables.currentScale}, rotation: ${appState.variables.currentRotation}`);
+
+ // Show book-info panel with PDF controls
+ const bookInfo = document.querySelector('#book-info');
+ if (bookInfo) {
+ bookInfo.classList.remove('hidden', 'opacity-0');
+ }
+
+ // Update zoom level display only during initial render or non-manual operations
+ if (!appState.variables.isManualPageChange) {
+ appState.elements.zoomLevel.textContent = Math.round(appState.variables.currentScale * 100) + '%';
+ }
+
+ // Update page input field max value
+ if (appState.elements.pageInput) {
+ appState.elements.pageInput.max = appState.variables.pdfDoc.numPages;
+ appState.elements.pageInput.value = num;
+ }
const renderSinglePage = async (pageNumber, container) => {
+ // Create a page wrapper for canvas and separator
+ const pageWrapper = document.createElement('div');
+ pageWrapper.classList.add('pdf-page-wrapper');
+ pageWrapper.dataset.page = pageNumber;
+
const canvas = document.createElement('canvas');
- const lastPage = appState.elements.pdfViewer.lastChild;
+ const lastPage = appState.elements.pdfViewer.querySelector('canvas');
canvas.classList.add('dark:invert');
canvas.ariaLabel = 'PDF page';
canvas.dataset.page = pageNumber;
@@ -1175,15 +2198,26 @@ document.addEventListener('DOMContentLoaded', () => {
if (lastPage) {
// Populate with last pages info, while we await.
canvas.height = lastPage.offsetHeight;
- canvas.width = lastPage.OffsetWidth;
+ canvas.width = lastPage.offsetWidth;
}
+ pageWrapper.appendChild(canvas);
+
+ // Add page separator with page number
+ const separator = document.createElement('div');
+ separator.classList.add('pdf-page-separator');
+ separator.innerHTML = `Page ${pageNumber} `;
+ pageWrapper.appendChild(separator);
+
// Immediately render the page, even while we wait for PDF.js
- if (append) container.appendChild(canvas);
- else container.prepend(canvas);
+ if (append) container.appendChild(pageWrapper);
+ else container.prepend(pageWrapper);
const page = await appState.variables.pdfDoc.getPage(pageNumber);
- const viewport = page.getViewport({ scale: appState.variables.currentScale });
+ const viewport = page.getViewport({
+ scale: appState.variables.currentScale,
+ rotation: appState.variables.currentRotation
+ });
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
@@ -1195,6 +2229,18 @@ document.addEventListener('DOMContentLoaded', () => {
await page.render(renderContext).promise;
+ // Create a selectable text layer for copying text using PDF.js standard class
+ const selectableTextLayer = document.createElement('div');
+ selectableTextLayer.className = 'textLayer';
+ selectableTextLayer.style.position = 'absolute';
+ selectableTextLayer.style.left = '0';
+ selectableTextLayer.style.top = '0';
+ selectableTextLayer.style.width = viewport.width + 'px';
+ selectableTextLayer.style.height = viewport.height + 'px';
+
+ // Add the selectable text layer to the page wrapper
+ pageWrapper.appendChild(selectableTextLayer);
+
if (skipTextExtraction) return '';
let textContent = await page.getTextContent();
@@ -1204,6 +2250,7 @@ document.addEventListener('DOMContentLoaded', () => {
textContent = parsedTextContent.body;
}
+ // Store text content for reference (selection, etc.)
appState.variables.pdfTextContent[pageNumber] = textContent;
return mapTextContent(textContent);
};
@@ -1212,28 +2259,64 @@ document.addEventListener('DOMContentLoaded', () => {
const page1Text = await renderSinglePage(num, appState.elements.pdfViewer, append);
let page2Text = '';
- if (num + 1 <= pdfDoc.numPages)
+ if (num + 1 <= appState.variables.pdfDoc.numPages)
page2Text = await renderSinglePage(num + 1, appState.elements.pdfViewer, append);
if (!skipTextExtraction) {
const combinedText = page1Text + ' ' + page2Text;
appState.elements.textDisplay.textContent = combinedText;
}
+
+ // Render text layer for clickable chunks with adaptive delays
+ if (!skipTextExtraction) {
+ const isScrollOp = !appState.variables.isManualPageChange;
+ const delay = isScrollOp ? 300 : 150; // Longer delay for scroll operations
+
+ console.log(`📄 [DEBUG] Queuing text layer render for pages ${num} and ${num + 1} (${isScrollOp ? 'scroll' : 'manual'})`);
+ await new Promise(resolve => setTimeout(resolve, delay));
+ await renderTextLayer(num);
+ if (num + 1 <= appState.variables.pdfDoc.numPages) {
+ await renderTextLayer(num + 1);
+ }
+ }
} else {
- const pageText = await renderSinglePage(num, appState.elements.pdfViewer);
+ const pageText = await renderSinglePage(num, appState.elements.pdfViewer, append);
if (!skipTextExtraction)
appState.elements.textDisplay.textContent = pageText;
+
+ // Render text layer for clickable chunks with adaptive delays
+ if (!skipTextExtraction) {
+ const isScrollOp = !appState.variables.isManualPageChange;
+ const delay = isScrollOp ? 300 : 150; // Longer delay for scroll operations
+
+ console.log(`📄 [DEBUG] Queuing text layer render for page ${num} (${isScrollOp ? 'scroll' : 'manual'})`);
+ await new Promise(resolve => setTimeout(resolve, delay));
+ await renderTextLayer(num);
+ }
}
if (appState.variables.activeBook?.source === 'local' && !skipTextExtraction) {
appState.variables.localBooks[appState.variables.activeBook.id].text = appState.elements.textDisplay.textContent;
saveLocalBooks(appState);
}
+
+ // Update page number display
+ updateCurrentPage(appState);
}
async function highlightPdfChunk(chunkObject) {
const highlightLayer = document.getElementById('highlight-layer');
- if (!highlightLayer) return; // Quit if there is an error with the layer.
+ if (!highlightLayer) {
+ console.error('Highlight layer not found!');
+ return;
+ }
+
+ // Check if currentReadingPage exists before accessing properties
+ if (!appState.variables.currentReadingPage || !appState.variables.currentReadingPage.dataset) {
+ console.warn('⚠️ currentReadingPage not available for highlighting');
+ return;
+ }
+
highlightLayer.innerHTML = ''; // Clear previous highlights
const currentReadingPageNum = Number.parseInt(appState.variables.currentReadingPage.dataset.page);
@@ -1247,6 +2330,11 @@ document.addEventListener('DOMContentLoaded', () => {
let textToFind = chunkText;
let currentPage = appState.variables.pdfTextContent[currentReadingPageNum];
+ if (!currentPage) {
+ console.debug('No PDF text content for page:', currentReadingPageNum);
+ return;
+ }
+
// Build complete text from page.
const pageText = normalizeText(mapTextContent(currentPage));
@@ -1265,7 +2353,7 @@ document.addEventListener('DOMContentLoaded', () => {
}
if (bestMatchStart === -1) {
- console.debug('Could not find any match in the text: ', pageText);
+ console.debug('Could not find any match in the text. Chunk:', chunkText.substring(0, 50), '... Page text:', pageText.substring(0, 100));
return;
}
@@ -1287,6 +2375,7 @@ document.addEventListener('DOMContentLoaded', () => {
currentTextPos = itemEndPos;
}
+ console.debug(`Highlighting ${itemsToHighlight.length} items on page ${currentReadingPageNum}`);
// Now actually highlight the items
for (const item of itemsToHighlight) {
@@ -1295,6 +2384,64 @@ document.addEventListener('DOMContentLoaded', () => {
createAndAppendHighlight(item, viewport, currentReadingPageNum, highlightLayer);
}
+
+ // Also add chunk-level highlighting to text spans
+ addPdfChunkHighlightToSpans(currentReadingPageNum, chunkText);
+ }
+
+ // Add chunk-level highlighting to text spans
+ function addPdfChunkHighlightToSpans(pageNum, chunkText) {
+ // Clear previous chunk highlights
+ const prevHighlights = document.querySelectorAll('.pdf-text-chunk-highlight');
+ prevHighlights.forEach(span => {
+ span.classList.remove('pdf-text-chunk-highlight');
+ // Remove color classes
+ span.classList.remove('green', 'blue');
+ });
+
+ // Get all text spans on the current page
+ const textSpans = Array.from(document.querySelectorAll(`.backend-text-span[data-page="${pageNum}"]`));
+ if (textSpans.length === 0) return;
+
+ // Normalize function
+ const normalizeText = (text) => text.trim().replace(/\s+/g, ' ');
+
+ // Build text from spans
+ const spansText = normalizeText(textSpans.map(span => span.textContent).join(' '));
+ const targetText = normalizeText(chunkText);
+
+ // Try to find the chunk in the spans text
+ const matchIndex = spansText.indexOf(targetText.substring(0, Math.min(targetText.length, 100)));
+
+ if (matchIndex === -1) {
+ console.debug('Could not match chunk to text spans');
+ return;
+ }
+
+ // Calculate which spans to highlight
+ let currentPos = 0;
+ const spansToHighlight = [];
+
+ for (let i = 0; i < textSpans.length; i++) {
+ const spanText = normalizeText(textSpans[i].textContent);
+ const spanEnd = currentPos + spanText.length;
+
+ // Check if this span overlaps with the chunk
+ if (spanEnd > matchIndex && currentPos < matchIndex + targetText.length) {
+ spansToHighlight.push(textSpans[i]);
+ }
+
+ currentPos = spanEnd + 1; // +1 for space between spans
+ }
+
+ // Apply chunk highlighting
+ const highlightColor = appState.variables.localPrefs?.highlightColor || '';
+ spansToHighlight.forEach(span => {
+ span.classList.add('pdf-text-chunk-highlight');
+ if (highlightColor) {
+ span.classList.add(highlightColor);
+ }
+ });
}
// Helper function to create and append highlight (extracted for cleaner code)
@@ -1304,10 +2451,10 @@ document.addEventListener('DOMContentLoaded', () => {
highlight.className = 'highlight pdf-highlight';
let layerOffset = 0;
- // Calculate how much we scrolled.
+ // Calculate how much we scrolled - accumulate heights of all previous pages
for (const child of appState.elements.pdfViewer.children) {
if (Number.parseInt(child.dataset.page) < pageIndex) {
- layerOffset += child.height;
+ layerOffset += child.offsetHeight || child.height;
} else {
break;
}
@@ -1316,7 +2463,7 @@ document.addEventListener('DOMContentLoaded', () => {
if (appState.variables.localPrefs.highlightColor)
highlight.className += ` ${appState.variables.localPrefs.highlightColor}`;
- const leftOffset = (appState.variables.isTwoPageView && pageIndex === 1) ? appState.elements.pdfViewer.children[0].width : 0;
+ const leftOffset = (appState.variables.isTwoPageView && pageIndex === 1) ? (appState.elements.pdfViewer.children[0].offsetWidth || appState.elements.pdfViewer.children[0].width) : 0;
highlight.style.left = `${tx[4] + leftOffset}px`;
highlight.style.top = `${tx[5] - 10 + layerOffset}px`;
@@ -1416,11 +2563,64 @@ document.addEventListener('DOMContentLoaded', () => {
appState.elements.libraryBtn.classList.add('bg-indigo-100', 'dark:bg-indigo-900', 'dark:bg-opacity-30');
appState.elements.bookView.classList.add('hidden');
- if (appState.variables.currentUser)
- appState.elements.mainDiv.appendChild(await renderUserPdfs(currentUser));
- else appState.elements.mainDiv.appendChild(createFilesGrid([]));
+ if (appState.variables.currentUser) {
+ // Define callback to load PDF when clicked
+ const onPdfClick = async (pdf) => {
+ // Close the library view
+ const library = document.getElementById('library-file-grid');
+ if (library) library.remove();
+
+ // Find the book in onlineBooks that matches this PDF
+ const book = appState.variables.onlineBooks.find(b => b.content === pdf.url);
+ if (book) {
+ setActiveBook({ ...book, source: 'online' });
+ } else {
+ // If not found, create a book object from the PDF data
+ const newBook = {
+ id: pdf.bookId || pdf.id,
+ title: pdf.name,
+ content: pdf.url,
+ is_pdf: true,
+ source: 'online'
+ };
+ setActiveBook(newBook);
+ }
+ };
+
+ appState.elements.mainDiv.appendChild(await renderUserPdfs(appState.variables.currentUser, onPdfClick));
+ } else {
+ appState.elements.mainDiv.appendChild(createFilesGrid([]));
+ }
});
+ // Shortcuts button - Open keyboard shortcuts help
+ if (appState.elements.shortcutsBtn && appState.elements.shortcutsModal) {
+ appState.elements.shortcutsBtn.addEventListener('click', () => {
+ appState.elements.shortcutsModal.classList.remove('hidden');
+ });
+ }
+
+ // Close shortcuts modal
+ if (appState.elements.shortcutsCloseBtn && appState.elements.shortcutsModal) {
+ appState.elements.shortcutsCloseBtn.addEventListener('click', () => {
+ appState.elements.shortcutsModal.classList.add('hidden');
+ });
+
+ // Also close on click outside
+ appState.elements.shortcutsModal.addEventListener('click', (e) => {
+ if (e.target === appState.elements.shortcutsModal) {
+ appState.elements.shortcutsModal.classList.add('hidden');
+ }
+ });
+
+ // Close on Escape key
+ document.addEventListener('keydown', (e) => {
+ if (e.key === 'Escape' && !appState.elements.shortcutsModal.classList.contains('hidden')) {
+ appState.elements.shortcutsModal.classList.add('hidden');
+ }
+ });
+ }
+
appState.elements.playbackSpeed.addEventListener('input', () => {
appState.elements.audioPlayer.playbackRate = appState.elements.playbackSpeed.value;
appState.elements.playbackSpeedDisplay.textContent = appState.elements.playbackSpeed.value.toString() + "x";
@@ -1505,33 +2705,48 @@ document.addEventListener('DOMContentLoaded', () => {
if (data.status === 'completed') {
clearInterval(interval);
- showNotification('PDF OCR completed successfully.', 'success');
+
+ // OCR completed - update with enhanced/merged text
+ const previousTextLength = appState.variables.fullBookText?.length || 0;
+ const newTextLength = data.text?.length || 0;
+
+ if (newTextLength > previousTextLength) {
+ console.log(`✨ OCR enhanced text: ${previousTextLength} → ${newTextLength} chars`);
+ showNotification('OCR completed! Text enhanced with additional content.', 'success');
+ } else {
+ console.log('✅ OCR completed (no additional content found)');
+ showNotification('OCR completed successfully.', 'success');
+ }
+
+ appState.variables.fullBookText = data.text;
if (bookId && currentUser) {
const newOnlineBook = await saveOcrText(bookId, data.text);
if (newOnlineBook) {
setActiveBook({ ...newOnlineBook, source: 'online' });
} else {
- // Fallback if the book couldn't be found after saving
console.error("Could not find online book after saving OCR text. Falling back to text view.");
- appState.variables.fullBookText = data.text;
+ appState.variables.totalTextPages = Math.max(1, Math.ceil(appState.variables.fullBookText.length / appState.variables.charsPerPage));
renderTextPage(1);
}
} else {
- // Anonymous user flow
+ // Anonymous user - update local storage
if (appState.variables.activeBook?.source === 'local') {
appState.variables.localBooks[appState.variables.activeBook.id].text = appState.variables.fullBookText;
saveLocalBooks(appState);
}
+ appState.variables.totalTextPages = Math.max(1, Math.ceil(appState.variables.fullBookText.length / appState.variables.charsPerPage));
renderTextPage(1);
}
} else if (data.status === 'failed') {
clearInterval(interval);
console.error('OCR failed:', data.detail);
- showBookModal(`OCR failed: ${data.detail}`, 'error');
+ showNotification(`OCR failed: ${data.detail}`, 'error');
- } else console.log('OCR in progress...');
+ } else {
+ console.log('🔄 OCR in progress...');
+ }
} catch (error) {
clearInterval(interval);
console.error('Error polling for OCR result:', error);
@@ -1552,6 +2767,7 @@ document.addEventListener('DOMContentLoaded', () => {
const data = await response.json();
if (data.status === 'completed') {
+ // PDF fully processed with both extraction and OCR
if (bookId && appState.variables.currentUser) {
saveOcrText(bookId, data.text);
}
@@ -1563,7 +2779,22 @@ document.addEventListener('DOMContentLoaded', () => {
appState.variables.totalTextPages = Math.max(1, Math.ceil(appState.variables.fullBookText.length / appState.variables.charsPerPage));
renderTextPage(1);
} else if (data.status === 'ocr_started') {
- showNotification('PDF contains no text. Starting background OCR...', 'info');
+ // OCR is running in background
+ if (data.partial && data.text) {
+ // We have extracted text, show it immediately while OCR enhances
+ console.log('📄 Using extracted text immediately, OCR will enhance it');
+ appState.variables.fullBookText = data.text;
+ if (appState.variables.activeBook?.source === 'local') {
+ appState.variables.localBooks[appState.variables.activeBook.id].text = appState.variables.fullBookText;
+ saveLocalBooks(appState);
+ }
+ appState.variables.totalTextPages = Math.max(1, Math.ceil(appState.variables.fullBookText.length / appState.variables.charsPerPage));
+ renderTextPage(1);
+ showNotification('Text loaded. OCR running in background to enhance...', 'info');
+ } else {
+ // No extracted text, waiting for OCR
+ showNotification('PDF contains no extractable text. Running OCR...', 'info');
+ }
pollOcrResult(data.task_id, bookId);
} else {
throw new Error('Received an unexpected response from the server.');
@@ -1576,6 +2807,9 @@ document.addEventListener('DOMContentLoaded', () => {
}
appState.elements.pdfFileInput.addEventListener('change', async (e) => {
+ e.preventDefault(); // Prevent default behavior
+ e.stopPropagation(); // Stop event bubbling
+
const file = e.target.files[0];
if (!file) return;
@@ -1615,9 +2849,58 @@ document.addEventListener('DOMContentLoaded', () => {
return;
}
} else {
- // Fallback to local IndexedDB for anonymous users
- await handlePdfUpload(file);
- showNotification('PDF text extracted! Sign in to save PDF files.')
+ // Fallback to local PDF rendering for anonymous users
+ console.log('📄 Loading PDF locally for anonymous user');
+ const arrayBuffer = await file.arrayBuffer();
+ if (appState.variables.activeBook?.source === 'local') {
+ // Store PDF data in memory (not localStorage due to size limits)
+ appState.variables.localBooks[appState.variables.activeBook.id].pdfData = arrayBuffer;
+ appState.variables.localBooks[appState.variables.activeBook.id].pdfId = 'local';
+ // Don't save to localStorage - pdfData will be ignored by JSON.stringify
+ }
+ // Clear text positions for local PDFs (no backend extraction)
+ appState.variables.pdfTextPositions = [];
+ console.log('⚠️ Local PDF mode: Text positions not available without backend');
+ // Render the PDF directly
+ console.log('🔄 Initializing PDF.js with loaded data');
+ try {
+ appState.variables.pdfDoc = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
+ console.log('✅ PDF loaded successfully!', {
+ numPages: appState.variables.pdfDoc.numPages,
+ initialScale: appState.variables.currentScale,
+ initialRotation: appState.variables.currentRotation
+ });
+ console.log('🎨 Rendering first page...');
+
+ // Set manual page change flag to prevent observer triggers during initial load
+ appState.variables.isManualPageChange = true;
+ await renderPage(1);
+
+ // Delay observer attachment to prevent multiple triggers
+ setTimeout(() => {
+ const firstPage = appState.elements.pdfViewer.children[0];
+ if (firstPage && window.upwardsScroll) {
+ window.upwardsScroll.observe(firstPage);
+ console.log('✅ [DEBUG] Local PDF - upward observer attached');
+ }
+ if (window.downwardsScroll) {
+ const toolbarSpace = document.querySelector("#toolbar-space");
+ if (toolbarSpace) {
+ window.downwardsScroll.observe(toolbarSpace);
+ console.log('✅ [DEBUG] Local PDF - downward observer attached');
+ }
+ }
+ appState.variables.isManualPageChange = false;
+ }, 500);
+ } catch (error) {
+ console.error('❌ Failed to load PDF:', error);
+ showNotification('Failed to load PDF: Invalid or corrupted PDF file', 'error');
+ hideFileModal(appState);
+ return;
+ }
+ showNotification('PDF loaded! Note: PDFs are not persisted. Sign in to save permanently.', 'info');
+ hideFileModal(appState);
+ return;
}
} else if (fileExtension === 'epub') {
if (appState.variables.activeBook.source === 'local') {
@@ -1677,23 +2960,369 @@ document.addEventListener('DOMContentLoaded', () => {
showNotification(`An error occurred: ${error.message}`, 'error');
appState.elements.textDisplay.innerHTML = '';
}
- } showNotification('Please select a valid PDF, EPUB, or DOCX file.', 'warn');
+ } else {
+ showNotification('Please select a valid PDF, EPUB, or DOCX file.', 'warn');
+ }
+ // Reset the file input so the same file can be selected again
+ e.target.value = '';
hideFileModal(appState);
});
appState.elements.zoomInBtn.addEventListener('click', () => {
if (!appState.variables.pdfDoc) return;
+ console.log(`🔍 [DEBUG] Zoom In clicked - current scale: ${appState.variables.currentScale}`);
+
+ // Disable infinite scroll during manual zoom
+ appState.variables.isManualPageChange = true;
+ if (window.downwardsScroll) window.downwardsScroll.disconnect();
+ if (window.upwardsScroll) window.upwardsScroll.disconnect();
+
appState.variables.currentScale += 0.25;
- renderPage(appState.variables.currentPageNum);
+ console.log(`➕ [DEBUG] New scale: ${appState.variables.currentScale}`);
+ appState.elements.zoomLevel.textContent = Math.round(appState.variables.currentScale * 100) + '%';
+ appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ console.log(`🧹 [DEBUG] Clearing text layer for zoom in`);
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+
+ renderPage(appState.variables.currentPageNum).then(() => {
+ appState.variables.isManualPageChange = false;
+ setTimeout(() => {
+ const topPage = appState.elements.pdfViewer.children[0];
+ const bottomPage = appState.elements.pdfViewer.children[appState.elements.pdfViewer.children.length - 1];
+ if (topPage && window.upwardsScroll) window.upwardsScroll.observe(topPage);
+ if (bottomPage && window.downwardsScroll) window.downwardsScroll.observe(bottomPage);
+ console.log('✅ [DEBUG] Zoom in complete, observers reconnected');
+ }, 100);
+ });
});
appState.elements.zoomOutBtn.addEventListener('click', () => {
if (!appState.variables.pdfDoc) return;
+ console.log(`🔍 [DEBUG] Zoom Out clicked - current scale: ${appState.variables.currentScale}`);
+ appState.variables.isManualPageChange = true;
+
+ if (window.downwardsScroll) window.downwardsScroll.disconnect();
+ if (window.upwardsScroll) window.upwardsScroll.disconnect();
+
appState.variables.currentScale = Math.max(0.25, appState.variables.currentScale - 0.25);
- renderPage(appState.variables.currentPageNum);
+ console.log(`➖ [DEBUG] New scale: ${appState.variables.currentScale}`);
+ appState.elements.zoomLevel.textContent = Math.round(appState.variables.currentScale * 100) + '%';
+ appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ console.log(`🧹 [DEBUG] Clearing text layer for zoom out`);
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+
+ renderPage(appState.variables.currentPageNum).then(() => {
+ appState.variables.isManualPageChange = false;
+ setTimeout(() => {
+ const topPage = appState.elements.pdfViewer.children[0];
+ const bottomPage = appState.elements.pdfViewer.children[appState.elements.pdfViewer.children.length - 1];
+ if (topPage && window.upwardsScroll) window.upwardsScroll.observe(topPage);
+ if (bottomPage && window.downwardsScroll) window.downwardsScroll.observe(bottomPage);
+ console.log('✅ [DEBUG] Zoom out complete, observers reconnected');
+ }, 100);
+ });
+ });
+
+ // Fit to width button
+ appState.elements.fitWidthBtn.addEventListener('click', () => {
+ if (!appState.variables.pdfDoc) return;
+ appState.variables.isManualPageChange = true;
+
+ // Disconnect infinite scroll to prevent interference
+ if (window.downwardsScroll) window.downwardsScroll.disconnect();
+ if (window.upwardsScroll) window.upwardsScroll.disconnect();
+
+ const container = appState.elements.pdfViewerWrapper;
+ const containerWidth = container.clientWidth;
+
+ appState.variables.pdfDoc.getPage(1).then(page => {
+ const viewport = page.getViewport({ scale: 1, rotation: appState.variables.currentRotation });
+ const scale = (containerWidth - 40) / viewport.width;
+ appState.variables.currentScale = scale;
+ appState.elements.zoomLevel.textContent = Math.round(scale * 100) + '%';
+ appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+
+ return renderPage(appState.variables.currentPageNum);
+ }).then(() => {
+ appState.variables.isManualPageChange = false;
+ // Reconnect observers
+ setTimeout(() => {
+ const topPage = appState.elements.pdfViewer.children[0];
+ const bottomPage = appState.elements.pdfViewer.children[appState.elements.pdfViewer.children.length - 1];
+ if (topPage && window.upwardsScroll) window.upwardsScroll.observe(topPage);
+ if (bottomPage && window.downwardsScroll) window.downwardsScroll.observe(bottomPage);
+ console.log('✅ [DEBUG] Fit width complete, observers reconnected');
+ }, 100);
+ });
+ });
+
+ // Fit to page button
+ appState.elements.fitPageBtn.addEventListener('click', () => {
+ if (!appState.variables.pdfDoc) return;
+ appState.variables.isManualPageChange = true;
+
+ // Disconnect infinite scroll to prevent interference
+ if (window.downwardsScroll) window.downwardsScroll.disconnect();
+ if (window.upwardsScroll) window.upwardsScroll.disconnect();
+
+ const container = appState.elements.pdfViewerWrapper;
+ const containerWidth = container.clientWidth;
+ const containerHeight = window.innerHeight - 200;
+
+ appState.variables.pdfDoc.getPage(1).then(page => {
+ const viewport = page.getViewport({ scale: 1, rotation: appState.variables.currentRotation });
+ const scaleWidth = (containerWidth - 40) / viewport.width;
+ const scaleHeight = (containerHeight - 40) / viewport.height;
+ const scale = Math.min(scaleWidth, scaleHeight);
+ appState.variables.currentScale = scale;
+ appState.elements.zoomLevel.textContent = Math.round(scale * 100) + '%';
+ appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+
+ return renderPage(appState.variables.currentPageNum);
+ }).then(() => {
+ appState.variables.isManualPageChange = false;
+ // Reconnect observers
+ setTimeout(() => {
+ const topPage = appState.elements.pdfViewer.children[0];
+ const bottomPage = appState.elements.pdfViewer.children[appState.elements.pdfViewer.children.length - 1];
+ if (topPage && window.upwardsScroll) window.upwardsScroll.observe(topPage);
+ if (bottomPage && window.downwardsScroll) window.downwardsScroll.observe(bottomPage);
+ console.log('✅ [DEBUG] Fit page complete, observers reconnected');
+ }, 100);
+ });
+ });
+
+ // Rotate left button
+ appState.elements.rotateLeftBtn.addEventListener('click', () => {
+ if (!appState.variables.pdfDoc) return;
+ appState.variables.isManualPageChange = true;
+ appState.variables.currentRotation = (appState.variables.currentRotation - 90 + 360) % 360;
+ appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+ renderPage(appState.variables.currentPageNum).then(() => {
+ appState.variables.isManualPageChange = false;
+ });
+ });
+
+ // Rotate right button
+ appState.elements.rotateRightBtn.addEventListener('click', () => {
+ if (!appState.variables.pdfDoc) return;
+ appState.variables.isManualPageChange = true;
+ appState.variables.currentRotation = (appState.variables.currentRotation + 90) % 360;
+ appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+ renderPage(appState.variables.currentPageNum).then(() => {
+ appState.variables.isManualPageChange = false;
+ });
+ });
+
+ // Debug mode toggle button
+ if (appState.elements.debugModeBtn) {
+ appState.elements.debugModeBtn.addEventListener('click', () => {
+ appState.variables.debugMode = !appState.variables.debugMode;
+
+ // Update button appearance
+ if (appState.variables.debugMode) {
+ appState.elements.debugModeBtn.classList.add('bg-indigo-500', 'text-white');
+ appState.elements.debugModeBtn.classList.remove('bg-gray-100', 'dark:bg-gray-800');
+ showNotification('🐛 Debug mode enabled - text boundaries visible', 'info');
+ } else {
+ appState.elements.debugModeBtn.classList.remove('bg-indigo-500', 'text-white');
+ appState.elements.debugModeBtn.classList.add('bg-gray-100', 'dark:bg-gray-800');
+ showNotification('Debug mode disabled', 'info');
+ }
+
+ // Re-render current page to apply debug styling
+ if (appState.variables.pdfDoc) {
+ appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+ renderPage(appState.variables.currentPageNum);
+ }
+ });
+ }
+
+ // First page button
+ appState.elements.firstPageBtn.addEventListener('click', () => {
+ if (appState.variables.pdfDoc) {
+ appState.variables.isManualPageChange = true;
+ appState.elements.pdfViewer.innerHTML = '';
+ renderPage(1).then(() => {
+ appState.variables.isManualPageChange = false;
+ });
+ } else if (appState.variables.fullBookText) {
+ appState.elements.textDisplay.innerHTML = '';
+ renderTextPage(1);
+ }
+ });
+
+ // Last page button
+ appState.elements.lastPageBtn.addEventListener('click', () => {
+ if (appState.variables.pdfDoc) {
+ const lastPage = appState.variables.pdfDoc.numPages;
+ appState.variables.isManualPageChange = true;
+ appState.elements.pdfViewer.innerHTML = '';
+ renderPage(lastPage).then(() => {
+ appState.variables.isManualPageChange = false;
+ });
+ } else if (appState.variables.fullBookText) {
+ appState.elements.textDisplay.innerHTML = '';
+ renderTextPage(appState.variables.totalTextPages);
+ }
+ });
+
+ // Page input - jump to page
+ appState.elements.pageInput.addEventListener('keydown', (e) => {
+ if (e.key === 'Enter') {
+ const pageNum = parseInt(appState.elements.pageInput.value);
+ console.log(`📌 [DEBUG] Goto page triggered - page: ${pageNum}`);
+ if (appState.variables.pdfDoc) {
+ if (pageNum >= 1 && pageNum <= appState.variables.pdfDoc.numPages) {
+ console.log(`✅ [DEBUG] Valid page number, rendering page ${pageNum}`);
+ appState.variables.isManualPageChange = true;
+ appState.elements.pdfViewer.innerHTML = '';
+ if (appState.elements.pdfTextLayer) {
+ console.log(`🧹 [DEBUG] Clearing text layer for goto page`);
+ appState.elements.pdfTextLayer.innerHTML = '';
+ }
+ renderPage(pageNum).then(() => {
+ appState.variables.isManualPageChange = false;
+ console.log(`✅ [DEBUG] Page ${pageNum} rendered, isManualPageChange reset`);
+ });
+ } else {
+ console.error(`❌ [DEBUG] Invalid page number: ${pageNum} (valid: 1-${appState.variables.pdfDoc.numPages})`);
+ }
+ } else if (appState.variables.fullBookText) {
+ if (pageNum >= 1 && pageNum <= appState.variables.totalTextPages) {
+ appState.elements.textDisplay.innerHTML = '';
+ renderTextPage(pageNum);
+ }
+ }
+ }
+ });
+
+ // PDF Search functionality
+ appState.elements.pdfSearchBtn.addEventListener('click', () => {
+ console.log('🔍 PDF Search button clicked');
+ appState.elements.pdfSearchBar.classList.toggle('hidden');
+ if (!appState.elements.pdfSearchBar.classList.contains('hidden')) {
+ console.log('📝 Search bar shown, focusing input');
+ appState.elements.pdfSearchInput.focus();
+ } else {
+ console.log('❌ Search bar hidden, clearing results');
+ clearSearchHighlights();
+ }
+ });
+
+ appState.elements.pdfSearchClose.addEventListener('click', () => {
+ appState.elements.pdfSearchBar.classList.add('hidden');
+ appState.elements.pdfSearchInput.value = '';
+ clearSearchHighlights();
+ });
+
+ // Search in PDF
+ appState.elements.pdfSearchInput.addEventListener('keydown', (e) => {
+ if (e.key === 'Enter') {
+ performPdfSearch();
+ }
+ });
+
+ appState.elements.pdfSearchNext.addEventListener('click', () => {
+ navigateSearchResults(1);
+ });
+
+ appState.elements.pdfSearchPrev.addEventListener('click', () => {
+ navigateSearchResults(-1);
});
+ // Function to perform PDF search
+ async function performPdfSearch() {
+ const searchText = appState.elements.pdfSearchInput.value.trim().toLowerCase();
+ if (!searchText || !appState.variables.pdfDoc) return;
+
+ appState.variables.searchMatches = [];
+ appState.variables.currentSearchIndex = -1;
+
+ // Search through all pages
+ for (let pageNum = 1; pageNum <= appState.variables.pdfDoc.numPages; pageNum++) {
+ const page = await appState.variables.pdfDoc.getPage(pageNum);
+ const textContent = await page.getTextContent();
+
+ textContent.items.forEach((item, index) => {
+ const text = item.str.toLowerCase();
+ let startIndex = 0;
+ while ((startIndex = text.indexOf(searchText, startIndex)) !== -1) {
+ appState.variables.searchMatches.push({
+ pageNum: pageNum,
+ itemIndex: index,
+ startIndex: startIndex,
+ text: item.str
+ });
+ startIndex += searchText.length;
+ }
+ });
+ }
+
+ // Update results display
+ if (appState.variables.searchMatches.length > 0) {
+ appState.elements.pdfSearchResults.classList.remove('hidden');
+ appState.elements.pdfSearchResults.textContent = `Found ${appState.variables.searchMatches.length} matches`;
+ navigateSearchResults(1);
+ } else {
+ appState.elements.pdfSearchResults.classList.remove('hidden');
+ appState.elements.pdfSearchResults.textContent = 'No matches found';
+ }
+ }
+
+ // Navigate between search results
+ function navigateSearchResults(direction) {
+ if (appState.variables.searchMatches.length === 0) return;
+
+ appState.variables.currentSearchIndex += direction;
+
+ // Wrap around
+ if (appState.variables.currentSearchIndex >= appState.variables.searchMatches.length) {
+ appState.variables.currentSearchIndex = 0;
+ } else if (appState.variables.currentSearchIndex < 0) {
+ appState.variables.currentSearchIndex = appState.variables.searchMatches.length - 1;
+ }
+
+ const match = appState.variables.searchMatches[appState.variables.currentSearchIndex];
+
+ // Update results display
+ appState.elements.pdfSearchResults.textContent =
+ `Match ${appState.variables.currentSearchIndex + 1} of ${appState.variables.searchMatches.length}`;
+
+ // Navigate to the page with the match
+ if (match.pageNum !== appState.variables.currentPageNum) {
+ appState.elements.pdfViewer.innerHTML = '';
+ renderPage(match.pageNum);
+ }
+ }
+
+ // Clear search highlights
+ function clearSearchHighlights() {
+ appState.variables.searchMatches = [];
+ appState.variables.currentSearchIndex = -1;
+ appState.elements.pdfSearchResults.classList.add('hidden');
+ }
+
appState.elements.engineSelect.addEventListener('change', (e) => { e.preventDefault(); updateVoices(appState) });
appState.elements.generateBtn.addEventListener('click', () => {
@@ -1856,6 +3485,41 @@ document.addEventListener('DOMContentLoaded', () => {
appState.elements.zoomOutBtn?.click(); hideCommandPalette();
} else showNotification('No book is currently active.');
} },
+ { name: 'Fit PDF to Width', icon: 'fa-arrows-left-right', description: 'Fit PDF page to width', action: () => {
+ if (appState.variables.activeBook && appState.variables.pdfDoc) {
+ appState.elements.fitWidthBtn?.click(); hideCommandPalette();
+ } else showNotification('No PDF is currently active.');
+ } },
+ { name: 'Fit PDF to Page', icon: 'fa-maximize', description: 'Fit entire PDF page to screen', action: () => {
+ if (appState.variables.activeBook && appState.variables.pdfDoc) {
+ appState.elements.fitPageBtn?.click(); hideCommandPalette();
+ } else showNotification('No PDF is currently active.');
+ } },
+ { name: 'Rotate PDF Left', icon: 'fa-rotate-left', description: 'Rotate PDF counterclockwise', action: () => {
+ if (appState.variables.activeBook && appState.variables.pdfDoc) {
+ appState.elements.rotateLeftBtn?.click(); hideCommandPalette();
+ } else showNotification('No PDF is currently active.');
+ } },
+ { name: 'Rotate PDF Right', icon: 'fa-rotate-right', description: 'Rotate PDF clockwise', action: () => {
+ if (appState.variables.activeBook && appState.variables.pdfDoc) {
+ appState.elements.rotateRightBtn?.click(); hideCommandPalette();
+ } else showNotification('No PDF is currently active.');
+ } },
+ { name: 'Search in PDF', icon: 'fa-search', description: 'Search for text in PDF', action: () => {
+ if (appState.variables.activeBook && appState.variables.pdfDoc) {
+ appState.elements.pdfSearchBtn?.click(); hideCommandPalette();
+ } else showNotification('No PDF is currently active.');
+ } },
+ { name: 'First Page', icon: 'fa-angles-left', description: 'Go to first page', action: () => {
+ if (appState.variables.activeBook) {
+ appState.elements.firstPageBtn?.click(); hideCommandPalette();
+ } else showNotification('No book is currently active.');
+ } },
+ { name: 'Last Page', icon: 'fa-angles-right', description: 'Go to last page', action: () => {
+ if (appState.variables.activeBook) {
+ appState.elements.lastPageBtn?.click(); hideCommandPalette();
+ } else showNotification('No book is currently active.');
+ } },
];
let filteredCommands = [];
@@ -2464,6 +4128,43 @@ document.addEventListener('DOMContentLoaded', () => {
if (bgNoiseAudio) bgNoiseAudio.volume = e.target.value;
});
+ // Initialize highlighting settings
+ const wordHighlightToggle = document.getElementById('word-highlight-toggle');
+ const chunkHighlightColorSelect = document.getElementById('chunk-highlight-color');
+ const wordHighlightColorSelect = document.getElementById('word-highlight-color');
+
+ // Set default values if not already set
+ if (appState.variables.localPrefs.enableWordHighlight === undefined) {
+ appState.variables.localPrefs.enableWordHighlight = true; // Default to enabled
+ }
+ if (!appState.variables.localPrefs.highlightColor) {
+ appState.variables.localPrefs.highlightColor = 'yellow'; // Default chunk color
+ }
+ if (!appState.variables.localPrefs.wordHighlightColor) {
+ appState.variables.localPrefs.wordHighlightColor = appState.variables.localPrefs.highlightColor; // Default to chunk color
+ }
+
+ // Load saved settings
+ wordHighlightToggle.checked = appState.variables.localPrefs.enableWordHighlight;
+ chunkHighlightColorSelect.value = appState.variables.localPrefs.highlightColor;
+ wordHighlightColorSelect.value = appState.variables.localPrefs.wordHighlightColor;
+
+ // Save settings when changed
+ wordHighlightToggle.addEventListener('change', () => {
+ appState.variables.localPrefs.enableWordHighlight = wordHighlightToggle.checked;
+ handlePrefs(appState.variables.localPrefs);
+ });
+
+ chunkHighlightColorSelect.addEventListener('change', () => {
+ appState.variables.localPrefs.highlightColor = chunkHighlightColorSelect.value;
+ handlePrefs(appState.variables.localPrefs);
+ });
+
+ wordHighlightColorSelect.addEventListener('change', () => {
+ appState.variables.localPrefs.wordHighlightColor = wordHighlightColorSelect.value;
+ handlePrefs(appState.variables.localPrefs);
+ });
+
// Infinite Scroll handlers
const infiniteScrollPageCache = 4;
let scrollTimeout = 0;
@@ -2483,81 +4184,134 @@ document.addEventListener('DOMContentLoaded', () => {
});
+ let downwardScrollInProgress = false;
+
const downwardsScroll = new IntersectionObserver((entries) => {
if (!entries[0].isIntersecting) return;
+ if (appState.variables.isManualPageChange) return; // Skip during manual operations
+ if (downwardScrollInProgress) return; // Prevent concurrent executions
+
+ downwardScrollInProgress = true;
+
+ try {
+ let topPage = null;
+ let currentPage = appState.variables.textCurrentPage;
+ let pageLimit = appState.variables.totalTextPages;
- let topPage = null;
- let currentPage = appState.variables.textCurrentPage;
- let pageLimit = appState.variables.totalTextPages;
-
- let container = {
- type: 'text',
- element: appState.elements.textDisplay
- };
+ let container = {
+ type: 'text',
+ element: appState.elements.textDisplay
+ };
- if (appState.variables.pdfDoc) {
- container.type = 'pdf';
- container.element = appState.elements.pdfViewer;
- pageLimit = appState.variables.isTwoPageView ? appState.variables.pdfDoc.numPages -1 : appState.variables.pdfDoc.numPages;
- // Careful not to mix these. currentPage is local
- // currentPageNum is the global PDF page tracker.
- currentPage = appState.variables.currentPageNum;
- };
+ if (appState.variables.pdfDoc) {
+ container.type = 'pdf';
+ container.element = appState.elements.pdfViewer;
+ pageLimit = appState.variables.isTwoPageView ? appState.variables.pdfDoc.numPages -1 : appState.variables.pdfDoc.numPages;
+ // Careful not to mix these. currentPage is local
+ // currentPageNum is the global PDF page tracker.
+ currentPage = appState.variables.currentPageNum;
+ };
- if (currentPage >= pageLimit) return;
+ if (currentPage >= pageLimit) {
+ downwardScrollInProgress = false;
+ return;
+ }
const lastRenderedPage = container.element.children[container.element.children.length - 1];
- if (container.type == 'pdf')
- renderPage(Number.parseInt(lastRenderedPage.dataset.page) + (appState.variables.isTwoPageView ? 2 : 1));
- else renderTextPage(Number.parseInt(lastRenderedPage.dataset.page) + 1);
-
- // Do a simple DOM cleanup. We lock this during generation to avoid unloading
- // a page being read.
- if (container.element.children.length > infiniteScrollPageCache && !appState.variables.isPlaying)
- container.element.children[0].remove();
-
- // Try to force DOM update.
- void container.element.offsetHeight;
- requestAnimationFrame(() => {
- topPage = container.element.children[0];
+ // Calculate next page number and validate
+ if (container.type == 'pdf' && !appState.variables.isManualPageChange) {
+ const nextPageNum = Number.parseInt(lastRenderedPage.dataset.page) + (appState.variables.isTwoPageView ? 2 : 1);
+ if (nextPageNum <= appState.variables.pdfDoc.numPages) {
+ renderPage(nextPageNum);
+ }
+ } else if (container.type != 'pdf') {
+ const nextPageNum = Number.parseInt(lastRenderedPage.dataset.page) + 1;
+ if (nextPageNum <= pageLimit) {
+ renderTextPage(nextPageNum);
+ }
+ }
- // Update upwards scrol
- upwardsScroll.disconnect();
- upwardsScroll.observe(topPage);
- });
+ // Do a simple DOM cleanup. We lock this during generation to avoid unloading
+ // a page being read.
+ if (container.element.children.length > infiniteScrollPageCache && !appState.variables.isPlaying)
+ container.element.children[0].remove();
+
+ // Try to force DOM update.
+ void container.element.offsetHeight;
+ requestAnimationFrame(() => {
+ topPage = container.element.children[0];
+
+ // Update upwards scroll with delay
+ upwardsScroll.disconnect();
+ if (topPage) {
+ setTimeout(() => {
+ upwardsScroll.observe(topPage);
+ downwardScrollInProgress = false;
+ }, 100);
+ } else {
+ downwardScrollInProgress = false;
+ }
+ });
+ } catch (error) {
+ console.error('❌ Downward scroll error:', error);
+ downwardScrollInProgress = false;
+ }
});
// This is the more complicated of the two.
+ let upwardScrollInProgress = false;
+
const upwardsScroll = new IntersectionObserver((entries) => {
if (!entries[0].isIntersecting) return;
+ if (appState.variables.isManualPageChange) return; // Skip during manual operations
+ if (upwardScrollInProgress) return; // Prevent concurrent executions
+
+ upwardScrollInProgress = true;
+
+ try {
+ const oldTopPage = entries[0].target;
+ if (oldTopPage) upwardsScroll.unobserve(oldTopPage);
- const oldTopPage = entries[0].target;
- if (oldTopPage) upwardsScroll.unobserve(oldTopPage);
-
- let currentPage = appState.variables.textCurrentPage;
+ let currentPage = appState.variables.textCurrentPage;
- let container = {
- type: 'text',
- element: appState.elements.textDisplay
- };
+ let container = {
+ type: 'text',
+ element: appState.elements.textDisplay
+ };
- if (appState.variables.pdfDoc) {
- container.type = 'pdf';
- container.element = appState.elements.pdfViewer;
- // Careful not to mix these. currentPage is local
- // currentPageNum is the global PDF page tracker.
- currentPage = appState.variables.currentPageNum;
- };
+ if (appState.variables.pdfDoc) {
+ container.type = 'pdf';
+ container.element = appState.elements.pdfViewer;
+ // Careful not to mix these. currentPage is local
+ // currentPageNum is the global PDF page tracker.
+ currentPage = appState.variables.currentPageNum;
+ };
- if (currentPage == 1) return;
+ // Check if we're already at the first page to avoid render
+ const firstRenderedPage = container.element.children[0];
+ if (!firstRenderedPage) {
+ upwardScrollInProgress = false;
+ return;
+ }
+
+ const firstPageNum = Number.parseInt(firstRenderedPage.dataset.page);
+ if (firstPageNum <= 1) {
+ upwardScrollInProgress = false;
+ return;
+ }
- const oldScrollHeight = container.element.scrollHeight;
- const firstRenderedPage = container.element.children[0];
+ const oldScrollHeight = container.element.scrollHeight;
- if (container.type == 'pdf') renderPage(Number.parseInt(firstRenderedPage.dataset.page) - (appState.variables.isTwoPageView ? 2 : 1), false, false);
- else renderTextPage(Number.parseInt(firstRenderedPage.dataset.page) - 1, false);
+ // Calculate previous page number, ensuring it's >= 1
+ const prevPageNum = Number.parseInt(firstRenderedPage.dataset.page) - (appState.variables.isTwoPageView ? 2 : 1);
+ if (container.type == 'pdf' && !appState.variables.isManualPageChange && prevPageNum >= 1) {
+ // Use prepend mode (4th param = true) to add page at beginning without clearing
+ renderPage(prevPageNum, false, false, true);
+ } else if (container.type != 'pdf') {
+ renderTextPage(Number.parseInt(firstRenderedPage.dataset.page) - 1, false);
+ }
// Do a simple DOM cleanup. We lock this during generation to avoid unloading
// a page being read.
@@ -2574,15 +4328,30 @@ document.addEventListener('DOMContentLoaded', () => {
const newScrollHeight = container.element.scrollHeight;
const heightAdded = newScrollHeight - (oldScrollHeight * 1.5);
- // Adjust the scroll position to keep the user in the same place
- container.element.scrollTop = container.element.scrollTop + heightAdded;
+ // Adjust the scroll position to keep the user in the same place
+ container.element.scrollTop = container.element.scrollTop + heightAdded;
- // Observe the new top page
- const newTopPage = container.element.children[0];
- if (newTopPage) upwardsScroll.observe(newTopPage);
- });
+ // Observe the new top page with delay
+ const newTopPage = container.element.children[0];
+ if (newTopPage) {
+ setTimeout(() => {
+ upwardsScroll.observe(newTopPage);
+ upwardScrollInProgress = false;
+ }, 100);
+ } else {
+ upwardScrollInProgress = false;
+ }
+ });
+ } catch (error) {
+ console.error('❌ Upward scroll error:', error);
+ upwardScrollInProgress = false;
+ }
});
+ // Store observers globally so fit buttons can access them
+ window.downwardsScroll = downwardsScroll;
+ window.upwardsScroll = upwardsScroll;
+
// Smooth animations
appState.elements.settingsDropupMenu.style.transition = 'opacity 0.2s ease-in-out, transform 0.2s ease-in-out';
appState.elements.settingsDropupMenu.style.opacity = '0';
diff --git a/static/js/library.js b/static/js/library.js
index 68dff82..3ac10a7 100644
--- a/static/js/library.js
+++ b/static/js/library.js
@@ -2,9 +2,10 @@
/**
* Generates an HTML grid display for a list of files.
* @param {Array} files An array of file objects, each with at least 'name' and 'type'.
+ * @param {Function} [onFileClick] Optional callback function when a file is clicked. Receives the file object.
* @returns {HTMLElement} The container div element with the file grid.
*/
-export function createFilesGrid(files) {
+export function createFilesGrid(files, onFileClick = null) {
const container = document.createElement('div');
container.id = 'library-file-grid';
container.className = 'm-5 file-grid-container grid grid-cols-[repeat(auto-fill,minmax(180px,1fr))] gap-4 p-5 border border-gray-200 rounded-lg bg-gray-50 dark:bg-gray-900 dark:border-gray-700 shadow-md relative transition-opacity duration-300 ease-in-out';
@@ -49,8 +50,17 @@ export function createFilesGrid(files) {
if (file.uploadDate) detailsText += (detailsText ? ' | ' : '') + `Uploaded: ${file.uploadDate}`;
detailsElement.textContent = detailsText;
- fileItem.addEventListener('click', () => {
- window.open(file.url, '_blank');
+ fileItem.addEventListener('click', (e) => {
+ e.preventDefault();
+ e.stopPropagation();
+
+ if (onFileClick) {
+ // Use the callback if provided
+ onFileClick(file);
+ } else {
+ // Fallback to opening in new tab
+ window.open(file.url, '_blank');
+ }
});
fileItem.appendChild(iconElement);
@@ -66,9 +76,10 @@ export function createFilesGrid(files) {
/**
* Fetches and renders a grid display of PDFs for the current user.
* @param {string} currentUser The username of the currently logged-in user.
+ * @param {Function} [onPdfClick] Optional callback function when a PDF is clicked. Receives the PDF object.
* @returns {Promise} A promise that resolves to the container div element with the PDF grid.
*/
-export async function renderUserPdfs(currentUser) {
+export async function renderUserPdfs(currentUser, onPdfClick = null) {
try {
const response = await fetch(`/api/users/${currentUser}/pdfs`);
if (!response.ok) {
@@ -80,10 +91,11 @@ export async function renderUserPdfs(currentUser) {
name: pdf.title,
type: 'application/pdf',
id: pdf.id, // Keep the ID for potential future interactions
+ bookId: pdf.book_id, // Book ID for loading
url: pdf.content // The URL to fetch the PDF content
}));
- return createFilesGrid(pdfsForGrid);
+ return createFilesGrid(pdfsForGrid, onPdfClick);
} catch (error) {
console.error('Error rendering user PDFs:', error);
const errorContainer = document.createElement('div');
@@ -91,4 +103,4 @@ export async function renderUserPdfs(currentUser) {
errorContainer.textContent = `Failed to load PDFs: ${error.message}`;
return errorContainer;
}
-}
\ No newline at end of file
+}
diff --git a/static/js/pdfBackend.js b/static/js/pdfBackend.js
new file mode 100644
index 0000000..bf74ca8
--- /dev/null
+++ b/static/js/pdfBackend.js
@@ -0,0 +1,293 @@
+/**
+ * pdfBackend.js - Client-side handler for backend-processed PDF data
+ *
+ * This module handles:
+ * - Loading backend-processed PDF data
+ * - Rendering interactive text overlays
+ * - Word-level highlighting during reading
+ * - Clickable text elements for navigation
+ */
+
+/**
+ * Process a PDF file using the backend comprehensive processor
+ * @param {File} file - The PDF file to process
+ * @param {number} chunkSize - Words per reading chunk (default: 50)
+ * @returns {Promise} Complete structured PDF data
+ */
+export async function processPDFWithBackend(file, chunkSize = 50) {
+ const formData = new FormData();
+ formData.append('file', file);
+ formData.append('chunk_size', chunkSize);
+
+ try {
+ const response = await fetch(`/api/process_pdf_interactive?chunk_size=${chunkSize}`, {
+ method: 'POST',
+ body: formData
+ });
+
+ if (!response.ok) {
+ const errorData = await response.json();
+ throw new Error(errorData.detail || 'Failed to process PDF');
+ }
+
+ const data = await response.json();
+ console.log('✅ PDF processed by backend:', {
+ pages: data.document.total_pages,
+ words: data.document.word_count,
+ chunks: data.metadata.total_chunks
+ });
+
+ return data;
+ } catch (error) {
+ console.error('❌ Backend PDF processing failed:', error);
+ throw error;
+ }
+}
+
+/**
+ * Render interactive text layer for a specific page using backend data
+ * @param {Object} pageData - Page data from backend
+ * @param {HTMLElement} pageWrapper - The page wrapper element
+ * @param {number} currentScale - Current PDF zoom scale
+ * @param {Function} onTextClick - Callback when text is clicked
+ */
+export function renderInteractiveTextLayer(pageData, pageWrapper, currentScale, onTextClick) {
+ // Remove existing text layers
+ const existingLayers = pageWrapper.querySelectorAll('.backend-text-layer, .backend-text-span');
+ existingLayers.forEach(layer => layer.remove());
+
+ // Create container for interactive text
+ const textLayerDiv = document.createElement('div');
+ textLayerDiv.className = 'backend-text-layer';
+ textLayerDiv.dataset.page = pageData.page_num;
+ textLayerDiv.style.position = 'absolute';
+ textLayerDiv.style.left = '0';
+ textLayerDiv.style.top = '0';
+ textLayerDiv.style.width = '100%';
+ textLayerDiv.style.height = '100%';
+ textLayerDiv.style.pointerEvents = 'auto';
+ textLayerDiv.style.zIndex = '10';
+
+ // Calculate scale factors for positioning
+ const canvas = pageWrapper.querySelector('canvas');
+ if (!canvas) {
+ console.warn('⚠️ Canvas not found for page', pageData.page_num);
+ return;
+ }
+
+ const scaleX = canvas.width / pageData.width;
+ const scaleY = canvas.height / pageData.height;
+
+ // Create text spans for each element
+ let spansCreated = 0;
+ for (const element of pageData.elements) {
+ if (!element.text || element.text.trim() === '') continue;
+
+ const span = document.createElement('span');
+ span.textContent = element.text;
+ span.className = 'backend-text-span';
+ span.dataset.elementId = element.id;
+ span.dataset.wordIdx = element.word_idx;
+ span.dataset.page = pageData.page_num;
+
+ // Position and style the span
+ span.style.position = 'absolute';
+ span.style.left = (element.x * scaleX) + 'px';
+ span.style.top = (element.y * scaleY) + 'px';
+ span.style.width = (element.width * scaleX) + 'px';
+ span.style.height = (element.height * scaleY) + 'px';
+ span.style.fontSize = (element.size * scaleY) + 'px';
+ span.style.fontFamily = element.font || 'sans-serif';
+ span.style.color = 'transparent';
+ span.style.whiteSpace = 'nowrap';
+ span.style.cursor = 'pointer';
+ span.style.userSelect = 'text';
+ span.style.overflow = 'hidden';
+
+ // Add hover effect
+ span.style.transition = 'background-color 0.2s';
+ span.addEventListener('mouseenter', () => {
+ span.style.backgroundColor = 'rgba(66, 153, 225, 0.2)';
+ });
+ span.addEventListener('mouseleave', () => {
+ if (!span.classList.contains('highlighted')) {
+ span.style.backgroundColor = 'transparent';
+ }
+ });
+
+ // Handle clicks
+ span.addEventListener('click', (e) => {
+ const selection = window.getSelection();
+ if (!selection || selection.toString().length === 0) {
+ e.stopPropagation();
+ if (onTextClick) {
+ onTextClick(element, pageData.page_num);
+ }
+ }
+ });
+
+ textLayerDiv.appendChild(span);
+ spansCreated++;
+ }
+
+ // Add to page wrapper
+ pageWrapper.appendChild(textLayerDiv);
+
+ console.log(`✅ Rendered ${spansCreated} text spans for page ${pageData.page_num}`);
+}
+
+/**
+ * Highlight words for a specific chunk during reading
+ * @param {Object} chunkData - Chunk data from backend
+ * @param {Object} pdfData - Complete PDF data from backend
+ * @param {string} highlightColor - CSS class for highlight color
+ */
+export function highlightChunk(chunkData, pdfData, highlightColor = 'bg-yellow-300') {
+ // Clear previous highlights
+ clearAllHighlights();
+
+ const wordStart = chunkData.word_start;
+ const wordEnd = chunkData.word_end;
+
+ // Find and highlight all text spans within the word range
+ for (const pageData of pdfData.pages) {
+ const pageNum = pageData.page_num;
+ const textLayer = document.querySelector(`.backend-text-layer[data-page="${pageNum}"]`);
+
+ if (!textLayer) continue;
+
+ const spans = textLayer.querySelectorAll('.backend-text-span');
+ spans.forEach(span => {
+ const wordIdx = parseInt(span.dataset.wordIdx);
+ if (wordIdx >= wordStart && wordIdx < wordEnd) {
+ span.classList.add('highlighted', highlightColor);
+ span.style.backgroundColor = 'rgba(254, 240, 138, 0.5)'; // yellow-200 with opacity
+ }
+ });
+ }
+
+ console.log(`🎯 Highlighted chunk ${chunkData.id}: words ${wordStart}-${wordEnd}`);
+}
+
+/**
+ * Highlight a single word (for more granular highlighting during playback)
+ * @param {number} wordIndex - Global word index to highlight
+ * @param {string} highlightColor - Background color for highlight
+ */
+export function highlightWord(wordIndex, highlightColor = 'rgba(96, 165, 250, 0.6)') {
+ // Find the span for this word
+ const span = document.querySelector(`.backend-text-span[data-word-idx="${wordIndex}"]`);
+
+ if (span) {
+ // Remove previous word highlight
+ const prevHighlighted = document.querySelector('.current-word-highlight');
+ if (prevHighlighted) {
+ prevHighlighted.classList.remove('current-word-highlight');
+ prevHighlighted.style.backgroundColor = 'rgba(254, 240, 138, 0.5)'; // Return to chunk highlight
+ }
+
+ // Highlight current word
+ span.classList.add('current-word-highlight');
+ span.style.backgroundColor = highlightColor;
+
+ // Scroll word into view if needed
+ span.scrollIntoView({ behavior: 'smooth', block: 'center' });
+ }
+}
+
+/**
+ * Clear all text highlights
+ */
+export function clearAllHighlights() {
+ const highlightedSpans = document.querySelectorAll('.backend-text-span.highlighted');
+ highlightedSpans.forEach(span => {
+ span.classList.remove('highlighted', 'current-word-highlight');
+ span.style.backgroundColor = 'transparent';
+ });
+}
+
+/**
+ * Get text starting from a clicked element
+ * @param {Object} element - The clicked text element
+ * @param {Object} pdfData - Complete PDF data from backend
+ * @returns {Object} { text: string, startWordIndex: number }
+ */
+export function getTextFromElement(element, pdfData) {
+ const wordIdx = element.word_idx;
+ const fullText = pdfData.document.full_text;
+
+ // Split full text into words and reconstruct from clicked position
+ const words = fullText.match(/\S+/g) || [];
+ const textFromPoint = words.slice(wordIdx).join(' ');
+
+ return {
+ text: textFromPoint,
+ startWordIndex: wordIdx,
+ clickedText: element.text
+ };
+}
+
+/**
+ * Find the page containing a specific word index
+ * @param {number} wordIndex - Global word index
+ * @param {Object} pdfData - Complete PDF data from backend
+ * @returns {number} Page number (1-indexed)
+ */
+export function getPageForWord(wordIndex, pdfData) {
+ return pdfData.document.word_to_page?.[wordIndex] || 1;
+}
+
+/**
+ * Get all chunks for the PDF
+ * @param {Object} pdfData - Complete PDF data from backend
+ * @returns {Array} Array of chunk objects
+ */
+export function getChunks(pdfData) {
+ return pdfData.chunks || [];
+}
+
+/**
+ * Store PDF data in appState after processing
+ * @param {Object} appState - Application state object
+ * @param {Object} pdfData - Processed PDF data from backend
+ */
+export function storePDFData(appState, pdfData) {
+ appState.variables.backendPdfData = pdfData;
+ appState.variables.pdfChunks = pdfData.chunks;
+ appState.variables.fullBookText = pdfData.document.full_text;
+
+ console.log('✅ Stored PDF data in appState', {
+ pages: pdfData.document.total_pages,
+ chunks: pdfData.chunks.length,
+ textLength: pdfData.document.full_text.length
+ });
+}
+
+/**
+ * Render all pages with interactive text layers
+ * @param {Object} appState - Application state object
+ * @param {Function} onTextClick - Callback when text is clicked
+ */
+export function renderAllPagesWithText(appState, onTextClick) {
+ if (!appState.variables.backendPdfData) {
+ console.warn('⚠️ No backend PDF data available');
+ return;
+ }
+
+ const pdfData = appState.variables.backendPdfData;
+
+ // Wait for pages to be rendered, then add text layers
+ setTimeout(() => {
+ for (const pageData of pdfData.pages) {
+ const pageWrapper = document.querySelector(`.pdf-page-wrapper[data-page="${pageData.page_num}"]`);
+ if (pageWrapper) {
+ renderInteractiveTextLayer(
+ pageData,
+ pageWrapper,
+ appState.variables.currentScale || 1.0,
+ onTextClick
+ );
+ }
+ }
+ }, 500);
+}
diff --git a/static/js/pdfBackendIntegrationExample.js b/static/js/pdfBackendIntegrationExample.js
new file mode 100644
index 0000000..7d70b60
--- /dev/null
+++ b/static/js/pdfBackendIntegrationExample.js
@@ -0,0 +1,331 @@
+/**
+ * PDF Backend Integration Example
+ *
+ * This file demonstrates how to integrate the backend PDF processing
+ * into the existing index.js workflow.
+ *
+ * Copy and adapt these code snippets into your index.js file.
+ */
+
+// ==========================================
+// STEP 1: Import the PDF backend module
+// ==========================================
+// Add this import at the top of index.js with other imports
+import {
+ processPDFWithBackend,
+ storePDFData,
+ renderInteractiveTextLayer,
+ renderAllPagesWithText,
+ highlightChunk,
+ highlightWord,
+ clearAllHighlights,
+ getTextFromElement,
+ getPageForWord,
+ getChunks
+} from './pdfBackend.js';
+
+
+// ==========================================
+// STEP 2: Modify PDF File Upload Handler
+// ==========================================
+// Find the existing PDF file input handler and replace it with:
+
+appState.elements.pdfFileInput.addEventListener('change', async (e) => {
+ const file = e.target.files[0];
+ if (!file || !file.name.endsWith('.pdf')) {
+ showNotification('Please select a valid PDF file', 'warning');
+ return;
+ }
+
+ try {
+ // Show loading state
+ showNotification('🔄 Processing PDF with backend...', 'info');
+ appState.elements.pdfFileInput.disabled = true;
+
+ // BACKEND PROCESSING - This does all the heavy lifting
+ const pdfData = await processPDFWithBackend(file, 50); // 50 words per chunk
+
+ // Store processed data in appState
+ storePDFData(appState, pdfData);
+
+ // Update UI with document info
+ appState.elements.textDisplay.textContent = pdfData.document.full_text;
+ appState.variables.fullBookText = pdfData.document.full_text;
+
+ // Continue with PDF.js for canvas rendering (visual display)
+ const arrayBuffer = await file.arrayBuffer();
+ const pdfDoc = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
+ appState.variables.pdfDoc = pdfDoc;
+
+ // Render first page (or first two pages if two-page view)
+ await renderPage(1, false, false, false);
+
+ // Add interactive text layers using backend data
+ renderAllPagesWithText(appState, handleBackendTextClick);
+
+ showNotification(`✅ PDF ready! ${pdfData.document.total_pages} pages, ${pdfData.document.word_count} words`, 'success');
+
+ // Show save button if logged in
+ if (appState.variables.currentUser) {
+ appState.elements.saveBookBtn.classList.remove('hidden');
+ }
+
+ } catch (error) {
+ console.error('❌ PDF processing failed:', error);
+ showNotification(`Failed to process PDF: ${error.message}`, 'error');
+ } finally {
+ appState.elements.pdfFileInput.disabled = false;
+ }
+});
+
+
+// ==========================================
+// STEP 3: Handle Text Clicks
+// ==========================================
+// Add this function to handle clicks on text elements
+
+function handleBackendTextClick(element, pageNum) {
+ // Prevent triggering if user is selecting text
+ const selection = window.getSelection();
+ if (selection && selection.toString().length > 0) {
+ return;
+ }
+
+ const pdfData = appState.variables.backendPdfData;
+ if (!pdfData) {
+ console.warn('No PDF data available');
+ return;
+ }
+
+ // Get text from clicked position to end
+ const { text, startWordIndex, clickedText } = getTextFromElement(element, pdfData);
+
+ console.log(`📍 Clicked on word ${startWordIndex}: "${clickedText}"`);
+ showNotification(`🎵 Starting from: "${clickedText.substring(0, 30)}..."`, 'info');
+
+ // Update current page and reading position
+ appState.variables.currentPageNum = pageNum;
+ appState.variables.currentWordIndex = startWordIndex;
+
+ // Start reading from this point
+ processTextAndPlayFromWord(text, pageNum, startWordIndex);
+}
+
+
+// ==========================================
+// STEP 4: Update Page Rendering
+// ==========================================
+// Modify the existing renderPage function to add text layers
+// Add this at the end of the renderPage function:
+
+async function renderPage(num, skipTextExtraction = false, append = true, prepend = false) {
+ // ... existing render code ...
+
+ // After canvas rendering completes, add interactive text layer
+ if (appState.variables.backendPdfData && !skipTextExtraction) {
+ // Wait a bit for canvas to be ready
+ setTimeout(() => {
+ const pdfData = appState.variables.backendPdfData;
+ const pageData = pdfData.pages.find(p => p.page_num === num);
+ const pageWrapper = appState.elements.pdfViewer.querySelector(
+ `.pdf-page-wrapper[data-page="${num}"]`
+ );
+
+ if (pageData && pageWrapper) {
+ renderInteractiveTextLayer(
+ pageData,
+ pageWrapper,
+ appState.variables.currentScale,
+ handleBackendTextClick
+ );
+ }
+
+ // If two-page view, render text layer for second page too
+ if (appState.variables.isTwoPageView && num + 1 <= pdfData.document.total_pages) {
+ const pageData2 = pdfData.pages.find(p => p.page_num === num + 1);
+ const pageWrapper2 = appState.elements.pdfViewer.querySelector(
+ `.pdf-page-wrapper[data-page="${num + 1}"]`
+ );
+
+ if (pageData2 && pageWrapper2) {
+ renderInteractiveTextLayer(
+ pageData2,
+ pageWrapper2,
+ appState.variables.currentScale,
+ handleBackendTextClick
+ );
+ }
+ }
+ }, 300); // Delay to ensure canvas is ready
+ }
+
+ // ... rest of existing code ...
+}
+
+
+// ==========================================
+// STEP 5: Integrate with Audio Playback
+// ==========================================
+// Modify the audio playback to highlight chunks/words
+
+function playAudioChunkWithHighlight(chunkIndex) {
+ const pdfData = appState.variables.backendPdfData;
+ if (!pdfData || !pdfData.chunks[chunkIndex]) {
+ console.warn('No chunk data for highlighting');
+ return;
+ }
+
+ const chunk = pdfData.chunks[chunkIndex];
+
+ // Clear previous highlights
+ clearAllHighlights();
+
+ // Highlight the entire chunk
+ highlightChunk(chunk, pdfData, 'bg-yellow-200');
+
+ // Ensure the page containing this chunk is visible
+ const chunkPage = chunk.page_start;
+ if (appState.variables.currentPageNum !== chunkPage) {
+ renderPage(chunkPage);
+ }
+
+ // Play audio for this chunk
+ const audioUrl = `/audio_cache/${appState.variables.currentUser}/${chunk.id}.mp3`;
+ const audio = new Audio(audioUrl);
+
+ // Word-by-word highlighting during playback
+ let currentWord = chunk.word_start;
+ const wordsInChunk = chunk.word_end - chunk.word_start;
+
+ audio.ontimeupdate = () => {
+ const progress = audio.currentTime / audio.duration;
+ const wordOffset = Math.floor(progress * wordsInChunk);
+ const wordToHighlight = chunk.word_start + wordOffset;
+
+ if (wordToHighlight !== currentWord && wordToHighlight < chunk.word_end) {
+ highlightWord(wordToHighlight, 'rgba(59, 130, 246, 0.7)'); // blue highlight
+ currentWord = wordToHighlight;
+ }
+ };
+
+ audio.onended = () => {
+ // Play next chunk or clear highlights
+ if (chunkIndex + 1 < pdfData.chunks.length) {
+ playAudioChunkWithHighlight(chunkIndex + 1);
+ } else {
+ clearAllHighlights();
+ }
+ };
+
+ audio.play();
+ appState.variables.currentAudio = audio;
+}
+
+
+// ==========================================
+// STEP 6: Enhanced Text Processing
+// ==========================================
+// New function to process text starting from a specific word
+
+async function processTextAndPlayFromWord(text, pageNum, startWordIndex) {
+ const pdfData = appState.variables.backendPdfData;
+
+ // Find which chunk this word belongs to
+ let startChunkIndex = 0;
+ for (let i = 0; i < pdfData.chunks.length; i++) {
+ const chunk = pdfData.chunks[i];
+ if (startWordIndex >= chunk.word_start && startWordIndex < chunk.word_end) {
+ startChunkIndex = i;
+ break;
+ }
+ }
+
+ console.log(`🎵 Starting playback from chunk ${startChunkIndex} (word ${startWordIndex})`);
+
+ // Update state
+ appState.variables.currentChunkIndex = startChunkIndex;
+ appState.variables.currentWordIndex = startWordIndex;
+ appState.variables.isPlaying = true;
+
+ // Start playing
+ playAudioChunkWithHighlight(startChunkIndex);
+}
+
+
+// ==========================================
+// STEP 7: Handle Zoom Changes
+// ==========================================
+// Update text layers when zoom changes
+
+appState.elements.zoomInBtn.addEventListener('click', () => {
+ appState.variables.currentScale *= 1.2;
+ appState.elements.zoomLevel.textContent = Math.round(appState.variables.currentScale * 100) + '%';
+
+ // Re-render current page with new scale
+ const currentPage = appState.variables.currentPageNum;
+ renderPage(currentPage, false, false, false);
+});
+
+appState.elements.zoomOutBtn.addEventListener('click', () => {
+ appState.variables.currentScale /= 1.2;
+ appState.elements.zoomLevel.textContent = Math.round(appState.variables.currentScale * 100) + '%';
+
+ // Re-render current page with new scale
+ const currentPage = appState.variables.currentPageNum;
+ renderPage(currentPage, false, false, false);
+});
+
+
+// ==========================================
+// STEP 8: Clean Up on Reset
+// ==========================================
+// Clear backend data when closing PDF
+
+function resetPdfViewEnhanced(appState) {
+ // Call existing reset function
+ resetPdfView(appState);
+
+ // Clear backend data
+ clearAllHighlights();
+ appState.variables.backendPdfData = null;
+ appState.variables.pdfChunks = null;
+ appState.variables.currentWordIndex = 0;
+ appState.variables.currentChunkIndex = 0;
+
+ console.log('✅ PDF view and backend data cleared');
+}
+
+
+// ==========================================
+// STEP 9: Add to appState variables
+// ==========================================
+// Add these variables to your appState.variables object initialization
+
+appState.variables = {
+ // ... existing variables ...
+
+ // Backend PDF data
+ backendPdfData: null, // Complete processed PDF data from backend
+ pdfChunks: null, // Array of reading chunks
+ currentWordIndex: 0, // Current word being read
+ currentChunkIndex: 0, // Current chunk being played
+
+ // ... rest of existing variables ...
+};
+
+
+// ==========================================
+// USAGE EXAMPLE
+// ==========================================
+
+/*
+When user uploads a PDF:
+
+1. Backend processes PDF -> returns complete structured data
+2. Store data in appState.variables.backendPdfData
+3. Render PDF pages using PDF.js (visual display)
+4. Overlay interactive text using backend data
+5. User clicks on any word -> starts reading from that word
+6. During playback, highlight words in real-time
+7. User can click any other word to jump to that position
+*/
diff --git a/static/js/speechGen.js b/static/js/speechGen.js
index 6a9e129..6979039 100644
--- a/static/js/speechGen.js
+++ b/static/js/speechGen.js
@@ -66,4 +66,54 @@ export async function generateSpeech(textChunk, lang='en', engine, voice) {
console.error('Error generating speech:', error);
return false;
}
+}
+
+/**
+ * Generate speech with word-level timing information for precise highlighting.
+ * This is used for PDF backend rendering where we need to highlight specific text elements.
+ * @param {string} textChunk Text to generate.
+ * @param {string} [lang] ISO language code.
+ * @param {string} engine Available engines: piper, kokoro, coqui.
+ * @param {string} voice Voice to use.
+ * @param {number} [chunkSize] Number of words per chunk.
+ * @param {number} [speed] Playback speed multiplier.
+ * @returns {Promise} Audio URL and timing data
+ */
+export async function generateSpeechWithTiming(textChunk, lang='en', engine, voice, chunkSize=50, speed=1.0) {
+ if (!textChunk) return null;
+ if (!voice) return null;
+
+ try {
+ const requestBody = {
+ text: textChunk,
+ voice: engine || 'piper',
+ speed: speed,
+ chunkSize: chunkSize
+ };
+
+ console.debug('🎵 Generating speech with timing:', requestBody);
+
+ const response = await fetch('/api/generate_speech_with_timing', {
+ method: 'POST',
+ headers: {
+ 'Content-Type': 'application/json',
+ },
+ body: JSON.stringify(requestBody),
+ });
+
+ if (!response.ok) {
+ const errorData = await response.json();
+ console.error('❌ Failed to generate speech with timing:', errorData);
+ return null;
+ }
+
+ const data = await response.json();
+ console.debug('✅ Received timing data:', data);
+
+ return data; // Returns { chunks: [...], originalText, normalizedText }
+
+ } catch (error) {
+ console.error('❌ Error generating speech with timing:', error);
+ return null;
+ }
}
\ No newline at end of file
diff --git a/static/test-pdf-click.html b/static/test-pdf-click.html
new file mode 100644
index 0000000..5b8689d
--- /dev/null
+++ b/static/test-pdf-click.html
@@ -0,0 +1,160 @@
+
+
+
+ PDF Click Test
+
+
+
+
+
PDF Click-to-Read Test
+
+
+
📋 Test Instructions:
+
+ Hover over the colored boxes below
+ You should see a blue highlight when hovering
+ Click on any box to test the click functionality
+ The canvas (checkered pattern) should remain visible under the boxes
+
+
+
+
Layering Test
+
This simulates the PDF viewer structure with canvas and text layer.
+
+
+
+
+
Canvas Layer (PDF)
+
This represents the PDF canvas. You should be able to see this clearly.
+
+
+
+
+
+
+
+
Test Results:
+
+ ✓ You should see the checkered pattern (canvas layer)
+ ✓ You should see "Chunk 1", "Chunk 2", "Chunk 3" labels
+ ✓ Hovering over chunks shows blue highlight
+ ✓ Clicking chunks triggers the event below
+
+
+
+
+
+
diff --git a/templates/config.html b/templates/config.html
index 56b6367..86b8ab1 100644
--- a/templates/config.html
+++ b/templates/config.html
@@ -37,33 +37,76 @@ Configuration
Personalization
-
Highlighting
-
Customize highlights color.
+
Chunk Highlight Color
+
Customize the sentence/chunk background highlight color.
-
-
-
-
The quick brown fox jumps over the lazy dog
+
+
+
+ Quick fox
-
- Classic
+ Yellow
+
+
-
-
-
The quick brown fox jumps over the lazy dog
+
+
+ Quick fox
-
- Greenish
+ Blue
+
+
-
-
-
The quick brown fox jumps over the lazy dog
+
+
+ Quick fox
-
- Sky
+ Orange
+
+
+
+
+
Word Highlight Color
+
Customize the individual word highlight color (appears while reading).
+
+
@@ -260,6 +303,22 @@
Chunk Size (in charac
+
+
+
Semantic Text Processing
+
Use AI (Qwen2.5) to split text into semantically meaningful sentences for more natural reading. Requires local Ollama with qwen2.5 model.
+
+
Enable Semantic Splitting
+
+
+
+
+
+
+
Setup: Install Ollama from
ollama.com , then run:
ollama pull qwen2.5
+
+
+
Cache Management
diff --git a/templates/index.html b/templates/index.html
index d45d9b4..64f41db 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -7,7 +7,6 @@
-
@@ -53,6 +52,10 @@
Commands
+
+
+ Shortcuts
+
@@ -236,9 +239,9 @@
URL
TTS Engine
+ Kokoro
Piper
Chatterbox
- Kokoro
Kitten
Coqui
Gemini Voice
@@ -299,6 +302,37 @@ URL
+
+
+
+
Highlighting
+
+
+ Word-by-word highlighting
+
+
+
+ Chunk Color
+
+ Yellow
+ Green
+ Blue
+ Pink
+ Orange
+
+
+
+ Word Color
+
+ Yellow
+ Green
+ Blue
+ Pink
+ Orange
+
+
+
+
@@ -321,21 +355,87 @@
URL