diff --git a/.runpod.env.template b/.runpod.env.template
index 8137684..6ad5a55 100644
--- a/.runpod.env.template
+++ b/.runpod.env.template
@@ -14,6 +14,9 @@ RUNPOD_SSH_USER=root
 # Remote paths
 RUNPOD_REMOTE_DIR=/workspace/cuvarbase
 
-# RunPod API Key (optional, for advanced automation)
+# RunPod API Key (required for scripts/runpod-create.sh and scripts/gpu-test.sh)
 # Get from https://www.runpod.io/console/user/settings
-# RUNPOD_API_KEY=your-api-key-here
+RUNPOD_API_KEY=
+
+# Pod ID (auto-populated by runpod-create.sh)
+# RUNPOD_POD_ID=
diff --git a/README.md b/README.md
index bab019c..89b0c8b 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,12 @@ Currently includes implementations of:
   - Sparse BLS ([Panahi & Zucker 2021](https://arxiv.org/abs/2103.06193)) for small datasets (< 500 observations)
     - GPU implementation: `sparse_bls_gpu()` (default)
     - CPU implementation: `sparse_bls_cpu()` (fallback)
+- **Transit Least Squares ([TLS](https://ui.adsabs.harvard.edu/abs/2019A%26A...623A..39H/abstract))** - GPU-accelerated transit detection with optimal depth fitting
+  - **35-202× faster** than CPU TLS (transitleastsquares package)
+  - Keplerian-aware duration constraints (`tls_transit()`) - searches physically plausible transit durations
+  - Standard mode (`tls_search_gpu()`) for custom period/duration grids
+  - Optimal period grid sampling (Ofir 2014)
+  - Supports datasets up to ~100,000 observations (optimal: 500-20,000)
 - **Non-equispaced fast Fourier transform (NFFT)** - Adjoint operation ([paper](http://epubs.siam.org/doi/abs/10.1137/0914081))
 - **NUFFT-based Likelihood Ratio Test (LRT)** - Transit detection with correlated noise (contributed by Jamila Taaki)
   - Matched filter in frequency domain with adaptive noise estimation
@@ -196,6 +202,8 @@ Full documentation is available at: https://johnh2o2.github.io/cuvarbase/
 
 ## Quick Start
 
+### Box Least Squares (BLS) - Transit Detection
+
 ```python
 import numpy as np
 from cuvarbase import bls
@@ -205,7 +213,6 @@ t = np.sort(np.random.uniform(0, 10, 1000)).astype(np.float32)
 y = np.sin(2 * np.pi * t / 2.5) + np.random.normal(0, 0.1, len(t))
 dy = np.ones_like(y) * 0.1  # uncertainties
 
-# Box Least Squares (BLS) - Transit detection
 # Define frequency grid
 freqs = np.linspace(0.1, 2.0, 5000).astype(np.float32)
 
@@ -218,6 +225,36 @@ print(f"Best period: {1/best_freq:.2f} (expected: 2.5)")
 power_adaptive = bls.eebls_gpu_fast_adaptive(t, y, dy, freqs)
 ```
 
+### Transit Least Squares (TLS) - Advanced Transit Detection
+
+```python
+from cuvarbase import tls
+
+# Generate transit data
+t = np.sort(np.random.uniform(0, 50, 500)).astype(np.float32)
+y = np.ones(len(t), dtype=np.float32)
+dy = np.ones(len(t), dtype=np.float32) * 0.001
+
+# Add 1% transit at 10-day period
+phase = (t % 10.0) / 10.0
+in_transit = (phase < 0.01) | (phase > 0.99)
+y[in_transit] -= 0.01
+y += np.random.normal(0, 0.001, len(t)).astype(np.float32)
+
+# TLS with Keplerian duration constraints (35-202x faster than CPU TLS!)
+results = tls.tls_transit(
+    t, y, dy,
+    R_star=1.0,      # Solar radii
+    M_star=1.0,      # Solar masses
+    period_min=5.0,
+    period_max=20.0
+)
+
+print(f"Best period: {results['period']:.2f} days")
+print(f"Transit depth: {results['depth']:.4f}")
+print(f"SDE: {results['SDE']:.1f}")
+```
+
 For more advanced usage including Lomb-Scargle and Conditional Entropy, see the [full documentation](https://johnh2o2.github.io/cuvarbase/) and [examples/](examples/).
 
 ## Using Multiple GPUs
diff --git a/compare_gpu_cpu_depth.py b/compare_gpu_cpu_depth.py
new file mode 100644
index 0000000..f0ffc38
--- /dev/null
+++ b/compare_gpu_cpu_depth.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Compare GPU and CPU TLS depth calculations"""
+import numpy as np
+from cuvarbase import tls as gpu_tls
+from transitleastsquares import transitleastsquares as cpu_tls
+
+# Generate test data
+np.random.seed(42)
+ndata = 500
+t = np.sort(np.random.uniform(0, 50, ndata))
+y = np.ones(ndata, dtype=np.float32)
+
+# Add transit
+period_true = 10.0
+depth_true = 0.01  # Fractional dip
+phase = (t % period_true) / period_true
+in_transit = (phase < 0.01) | (phase > 0.99)
+y[in_transit] -= depth_true
+y += np.random.normal(0, 0.001, ndata).astype(np.float32)
+dy = np.ones(ndata, dtype=np.float32) * 0.001
+
+print(f"Test data:")
+print(f"  N = {ndata}")
+print(f"  Period = {period_true:.1f} days")
+print(f"  Depth (fractional dip) = {depth_true:.3f}")
+print(f"  Points in transit: {np.sum(in_transit)}")
+print(f"  Measured depth: {np.mean(y[~in_transit]) - np.mean(y[in_transit]):.6f}")
+
+# GPU TLS
+print(f"\n--- GPU TLS ---")
+gpu_result = gpu_tls.tls_search_gpu(
+    t.astype(np.float32), y, dy,
+    period_min=9.0,
+    period_max=11.0
+)
+
+print(f"Period: {gpu_result['period']:.4f} (error: {abs(gpu_result['period'] - period_true)/period_true*100:.2f}%)")
+print(f"Depth: {gpu_result['depth']:.6f}")
+print(f"Duration: {gpu_result['duration']:.4f} days")
+print(f"T0: {gpu_result['T0']:.4f}")
+
+# CPU TLS
+print(f"\n--- CPU TLS ---")
+model = cpu_tls(t, y, dy)
+cpu_result = model.power(
+    period_min=9.0,
+    period_max=11.0,
+    n_transits_min=2
+)
+
+print(f"Period: {cpu_result.period:.4f} (error: {abs(cpu_result.period - period_true)/period_true*100:.2f}%)")
+print(f"Depth (flux ratio): {cpu_result.depth:.6f}")
+print(f"Depth (fractional dip): {1 - cpu_result.depth:.6f}")
+print(f"Duration: {cpu_result.duration:.4f} days")
+print(f"T0: {cpu_result.T0:.4f}")
+
+# Compare
+print(f"\n--- Comparison ---")
+print(f"Period agreement: {abs(gpu_result['period'] - cpu_result.period):.4f} days")
+print(f"Duration agreement: {abs(gpu_result['duration'] - cpu_result.duration):.4f} days")
+
+# Check depth conventions
+gpu_depth_frac = gpu_result['depth']  # GPU reports fractional dip
+cpu_depth_frac = 1 - cpu_result.depth  # CPU reports flux ratio
+
+print(f"\nDepth (fractional dip convention):")
+print(f"  True: {depth_true:.6f}")
+print(f"  GPU:  {gpu_depth_frac:.6f} (error: {abs(gpu_depth_frac - depth_true)/depth_true*100:.1f}%)")
+print(f"  CPU:  {cpu_depth_frac:.6f} (error: {abs(cpu_depth_frac - depth_true)/depth_true*100:.1f}%)")
diff --git a/cuvarbase/kernels/tls.cu b/cuvarbase/kernels/tls.cu
new file mode 100644
index 0000000..c2183b7
--- /dev/null
+++ b/cuvarbase/kernels/tls.cu
@@ -0,0 +1,510 @@
+/*
+ * Transit Least Squares (TLS) GPU kernel
+ *
+ * Optimized kernel using bitonic sort for phase sorting and a
+ * limb-darkened transit template for physically realistic fitting.
+ *
+ * The transit template is a 1D array mapping transit_coord in [-1, 1]
+ * to normalized depth in [0, 1], precomputed on the CPU using batman
+ * (or a trapezoidal fallback) and loaded into shared memory.
+ *
+ * References:
+ * [1] Hippke & Heller (2019), A&A 623, A39
+ * [2] Kovacs et al. (2002), A&A 391, 369
+ */
+
+#include <stdio.h>
+
+//{CPP_DEFS}
+
+#ifndef BLOCK_SIZE
+#define BLOCK_SIZE 128
+#endif
+
+#define MAX_NDATA 100000
+#define PI 3.141592653589793f
+#define WARP_SIZE 32
+
+// Device utility functions
+__device__ inline float mod1(float x) {
+    return x - floorf(x);
+}
+
+/**
+ * Bitonic sort for phase-folded data
+ * O(N log^2 N) parallel sort, requires padding to next power of 2
+ */
+__device__ void bitonic_sort_phases(
+    float* phases,
+    float* y_sorted,
+    float* dy_sorted,
+    int ndata)
+{
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    // Compute next power of 2 >= ndata
+    int n_pow2 = 1;
+    while (n_pow2 < ndata) n_pow2 <<= 1;
+
+    // Bitonic sort: outer loop over power-of-2 sizes
+    for (int k = 2; k <= n_pow2; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            for (int i = tid; i < n_pow2; i += stride) {
+                int ixj = i ^ j;
+                if (ixj > i && ixj < ndata && i < ndata) {
+                    if ((i & k) == 0) {
+                        // Ascending
+                        if (phases[i] > phases[ixj]) {
+                            float temp = phases[i];
+                            phases[i] = phases[ixj];
+                            phases[ixj] = temp;
+                            temp = y_sorted[i];
+                            y_sorted[i] = y_sorted[ixj];
+                            y_sorted[ixj] = temp;
+                            temp = dy_sorted[i];
+                            dy_sorted[i] = dy_sorted[ixj];
+                            dy_sorted[ixj] = temp;
+                        }
+                    } else {
+                        // Descending
+                        if (phases[i] < phases[ixj]) {
+                            float temp = phases[i];
+                            phases[i] = phases[ixj];
+                            phases[ixj] = temp;
+                            temp = y_sorted[i];
+                            y_sorted[i] = y_sorted[ixj];
+                            y_sorted[ixj] = temp;
+                            temp = dy_sorted[i];
+                            dy_sorted[i] = dy_sorted[ixj];
+                            dy_sorted[ixj] = temp;
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+}
+
+/**
+ * Look up transit template value with linear interpolation.
+ *
+ * Maps transit_coord in [-1, 1] to template index, does linear
+ * interpolation between adjacent samples. Returns 0 outside [-1, 1].
+ *
+ * s_template: shared memory pointer to template array
+ * n_template: number of template samples
+ * transit_coord: position within transit, [-1, 1]
+ */
+__device__ float lookup_template(const float* s_template, int n_template,
+                                  float transit_coord)
+{
+    if (transit_coord < -1.0f || transit_coord > 1.0f)
+        return 0.0f;
+
+    // Map [-1, 1] to [0, n_template - 1]
+    float idx_f = (transit_coord + 1.0f) * 0.5f * (float)(n_template - 1);
+
+    int idx0 = (int)floorf(idx_f);
+    int idx1 = idx0 + 1;
+
+    // Clamp
+    if (idx0 < 0) idx0 = 0;
+    if (idx1 >= n_template) idx1 = n_template - 1;
+    if (idx0 >= n_template) idx0 = n_template - 1;
+
+    float frac = idx_f - floorf(idx_f);
+
+    return s_template[idx0] * (1.0f - frac) + s_template[idx1] * frac;
+}
+
+/**
+ * Calculate optimal transit depth using weighted least squares
+ * with limb-darkened transit template.
+ */
+__device__ float calculate_optimal_depth(
+    const float* y_sorted,
+    const float* dy_sorted,
+    const float* phases_sorted,
+    const float* s_template,
+    int n_template,
+    float duration_phase,
+    float t0_phase,
+    int ndata)
+{
+    float numerator = 0.0f;
+    float denominator = 0.0f;
+
+    float half_dur = duration_phase * 0.5f;
+
+    for (int i = 0; i < ndata; i++) {
+        float phase_rel = mod1(phases_sorted[i] - t0_phase + 0.5f) - 0.5f;
+
+        if (fabsf(phase_rel) < half_dur) {
+            float transit_coord = phase_rel / half_dur;
+            float template_val = lookup_template(s_template, n_template, transit_coord);
+            float sigma2 = dy_sorted[i] * dy_sorted[i] + 1e-10f;
+            float y_residual = 1.0f - y_sorted[i];
+            numerator += y_residual * template_val / sigma2;
+            denominator += template_val * template_val / sigma2;
+        }
+    }
+
+    if (denominator < 1e-10f) return 0.0f;
+
+    float depth = numerator / denominator;
+    if (depth < 0.0f) depth = 0.0f;
+    if (depth > 1.0f) depth = 1.0f;
+
+    return depth;
+}
+
+/**
+ * Calculate chi-squared for a given transit model fit
+ * using limb-darkened transit template.
+ */
+__device__ float calculate_chi2(
+    const float* y_sorted,
+    const float* dy_sorted,
+    const float* phases_sorted,
+    const float* s_template,
+    int n_template,
+    float duration_phase,
+    float t0_phase,
+    float depth,
+    int ndata)
+{
+    float chi2 = 0.0f;
+    float half_dur = duration_phase * 0.5f;
+
+    for (int i = 0; i < ndata; i++) {
+        float phase_rel = mod1(phases_sorted[i] - t0_phase + 0.5f) - 0.5f;
+        float model_val;
+        if (fabsf(phase_rel) < half_dur) {
+            float transit_coord = phase_rel / half_dur;
+            float template_val = lookup_template(s_template, n_template, transit_coord);
+            model_val = 1.0f - depth * template_val;
+        } else {
+            model_val = 1.0f;
+        }
+        float residual = y_sorted[i] - model_val;
+        float sigma2 = dy_sorted[i] * dy_sorted[i] + 1e-10f;
+        chi2 += (residual * residual) / sigma2;
+    }
+
+    return chi2;
+}
+
+/**
+ * TLS search kernel with Keplerian duration constraints
+ * Grid: (nperiods, 1, 1), Block: (BLOCK_SIZE, 1, 1)
+ *
+ * Shared memory layout:
+ *   phases[ndata] | y_sorted[ndata] | dy_sorted[ndata] |
+ *   template[n_template] | thread_chi2[blockDim] | thread_t0[blockDim] |
+ *   thread_dur[blockDim] | thread_depth[blockDim]
+ */
+extern "C" __global__ void tls_search_kernel_keplerian(
+    const float* __restrict__ t,
+    const float* __restrict__ y,
+    const float* __restrict__ dy,
+    const float* __restrict__ periods,
+    const float* __restrict__ qmin,
+    const float* __restrict__ qmax,
+    const float* __restrict__ transit_template,
+    const int ndata,
+    const int nperiods,
+    const int n_durations,
+    const int n_template,
+    float* __restrict__ chi2_out,
+    float* __restrict__ best_t0_out,
+    float* __restrict__ best_duration_out,
+    float* __restrict__ best_depth_out)
+{
+    extern __shared__ float shared_mem[];
+    float* phases = shared_mem;
+    float* y_sorted = &shared_mem[ndata];
+    float* dy_sorted = &shared_mem[2 * ndata];
+    float* s_template = &shared_mem[3 * ndata];
+    float* thread_chi2 = &s_template[n_template];
+    float* thread_t0 = &thread_chi2[blockDim.x];
+    float* thread_duration = &thread_t0[blockDim.x];
+    float* thread_depth = &thread_duration[blockDim.x];
+
+    int period_idx = blockIdx.x;
+    if (period_idx >= nperiods) return;
+
+    // Load template from global to shared memory (once per block)
+    for (int i = threadIdx.x; i < n_template; i += blockDim.x) {
+        s_template[i] = transit_template[i];
+    }
+    __syncthreads();
+
+    float period = periods[period_idx];
+    float duration_phase_min = qmin[period_idx];
+    float duration_phase_max = qmax[period_idx];
+
+    // Phase fold
+    for (int i = threadIdx.x; i < ndata; i += blockDim.x) {
+        phases[i] = mod1(t[i] / period);
+    }
+    __syncthreads();
+
+    // Initialize y_sorted and dy_sorted arrays
+    for (int i = threadIdx.x; i < ndata; i += blockDim.x) {
+        y_sorted[i] = y[i];
+        dy_sorted[i] = dy[i];
+    }
+    __syncthreads();
+
+    // Sort by phase using bitonic sort
+    bitonic_sort_phases(phases, y_sorted, dy_sorted, ndata);
+
+    // Search over durations and T0 using Keplerian constraints
+    float thread_min_chi2 = 1e30f;
+    float thread_best_t0 = 0.0f;
+    float thread_best_duration = 0.0f;
+    float thread_best_depth = 0.0f;
+
+    for (int d_idx = 0; d_idx < n_durations; d_idx++) {
+        float log_dur_min = logf(duration_phase_min);
+        float log_dur_max = logf(duration_phase_max);
+        float log_duration = log_dur_min + (log_dur_max - log_dur_min) * d_idx / (n_durations - 1);
+        float duration_phase = expf(log_duration);
+        float duration = duration_phase * period;
+
+        int n_t0 = 30;
+        for (int t0_idx = threadIdx.x; t0_idx < n_t0; t0_idx += blockDim.x) {
+            float t0_phase = (float)t0_idx / n_t0;
+            float depth = calculate_optimal_depth(y_sorted, dy_sorted, phases,
+                                                   s_template, n_template,
+                                                   duration_phase, t0_phase, ndata);
+
+            if (depth > 0.0f && depth < 0.5f) {
+                float chi2 = calculate_chi2(y_sorted, dy_sorted, phases,
+                                             s_template, n_template,
+                                             duration_phase, t0_phase, depth, ndata);
+                if (chi2 < thread_min_chi2) {
+                    thread_min_chi2 = chi2;
+                    thread_best_t0 = t0_phase;
+                    thread_best_duration = duration;
+                    thread_best_depth = depth;
+                }
+            }
+        }
+    }
+
+    // Store per-thread results to shared memory
+    thread_chi2[threadIdx.x] = thread_min_chi2;
+    thread_t0[threadIdx.x] = thread_best_t0;
+    thread_duration[threadIdx.x] = thread_best_duration;
+    thread_depth[threadIdx.x] = thread_best_depth;
+    __syncthreads();
+
+    // Block reduction down to warp size
+    for (int stride = blockDim.x / 2; stride >= WARP_SIZE; stride /= 2) {
+        if (threadIdx.x < stride) {
+            if (thread_chi2[threadIdx.x + stride] < thread_chi2[threadIdx.x]) {
+                thread_chi2[threadIdx.x] = thread_chi2[threadIdx.x + stride];
+                thread_t0[threadIdx.x] = thread_t0[threadIdx.x + stride];
+                thread_duration[threadIdx.x] = thread_duration[threadIdx.x + stride];
+                thread_depth[threadIdx.x] = thread_depth[threadIdx.x + stride];
+            }
+        }
+        __syncthreads();
+    }
+
+    // Final warp reduction using shuffle (no sync needed)
+    if (threadIdx.x < WARP_SIZE) {
+        float val_chi2 = thread_chi2[threadIdx.x];
+        float val_t0 = thread_t0[threadIdx.x];
+        float val_dur = thread_duration[threadIdx.x];
+        float val_dep = thread_depth[threadIdx.x];
+
+        for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+            float other_chi2 = __shfl_down_sync(0xffffffff, val_chi2, offset);
+            float other_t0 = __shfl_down_sync(0xffffffff, val_t0, offset);
+            float other_dur = __shfl_down_sync(0xffffffff, val_dur, offset);
+            float other_dep = __shfl_down_sync(0xffffffff, val_dep, offset);
+
+            if (other_chi2 < val_chi2) {
+                val_chi2 = other_chi2;
+                val_t0 = other_t0;
+                val_dur = other_dur;
+                val_dep = other_dep;
+            }
+        }
+
+        if (threadIdx.x == 0) {
+            thread_chi2[0] = val_chi2;
+            thread_t0[0] = val_t0;
+            thread_duration[0] = val_dur;
+            thread_depth[0] = val_dep;
+        }
+    }
+
+    // Write final result
+    if (threadIdx.x == 0) {
+        chi2_out[period_idx] = thread_chi2[0];
+        best_t0_out[period_idx] = thread_t0[0];
+        best_duration_out[period_idx] = thread_duration[0];
+        best_depth_out[period_idx] = thread_depth[0];
+    }
+}
+
+/**
+ * TLS search kernel (standard, fixed duration range)
+ * Grid: (nperiods, 1, 1), Block: (BLOCK_SIZE, 1, 1)
+ *
+ * Shared memory layout:
+ *   phases[ndata] | y_sorted[ndata] | dy_sorted[ndata] |
+ *   template[n_template] | thread_chi2[blockDim] | thread_t0[blockDim] |
+ *   thread_dur[blockDim] | thread_depth[blockDim]
+ */
+extern "C" __global__ void tls_search_kernel(
+    const float* __restrict__ t,
+    const float* __restrict__ y,
+    const float* __restrict__ dy,
+    const float* __restrict__ periods,
+    const float* __restrict__ transit_template,
+    const int ndata,
+    const int nperiods,
+    const int n_template,
+    float* __restrict__ chi2_out,
+    float* __restrict__ best_t0_out,
+    float* __restrict__ best_duration_out,
+    float* __restrict__ best_depth_out)
+{
+    extern __shared__ float shared_mem[];
+    float* phases = shared_mem;
+    float* y_sorted = &shared_mem[ndata];
+    float* dy_sorted = &shared_mem[2 * ndata];
+    float* s_template = &shared_mem[3 * ndata];
+    float* thread_chi2 = &s_template[n_template];
+    float* thread_t0 = &thread_chi2[blockDim.x];
+    float* thread_duration = &thread_t0[blockDim.x];
+    float* thread_depth = &thread_duration[blockDim.x];
+
+    int period_idx = blockIdx.x;
+    if (period_idx >= nperiods) return;
+
+    // Load template from global to shared memory (once per block)
+    for (int i = threadIdx.x; i < n_template; i += blockDim.x) {
+        s_template[i] = transit_template[i];
+    }
+    __syncthreads();
+
+    float period = periods[period_idx];
+
+    // Phase fold
+    for (int i = threadIdx.x; i < ndata; i += blockDim.x) {
+        phases[i] = mod1(t[i] / period);
+    }
+    __syncthreads();
+
+    // Initialize y_sorted and dy_sorted arrays
+    for (int i = threadIdx.x; i < ndata; i += blockDim.x) {
+        y_sorted[i] = y[i];
+        dy_sorted[i] = dy[i];
+    }
+    __syncthreads();
+
+    // Sort by phase using bitonic sort
+    bitonic_sort_phases(phases, y_sorted, dy_sorted, ndata);
+
+    // Search over durations and T0
+    float thread_min_chi2 = 1e30f;
+    float thread_best_t0 = 0.0f;
+    float thread_best_duration = 0.0f;
+    float thread_best_depth = 0.0f;
+
+    int n_durations = 15;
+    float duration_phase_min = 0.005f;
+    float duration_phase_max = 0.15f;
+
+    for (int d_idx = 0; d_idx < n_durations; d_idx++) {
+        float log_dur_min = logf(duration_phase_min);
+        float log_dur_max = logf(duration_phase_max);
+        float log_duration = log_dur_min + (log_dur_max - log_dur_min) * d_idx / (n_durations - 1);
+        float duration_phase = expf(log_duration);
+        float duration = duration_phase * period;
+
+        int n_t0 = 30;
+        for (int t0_idx = threadIdx.x; t0_idx < n_t0; t0_idx += blockDim.x) {
+            float t0_phase = (float)t0_idx / n_t0;
+            float depth = calculate_optimal_depth(y_sorted, dy_sorted, phases,
+                                                   s_template, n_template,
+                                                   duration_phase, t0_phase, ndata);
+
+            if (depth > 0.0f && depth < 0.5f) {
+                float chi2 = calculate_chi2(y_sorted, dy_sorted, phases,
+                                             s_template, n_template,
+                                             duration_phase, t0_phase, depth, ndata);
+                if (chi2 < thread_min_chi2) {
+                    thread_min_chi2 = chi2;
+                    thread_best_t0 = t0_phase;
+                    thread_best_duration = duration;
+                    thread_best_depth = depth;
+                }
+            }
+        }
+    }
+
+    // Store per-thread results to shared memory
+    thread_chi2[threadIdx.x] = thread_min_chi2;
+    thread_t0[threadIdx.x] = thread_best_t0;
+    thread_duration[threadIdx.x] = thread_best_duration;
+    thread_depth[threadIdx.x] = thread_best_depth;
+    __syncthreads();
+
+    // Block reduction down to warp size
+    for (int stride = blockDim.x / 2; stride >= WARP_SIZE; stride /= 2) {
+        if (threadIdx.x < stride) {
+            if (thread_chi2[threadIdx.x + stride] < thread_chi2[threadIdx.x]) {
+                thread_chi2[threadIdx.x] = thread_chi2[threadIdx.x + stride];
+                thread_t0[threadIdx.x] = thread_t0[threadIdx.x + stride];
+                thread_duration[threadIdx.x] = thread_duration[threadIdx.x + stride];
+                thread_depth[threadIdx.x] = thread_depth[threadIdx.x + stride];
+            }
+        }
+        __syncthreads();
+    }
+
+    // Final warp reduction using shuffle (no sync needed)
+    if (threadIdx.x < WARP_SIZE) {
+        float val_chi2 = thread_chi2[threadIdx.x];
+        float val_t0 = thread_t0[threadIdx.x];
+        float val_dur = thread_duration[threadIdx.x];
+        float val_dep = thread_depth[threadIdx.x];
+
+        for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+            float other_chi2 = __shfl_down_sync(0xffffffff, val_chi2, offset);
+            float other_t0 = __shfl_down_sync(0xffffffff, val_t0, offset);
+            float other_dur = __shfl_down_sync(0xffffffff, val_dur, offset);
+            float other_dep = __shfl_down_sync(0xffffffff, val_dep, offset);
+
+            if (other_chi2 < val_chi2) {
+                val_chi2 = other_chi2;
+                val_t0 = other_t0;
+                val_dur = other_dur;
+                val_dep = other_dep;
+            }
+        }
+
+        if (threadIdx.x == 0) {
+            thread_chi2[0] = val_chi2;
+            thread_t0[0] = val_t0;
+            thread_duration[0] = val_dur;
+            thread_depth[0] = val_dep;
+        }
+    }
+
+    // Write final result
+    if (threadIdx.x == 0) {
+        chi2_out[period_idx] = thread_chi2[0];
+        best_t0_out[period_idx] = thread_t0[0];
+        best_duration_out[period_idx] = thread_duration[0];
+        best_depth_out[period_idx] = thread_depth[0];
+    }
+}
diff --git a/cuvarbase/tests/test_tls_basic.py b/cuvarbase/tests/test_tls_basic.py
new file mode 100644
index 0000000..984c30e
--- /dev/null
+++ b/cuvarbase/tests/test_tls_basic.py
@@ -0,0 +1,459 @@
+"""
+Basic tests for TLS GPU implementation.
+
+These tests verify the basic functionality of the TLS implementation,
+focusing on API correctness and basic execution rather than scientific
+accuracy (which will be tested in test_tls_consistency.py).
+"""
+
+import pytest
+import numpy as np
+
+try:
+    import pycuda
+    import pycuda.autoinit
+    PYCUDA_AVAILABLE = True
+except ImportError:
+    PYCUDA_AVAILABLE = False
+
+# Import modules to test
+from cuvarbase import tls_grids, tls_models, tls_stats
+
+
+class TestGridGeneration:
+    """Test period and duration grid generation."""
+
+    def test_period_grid_basic(self):
+        """Test basic period grid generation."""
+        t = np.linspace(0, 100, 1000)  # 100-day observation
+
+        periods = tls_grids.period_grid_ofir(t, R_star=1.0, M_star=1.0)
+
+        assert len(periods) > 0
+        assert np.all(periods > 0)
+        assert np.all(np.diff(periods) > 0)  # Increasing
+        assert periods[0] < periods[-1]
+
+    def test_period_grid_limits(self):
+        """Test period grid with custom limits."""
+        t = np.linspace(0, 100, 1000)
+
+        periods = tls_grids.period_grid_ofir(
+            t, period_min=5.0, period_max=20.0
+        )
+
+        assert periods[0] >= 5.0
+        assert periods[-1] <= 20.0
+
+    def test_duration_grid(self):
+        """Test duration grid generation."""
+        periods = np.array([10.0, 20.0, 30.0])
+
+        durations, counts = tls_grids.duration_grid(periods)
+
+        assert len(durations) == len(periods)
+        assert len(counts) == len(periods)
+        assert all(c > 0 for c in counts)
+
+        # Check durations are reasonable (< period)
+        for i, period in enumerate(periods):
+            assert all(d < period for d in durations[i])
+            assert all(d > 0 for d in durations[i])
+
+    def test_transit_duration_max(self):
+        """Test maximum transit duration calculation."""
+        period = 10.0  # days
+
+        duration = tls_grids.transit_duration_max(
+            period, R_star=1.0, M_star=1.0, R_planet=1.0
+        )
+
+        assert duration > 0
+        assert duration < period  # Duration must be less than period
+        assert duration < 1.0  # For Earth-Sun system, ~0.5 days
+
+    def test_t0_grid(self):
+        """Test T0 grid generation."""
+        period = 10.0
+        duration = 0.1
+
+        t0_values = tls_grids.t0_grid(period, duration, oversampling=5)
+
+        assert len(t0_values) > 0
+        assert np.all(t0_values >= 0)
+        assert np.all(t0_values <= 1)
+
+    def test_validate_stellar_parameters(self):
+        """Test stellar parameter validation."""
+        # Valid parameters
+        tls_grids.validate_stellar_parameters(R_star=1.0, M_star=1.0)
+
+        # Invalid radius
+        with pytest.raises(ValueError):
+            tls_grids.validate_stellar_parameters(R_star=10.0, M_star=1.0)
+
+        # Invalid mass
+        with pytest.raises(ValueError):
+            tls_grids.validate_stellar_parameters(R_star=1.0, M_star=5.0)
+
+
+class TestTransitTemplate:
+    """Test transit template generation for GPU kernel."""
+
+    def test_trapezoid_template_shape(self):
+        """Test trapezoidal fallback template has correct shape."""
+        template = tls_models._trapezoid_template(n_template=500)
+
+        assert template.shape == (500,)
+        assert template.dtype == np.float32
+
+    def test_trapezoid_template_normalization(self):
+        """Test trapezoidal template values are in [0, 1]."""
+        template = tls_models._trapezoid_template(n_template=1000)
+
+        assert np.all(template >= 0.0)
+        assert np.all(template <= 1.0)
+        # Center should be at max depth
+        assert template[500] == pytest.approx(1.0)
+        # Edges should be near zero
+        assert template[0] == pytest.approx(0.0, abs=0.01)
+        assert template[-1] == pytest.approx(0.0, abs=0.01)
+
+    def test_trapezoid_template_symmetric(self):
+        """Test trapezoidal template is symmetric."""
+        template = tls_models._trapezoid_template(n_template=1001)
+        np.testing.assert_allclose(template, template[::-1], atol=1e-6)
+
+    @pytest.mark.skipif(not tls_models.BATMAN_AVAILABLE,
+                       reason="batman-package not installed")
+    def test_batman_template_shape(self):
+        """Test batman template has correct shape and dtype."""
+        template = tls_models.generate_transit_template(n_template=1000)
+
+        assert template.shape == (1000,)
+        assert template.dtype == np.float32
+
+    @pytest.mark.skipif(not tls_models.BATMAN_AVAILABLE,
+                       reason="batman-package not installed")
+    def test_batman_template_normalization(self):
+        """Test batman template values are in [0, 1] with max = 1."""
+        template = tls_models.generate_transit_template(n_template=1000)
+
+        assert np.all(template >= 0.0)
+        assert np.all(template <= 1.0)
+        assert np.max(template) == pytest.approx(1.0, abs=0.01)
+        # Edges should be near zero
+        assert template[0] < 0.1
+        assert template[-1] < 0.1
+
+    @pytest.mark.skipif(not tls_models.BATMAN_AVAILABLE,
+                       reason="batman-package not installed")
+    def test_batman_template_limb_darkened(self):
+        """Test batman template shows limb darkening (not a box)."""
+        template = tls_models.generate_transit_template(n_template=1000)
+
+        # The template should NOT be a perfect box (all 0 or 1).
+        # With limb darkening, there should be intermediate values.
+        n_intermediate = np.sum((template > 0.1) & (template < 0.9))
+        assert n_intermediate > 10, "Template should have limb-darkened shape, not a box"
+
+    def test_generate_fallback_without_batman(self):
+        """Test generate_transit_template falls back to trapezoid."""
+        # Force fallback by testing _trapezoid_template directly
+        template = tls_models._trapezoid_template(n_template=500)
+
+        assert template.shape == (500,)
+        assert np.max(template) == pytest.approx(1.0)
+        assert np.min(template) == pytest.approx(0.0, abs=0.01)
+
+
+@pytest.mark.skipif(not tls_models.BATMAN_AVAILABLE,
+                   reason="batman-package not installed")
+class TestTransitModels:
+    """Test transit model generation (requires batman)."""
+
+    def test_reference_transit(self):
+        """Test reference transit model creation."""
+        phases, flux = tls_models.create_reference_transit(n_samples=100)
+
+        assert len(phases) == len(flux)
+        assert len(phases) == 100
+        assert np.all((phases >= 0) & (phases <= 1))
+        assert np.all(flux <= 1.0)  # Transit causes dimming
+        assert np.min(flux) < 1.0  # There is a transit
+
+    def test_transit_model_cache(self):
+        """Test transit model cache creation."""
+        durations = np.array([0.05, 0.1, 0.15])
+
+        models, phases = tls_models.create_transit_model_cache(
+            durations, period=10.0, n_samples=100
+        )
+
+        assert len(models) == len(durations)
+        assert len(phases) == 100
+        for model in models:
+            assert len(model) == len(phases)
+
+
+class TestSimpleTransitModels:
+    """Test simple transit models (no batman required)."""
+
+    def test_simple_trapezoid(self):
+        """Test simple trapezoidal transit."""
+        phases = np.linspace(0, 1, 1000)
+        duration_phase = 0.1
+
+        flux = tls_models.simple_trapezoid_transit(
+            phases, duration_phase, depth=0.01
+        )
+
+        assert len(flux) == len(phases)
+        assert np.all(flux <= 1.0)
+        assert np.min(flux) < 1.0  # There is a transit
+        assert np.max(flux) == 1.0  # Out of transit = 1.0
+
+    def test_interpolate_transit_model(self):
+        """Test transit model interpolation."""
+        model_phases = np.linspace(0, 1, 100)
+        model_flux = np.ones(100)
+        model_flux[40:60] = 0.99  # Simple transit
+
+        target_phases = np.linspace(0, 1, 200)
+
+        flux_interp = tls_models.interpolate_transit_model(
+            model_phases, model_flux, target_phases, target_depth=0.01
+        )
+
+        assert len(flux_interp) == len(target_phases)
+        assert np.all(flux_interp <= 1.0)
+
+    def test_default_limb_darkening(self):
+        """Test default limb darkening coefficient lookup."""
+        u_kepler = tls_models.get_default_limb_darkening('Kepler', T_eff=5500)
+        assert len(u_kepler) == 2
+        assert all(0 < coeff < 1 for coeff in u_kepler)
+
+        u_tess = tls_models.get_default_limb_darkening('TESS', T_eff=5500)
+        assert len(u_tess) == 2
+
+    def test_validate_limb_darkening(self):
+        """Test limb darkening validation."""
+        # Valid quadratic
+        tls_models.validate_limb_darkening_coeffs([0.4, 0.2], 'quadratic')
+
+        # Invalid - wrong number
+        with pytest.raises(ValueError):
+            tls_models.validate_limb_darkening_coeffs([0.4], 'quadratic')
+
+
+class TestStatistics:
+    """Test TLS statistics calculations."""
+
+    def test_signal_residue_with_signal(self):
+        """Test SR is positive for a signal."""
+        # Simulate chi2 values where one period has much lower chi2
+        chi2 = np.ones(100) * 1000.0
+        chi2[50] = 500.0  # Signal at index 50
+
+        SR = tls_stats.signal_residue(chi2)
+
+        # SR at signal should be highest
+        assert SR[50] > SR[0]
+        assert SR[50] > 0
+
+    def test_sde_positive_for_signal(self):
+        """Test SDE > 0 for an injected signal (regression test)."""
+        # Simulate chi2 values with a clear signal
+        np.random.seed(42)
+        chi2 = np.random.normal(1000, 10, size=200)
+        chi2[100] = 500.0  # Strong signal
+
+        SDE, SDE_raw, power = tls_stats.signal_detection_efficiency(
+            chi2, detrend=False
+        )
+
+        assert SDE > 0, "SDE should be > 0 for injected signal"
+        assert SDE_raw > 0
+
+    def test_snr_with_chi2(self):
+        """Test SNR estimation from chi2 values."""
+        snr = tls_stats.signal_to_noise(
+            0.01, chi2_null=1000.0, chi2_best=500.0
+        )
+        assert snr > 0
+
+    def test_snr_returns_zero_without_info(self):
+        """Test SNR returns 0 when no depth_err or chi2 provided."""
+        snr = tls_stats.signal_to_noise(0.01)
+        assert snr == 0.0
+
+
+@pytest.mark.skipif(not PYCUDA_AVAILABLE,
+                   reason="PyCUDA not available")
+class TestTLSKernel:
+    """Test TLS kernel compilation and basic execution."""
+
+    def test_kernel_compilation(self):
+        """Test that TLS kernel compiles."""
+        from cuvarbase import tls
+
+        kernel = tls.compile_tls(block_size=128)
+        assert kernel is not None
+
+    def test_kernel_caching(self):
+        """Test kernel caching mechanism."""
+        from cuvarbase import tls
+
+        # First call - compiles
+        kernel1 = tls._get_cached_kernels(128)
+        assert kernel1 is not None
+
+        # Second call - should use cache
+        kernel2 = tls._get_cached_kernels(128)
+        assert kernel2 is kernel1
+
+    def test_block_size_selection(self):
+        """Test automatic block size selection."""
+        from cuvarbase import tls
+
+        assert tls._choose_block_size(10) == 32
+        assert tls._choose_block_size(50) == 64
+        assert tls._choose_block_size(100) == 128
+
+
+@pytest.mark.skipif(not PYCUDA_AVAILABLE,
+                   reason="PyCUDA not available")
+class TestTLSMemory:
+    """Test TLS memory management."""
+
+    def test_memory_allocation(self):
+        """Test memory allocation."""
+        from cuvarbase.tls import TLSMemory
+
+        mem = TLSMemory(max_ndata=1000, max_nperiods=100)
+
+        assert mem.t is not None
+        assert len(mem.t) == 1000
+        assert len(mem.periods) == 100
+
+    def test_memory_setdata(self):
+        """Test setting data."""
+        from cuvarbase.tls import TLSMemory
+
+        t = np.linspace(0, 100, 100)
+        y = np.ones(100)
+        dy = np.ones(100) * 0.01
+        periods = np.linspace(1, 10, 50)
+
+        mem = TLSMemory(max_ndata=1000, max_nperiods=100)
+        mem.setdata(t, y, dy, periods=periods, transfer=False)
+
+        assert np.allclose(mem.t[:100], t)
+        assert np.allclose(mem.periods[:50], periods)
+
+    def test_memory_fromdata(self):
+        """Test creating memory from data."""
+        from cuvarbase.tls import TLSMemory
+
+        t = np.linspace(0, 100, 100)
+        y = np.ones(100)
+        dy = np.ones(100) * 0.01
+        periods = np.linspace(1, 10, 50)
+
+        mem = TLSMemory.fromdata(t, y, dy, periods=periods, transfer=False)
+
+        assert mem.max_ndata >= 100
+        assert mem.max_nperiods >= 50
+
+
+@pytest.mark.skipif(not PYCUDA_AVAILABLE,
+                   reason="PyCUDA not available")
+class TestTLSBasicExecution:
+    """Test basic TLS execution (not accuracy)."""
+
+    def test_tls_search_runs(self):
+        """Test that TLS search runs without errors."""
+        from cuvarbase import tls
+
+        # Create simple synthetic data
+        t = np.linspace(0, 100, 500)
+        y = np.ones(500)
+        dy = np.ones(500) * 0.001
+
+        # Use small period range for speed
+        periods = np.linspace(5, 15, 20)
+
+        # This should run without errors
+        results = tls.tls_search_gpu(
+            t, y, dy,
+            periods=periods,
+            block_size=64
+        )
+
+        assert results is not None
+        assert 'periods' in results
+        assert 'chi2' in results
+        assert len(results['periods']) == 20
+
+    def test_tls_search_with_transit(self):
+        """Test TLS with injected transit."""
+        from cuvarbase import tls
+
+        # Create data with simple transit
+        t = np.linspace(0, 100, 500)
+        y = np.ones(500)
+
+        # Inject transit at period = 10 days
+        period_true = 10.0
+        duration = 0.1
+        depth = 0.01
+
+        phases = (t % period_true) / period_true
+        in_transit = (phases < duration / period_true) | (phases > 1 - duration / period_true)
+        y[in_transit] -= depth
+
+        dy = np.ones(500) * 0.0001
+
+        # Search with periods around the true value
+        periods = np.linspace(8, 12, 30)
+
+        results = tls.tls_search_gpu(t, y, dy, periods=periods)
+
+        # Should return results
+        assert results['chi2'] is not None
+        assert len(results['chi2']) == 30
+
+        # Minimum chi2 should be near period = 10 (within a few samples)
+        min_idx = np.argmin(results['chi2'])
+        best_period = results['periods'][min_idx]
+
+        # Should be within 20% of true period (very loose for Phase 1)
+        assert 8 < best_period < 12
+
+    def test_sde_positive_with_transit(self):
+        """Test SDE > 0 when a transit is present (regression test)."""
+        from cuvarbase import tls
+
+        # Create data with obvious transit
+        t = np.linspace(0, 100, 500)
+        y = np.ones(500)
+
+        period_true = 10.0
+        depth = 0.02
+        phases = (t % period_true) / period_true
+        in_transit = phases < 0.02
+        y[in_transit] -= depth
+
+        dy = np.ones(500) * 0.0001
+
+        periods = np.linspace(8, 12, 50)
+        results = tls.tls_search_gpu(t, y, dy, periods=periods)
+
+        assert results['SDE'] > 0, (
+            "SDE should be > 0 for a clear transit signal"
+        )
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
diff --git a/cuvarbase/tls.py b/cuvarbase/tls.py
new file mode 100644
index 0000000..53ff2cb
--- /dev/null
+++ b/cuvarbase/tls.py
@@ -0,0 +1,777 @@
+"""
+GPU-accelerated Transit Least Squares (TLS) periodogram.
+
+This module implements a fast GPU version of the Transit Least Squares
+algorithm for detecting planetary transits in photometric time series.
+
+References
+----------
+.. [1] Hippke & Heller (2019), "Transit Least Squares",  A&A 623, A39
+.. [2] Kovács et al. (2002), "Box Least Squares", A&A 391, 369
+"""
+
+import sys
+import threading
+from collections import OrderedDict
+import resource
+
+import pycuda.autoprimaryctx
+import pycuda.driver as cuda
+import pycuda.gpuarray as gpuarray
+from pycuda.compiler import SourceModule
+
+import numpy as np
+
+from .utils import find_kernel, _module_reader
+from . import tls_grids
+from . import tls_models
+from . import tls_stats
+
+_default_block_size = 128  # Smaller default than BLS (TLS has more shared memory needs)
+_KERNEL_CACHE_MAX_SIZE = 10
+_kernel_cache = OrderedDict()
+_kernel_cache_lock = threading.Lock()
+
+
+def _choose_block_size(ndata):
+    """
+    Choose optimal block size for TLS kernel based on data size.
+
+    Parameters
+    ----------
+    ndata : int
+        Number of data points
+
+    Returns
+    -------
+    block_size : int
+        Optimal CUDA block size (32, 64, or 128)
+
+    Notes
+    -----
+    TLS uses more shared memory than BLS, so we use smaller block sizes
+    to avoid shared memory limits.
+    """
+    if ndata <= 32:
+        return 32
+    elif ndata <= 64:
+        return 64
+    else:
+        return 128  # Max for TLS (vs 256 for BLS)
+
+
+def _get_cached_kernels(block_size):
+    """
+    Get compiled TLS kernel from cache.
+
+    Parameters
+    ----------
+    block_size : int
+        CUDA block size
+
+    Returns
+    -------
+    kernel : PyCUDA function
+        Compiled kernel function
+    """
+    key = block_size
+
+    with _kernel_cache_lock:
+        if key in _kernel_cache:
+            _kernel_cache.move_to_end(key)
+            return _kernel_cache[key]
+
+        # Compile kernel
+        compiled = compile_tls(block_size=block_size)
+
+        # Add to cache
+        _kernel_cache[key] = compiled
+        _kernel_cache.move_to_end(key)
+
+        # Evict oldest if needed
+        if len(_kernel_cache) > _KERNEL_CACHE_MAX_SIZE:
+            _kernel_cache.popitem(last=False)
+
+        return compiled
+
+
+def compile_tls(block_size=_default_block_size):
+    """
+    Compile TLS CUDA kernels.
+
+    Parameters
+    ----------
+    block_size : int, optional
+        CUDA block size (default: 128)
+
+    Returns
+    -------
+    kernels : dict
+        Dictionary with 'standard' and 'keplerian' kernel functions
+
+    Notes
+    -----
+    The kernels use bitonic sort for phase sorting and a limb-darkened
+    transit template loaded into shared memory for physically realistic
+    fitting. Works for datasets up to ~100,000 points.
+
+    The 'keplerian' kernel variant accepts per-period qmin/qmax arrays
+    to focus the duration search on physically plausible values.
+    """
+    cppd = dict(BLOCK_SIZE=block_size)
+
+    kernel_name = 'tls'
+    kernel_txt = _module_reader(find_kernel(kernel_name), cpp_defs=cppd)
+
+    # Compile with fast math
+    # no_extern_c=True needed for proper extern "C" handling
+    module = SourceModule(kernel_txt, options=['--use_fast_math'], no_extern_c=True)
+
+    # Get both kernel functions
+    kernels = {
+        'standard': module.get_function('tls_search_kernel'),
+        'keplerian': module.get_function('tls_search_kernel_keplerian')
+    }
+
+    return kernels
+
+
+class TLSMemory:
+    """
+    Memory management for TLS GPU computations.
+
+    This class handles allocation and transfer of data between CPU and GPU
+    for TLS periodogram calculations.
+
+    Parameters
+    ----------
+    max_ndata : int
+        Maximum number of data points
+    max_nperiods : int
+        Maximum number of trial periods
+    stream : pycuda.driver.Stream, optional
+        CUDA stream for async operations
+
+    Attributes
+    ----------
+    t, y, dy : ndarray
+        Pinned CPU arrays for time, flux, uncertainties
+    t_g, y_g, dy_g : gpuarray
+        GPU arrays for data
+    periods_g, chi2_g : gpuarray
+        GPU arrays for periods and chi-squared values
+    best_t0_g, best_duration_g, best_depth_g : gpuarray
+        GPU arrays for best-fit parameters
+    """
+
+    def __init__(self, max_ndata, max_nperiods, stream=None, **kwargs):
+        self.max_ndata = max_ndata
+        self.max_nperiods = max_nperiods
+        self.stream = stream
+        self.rtype = np.float32
+
+        # CPU pinned memory for fast transfers
+        self.t = None
+        self.y = None
+        self.dy = None
+
+        # GPU memory
+        self.t_g = None
+        self.y_g = None
+        self.dy_g = None
+        self.periods_g = None
+        self.qmin_g = None  # Keplerian duration constraints
+        self.qmax_g = None  # Keplerian duration constraints
+        self.chi2_g = None
+        self.best_t0_g = None
+        self.best_duration_g = None
+        self.best_depth_g = None
+        self.template_g = None
+
+        self.allocate_pinned_arrays()
+
+    def allocate_pinned_arrays(self):
+        """Allocate page-aligned pinned memory on CPU for fast transfers."""
+        pagesize = resource.getpagesize()
+
+        self.t = cuda.aligned_zeros(shape=(self.max_ndata,),
+                                    dtype=self.rtype,
+                                    alignment=pagesize)
+
+        self.y = cuda.aligned_zeros(shape=(self.max_ndata,),
+                                    dtype=self.rtype,
+                                    alignment=pagesize)
+
+        self.dy = cuda.aligned_zeros(shape=(self.max_ndata,),
+                                     dtype=self.rtype,
+                                     alignment=pagesize)
+
+        self.periods = cuda.aligned_zeros(shape=(self.max_nperiods,),
+                                         dtype=self.rtype,
+                                         alignment=pagesize)
+
+        self.chi2 = cuda.aligned_zeros(shape=(self.max_nperiods,),
+                                      dtype=self.rtype,
+                                      alignment=pagesize)
+
+        self.best_t0 = cuda.aligned_zeros(shape=(self.max_nperiods,),
+                                         dtype=self.rtype,
+                                         alignment=pagesize)
+
+        self.best_duration = cuda.aligned_zeros(shape=(self.max_nperiods,),
+                                               dtype=self.rtype,
+                                               alignment=pagesize)
+
+        self.best_depth = cuda.aligned_zeros(shape=(self.max_nperiods,),
+                                            dtype=self.rtype,
+                                            alignment=pagesize)
+
+        # Keplerian duration constraints
+        self.qmin = cuda.aligned_zeros(shape=(self.max_nperiods,),
+                                      dtype=self.rtype,
+                                      alignment=pagesize)
+
+        self.qmax = cuda.aligned_zeros(shape=(self.max_nperiods,),
+                                      dtype=self.rtype,
+                                      alignment=pagesize)
+
+    def allocate_gpu_arrays(self, ndata=None, nperiods=None):
+        """Allocate GPU memory."""
+        if ndata is None:
+            ndata = self.max_ndata
+        if nperiods is None:
+            nperiods = self.max_nperiods
+
+        self.t_g = gpuarray.zeros(ndata, dtype=self.rtype)
+        self.y_g = gpuarray.zeros(ndata, dtype=self.rtype)
+        self.dy_g = gpuarray.zeros(ndata, dtype=self.rtype)
+        self.periods_g = gpuarray.zeros(nperiods, dtype=self.rtype)
+        self.qmin_g = gpuarray.zeros(nperiods, dtype=self.rtype)
+        self.qmax_g = gpuarray.zeros(nperiods, dtype=self.rtype)
+        self.chi2_g = gpuarray.zeros(nperiods, dtype=self.rtype)
+        self.best_t0_g = gpuarray.zeros(nperiods, dtype=self.rtype)
+        self.best_duration_g = gpuarray.zeros(nperiods, dtype=self.rtype)
+        self.best_depth_g = gpuarray.zeros(nperiods, dtype=self.rtype)
+
+    def set_template(self, template):
+        """Transfer transit template to GPU.
+
+        Parameters
+        ----------
+        template : ndarray
+            Float32 template array from generate_transit_template()
+        """
+        template = np.asarray(template, dtype=self.rtype)
+        self.template_g = gpuarray.to_gpu(template)
+
+    def setdata(self, t, y, dy, periods=None, qmin=None, qmax=None, transfer=True):
+        """
+        Set data for TLS computation.
+
+        Parameters
+        ----------
+        t : array_like
+            Observation times
+        y : array_like
+            Flux measurements
+        dy : array_like
+            Flux uncertainties
+        periods : array_like, optional
+            Trial periods
+        qmin : array_like, optional
+            Minimum fractional duration per period (for Keplerian search)
+        qmax : array_like, optional
+            Maximum fractional duration per period (for Keplerian search)
+        transfer : bool, optional
+            Transfer to GPU immediately (default: True)
+        """
+        ndata = len(t)
+
+        # Copy to pinned memory
+        self.t[:ndata] = np.asarray(t).astype(self.rtype)
+        self.y[:ndata] = np.asarray(y).astype(self.rtype)
+        self.dy[:ndata] = np.asarray(dy).astype(self.rtype)
+
+        if periods is not None:
+            nperiods = len(periods)
+            self.periods[:nperiods] = np.asarray(periods).astype(self.rtype)
+
+        if qmin is not None:
+            nperiods = len(qmin)
+            self.qmin[:nperiods] = np.asarray(qmin).astype(self.rtype)
+
+        if qmax is not None:
+            nperiods = len(qmax)
+            self.qmax[:nperiods] = np.asarray(qmax).astype(self.rtype)
+
+        # Allocate GPU memory if needed
+        if self.t_g is None or len(self.t_g) < ndata:
+            self.allocate_gpu_arrays(ndata, len(periods) if periods is not None else self.max_nperiods)
+
+        # Transfer to GPU
+        if transfer:
+            self.transfer_to_gpu(ndata, len(periods) if periods is not None else None,
+                               qmin is not None, qmax is not None)
+
+    def transfer_to_gpu(self, ndata, nperiods=None, has_qmin=False, has_qmax=False):
+        """Transfer data from CPU to GPU."""
+        if self.stream is None:
+            self.t_g.set(self.t[:ndata])
+            self.y_g.set(self.y[:ndata])
+            self.dy_g.set(self.dy[:ndata])
+            if nperiods is not None:
+                self.periods_g.set(self.periods[:nperiods])
+            if has_qmin:
+                self.qmin_g.set(self.qmin[:nperiods])
+            if has_qmax:
+                self.qmax_g.set(self.qmax[:nperiods])
+        else:
+            self.t_g.set_async(self.t[:ndata], stream=self.stream)
+            self.y_g.set_async(self.y[:ndata], stream=self.stream)
+            self.dy_g.set_async(self.dy[:ndata], stream=self.stream)
+            if nperiods is not None:
+                self.periods_g.set_async(self.periods[:nperiods], stream=self.stream)
+            if has_qmin:
+                self.qmin_g.set_async(self.qmin[:nperiods], stream=self.stream)
+            if has_qmax:
+                self.qmax_g.set_async(self.qmax[:nperiods], stream=self.stream)
+
+    def transfer_from_gpu(self, nperiods):
+        """Transfer results from GPU to CPU."""
+        if self.stream is None:
+            self.chi2[:nperiods] = self.chi2_g.get()[:nperiods]
+            self.best_t0[:nperiods] = self.best_t0_g.get()[:nperiods]
+            self.best_duration[:nperiods] = self.best_duration_g.get()[:nperiods]
+            self.best_depth[:nperiods] = self.best_depth_g.get()[:nperiods]
+        else:
+            self.chi2_g.get_async(ary=self.chi2, stream=self.stream)
+            self.best_t0_g.get_async(ary=self.best_t0, stream=self.stream)
+            self.best_duration_g.get_async(ary=self.best_duration, stream=self.stream)
+            self.best_depth_g.get_async(ary=self.best_depth, stream=self.stream)
+
+    @classmethod
+    def fromdata(cls, t, y, dy, periods=None, **kwargs):
+        """
+        Create TLSMemory instance from data.
+
+        Parameters
+        ----------
+        t, y, dy : array_like
+            Time series data
+        periods : array_like, optional
+            Trial periods
+        **kwargs
+            Passed to __init__
+
+        Returns
+        -------
+        memory : TLSMemory
+            Initialized memory object
+        """
+        max_ndata = kwargs.get('max_ndata', len(t))
+        max_nperiods = kwargs.get('max_nperiods',
+                                  len(periods) if periods is not None else 10000)
+
+        mem = cls(max_ndata, max_nperiods, **kwargs)
+        mem.setdata(t, y, dy, periods=periods, transfer=kwargs.get('transfer', True))
+
+        return mem
+
+
+def tls_search_gpu(t, y, dy, periods=None, durations=None,
+                   qmin=None, qmax=None, n_durations=15,
+                   R_star=1.0, M_star=1.0,
+                   period_min=None, period_max=None, n_transits_min=2,
+                   oversampling_factor=3, duration_grid_step=1.1,
+                   R_planet_min=0.5, R_planet_max=5.0,
+                   limb_dark='quadratic', u=[0.4804, 0.1867],
+                   block_size=None,
+                   kernel=None, memory=None, stream=None,
+                   transfer_to_device=True, transfer_to_host=True,
+                   **kwargs):
+    """
+    Run Transit Least Squares search on GPU.
+
+    Parameters
+    ----------
+    t : array_like
+        Observation times (days)
+    y : array_like
+        Flux measurements (arbitrary units, will be normalized)
+    dy : array_like
+        Flux uncertainties
+    periods : array_like, optional
+        Custom period grid. If None, generated automatically.
+    qmin : array_like, optional
+        Minimum fractional duration per period (for Keplerian search).
+        If provided, enables Keplerian mode.
+    qmax : array_like, optional
+        Maximum fractional duration per period (for Keplerian search).
+        If provided, enables Keplerian mode.
+    n_durations : int, optional
+        Number of duration samples per period (default: 15).
+        Only used in Keplerian mode.
+    R_star : float, optional
+        Stellar radius in solar radii (default: 1.0)
+    M_star : float, optional
+        Stellar mass in solar masses (default: 1.0)
+    period_min, period_max : float, optional
+        Period search range (days). Auto-computed if None.
+    n_transits_min : int, optional
+        Minimum number of transits required (default: 2)
+    oversampling_factor : float, optional
+        Period grid oversampling (default: 3)
+    duration_grid_step : float, optional
+        Duration grid spacing factor (default: 1.1)
+    R_planet_min, R_planet_max : float, optional
+        Planet radius range in Earth radii (default: 0.5 to 5.0)
+    limb_dark : str, optional
+        Limb darkening law (default: 'quadratic')
+    u : list, optional
+        Limb darkening coefficients (default: [0.4804, 0.1867])
+    block_size : int, optional
+        CUDA block size (auto-selected if None)
+    kernel : PyCUDA function, optional
+        Pre-compiled kernel
+    memory : TLSMemory, optional
+        Pre-allocated memory object
+    stream : cuda.Stream, optional
+        CUDA stream for async execution
+    transfer_to_device : bool, optional
+        Transfer data to GPU (default: True)
+    transfer_to_host : bool, optional
+        Transfer results to CPU (default: True)
+
+    Returns
+    -------
+    results : dict
+        Dictionary with keys:
+        - 'periods': Trial periods
+        - 'chi2': Chi-squared values
+        - 'best_t0': Best mid-transit times
+        - 'best_duration': Best durations
+        - 'best_depth': Best depths
+        - 'SDE': Signal Detection Efficiency (if computed)
+
+    Notes
+    -----
+    This is the main GPU TLS function. For the first implementation,
+    it provides a basic version that will be optimized in Phase 2.
+    """
+    # Validate stellar parameters
+    tls_grids.validate_stellar_parameters(R_star, M_star)
+
+    # Validate limb darkening
+    tls_models.validate_limb_darkening_coeffs(u, limb_dark)
+
+    # Generate period grid if not provided
+    if periods is None:
+        periods = tls_grids.period_grid_ofir(
+            t, R_star=R_star, M_star=M_star,
+            oversampling_factor=oversampling_factor,
+            period_min=period_min, period_max=period_max,
+            n_transits_min=n_transits_min
+        )
+
+    # Convert to numpy arrays
+    t = np.asarray(t, dtype=np.float32)
+    y = np.asarray(y, dtype=np.float32)
+    dy = np.asarray(dy, dtype=np.float32)
+    periods = np.asarray(periods, dtype=np.float32)
+
+    ndata = len(t)
+    nperiods = len(periods)
+
+    # Choose block size
+    if block_size is None:
+        block_size = _choose_block_size(ndata)
+
+    # Determine if using Keplerian mode
+    use_keplerian = (qmin is not None and qmax is not None)
+
+    # Get or compile kernels
+    if kernel is None:
+        kernels = _get_cached_kernels(block_size)
+        kernel = kernels['keplerian'] if use_keplerian else kernels['standard']
+
+    # Allocate or use existing memory
+    if memory is None:
+        memory = TLSMemory.fromdata(t, y, dy, periods=periods,
+                                    stream=stream,
+                                    transfer=transfer_to_device)
+    elif transfer_to_device:
+        memory.setdata(t, y, dy, periods=periods, transfer=True)
+
+    # Set qmin/qmax if using Keplerian mode
+    if use_keplerian:
+        qmin = np.asarray(qmin, dtype=np.float32)
+        qmax = np.asarray(qmax, dtype=np.float32)
+        if len(qmin) != nperiods or len(qmax) != nperiods:
+            raise ValueError(f"qmin and qmax must have same length as periods ({nperiods})")
+        memory.setdata(t, y, dy, periods=periods, qmin=qmin, qmax=qmax, transfer=transfer_to_device)
+
+    # Generate and transfer transit template
+    n_template = kwargs.get('n_template', 1000)
+    if memory.template_g is None:
+        template = tls_models.generate_transit_template(
+            n_template=n_template, limb_dark=limb_dark, u=u
+        )
+        memory.set_template(template)
+
+    # Calculate shared memory requirements
+    # phases[ndata] + y_sorted[ndata] + dy_sorted[ndata] +
+    # template[n_template] + 4 * thread arrays[block_size]
+    shared_mem_size = (3 * ndata + n_template + 4 * block_size) * 4  # 4 bytes per float
+
+    # Launch kernel
+    grid = (nperiods, 1, 1)
+    block = (block_size, 1, 1)
+
+    if use_keplerian:
+        # Keplerian kernel with qmin/qmax arrays and template
+        kernel_args = [
+            memory.t_g, memory.y_g, memory.dy_g,
+            memory.periods_g, memory.qmin_g, memory.qmax_g,
+            memory.template_g,
+            np.int32(ndata), np.int32(nperiods), np.int32(n_durations),
+            np.int32(n_template),
+            memory.chi2_g, memory.best_t0_g,
+            memory.best_duration_g, memory.best_depth_g,
+        ]
+    else:
+        # Standard kernel with fixed duration range and template
+        kernel_args = [
+            memory.t_g, memory.y_g, memory.dy_g,
+            memory.periods_g,
+            memory.template_g,
+            np.int32(ndata), np.int32(nperiods),
+            np.int32(n_template),
+            memory.chi2_g, memory.best_t0_g,
+            memory.best_duration_g, memory.best_depth_g,
+        ]
+
+    kernel_kwargs = dict(block=block, grid=grid, shared=shared_mem_size)
+    if stream is not None:
+        kernel_kwargs['stream'] = stream
+
+    kernel(*kernel_args, **kernel_kwargs)
+
+    # Transfer results if requested
+    if transfer_to_host:
+        if stream is not None:
+            stream.synchronize()
+        memory.transfer_from_gpu(nperiods)
+
+        chi2_vals = memory.chi2[:nperiods].copy()
+        best_t0_vals = memory.best_t0[:nperiods].copy()
+        best_duration_vals = memory.best_duration[:nperiods].copy()
+        best_depth_vals = memory.best_depth[:nperiods].copy()
+
+        # Find best period
+        best_idx = np.argmin(chi2_vals)
+        best_period = periods[best_idx]
+        best_chi2 = chi2_vals[best_idx]
+        best_t0 = best_t0_vals[best_idx]
+        best_duration = best_duration_vals[best_idx]
+        best_depth = best_depth_vals[best_idx]
+
+        # Estimate number of transits
+        T_span = np.max(t) - np.min(t)
+        n_transits = int(T_span / best_period)
+
+        # Compute statistics
+        stats = tls_stats.compute_all_statistics(
+            chi2_vals, periods, best_idx,
+            best_depth, best_duration, n_transits
+        )
+
+        # Period uncertainty
+        period_uncertainty = tls_stats.compute_period_uncertainty(
+            periods, chi2_vals, best_idx
+        )
+
+        results = {
+            # Raw outputs
+            'periods': periods,
+            'chi2': chi2_vals,
+            'best_t0_per_period': best_t0_vals,
+            'best_duration_per_period': best_duration_vals,
+            'best_depth_per_period': best_depth_vals,
+
+            # Best-fit parameters
+            'period': best_period,
+            'period_uncertainty': period_uncertainty,
+            'T0': best_t0,
+            'duration': best_duration,
+            'depth': best_depth,
+            'chi2_min': best_chi2,
+
+            # Statistics
+            'SDE': stats['SDE'],
+            'SDE_raw': stats['SDE_raw'],
+            'SNR': stats['SNR'],
+            'FAP': stats['FAP'],
+            'power': stats['power'],
+            'SR': stats['SR'],
+
+            # Metadata
+            'n_transits': n_transits,
+            'R_star': R_star,
+            'M_star': M_star,
+        }
+    else:
+        # Just return periods if not transferring
+        results = {
+            'periods': periods,
+            'chi2': None,
+            'best_t0_per_period': None,
+            'best_duration_per_period': None,
+            'best_depth_per_period': None,
+        }
+
+    return results
+
+
+def tls_search(t, y, dy, **kwargs):
+    """
+    High-level TLS search function.
+
+    This is the main user-facing function for TLS searches.
+
+    Parameters
+    ----------
+    t, y, dy : array_like
+        Time series data
+    **kwargs
+        Passed to tls_search_gpu
+
+    Returns
+    -------
+    results : dict
+        Search results
+
+    See Also
+    --------
+    tls_search_gpu : Lower-level GPU function
+    tls_transit : Keplerian-aware search wrapper
+    """
+    return tls_search_gpu(t, y, dy, **kwargs)
+
+
+def tls_transit(t, y, dy, R_star=1.0, M_star=1.0, R_planet=1.0,
+                qmin_fac=0.5, qmax_fac=2.0, n_durations=15,
+                period_min=None, period_max=None, n_transits_min=2,
+                oversampling_factor=3, **kwargs):
+    """
+    Transit Least Squares search with Keplerian duration constraints.
+
+    This is the TLS analog of BLS's eebls_transit() function. It uses stellar
+    parameters to focus the duration search on physically plausible values,
+    providing ~7-8× efficiency improvement over fixed duration ranges.
+
+    Parameters
+    ----------
+    t : array_like
+        Observation times (days)
+    y : array_like
+        Flux measurements (arbitrary units)
+    dy : array_like
+        Flux uncertainties
+    R_star : float, optional
+        Stellar radius in solar radii (default: 1.0)
+    M_star : float, optional
+        Stellar mass in solar masses (default: 1.0)
+    R_planet : float, optional
+        Fiducial planet radius in Earth radii (default: 1.0)
+        Sets the central duration value around which to search
+    qmin_fac : float, optional
+        Minimum duration factor (default: 0.5)
+        Searches down to qmin_fac × q_keplerian
+    qmax_fac : float, optional
+        Maximum duration factor (default: 2.0)
+        Searches up to qmax_fac × q_keplerian
+    n_durations : int, optional
+        Number of duration samples per period (default: 15)
+    period_min, period_max : float, optional
+        Period search range (days). Auto-computed if None.
+    n_transits_min : int, optional
+        Minimum number of transits required (default: 2)
+    oversampling_factor : float, optional
+        Period grid oversampling (default: 3)
+    **kwargs
+        Additional parameters passed to tls_search_gpu
+
+    Returns
+    -------
+    results : dict
+        Search results with keys:
+        - 'period': Best-fit period
+        - 'T0': Best mid-transit time
+        - 'duration': Best transit duration
+        - 'depth': Best transit depth
+        - 'SDE': Signal Detection Efficiency
+        - 'periods': Trial periods
+        - 'chi2': Chi-squared values per period
+        ... (see tls_search_gpu for full list)
+
+    Notes
+    -----
+    This function automatically generates:
+    1. Optimal period grid using Ofir (2014) algorithm
+    2. Per-period duration ranges based on Keplerian physics
+    3. Qmin/qmax arrays for focused duration search
+
+    The duration search at each period focuses on physically plausible values:
+    - For short periods: searches shorter durations
+    - For long periods: searches longer durations
+    - Scales with stellar density (M_star, R_star)
+
+    This is much more efficient than searching a fixed fractional duration
+    range (0.5%-15%) at all periods.
+
+    Examples
+    --------
+    >>> from cuvarbase import tls
+    >>> results = tls.tls_transit(t, y, dy,
+    ...                            R_star=1.0, M_star=1.0,
+    ...                            period_min=5.0, period_max=20.0)
+    >>> print(f"Best period: {results['period']:.4f} days")
+    >>> print(f"Transit depth: {results['depth']:.4f}")
+
+    See Also
+    --------
+    tls_search_gpu : Lower-level GPU function
+    tls_grids.duration_grid_keplerian : Generate Keplerian duration grids
+    tls_grids.q_transit : Calculate Keplerian fractional duration
+    """
+    # Generate period grid
+    periods = tls_grids.period_grid_ofir(
+        t, R_star=R_star, M_star=M_star,
+        oversampling_factor=oversampling_factor,
+        period_min=period_min, period_max=period_max,
+        n_transits_min=n_transits_min
+    )
+
+    # Generate Keplerian duration constraints
+    durations, dur_counts, q_values = tls_grids.duration_grid_keplerian(
+        periods, R_star=R_star, M_star=M_star, R_planet=R_planet,
+        qmin_fac=qmin_fac, qmax_fac=qmax_fac, n_durations=n_durations
+    )
+
+    # Calculate qmin and qmax arrays
+    qmin = q_values * qmin_fac
+    qmax = q_values * qmax_fac
+
+    # Run TLS search with Keplerian constraints
+    results = tls_search_gpu(
+        t, y, dy,
+        periods=periods,
+        qmin=qmin,
+        qmax=qmax,
+        n_durations=n_durations,
+        R_star=R_star,
+        M_star=M_star,
+        **kwargs
+    )
+
+    return results
diff --git a/cuvarbase/tls_grids.py b/cuvarbase/tls_grids.py
new file mode 100644
index 0000000..429ff57
--- /dev/null
+++ b/cuvarbase/tls_grids.py
@@ -0,0 +1,463 @@
+"""
+Period and duration grid generation for Transit Least Squares.
+
+Implements the Ofir (2014) optimal frequency sampling algorithm and
+logarithmically-spaced duration grids based on stellar parameters.
+
+References
+----------
+.. [1] Ofir (2014), "An optimized transit detection algorithm to search
+       for periodic transits of small planets", A&A 561, A138
+.. [2] Hippke & Heller (2019), "Transit Least Squares", A&A 623, A39
+"""
+
+import numpy as np
+
+
+# Physical constants
+G = 6.67430e-11  # Gravitational constant (m^3 kg^-1 s^-2)
+R_sun = 6.95700e8  # Solar radius (m)
+M_sun = 1.98840e30  # Solar mass (kg)
+R_earth = 6.371e6  # Earth radius (m)
+
+
+def q_transit(period, R_star=1.0, M_star=1.0, R_planet=1.0):
+    """
+    Calculate fractional transit duration (q = duration/period) for Keplerian orbit.
+
+    This is the TLS analog of the BLS q parameter. For a circular, edge-on orbit,
+    the transit duration scales with stellar density and planet/star size ratio.
+
+    Parameters
+    ----------
+    period : float or array_like
+        Orbital period in days
+    R_star : float, optional
+        Stellar radius in solar radii (default: 1.0)
+    M_star : float, optional
+        Stellar mass in solar masses (default: 1.0)
+    R_planet : float, optional
+        Planet radius in Earth radii (default: 1.0)
+
+    Returns
+    -------
+    q : float or array_like
+        Fractional transit duration (duration/period)
+
+    Notes
+    -----
+    This follows the same Keplerian assumption as BLS but for TLS.
+    The duration is calculated for edge-on circular orbits and normalized by period.
+
+    See Also
+    --------
+    transit_duration_max : Calculate absolute transit duration
+    duration_grid_keplerian : Generate duration grid using Keplerian q values
+    """
+    duration = transit_duration_max(period, R_star, M_star, R_planet)
+    return duration / period
+
+
+def transit_duration_max(period, R_star=1.0, M_star=1.0, R_planet=1.0):
+    """
+    Calculate maximum transit duration for circular orbit.
+
+    Parameters
+    ----------
+    period : float or array_like
+        Orbital period in days
+    R_star : float, optional
+        Stellar radius in solar radii (default: 1.0)
+    M_star : float, optional
+        Stellar mass in solar masses (default: 1.0)
+    R_planet : float, optional
+        Planet radius in Earth radii (default: 1.0)
+
+    Returns
+    -------
+    duration : float or array_like
+        Maximum transit duration in days (for edge-on circular orbit)
+
+    Notes
+    -----
+    Formula: T_14 = (R_star + R_planet) * (4 * P / (π * G * M_star))^(1/3)
+
+    Assumes:
+    - Circular orbit (e = 0)
+    - Edge-on configuration (i = 90°)
+    - Planet + stellar radii contribute to transit chord
+    """
+    period_sec = period * 86400.0  # Convert to seconds
+    R_total = R_star * R_sun + R_planet * R_earth  # Total radius in meters
+    M_star_kg = M_star * M_sun  # Mass in kg
+
+    # Duration in seconds
+    duration_sec = R_total * (4.0 * period_sec / (np.pi * G * M_star_kg))**(1.0/3.0)
+
+    # Convert to days
+    duration_days = duration_sec / 86400.0
+
+    return duration_days
+
+
+def period_grid_ofir(t, R_star=1.0, M_star=1.0, oversampling_factor=3,
+                     period_min=None, period_max=None, n_transits_min=2):
+    """
+    Generate optimal period grid using Ofir (2014) algorithm.
+
+    This creates a non-uniform period grid that optimally samples the
+    period space, with denser sampling at shorter periods where transit
+    durations are shorter.
+
+    Parameters
+    ----------
+    t : array_like
+        Observation times (days)
+    R_star : float, optional
+        Stellar radius in solar radii (default: 1.0)
+    M_star : float, optional
+        Stellar mass in solar masses (default: 1.0)
+    oversampling_factor : float, optional
+        Oversampling factor for period grid (default: 3)
+        Higher values give denser grids
+    period_min : float, optional
+        Minimum period to search (days). If None, calculated from
+        Roche limit and minimum transits
+    period_max : float, optional
+        Maximum period to search (days). If None, set to half the
+        total observation span
+    n_transits_min : int, optional
+        Minimum number of transits required (default: 2)
+
+    Returns
+    -------
+    periods : ndarray
+        Array of trial periods (days)
+
+    Notes
+    -----
+    Uses the Ofir (2014) frequency-to-cubic transformation:
+
+    f_x = (A/3 * x + C)^3
+
+    where A = (2π)^(2/3) / π * R_star / (G * M_star)^(1/3) * 1/(S * OS)
+
+    This ensures optimal statistical sampling across the period space.
+    """
+    t = np.asarray(t)
+    T_span = np.max(t) - np.min(t)  # Total observation span
+
+    # Store user's requested limits (for filtering later)
+    user_period_min = period_min
+    user_period_max = period_max
+
+    # Physical boundary conditions (following Ofir 2014 and CPU TLS)
+    # f_min: require n_transits_min transits over baseline
+    f_min = n_transits_min / (T_span * 86400.0)  # 1/seconds
+
+    # f_max: Roche limit (maximum possible frequency)
+    # P_roche = 2π * sqrt(a^3 / (G*M)) where a = 3*R at Roche limit
+    R_star_m = R_star * R_sun
+    M_star_kg = M_star * M_sun
+    f_max = 1.0 / (2.0 * np.pi) * np.sqrt(G * M_star_kg / (3.0 * R_star_m)**3)
+
+    # Ofir (2014) parameters - equations (5), (6), (7)
+    T_span_sec = T_span * 86400.0  # Convert to seconds
+
+    # Equation (5): optimal frequency sampling parameter
+    A = ((2.0 * np.pi)**(2.0/3.0) / np.pi * R_star_m /
+         (G * M_star_kg)**(1.0/3.0) / (T_span_sec * oversampling_factor))
+
+    # Equation (6): offset parameter
+    C = f_min**(1.0/3.0) - A / 3.0
+
+    # Equation (7): optimal number of frequency samples
+    n_freq = int(np.ceil((f_max**(1.0/3.0) - f_min**(1.0/3.0) + A / 3.0) * 3.0 / A))
+
+    # Ensure we have at least some frequencies
+    if n_freq < 10:
+        n_freq = 10
+
+    # Linear grid in cubic-root frequency space
+    x = np.arange(n_freq) + 1  # 1-indexed like CPU TLS
+
+    # Transform to frequency space (Hz)
+    freqs = (A / 3.0 * x + C)**3
+
+    # Convert to periods (days)
+    periods = 1.0 / freqs / 86400.0
+
+    # Apply user-requested period limits
+    if user_period_min is not None or user_period_max is not None:
+        if user_period_min is None:
+            user_period_min = 0.0
+        if user_period_max is None:
+            user_period_max = np.inf
+
+        periods = periods[(periods > user_period_min) & (periods <= user_period_max)]
+
+    # If we somehow got no periods, use simple linear grid
+    if len(periods) == 0:
+        if user_period_min is None:
+            user_period_min = T_span / 20.0
+        if user_period_max is None:
+            user_period_max = T_span / 2.0
+        periods = np.linspace(user_period_min, user_period_max, 100)
+
+    # Sort in increasing order (standard convention)
+    periods = np.sort(periods)
+
+    return periods
+
+
+def duration_grid(periods, R_star=1.0, M_star=1.0, R_planet_min=0.5,
+                  R_planet_max=5.0, duration_grid_step=1.1):
+    """
+    Generate logarithmically-spaced duration grid for each period.
+
+    Parameters
+    ----------
+    periods : array_like
+        Trial periods (days)
+    R_star : float, optional
+        Stellar radius in solar radii (default: 1.0)
+    M_star : float, optional
+        Stellar mass in solar masses (default: 1.0)
+    R_planet_min : float, optional
+        Minimum planet radius to consider in Earth radii (default: 0.5)
+    R_planet_max : float, optional
+        Maximum planet radius to consider in Earth radii (default: 5.0)
+    duration_grid_step : float, optional
+        Multiplicative step for duration grid (default: 1.1)
+        1.1 means each duration is 10% larger than previous
+
+    Returns
+    -------
+    durations : list of ndarray
+        List where durations[i] is array of durations for periods[i]
+    duration_counts : ndarray
+        Number of durations for each period
+
+    Notes
+    -----
+    Durations are sampled logarithmically from the minimum transit time
+    (small planet) to maximum transit time (large planet) for each period.
+
+    The grid spacing ensures we don't miss any transit duration while
+    avoiding excessive oversampling.
+    """
+    periods = np.asarray(periods)
+
+    # Calculate duration bounds for each period
+    T_min = transit_duration_max(periods, R_star, M_star, R_planet_min)
+    T_max = transit_duration_max(periods, R_star, M_star, R_planet_max)
+
+    durations = []
+    duration_counts = np.zeros(len(periods), dtype=np.int32)
+
+    for i, (period, t_min, t_max) in enumerate(zip(periods, T_min, T_max)):
+        # Generate logarithmically-spaced durations
+        dur = []
+        t = t_min
+        while t <= t_max:
+            dur.append(t)
+            t *= duration_grid_step
+
+        # Ensure we include the maximum duration
+        if dur[-1] < t_max:
+            dur.append(t_max)
+
+        durations.append(np.array(dur, dtype=np.float32))
+        duration_counts[i] = len(dur)
+
+    return durations, duration_counts
+
+
+def duration_grid_keplerian(periods, R_star=1.0, M_star=1.0, R_planet=1.0,
+                            qmin_fac=0.5, qmax_fac=2.0, n_durations=15):
+    """
+    Generate Keplerian-aware duration grid for each period.
+
+    This is the TLS analog of BLS's Keplerian q-based duration search.
+    At each period, we calculate the expected transit duration for a
+    Keplerian orbit and search within qmin_fac to qmax_fac times that value.
+
+    Parameters
+    ----------
+    periods : array_like
+        Trial periods (days)
+    R_star : float, optional
+        Stellar radius in solar radii (default: 1.0)
+    M_star : float, optional
+        Stellar mass in solar masses (default: 1.0)
+    R_planet : float, optional
+        Fiducial planet radius in Earth radii (default: 1.0)
+        This sets the central duration value around which we search
+    qmin_fac : float, optional
+        Minimum duration factor (default: 0.5)
+        Searches down to qmin_fac * q_keplerian
+    qmax_fac : float, optional
+        Maximum duration factor (default: 2.0)
+        Searches up to qmax_fac * q_keplerian
+    n_durations : int, optional
+        Number of duration samples per period (default: 15)
+        Logarithmically spaced between qmin and qmax
+
+    Returns
+    -------
+    durations : list of ndarray
+        List where durations[i] is array of durations for periods[i]
+    duration_counts : ndarray
+        Number of durations for each period (constant = n_durations)
+    q_values : ndarray
+        Keplerian q values (duration/period) for each period
+
+    Notes
+    -----
+    This exploits the Keplerian assumption that transit duration scales
+    predictably with period based on stellar parameters. This is much
+    more efficient than searching all possible durations, as we focus
+    the search around the physically expected value.
+
+    For example, for a Sun-like star (M=1, R=1) and Earth-size planet:
+    - At P=10 days: q ~ 0.015, so we search 0.0075 to 0.030 (0.5x to 2x)
+    - At P=100 days: q ~ 0.027, so we search 0.014 to 0.054
+
+    This is equivalent to BLS's approach but applied to transit shapes.
+
+    See Also
+    --------
+    q_transit : Calculate Keplerian fractional transit duration
+    duration_grid : Alternative method that searches fixed planet radius range
+    """
+    periods = np.asarray(periods)
+
+    # Calculate Keplerian q value (fractional duration) for each period
+    q_values = q_transit(periods, R_star, M_star, R_planet)
+
+    # Duration bounds based on q-factors
+    qmin_vals = q_values * qmin_fac
+    qmax_vals = q_values * qmax_fac
+
+    durations = []
+    duration_counts = np.full(len(periods), n_durations, dtype=np.int32)
+
+    for period, qmin, qmax in zip(periods, qmin_vals, qmax_vals):
+        # Logarithmically-spaced durations from qmin to qmax
+        # (in absolute time, not fractional)
+        dur_min = qmin * period
+        dur_max = qmax * period
+
+        # Log-spaced grid
+        dur = np.logspace(np.log10(dur_min), np.log10(dur_max),
+                         n_durations, dtype=np.float32)
+
+        durations.append(dur)
+
+    return durations, duration_counts, q_values
+
+
+def t0_grid(period, duration, n_transits=None, oversampling=5):
+    """
+    Generate grid of T0 (mid-transit time) positions to test.
+
+    Parameters
+    ----------
+    period : float
+        Orbital period (days)
+    duration : float
+        Transit duration (days)
+    n_transits : int, optional
+        Number of transits in observation span. If None, assumes
+        you want to sample one full period cycle.
+    oversampling : int, optional
+        Number of T0 positions to test per transit duration (default: 5)
+
+    Returns
+    -------
+    t0_values : ndarray
+        Array of T0 positions (in phase, 0 to 1)
+
+    Notes
+    -----
+    This creates a grid of phase offsets to test. The spacing is
+    determined by the transit duration and oversampling factor.
+
+    For computational efficiency, we typically use stride sampling
+    (not every possible phase offset).
+    """
+    # Phase-space duration
+    q = duration / period
+
+    # Step size in phase
+    step = q / oversampling
+
+    # Number of steps to cover one full period
+    if n_transits is not None:
+        n_steps = int(np.ceil(1.0 / (step * n_transits)))
+    else:
+        n_steps = int(np.ceil(1.0 / step))
+
+    # Grid from 0 to 1 (phase)
+    t0_values = np.linspace(0, 1 - step, n_steps, dtype=np.float32)
+
+    return t0_values
+
+
+def validate_stellar_parameters(R_star=1.0, M_star=1.0,
+                                R_star_min=0.13, R_star_max=3.5,
+                                M_star_min=0.1, M_star_max=2.0):
+    """
+    Validate stellar parameters are within reasonable bounds.
+
+    Parameters
+    ----------
+    R_star : float
+        Stellar radius in solar radii
+    M_star : float
+        Stellar mass in solar masses
+    R_star_min, R_star_max : float
+        Allowed range for stellar radius
+    M_star_min, M_star_max : float
+        Allowed range for stellar mass
+
+    Raises
+    ------
+    ValueError
+        If parameters are outside allowed ranges
+    """
+    if not (R_star_min <= R_star <= R_star_max):
+        raise ValueError(f"R_star={R_star} outside allowed range "
+                        f"[{R_star_min}, {R_star_max}] solar radii")
+
+    if not (M_star_min <= M_star <= M_star_max):
+        raise ValueError(f"M_star={M_star} outside allowed range "
+                        f"[{M_star_min}, {M_star_max}] solar masses")
+
+
+def estimate_n_evaluations(periods, durations, t0_oversampling=5):
+    """
+    Estimate total number of chi-squared evaluations.
+
+    Parameters
+    ----------
+    periods : array_like
+        Trial periods
+    durations : list of array_like
+        Duration grids for each period
+    t0_oversampling : int
+        T0 grid oversampling factor
+
+    Returns
+    -------
+    n_total : int
+        Total number of evaluations (P × D × T0)
+    """
+    n_total = 0
+    for i, period in enumerate(periods):
+        n_durations = len(durations[i])
+        for duration in durations[i]:
+            t0_vals = t0_grid(period, duration, oversampling=t0_oversampling)
+            n_total += len(t0_vals)
+
+    return n_total
diff --git a/cuvarbase/tls_models.py b/cuvarbase/tls_models.py
new file mode 100644
index 0000000..79f6d2b
--- /dev/null
+++ b/cuvarbase/tls_models.py
@@ -0,0 +1,476 @@
+"""
+Transit model generation for TLS.
+
+This module handles creation of physically realistic transit light curves
+using the Batman package for limb-darkened transits.
+
+References
+----------
+.. [1] Kreidberg (2015), "batman: BAsic Transit Model cAlculatioN in Python",
+       PASP 127, 1161
+.. [2] Mandel & Agol (2002), "Analytic Light Curves for Planetary Transit
+       Searches", ApJ 580, L171
+"""
+
+import numpy as np
+try:
+    import batman
+    BATMAN_AVAILABLE = True
+except ImportError:
+    BATMAN_AVAILABLE = False
+    import warnings
+    warnings.warn("batman package not available. Install with: pip install batman-package")
+
+
+def create_reference_transit(n_samples=1000, limb_dark='quadratic',
+                             u=[0.4804, 0.1867]):
+    """
+    Create a reference transit model normalized to Earth-like transit.
+
+    This generates a high-resolution transit template that can be scaled
+    and interpolated for different durations and depths.
+
+    Parameters
+    ----------
+    n_samples : int, optional
+        Number of samples in the model (default: 1000)
+    limb_dark : str, optional
+        Limb darkening law (default: 'quadratic')
+        Options: 'uniform', 'linear', 'quadratic', 'nonlinear'
+    u : list, optional
+        Limb darkening coefficients (default: [0.4804, 0.1867])
+        Default values are for Sun-like star in Kepler bandpass
+
+    Returns
+    -------
+    phases : ndarray
+        Phase values (0 to 1)
+    flux : ndarray
+        Normalized flux (1.0 = out of transit, <1.0 = in transit)
+
+    Notes
+    -----
+    The reference model assumes:
+    - Period = 1.0 (arbitrary units, we work in phase)
+    - Semi-major axis = 1.0 (normalized)
+    - Planet-to-star radius ratio scaled to produce unit depth
+    """
+    if not BATMAN_AVAILABLE:
+        raise ImportError("batman package required for transit models. "
+                         "Install with: pip install batman-package")
+
+    # Batman parameters for reference transit
+    params = batman.TransitParams()
+
+    # Fixed parameters (Earth-like)
+    params.t0 = 0.0                   # Mid-transit time
+    params.per = 1.0                  # Period (arbitrary, we use phase)
+    params.rp = 0.1                   # Planet-to-star radius ratio (will normalize)
+    params.a = 15.0                   # Semi-major axis in stellar radii (typical)
+    params.inc = 90.0                 # Inclination (degrees) - edge-on
+    params.ecc = 0.0                  # Eccentricity - circular
+    params.w = 90.0                   # Longitude of periastron
+    params.limb_dark = limb_dark      # Limb darkening model
+    params.u = u                      # Limb darkening coefficients
+
+    # Create time array spanning the transit
+    # For a = 15, duration is approximately 0.05 in phase units
+    # We'll create a grid from -0.1 to 0.1 (well beyond transit)
+    t = np.linspace(-0.15, 0.15, n_samples)
+
+    # Generate model
+    m = batman.TransitModel(params, t)
+    flux = m.light_curve(params)
+
+    # Normalize: shift so out-of-transit = 1.0, in-transit depth = 1.0 at center
+    flux_oot = flux[0]  # Out of transit flux
+    depth = flux_oot - np.min(flux)  # Transit depth
+
+    if depth < 1e-10:
+        raise ValueError("Transit depth too small - check parameters")
+
+    flux_normalized = (flux - flux_oot) / depth + 1.0
+
+    # Convert time to phase (0 to 1)
+    phases = (t - t[0]) / (t[-1] - t[0])
+
+    return phases, flux_normalized
+
+
+def create_transit_model_cache(durations, period=1.0, n_samples=1000,
+                               limb_dark='quadratic', u=[0.4804, 0.1867],
+                               R_star=1.0, M_star=1.0):
+    """
+    Create cache of transit models for different durations.
+
+    Parameters
+    ----------
+    durations : array_like
+        Array of transit durations (days) to cache
+    period : float, optional
+        Reference period (days) - used for scaling (default: 1.0)
+    n_samples : int, optional
+        Number of samples per model (default: 1000)
+    limb_dark : str, optional
+        Limb darkening law (default: 'quadratic')
+    u : list, optional
+        Limb darkening coefficients (default: [0.4804, 0.1867])
+    R_star : float, optional
+        Stellar radius in solar radii (default: 1.0)
+    M_star : float, optional
+        Stellar mass in solar masses (default: 1.0)
+
+    Returns
+    -------
+    models : list of ndarray
+        List of flux arrays for each duration
+    phases : ndarray
+        Phase array (same for all models)
+
+    Notes
+    -----
+    This creates models at different durations by adjusting the semi-major
+    axis in the batman model to produce the desired transit duration.
+    """
+    if not BATMAN_AVAILABLE:
+        raise ImportError("batman package required for transit models")
+
+    durations = np.asarray(durations)
+    models = []
+
+    for duration in durations:
+        # Create batman parameters
+        params = batman.TransitParams()
+        params.t0 = 0.0
+        params.per = period
+        params.rp = 0.1  # Will be scaled later
+        params.inc = 90.0
+        params.ecc = 0.0
+        params.w = 90.0
+        params.limb_dark = limb_dark
+        params.u = u
+
+        # Calculate semi-major axis to produce desired duration
+        # T_14 ≈ (P/π) * arcsin(R_star/a) for edge-on transit
+        # Approximation: a ≈ R_star * P / (π * duration)
+        a = R_star * period / (np.pi * duration)
+        params.a = max(a, 1.5)  # Ensure a > R_star + R_planet
+
+        # Create time array
+        t = np.linspace(-0.15, 0.15, n_samples)
+
+        # Generate model
+        m = batman.TransitModel(params, t)
+        flux = m.light_curve(params)
+
+        # Normalize
+        flux_oot = flux[0]
+        depth = flux_oot - np.min(flux)
+
+        if depth < 1e-10:
+            # If depth is too small, use reference model
+            phases, flux_normalized = create_reference_transit(
+                n_samples, limb_dark, u)
+        else:
+            flux_normalized = (flux - flux_oot) / depth + 1.0
+            phases = (t - t[0]) / (t[-1] - t[0])
+
+        models.append(flux_normalized.astype(np.float32))
+
+    return models, phases.astype(np.float32)
+
+
+def simple_trapezoid_transit(phases, duration_phase, depth=1.0,
+                             ingress_duration=0.1):
+    """
+    Create a simple trapezoidal transit model (fast, no Batman needed).
+
+    This is a simplified model for testing or when Batman is not available.
+
+    Parameters
+    ----------
+    phases : array_like
+        Phase values (0 to 1)
+    duration_phase : float
+        Total transit duration in phase units
+    depth : float, optional
+        Transit depth (default: 1.0)
+    ingress_duration : float, optional
+        Ingress/egress duration as fraction of total duration (default: 0.1)
+
+    Returns
+    -------
+    flux : ndarray
+        Flux values (1.0 = out of transit)
+
+    Notes
+    -----
+    This creates a trapezoid with linear ingress/egress. It's much faster
+    than Batman but less physically accurate (no limb darkening).
+    """
+    phases = np.asarray(phases)
+    flux = np.ones_like(phases, dtype=np.float32)
+
+    # Calculate ingress/egress duration
+    t_ingress = duration_phase * ingress_duration
+    t_flat = duration_phase * (1.0 - 2.0 * ingress_duration)
+
+    # Transit centered at phase = 0.5
+    t1 = 0.5 - duration_phase / 2.0  # Start of ingress
+    t2 = t1 + t_ingress               # Start of flat bottom
+    t3 = t2 + t_flat                  # Start of egress
+    t4 = t3 + t_ingress               # End of transit
+
+    # Ingress
+    mask_ingress = (phases >= t1) & (phases < t2)
+    flux[mask_ingress] = 1.0 - depth * (phases[mask_ingress] - t1) / t_ingress
+
+    # Flat bottom
+    mask_flat = (phases >= t2) & (phases < t3)
+    flux[mask_flat] = 1.0 - depth
+
+    # Egress
+    mask_egress = (phases >= t3) & (phases < t4)
+    flux[mask_egress] = 1.0 - depth * (t4 - phases[mask_egress]) / t_ingress
+
+    return flux
+
+
+def interpolate_transit_model(model_phases, model_flux, target_phases,
+                              target_depth=1.0):
+    """
+    Interpolate a transit model to new phase grid and scale depth.
+
+    Parameters
+    ----------
+    model_phases : array_like
+        Phase values of the template model
+    model_flux : array_like
+        Flux values of the template model
+    target_phases : array_like
+        Desired phase values for interpolation
+    target_depth : float, optional
+        Desired transit depth (default: 1.0)
+
+    Returns
+    -------
+    flux : ndarray
+        Interpolated and scaled flux values
+
+    Notes
+    -----
+    Uses linear interpolation. For GPU implementation, texture memory
+    with hardware interpolation would be faster.
+    """
+    # Interpolate to target phases
+    flux_interp = np.interp(target_phases, model_phases, model_flux)
+
+    # Scale depth: current depth is (1.0 - min(model_flux))
+    current_depth = 1.0 - np.min(model_flux)
+
+    if current_depth < 1e-10:
+        return flux_interp
+
+    # Scale: flux = 1 - target_depth * (1 - flux_normalized)
+    flux_scaled = 1.0 - target_depth * (1.0 - flux_interp)
+
+    return flux_scaled.astype(np.float32)
+
+
+def generate_transit_template(n_template=1000, limb_dark='quadratic',
+                              u=[0.4804, 0.1867]):
+    """
+    Generate a 1D transit template for use in the GPU TLS kernel.
+
+    The template maps transit_coord in [-1, 1] (edge-to-edge of transit)
+    to a normalized depth value in [0, 1] where 0 = no dimming (edges)
+    and 1 = maximum dimming (center, with limb darkening).
+
+    Parameters
+    ----------
+    n_template : int, optional
+        Number of points in the template (default: 1000)
+    limb_dark : str, optional
+        Limb darkening law (default: 'quadratic')
+    u : list, optional
+        Limb darkening coefficients (default: [0.4804, 0.1867])
+
+    Returns
+    -------
+    template : ndarray
+        Float32 array of shape (n_template,) with values in [0, 1].
+        Index 0 corresponds to transit_coord = -1 (leading edge),
+        index n_template-1 corresponds to transit_coord = +1 (trailing edge).
+    """
+    transit_coords = np.linspace(-1.0, 1.0, n_template)
+
+    if BATMAN_AVAILABLE:
+        try:
+            # Generate a batman transit model
+            phases, flux = create_reference_transit(
+                n_samples=5000, limb_dark=limb_dark, u=u
+            )
+
+            # Find the in-transit region (where flux < 1.0 - small threshold)
+            threshold = 1e-6
+            in_transit = flux < (1.0 - threshold)
+
+            if not np.any(in_transit):
+                # Fallback to trapezoid if no transit detected
+                return _trapezoid_template(n_template)
+
+            # Get the in-transit indices
+            transit_indices = np.where(in_transit)[0]
+            i_start = transit_indices[0]
+            i_end = transit_indices[-1]
+
+            # Extract in-transit portion
+            transit_phases = phases[i_start:i_end + 1]
+            transit_flux = flux[i_start:i_end + 1]
+
+            # Map transit phases to transit_coord [-1, 1]
+            phase_center = 0.5 * (transit_phases[0] + transit_phases[-1])
+            phase_half_width = 0.5 * (transit_phases[-1] - transit_phases[0])
+
+            if phase_half_width < 1e-10:
+                return _trapezoid_template(n_template)
+
+            source_coords = (transit_phases - phase_center) / phase_half_width
+
+            # Depth values: 0 = no dimming, 1 = max dimming
+            depth_values = 1.0 - transit_flux
+
+            # Normalize so max = 1
+            max_depth = np.max(depth_values)
+            if max_depth < 1e-10:
+                return _trapezoid_template(n_template)
+            depth_values /= max_depth
+
+            # Resample to uniform transit_coord grid
+            template = np.interp(transit_coords, source_coords, depth_values,
+                                 left=0.0, right=0.0)
+
+            return template.astype(np.float32)
+
+        except Exception:
+            return _trapezoid_template(n_template)
+    else:
+        return _trapezoid_template(n_template)
+
+
+def _trapezoid_template(n_template=1000, ingress_fraction=0.1):
+    """
+    Generate a trapezoidal transit template as fallback.
+
+    Parameters
+    ----------
+    n_template : int
+        Number of template points
+    ingress_fraction : float
+        Fraction of transit that is ingress/egress (each side)
+
+    Returns
+    -------
+    template : ndarray
+        Float32 array of shape (n_template,) with values in [0, 1].
+    """
+    transit_coords = np.linspace(-1.0, 1.0, n_template)
+    template = np.zeros(n_template, dtype=np.float32)
+
+    # Trapezoidal shape: ramp up during ingress, flat bottom, ramp down during egress
+    edge_inner = 1.0 - 2.0 * ingress_fraction  # Where flat bottom starts/ends
+
+    for i in range(n_template):
+        coord = abs(transit_coords[i])
+        if coord <= edge_inner:
+            template[i] = 1.0  # Flat bottom (max depth)
+        elif coord <= 1.0:
+            # Linear ramp from 1 to 0 during ingress/egress
+            template[i] = (1.0 - coord) / (1.0 - edge_inner)
+        else:
+            template[i] = 0.0
+
+    return template
+
+
+def get_default_limb_darkening(filter='Kepler', T_eff=5500):
+    """
+    Get default limb darkening coefficients for common filters and T_eff.
+
+    Parameters
+    ----------
+    filter : str, optional
+        Filter name: 'Kepler', 'TESS', 'Johnson_V', etc. (default: 'Kepler')
+    T_eff : float, optional
+        Effective temperature (K) (default: 5500)
+
+    Returns
+    -------
+    u : list
+        Quadratic limb darkening coefficients [u1, u2]
+
+    Notes
+    -----
+    These are approximate values. For precise work, calculate coefficients
+    for your specific stellar parameters using packages like ldtk.
+
+    Values from Claret & Bloemen (2011), A&A 529, A75
+    """
+    # Simple lookup table for common cases
+    # Format: {filter: {T_eff_range: [u1, u2]}}
+
+    if filter == 'Kepler':
+        if T_eff < 4500:
+            return [0.7, 0.1]  # Cool stars
+        elif T_eff < 6000:
+            return [0.4804, 0.1867]  # Solar-type
+        else:
+            return [0.3, 0.2]  # Hot stars
+
+    elif filter == 'TESS':
+        if T_eff < 4500:
+            return [0.5, 0.2]
+        elif T_eff < 6000:
+            return [0.3, 0.3]
+        else:
+            return [0.2, 0.3]
+
+    else:
+        # Default to Solar-type in Kepler
+        return [0.4804, 0.1867]
+
+
+def validate_limb_darkening_coeffs(u, limb_dark='quadratic'):
+    """
+    Validate limb darkening coefficients are physically reasonable.
+
+    Parameters
+    ----------
+    u : list
+        Limb darkening coefficients
+    limb_dark : str
+        Limb darkening law
+
+    Raises
+    ------
+    ValueError
+        If coefficients are unphysical
+    """
+    u = np.asarray(u)
+
+    if limb_dark == 'quadratic':
+        if len(u) != 2:
+            raise ValueError("Quadratic limb darkening requires 2 coefficients")
+        # Physical constraints: 0 < u1 + u2 < 1, u1 > 0, u1 + 2*u2 > 0
+        if not (0 < u[0] + u[1] < 1):
+            raise ValueError(f"u1 + u2 = {u[0] + u[1]} must be in (0, 1)")
+        if not (u[0] > 0):
+            raise ValueError(f"u1 = {u[0]} must be > 0")
+        if not (u[0] + 2*u[1] > 0):
+            raise ValueError(f"u1 + 2*u2 = {u[0] + 2*u[1]} must be > 0")
+
+    elif limb_dark == 'linear':
+        if len(u) != 1:
+            raise ValueError("Linear limb darkening requires 1 coefficient")
+        if not (0 < u[0] < 1):
+            raise ValueError(f"u = {u[0]} must be in (0, 1)")
diff --git a/cuvarbase/tls_stats.py b/cuvarbase/tls_stats.py
new file mode 100644
index 0000000..b3d9fe6
--- /dev/null
+++ b/cuvarbase/tls_stats.py
@@ -0,0 +1,448 @@
+"""
+Statistical calculations for Transit Least Squares.
+
+Implements Signal Detection Efficiency (SDE), Signal-to-Noise Ratio (SNR),
+False Alarm Probability (FAP), and related metrics.
+
+References
+----------
+.. [1] Hippke & Heller (2019), A&A 623, A39
+.. [2] Kovács et al. (2002), A&A 391, 369
+"""
+
+import numpy as np
+from scipy import signal, stats
+
+
+def signal_residue(chi2, chi2_null=None):
+    """
+    Calculate Signal Residue (SR).
+
+    SR = 1 - chi²_signal / chi²_null, where higher = stronger signal.
+
+    Parameters
+    ----------
+    chi2 : array_like
+        Chi-squared values at each period
+    chi2_null : float, optional
+        Null hypothesis chi-squared (constant model)
+        If None, uses maximum chi2 value
+
+    Returns
+    -------
+    SR : ndarray
+        Signal residue values. 0 = no signal, higher = stronger.
+
+    Notes
+    -----
+    Higher SR values indicate stronger signals.
+    SR ~ 0 means chi² is close to the null model.
+    """
+    chi2 = np.asarray(chi2)
+
+    if chi2_null is None:
+        chi2_null = np.max(chi2)
+
+    SR = 1.0 - chi2 / (chi2_null + 1e-10)
+
+    return SR
+
+
+def signal_detection_efficiency(chi2, chi2_null=None, detrend=True,
+                                window_length=None):
+    """
+    Calculate Signal Detection Efficiency (SDE).
+
+    SDE measures how many standard deviations above the noise
+    the signal is. Higher SDE = more significant detection.
+
+    Parameters
+    ----------
+    chi2 : array_like
+        Chi-squared values at each period
+    chi2_null : float, optional
+        Null hypothesis chi-squared
+    detrend : bool, optional
+        Apply median filter detrending (default: True)
+    window_length : int, optional
+        Window length for median filter (default: len(chi2)//10)
+
+    Returns
+    -------
+    SDE : float
+        Signal detection efficiency (z-score)
+    SDE_raw : float
+        Raw SDE before detrending
+    power : ndarray
+        Detrended power spectrum (if detrend=True)
+
+    Notes
+    -----
+    SDE is essentially a z-score:
+    SDE = (max(SR) - mean(SR)) / std(SR)
+
+    Typical threshold: SDE > 7 for 1% false alarm probability
+    """
+    chi2 = np.asarray(chi2)
+
+    # Calculate signal residue
+    SR = signal_residue(chi2, chi2_null)
+
+    # Raw SDE (before detrending)
+    mean_SR = np.mean(SR)
+    std_SR = np.std(SR)
+
+    if std_SR < 1e-10:
+        SDE_raw = 0.0
+    else:
+        SDE_raw = (np.max(SR) - mean_SR) / std_SR
+
+    # Detrend with median filter if requested
+    if detrend:
+        if window_length is None:
+            window_length = max(len(SR) // 10, 3)
+            # Ensure odd window
+            if window_length % 2 == 0:
+                window_length += 1
+
+        # Apply median filter to remove trends
+        SR_trend = signal.medfilt(SR, kernel_size=window_length)
+
+        # Detrended signal residue
+        SR_detrended = SR - SR_trend + np.median(SR)
+
+        # Calculate SDE on detrended signal
+        mean_SR_detrended = np.mean(SR_detrended)
+        std_SR_detrended = np.std(SR_detrended)
+
+        if std_SR_detrended < 1e-10:
+            SDE = 0.0
+        else:
+            SDE = (np.max(SR_detrended) - mean_SR_detrended) / std_SR_detrended
+
+        power = SR_detrended
+    else:
+        SDE = SDE_raw
+        power = SR
+
+    return SDE, SDE_raw, power
+
+
+def signal_to_noise(depth, depth_err=None, n_transits=1,
+                    chi2_null=None, chi2_best=None):
+    """
+    Calculate signal-to-noise ratio.
+
+    Parameters
+    ----------
+    depth : float
+        Transit depth
+    depth_err : float, optional
+        Uncertainty in depth. If None, estimated from chi2 values or
+        Poisson statistics as a last resort.
+    n_transits : int, optional
+        Number of transits (default: 1)
+    chi2_null : float, optional
+        Null hypothesis chi-squared (no transit). Used to estimate
+        depth_err when depth_err is not provided.
+    chi2_best : float, optional
+        Best-fit chi-squared. Used with chi2_null to estimate depth_err.
+
+    Returns
+    -------
+    snr : float
+        Signal-to-noise ratio
+
+    Notes
+    -----
+    SNR improves as sqrt(n_transits) for independent transits.
+
+    When depth_err is not provided, it is estimated as:
+    depth / sqrt(chi2_null - chi2_best) if chi2 values are given,
+    otherwise returns 0.
+    """
+    if depth_err is None:
+        if chi2_null is not None and chi2_best is not None:
+            delta_chi2 = chi2_null - chi2_best
+            if delta_chi2 > 0:
+                depth_err = depth / np.sqrt(delta_chi2)
+            else:
+                return 0.0
+        else:
+            return 0.0
+
+    if depth_err < 1e-10:
+        return 0.0
+
+    snr = depth / depth_err * np.sqrt(n_transits)
+
+    return snr
+
+
+def false_alarm_probability(SDE, method='empirical'):
+    """
+    Estimate False Alarm Probability from SDE.
+
+    Parameters
+    ----------
+    SDE : float
+        Signal Detection Efficiency
+    method : str, optional
+        Method for FAP estimation (default: 'empirical')
+        - 'empirical': From Hippke & Heller calibration
+        - 'gaussian': Assuming Gaussian noise
+
+    Returns
+    -------
+    FAP : float
+        False Alarm Probability
+
+    Notes
+    -----
+    Empirical calibration from Hippke & Heller (2019):
+    - SDE = 7 -> FAP ~ 1%
+    - SDE = 9 -> FAP ~ 0.1%
+    - SDE = 11 -> FAP ~ 0.01%
+
+    These values are approximate. For rigorous FAP estimation,
+    injection-recovery simulations are recommended.
+    """
+    if method == 'gaussian':
+        # Gaussian approximation: FAP = 1 - erf(SDE/sqrt(2))
+        FAP = 1.0 - stats.norm.cdf(SDE)
+    else:
+        # Empirical calibration from Hippke & Heller (2019)
+        # Rough approximation based on their Figure 5
+        if SDE < 5:
+            FAP = 1.0  # Very high FAP
+        elif SDE < 7:
+            FAP = 10 ** (-0.5 * (SDE - 5))  # ~10% at SDE=5, ~1% at SDE=7
+        else:
+            FAP = 10 ** (-(SDE - 5))  # Exponential decrease
+
+        # Clip to reasonable range
+        FAP = np.clip(FAP, 1e-10, 1.0)
+
+    return FAP
+
+
+def odd_even_mismatch(depths_odd, depths_even):
+    """
+    Calculate odd-even transit depth mismatch.
+
+    This tests whether odd and even transits have significantly
+    different depths, which could indicate:
+    - Binary system
+    - Non-planetary signal
+    - Instrumental effects
+
+    Parameters
+    ----------
+    depths_odd : array_like
+        Depths of odd-numbered transits
+    depths_even : array_like
+        Depths of even-numbered transits
+
+    Returns
+    -------
+    mismatch : float
+        Significance of mismatch (z-score)
+    depth_diff : float
+        Difference between mean depths
+
+    Notes
+    -----
+    High mismatch (>3σ) suggests the signal may not be planetary.
+    """
+    depths_odd = np.asarray(depths_odd)
+    depths_even = np.asarray(depths_even)
+
+    mean_odd = np.mean(depths_odd)
+    mean_even = np.mean(depths_even)
+
+    std_odd = np.std(depths_odd) / np.sqrt(len(depths_odd))
+    std_even = np.std(depths_even) / np.sqrt(len(depths_even))
+
+    depth_diff = mean_odd - mean_even
+    combined_std = np.sqrt(std_odd**2 + std_even**2)
+
+    if combined_std < 1e-10:
+        return 0.0, 0.0
+
+    mismatch = np.abs(depth_diff) / combined_std
+
+    return mismatch, depth_diff
+
+
+def compute_all_statistics(chi2, periods, best_period_idx,
+                           depth, duration, n_transits,
+                           depths_per_transit=None):
+    """
+    Compute all TLS statistics for a search result.
+
+    Parameters
+    ----------
+    chi2 : array_like
+        Chi-squared values at each period
+    periods : array_like
+        Trial periods
+    best_period_idx : int
+        Index of best period
+    depth : float
+        Best-fit transit depth
+    duration : float
+        Best-fit transit duration
+    n_transits : int
+        Number of transits at best period
+    depths_per_transit : array_like, optional
+        Individual transit depths
+
+    Returns
+    -------
+    stats : dict
+        Dictionary with all statistics:
+        - SDE: Signal Detection Efficiency
+        - SDE_raw: Raw SDE before detrending
+        - SNR: Signal-to-noise ratio
+        - FAP: False Alarm Probability
+        - power: Detrended power spectrum
+        - SR: Signal residue
+        - odd_even_mismatch: Odd/even depth difference (if available)
+    """
+    # Signal residue and SDE
+    SDE, SDE_raw, power = signal_detection_efficiency(chi2, detrend=True)
+
+    SR = signal_residue(chi2)
+
+    # SNR (use chi2 values for depth_err estimation)
+    chi2_null = np.max(chi2)
+    chi2_best = chi2[best_period_idx]
+    SNR = signal_to_noise(depth, n_transits=n_transits,
+                          chi2_null=chi2_null, chi2_best=chi2_best)
+
+    # FAP
+    FAP = false_alarm_probability(SDE)
+
+    # Compile statistics
+    stats = {
+        'SDE': SDE,
+        'SDE_raw': SDE_raw,
+        'SNR': SNR,
+        'FAP': FAP,
+        'power': power,
+        'SR': SR,
+        'best_period': periods[best_period_idx],
+        'best_chi2': chi2[best_period_idx],
+    }
+
+    # Odd-even mismatch if per-transit depths available
+    if depths_per_transit is not None and len(depths_per_transit) > 2:
+        depths = np.asarray(depths_per_transit)
+        n = len(depths)
+
+        if n >= 4:  # Need at least 2 odd and 2 even
+            depths_odd = depths[::2]
+            depths_even = depths[1::2]
+
+            mismatch, diff = odd_even_mismatch(depths_odd, depths_even)
+            stats['odd_even_mismatch'] = mismatch
+            stats['odd_even_depth_diff'] = diff
+        else:
+            stats['odd_even_mismatch'] = 0.0
+            stats['odd_even_depth_diff'] = 0.0
+
+    return stats
+
+
+def compute_period_uncertainty(periods, chi2, best_idx, threshold=1.0):
+    """
+    Estimate period uncertainty using FWHM approach.
+
+    Parameters
+    ----------
+    periods : array_like
+        Trial periods
+    chi2 : array_like
+        Chi-squared values
+    best_idx : int
+        Index of minimum chi²
+    threshold : float, optional
+        Chi² increase threshold for FWHM (default: 1.0)
+
+    Returns
+    -------
+    uncertainty : float
+        Period uncertainty (half-width at threshold)
+
+    Notes
+    -----
+    Finds the width of the chi² minimum at threshold above minimum.
+    Default threshold=1 corresponds to 1σ for Gaussian errors.
+    """
+    periods = np.asarray(periods)
+    chi2 = np.asarray(chi2)
+
+    chi2_min = chi2[best_idx]
+    chi2_thresh = chi2_min + threshold
+
+    # Find points below threshold
+    below = chi2 < chi2_thresh
+
+    if not np.any(below):
+        # If no points below threshold, use grid spacing
+        if len(periods) > 1:
+            return np.abs(periods[1] - periods[0])
+        else:
+            return 0.1 * periods[best_idx]
+
+    # Find continuous region around best_idx
+    # Walk left from best_idx
+    left_idx = best_idx
+    while left_idx > 0 and below[left_idx]:
+        left_idx -= 1
+
+    # Walk right from best_idx
+    right_idx = best_idx
+    while right_idx < len(periods) - 1 and below[right_idx]:
+        right_idx += 1
+
+    # Uncertainty is half the width
+    width = periods[right_idx] - periods[left_idx]
+    uncertainty = width / 2.0
+
+    return uncertainty
+
+
+def pink_noise_correction(snr, n_transits, correlation_length=1):
+    """
+    Correct SNR for correlated (pink) noise.
+
+    Parameters
+    ----------
+    snr : float
+        White noise SNR
+    n_transits : int
+        Number of transits
+    correlation_length : float, optional
+        Correlation length in transit durations (default: 1)
+
+    Returns
+    -------
+    snr_pink : float
+        Pink noise corrected SNR
+
+    Notes
+    -----
+    Pink noise (correlated noise) reduces effective SNR because
+    neighboring points are not independent.
+
+    Correction factor ≈ sqrt(correlation_length / n_points_per_transit)
+    """
+    if correlation_length <= 0:
+        return snr
+
+    # Approximate correction
+    correction = np.sqrt(correlation_length)
+    snr_pink = snr / correction
+
+    return snr_pink
diff --git a/docs/RUNPOD_DEVELOPMENT.md b/docs/RUNPOD_DEVELOPMENT.md
index 116d09d..209fee3 100644
--- a/docs/RUNPOD_DEVELOPMENT.md
+++ b/docs/RUNPOD_DEVELOPMENT.md
@@ -178,6 +178,89 @@ nvcc --version
 
 Most RunPod templates include CUDA by default.
 
+**Common Issue**: `nvcc` not in PATH. Add CUDA to PATH before running:
+
+```bash
+export PATH=/usr/local/cuda/bin:$PATH
+```
+
+Or add to your `~/.bashrc` on RunPod for persistence.
+
+### scikit-cuda + numpy 2.x Compatibility
+
+If you encounter `AttributeError: module 'numpy' has no attribute 'typeDict'`:
+
+This is a known issue with scikit-cuda 0.5.3 and numpy 2.x. The `setup-remote.sh` script attempts to patch this automatically. If the patch fails, you can manually fix it:
+
+```bash
+ssh -p ${RUNPOD_SSH_PORT} ${RUNPOD_SSH_USER}@${RUNPOD_SSH_HOST}
+python3 << 'PYEOF'
+# Read the file
+with open('/usr/local/lib/python3.12/dist-packages/skcuda/misc.py', 'r') as f:
+    lines = f.readlines()
+
+# Find and replace the problematic section
+new_lines = []
+i = 0
+while i < len(lines):
+    if 'num_types = [np.sctypeDict[t] for t in' in lines[i] or 'num_types = [np.typeDict[t] for t in' in lines[i]:
+        new_lines.append('# Fixed for numpy 2.x compatibility\n')
+        new_lines.append('num_types = []\n')
+        new_lines.append('for t in np.typecodes["AllInteger"]+np.typecodes["AllFloat"]:\n')
+        new_lines.append('    try:\n')
+        new_lines.append('        num_types.append(np.dtype(t).type)\n')
+        new_lines.append('    except (KeyError, TypeError):\n')
+        new_lines.append('        pass\n')
+        if i+1 < len(lines) and 'np.typecodes' in lines[i+1]:
+            i += 1
+        i += 1
+    else:
+        new_lines.append(lines[i])
+        i += 1
+
+with open('/usr/local/lib/python3.12/dist-packages/skcuda/misc.py', 'w') as f:
+    f.writelines(new_lines)
+
+print('✓ Fixed skcuda/misc.py')
+PYEOF
+```
+
+### CUDA Initialization Errors
+
+If you see `pycuda._driver.LogicError: cuInit failed: initialization error`:
+
+**Symptoms:**
+- `nvidia-smi` shows GPU is available
+- PyCUDA/PyTorch cannot initialize CUDA
+- `/dev/nvidia0` missing or `/dev/nvidia1` present instead
+
+**Solution:**
+1. **Restart the RunPod instance** from the RunPod dashboard
+2. If restart doesn't help, **terminate and launch a new pod**
+3. Verify GPU access after restart:
+   ```bash
+   python3 -c 'import pycuda.driver as cuda; cuda.init(); print(f"GPUs: {cuda.Device.count()}")'
+   ```
+
+This is typically a GPU passthrough issue in the container that requires pod restart.
+
+### TLS GPU Testing
+
+To test the TLS GPU implementation:
+
+```bash
+# Quick test (bypasses import issues)
+./scripts/run-remote.sh "export PATH=/usr/local/cuda/bin:\$PATH && python3 test_tls_gpu.py"
+
+# Full example
+./scripts/run-remote.sh "export PATH=/usr/local/cuda/bin:\$PATH && python3 examples/tls_example.py"
+
+# Run pytest tests
+./scripts/test-remote.sh cuvarbase/tests/test_tls_basic.py -v
+```
+
+**Note**: The TLS implementation uses PyCUDA directly and does not depend on skcuda, so TLS tests can run even if skcuda has import issues.
+
 ## Security Notes
 
 - `.runpod.env` is gitignored to protect your credentials
diff --git a/docs/TLS_GPU_IMPLEMENTATION_PLAN.md b/docs/TLS_GPU_IMPLEMENTATION_PLAN.md
new file mode 100644
index 0000000..091667f
--- /dev/null
+++ b/docs/TLS_GPU_IMPLEMENTATION_PLAN.md
@@ -0,0 +1,1070 @@
+# GPU-Accelerated Transit Least Squares (TLS) Implementation Plan
+
+**Branch:** `tls-gpu-implementation`
+**Target:** Fastest TLS implementation with GPU acceleration
+**Reference:** https://github.com/hippke/tls (canonical CPU implementation)
+
+---
+
+## Executive Summary
+
+This document outlines the implementation plan for a GPU-accelerated Transit Least Squares (TLS) algorithm in cuvarbase. TLS is a more sophisticated transit detection method than Box Least Squares (BLS) that uses physically realistic transit models with limb darkening, achieving ~93% recovery rate vs BLS's ~76%.
+
+**Performance Target:** <1 second per light curve (vs ~10 seconds for CPU TLS)
+**Expected Speedup:** 10-100x over CPU implementation
+
+---
+
+## 1. Background: What is TLS?
+
+### 1.1 Core Concept
+
+Transit Least Squares detects periodic planetary transits using a chi-squared minimization approach with physically realistic transit models. Unlike BLS which uses simple box functions, TLS models:
+
+- **Limb darkening** (quadratic law via Batman library)
+- **Ingress/egress** (gradual dimming as planet enters/exits stellar disk)
+- **Full unbinned data** (no phase-binning approximations)
+
+### 1.2 Mathematical Formulation
+
+**Chi-squared test statistic:**
+```
+χ²(P, t₀, d) = Σᵢ (yᵢᵐ(P, t₀, d) - yᵢᵒ)² / σᵢ²
+```
+
+**Signal Residue (detection metric):**
+```
+SR(P) = χ²ₘᵢₙ,ₘₚₗₒᵦ / χ²ₘᵢₙ(P)
+```
+Normalized to [0,1], with 1 = strongest signal.
+
+**Signal Detection Efficiency (SDE):**
+```
+SDE(P) = (1 - ⟨SR(P)⟩) / σ(SR(P))
+```
+Z-score measuring signal strength above noise.
+
+### 1.3 Key Differences vs BLS
+
+| Feature | TLS | BLS |
+|---------|-----|-----|
+| Transit shape | Trapezoidal with limb darkening | Rectangular box |
+| Data handling | Unbinned phase-folded | Binned phase-folded |
+| Detection efficiency | 93% recovery | 76% recovery |
+| Physical realism | Models stellar physics | Simplified |
+| Small planet detection | Optimized (~10% better) | Standard |
+| Computational cost | ~10s per K2 LC (CPU) | ~10s per K2 LC |
+
+### 1.4 Algorithm Structure
+
+```
+For each trial period P:
+    1. Phase fold time series
+    2. Sort by phase
+    3. Patch arrays (handle edge wrapping)
+
+    For each duration d:
+        4. Get/cache transit model for duration d
+        5. Calculate out-of-transit residuals (cached)
+
+        For each trial T0 position:
+            6. Calculate in-transit residuals
+            7. Scale transit depth optimally
+            8. Compute chi-squared
+            9. Track minimum chi-squared
+```
+
+**Complexity:** O(P × D × N × W)
+- P = trial periods (~8,500)
+- D = durations per period (varies)
+- N = data points (~4,320)
+- W = transit width in samples
+
+**Total evaluations:** ~3×10⁸ per typical K2 light curve
+
+---
+
+## 2. Analysis of Existing BLS GPU Implementation
+
+### 2.1 Architecture Overview
+
+The existing cuvarbase BLS implementation provides an excellent foundation:
+
+**File Structure:**
+- `cuvarbase/bls.py` - Python API and memory management
+- `cuvarbase/kernels/bls.cu` - Standard CUDA kernel
+- `cuvarbase/kernels/bls_optimized.cu` - Optimized kernel with warp shuffles
+
+**Key Features:**
+1. **Dynamic block sizing** - Adapts block size to dataset size (32-256 threads)
+2. **Kernel caching** - LRU cache for compiled kernels (~100 MB max)
+3. **Shared memory histogramming** - Phase-binned data in shared memory
+4. **Parallel reduction** - Tree reduction with warp shuffle optimization
+5. **Adaptive mode** - Automatically selects sparse vs standard BLS
+
+### 2.2 GPU Optimization Techniques Used
+
+**Memory optimizations:**
+- Separate yw/w arrays to avoid bank conflicts
+- Coalesced global memory access
+- Shared memory for frequently accessed data
+
+**Compute optimizations:**
+- Fast math intrinsics (`__float2int_rd` instead of `floorf`)
+- Warp-level shuffle reduction (eliminates 4 `__syncthreads` calls)
+- Prepared function calls for faster kernel launches
+
+**Batching strategy:**
+- Frequency batching to respect GPU timeout limits
+- Stream-based async execution for overlapping compute/transfer
+- Grid-stride loops for handling more frequencies than blocks
+
+### 2.3 Memory Management
+
+**BLSMemory class:**
+- Page-aligned pinned memory for faster CPU-GPU transfers
+- Pre-allocated GPU arrays to avoid repeated allocation
+- Separate data/frequency memory allocation
+
+**Transfer strategy:**
+- Async transfers with CUDA streams
+- Data stays on GPU across multiple kernel launches
+- Results transferred back only when needed
+
+---
+
+## 3. TLS-Specific Challenges
+
+### 3.1 Key Algorithmic Differences
+
+| Aspect | BLS | TLS | Implementation Impact |
+|--------|-----|-----|----------------------|
+| Transit model | Box function | Limb-darkened trapezoid | Need transit model cache on GPU |
+| Model complexity | 1 multiplication | ~10-100 ops per point | Higher compute/memory ratio |
+| Duration sampling | Uniform q values | Logarithmic durations | Different grid generation |
+| Phase binning | Yes (shared memory) | No (unbinned) | Different memory access pattern |
+| Edge effects | Minimal | Requires correction | Need array patching |
+
+### 3.2 Computational Bottlenecks
+
+**From CPU TLS profiling:**
+1. **Phase folding/sorting** (~53% of time)
+   - MergeSort on GPU (use CUB library)
+   - Phase fold fully parallel
+
+2. **Residual calculations** (~47% of time)
+   - Highly parallel across T0 positions
+   - Chi-squared reductions (parallel reduction)
+
+3. **Out-of-transit caching** (critical optimization)
+   - Cumulative sums (parallel scan/prefix sum)
+   - Shared/global memory caching
+
+### 3.3 Transit Model Handling
+
+**Challenge:** TLS uses Batman library for transit models (CPU-only)
+
+**Solution:**
+1. Pre-compute transit models on CPU (Batman)
+2. Create reference transit (Earth-like, normalized)
+3. Cache scaled versions for different durations
+4. Transfer cache to GPU (constant/texture memory)
+5. Interpolate depths during search (fast on GPU)
+
+**Memory requirement:** ~MB scale for typical duration range
+
+---
+
+## 4. GPU Implementation Strategy
+
+### 4.1 Parallelization Hierarchy
+
+**Three levels of parallelism:**
+
+1. **Period-level (coarse-grained)**
+   - Each trial period is independent
+   - Launch 1 block per period
+   - Similar to BLS gridDim.x loop
+
+2. **Duration-level (medium-grained)**
+   - Multiple durations per period
+   - Can parallelize within block
+   - Shared memory for duration-specific data
+
+3. **T0-level (fine-grained)**
+   - Multiple T0 positions per duration
+   - Thread-level parallelism
+   - Ideal for GPU threads
+
+**Grid/block configuration:**
+```
+Grid: (nperiods, 1, 1)
+Block: (block_size, 1, 1)  // 64-256 threads
+
+Each block handles one period:
+  - Threads iterate over durations
+  - Threads iterate over T0 positions
+  - Reduction to find minimum chi-squared
+```
+
+### 4.2 Kernel Design
+
+**Proposed kernel structure:**
+
+```cuda
+__global__ void tls_search_kernel(
+    const float* t,              // Time array
+    const float* y,              // Flux/brightness
+    const float* dy,             // Uncertainties
+    const float* periods,        // Trial periods
+    const float* durations,      // Duration grid (per period)
+    const int* duration_counts,  // # durations per period
+    const float* transit_models, // Pre-computed transit shapes
+    const int* model_indices,    // Index into transit_models
+    float* chi2_min,            // Output: minimum chi²
+    float* best_t0,             // Output: best mid-transit time
+    float* best_duration,       // Output: best duration
+    float* best_depth,          // Output: best depth
+    int ndata,
+    int nperiods
+)
+```
+
+**Key kernel operations:**
+1. Phase fold data for assigned period
+2. Sort by phase (CUB DeviceRadixSort)
+3. Patch arrays (extend with wrapped data)
+4. For each duration:
+   - Load transit model from cache
+   - For each T0 position (stride sampling):
+     - Calculate in-transit residuals
+     - Calculate out-of-transit residuals (cached)
+     - Scale depth optimally
+     - Compute chi-squared
+5. Parallel reduction to find minimum chi²
+6. Store best solution
+
+### 4.3 Memory Layout
+
+**Global memory:**
+- Input data: `t`, `y`, `dy` (float32, ~4-10K points)
+- Period grid: `periods` (float32, ~8K)
+- Duration grids: `durations` (float32, variable per period)
+- Output: `chi2_min`, `best_t0`, `best_duration`, `best_depth`
+
+**Constant/texture memory:**
+- Transit model cache (~1-10 MB)
+- Limb darkening coefficients
+- Stellar parameters
+
+**Shared memory:**
+- Phase-folded data (float32, 4×ndata bytes)
+- Sorted indices (int32, 4×ndata bytes)
+- Partial chi² values (float32, blockDim.x bytes)
+- Out-of-transit residual cache (varies with duration)
+
+**Shared memory requirement:**
+```
+shmem = 8 × ndata + 4 × blockDim.x + cache_size
+      ≈ 35-40 KB for ndata=4K, blockDim=256
+```
+
+### 4.4 Optimization Techniques
+
+**From BLS optimizations:**
+1. Fast math intrinsics (`__float2int_rd`, etc.)
+2. Warp shuffle reduction for final chi² minimum
+3. Coalesced memory access patterns
+4. Separate arrays to avoid bank conflicts
+
+**TLS-specific:**
+1. Texture memory for transit models (fast interpolation)
+2. Parallel scan for cumulative sums (out-of-transit cache)
+3. MergeSort via CUB (better for partially sorted data)
+4. Array patching in kernel (avoid extra memory)
+
+---
+
+## 5. Implementation Phases
+
+### Phase 1: Core Infrastructure - COMPLETED
+
+**Status:** Basic infrastructure implemented
+**Date:** 2025-10-27
+
+**Completed:**
+- ✅ `cuvarbase/tls_grids.py` - Period and duration grid generation
+- ✅ `cuvarbase/tls_models.py` - Transit model generation (Batman wrapper + simple models)
+- ✅ `cuvarbase/tls.py` - Main Python API with TLSMemory class
+- ✅ `cuvarbase/kernels/tls.cu` - Basic CUDA kernel (Phase 1 version)
+- ✅ `cuvarbase/tests/test_tls_basic.py` - Initial unit tests
+
+**Key Learnings:**
+
+1. **Ofir 2014 Period Grid**: The Ofir algorithm can produce edge cases when parameters result in very few frequencies. Added fallback to simple linear grid for robustness.
+
+2. **Memory Layout**: Following BLS pattern with separate TLSMemory class for managing GPU/CPU transfers. Using page-aligned pinned memory for fast transfers.
+
+3. **Kernel Design Choices**:
+   - Phase 1 uses simple bubble sort (thread 0 only) - this limits us to small datasets
+   - Using simple trapezoidal transit model initially (no Batman on GPU)
+   - Fixed duration/T0 grids for Phase 1 simplicity
+   - Shared memory allocation: `(4*ndata + block_size) * 4 bytes`
+
+4. **Testing Strategy**: Created tests that don't require GPU hardware for CI/CD compatibility. GPU tests are marked with `@pytest.mark.skipif`.
+
+**Known Limitations (to be addressed in Phase 2):**
+- Bubble sort limits ndata to ~100-200 points
+- No optimal depth calculation (using fixed depth)
+- Simple trapezoid transit (no limb darkening on GPU yet)
+- No edge effect correction
+- No proper parameter tracking across threads in reduction
+
+**Next Steps:** Proceed to Phase 2 optimization ✅ COMPLETED
+
+---
+
+### Phase 2: Optimization - COMPLETED
+
+**Status:** Core optimizations implemented
+**Date:** 2025-10-27
+
+**Completed:**
+- ✅ `cuvarbase/kernels/tls_optimized.cu` - Optimized CUDA kernel with Thrust
+- ✅ Updated `cuvarbase/tls.py` - Support for multiple kernel variants
+- ✅ Optimal depth calculation using least squares
+- ✅ Warp shuffle reduction for minimum finding
+- ✅ Proper parameter tracking across thread reduction
+- ✅ Optimized shared memory layout (separate arrays, no bank conflicts)
+- ✅ Auto-selection of kernel variant based on dataset size
+
+**Key Improvements:**
+
+1. **Three Kernel Variants**:
+   - **Basic** (Phase 1): Bubble sort, fixed depth - for reference/testing
+   - **Simple**: Insertion sort, optimal depth, no Thrust - for ndata < 500
+   - **Optimized**: Thrust sorting, full optimizations - for ndata >= 500
+
+2. **Sorting Improvements**:
+   - Basic: O(n²) bubble sort (Phase 1 baseline)
+   - Simple: O(n²) insertion sort (3-5x faster than bubble sort)
+   - Optimized: O(n log n) Thrust sort (~100x faster for n=1000)
+
+3. **Optimal Depth Calculation**:
+   - Implemented weighted least squares: `depth = Σ(y*m/σ²) / Σ(m²/σ²)`
+   - Physical constraints: depth ∈ [0, 1]
+   - Improves chi² minimization significantly
+
+4. **Reduction Optimizations**:
+   - Tree reduction down to warp size
+   - Warp shuffle for final reduction (no `__syncthreads` in warp)
+   - Proper tracking of all parameters (t0, duration, depth, config_idx)
+   - No parameter loss during reduction
+
+5. **Memory Optimizations**:
+   - Separate arrays for y/dy to avoid bank conflicts
+   - Working memory allocation for Thrust (phases, y, dy, indices per period)
+   - Optimized shared memory layout: 3*ndata + 5*block_size floats + block_size ints
+
+6. **Search Space Expansion**:
+   - Increased durations: 10 → 15 samples
+   - Logarithmic duration spacing for better coverage
+   - Increased T0 positions: 20 → 30 samples
+   - Duration range: 0.5% to 15% of period
+
+**Performance Estimates:**
+
+| ndata | Kernel | Sort Time | Speedup vs Basic |
+|-------|--------|-----------|------------------|
+| 100   | Basic  | ~0.1 ms   | 1x               |
+| 100   | Simple | ~0.03 ms  | ~3x              |
+| 500   | Simple | ~1 ms     | ~5x              |
+| 1000  | Optimized | ~0.05 ms | ~100x        |
+| 5000  | Optimized | ~0.3 ms  | ~500x         |
+
+**Auto-Selection Logic:**
+- ndata < 500: Use simple kernel (insertion sort overhead acceptable)
+- ndata >= 500: Use optimized kernel (Thrust overhead justified)
+
+**Known Limitations (Phase 3 targets):**
+- Fixed duration/T0 grids (not period-dependent yet)
+- Simple box transit model (no limb darkening on GPU)
+- No edge effect correction
+- No out-of-transit caching
+- Working memory scales with nperiods (could be optimized)
+
+**Key Learnings:**
+
+1. **Thrust Integration**: Thrust provides massive speedup but adds compilation complexity. Simple kernel provides good middle ground.
+
+2. **Parameter Tracking**: Critical to track all parameters through reduction tree, not just chi². Volatile memory trick works for warp-level reduction.
+
+3. **Kernel Variant Selection**: Auto-selection based on dataset size provides best user experience without requiring expertise.
+
+4. **Shared Memory**: With optimal depth + parameter tracking, shared memory needs are: `(3*ndata + 5*BLOCK_SIZE)*4 + BLOCK_SIZE*4` bytes. For ndata=1000, block_size=128: ~13 KB (well under 48 KB limit).
+
+5. **Logarithmic Duration Spacing**: Much better coverage than linear spacing, especially for wide duration ranges.
+
+**Next Steps:** Proceed to Phase 3 (features & robustness) ✅ COMPLETED
+
+---
+
+### Phase 3: Features & Robustness - COMPLETED
+
+**Status:** Production features implemented
+**Date:** 2025-10-27
+
+**Completed:**
+- ✅ `cuvarbase/tls_stats.py` - Complete statistics module
+- ✅ `cuvarbase/tls_adaptive.py` - Adaptive method selection
+- ✅ `examples/tls_example.py` - Complete usage example
+- ✅ Enhanced results output with full statistics
+- ✅ Auto-selection between BLS and TLS
+
+**Key Features Added:**
+
+1. **Comprehensive Statistics Module** (`tls_stats.py`):
+   - **Signal Detection Efficiency (SDE)**: Primary detection metric with detrending
+   - **Signal-to-Noise Ratio (SNR)**: Transit depth SNR calculation
+   - **False Alarm Probability (FAP)**: Empirical calibration (Hippke & Heller 2019)
+   - **Signal Residue (SR)**: Normalized chi² ratio
+   - **Period uncertainty**: FWHM-based estimation
+   - **Odd-even mismatch**: Binary/false positive detection
+   - **Pink noise correction**: Correlated noise handling
+
+2. **Enhanced Results Output**:
+   - Raw outputs: chi², per-period parameters
+   - Best-fit: period, T0, duration, depth with uncertainties
+   - Statistics: SDE, SNR, FAP, power spectrum
+   - Metadata: n_transits, stellar parameters
+   - **41 output fields** matching CPU TLS
+
+3. **Adaptive Method Selection** (`tls_adaptive.py`):
+   - **Auto-selection logic**:
+     - ndata < 100: Sparse BLS (optimal for very few points)
+     - 100 < ndata < 500: Cost-based selection
+     - ndata > 500: TLS (best accuracy + speed)
+   - **Computational cost estimation** for each method
+   - **Special case handling**: short spans, fine grids, accuracy preference
+   - **Comparison mode**: Run all methods for benchmarking
+
+4. **Complete Usage Example** (`examples/tls_example.py`):
+   - Synthetic transit generation (Batman or simple)
+   - Full TLS search workflow
+   - Result analysis and comparison
+   - Four-panel diagnostic plots
+   - Error handling and fallbacks
+
+**Statistics Implementation:**
+
+```python
+# Signal Detection Efficiency
+SDE = (1 - ⟨SR⟩) / σ(SR)  with median detrending
+
+# SNR Calculation
+SNR = depth / depth_err × sqrt(n_transits)
+
+# FAP Calibration (empirical)
+SDE = 7  → FAP ≈ 1%
+SDE = 9  → FAP ≈ 0.1%
+SDE = 11 → FAP ≈ 0.01%
+```
+
+**Adaptive Selection Decision Tree:**
+
+```
+ndata < 100:
+    → Sparse BLS (optimal)
+
+100 ≤ ndata < 500:
+    if prefer_accuracy:
+        → TLS
+    else:
+        → Cost-based (Sparse BLS / BLS / TLS)
+
+ndata ≥ 500:
+    → TLS (optimal balance)
+
+Special overrides:
+    - T_span < 10 days → Sparse BLS
+    - nperiods > 10000 → TLS (if ndata allows)
+```
+
+**Example Output Structure:**
+
+```python
+results = {
+    # Raw outputs
+    'periods': [...],
+    'chi2': [...],
+    'best_t0_per_period': [...],
+    'best_duration_per_period': [...],
+    'best_depth_per_period': [...],
+
+    # Best-fit
+    'period': 12.5,
+    'period_uncertainty': 0.02,
+    'T0': 0.234,
+    'duration': 0.12,
+    'depth': 0.008,
+
+    # Statistics
+    'SDE': 15.3,
+    'SNR': 8.5,
+    'FAP': 1.2e-6,
+    'power': [...],
+    'SR': [...],
+
+    # Metadata
+    'n_transits': 8,
+    'R_star': 1.0,
+    'M_star': 1.0,
+}
+```
+
+**Key Learnings:**
+
+1. **SDE vs SNR**: SDE is more robust for period search (handles systematic noise), while SNR is better for individual transit significance.
+
+2. **Detrending Critical**: Median filter detrending improves SDE significantly by removing long-term trends and systematic effects.
+
+3. **FAP Calibration**: Empirical calibration much more accurate than Gaussian assumption for real data with correlated noise.
+
+4. **Adaptive Selection Value**: Users shouldn't need to know which method is best - auto-selection provides optimal performance.
+
+5. **Statistics Matching**: Full 41-field output structure compatible with CPU TLS for easy migration.
+
+**Production Readiness:**
+
+✅ **Complete API**: All major TLS features implemented
+✅ **Full Statistics**: SDE, SNR, FAP, and more
+✅ **Auto-Selection**: Smart method choice
+✅ **Example Code**: Complete usage demonstration
+✅ **Error Handling**: Graceful fallbacks
+✅ **Documentation**: Inline docs and examples
+
+**Remaining for Full Production:**
+
+- Integration tests with real astronomical data
+- Performance benchmarking suite
+- Comparison validation against CPU TLS
+- User documentation and tutorials
+- CI/CD pipeline setup
+
+**Next Steps:** Validation and testing phase, then merge to main
+
+---
+
+### Phase 1: Core Infrastructure (Week 1) - ORIGINAL PLAN
+
+**Files to create:**
+- `cuvarbase/tls.py` - Python API
+- `cuvarbase/kernels/tls.cu` - CUDA kernel
+- `cuvarbase/tls_models.py` - Transit model generation
+
+**Tasks:**
+1. Create TLS Python class similar to BLS structure
+2. Implement transit model pre-computation (Batman wrapper)
+3. Create period/duration grid generation (Ofir 2014)
+4. Implement basic kernel structure (no optimization)
+5. Memory management class (TLSMemory)
+
+**Deliverables:**
+- Basic working TLS GPU implementation
+- Correctness validation vs CPU TLS
+
+### Phase 2: Optimization (Week 2)
+
+**Tasks:**
+1. Implement shared memory optimizations
+2. Add warp shuffle reduction
+3. Optimize memory access patterns
+4. Implement out-of-transit caching
+5. Add texture memory for transit models
+6. Implement CUB-based sorting
+
+**Deliverables:**
+- Optimized TLS kernel
+- Performance benchmarks vs CPU
+
+### Phase 3: Features & Robustness (Week 3)
+
+**Tasks:**
+1. Implement edge effect correction
+2. Add adaptive block sizing
+3. Implement kernel caching (LRU)
+4. Add batch processing for large period grids
+5. Implement CUDA streams for async execution
+6. Add sparse TLS variant (for small datasets)
+
+**Deliverables:**
+- Production-ready TLS implementation
+- Adaptive mode selection
+
+### Phase 4: Testing & Validation (Week 4)
+
+**Tasks:**
+1. Create comprehensive unit tests
+2. Validate against CPU TLS on known planets
+3. Test edge cases (few data points, long periods, etc.)
+4. Performance profiling and optimization
+5. Documentation and examples
+
+**Deliverables:**
+- Full test suite
+- Benchmark results
+- Documentation
+
+---
+
+## 6. Testing Strategy
+
+### 6.1 Validation Tests
+
+**Test against CPU TLS:**
+1. **Synthetic transits** - Generate known signals, verify recovery
+2. **Known planets** - Test on confirmed exoplanet light curves
+3. **Edge cases** - Few transits, long periods, noisy data
+4. **Statistical properties** - SDE, SNR, FAP calculations
+
+**Metrics for validation:**
+- Period recovery (within 1%)
+- Duration recovery (within 10%)
+- Depth recovery (within 5%)
+- T0 recovery (within transit duration)
+- SDE values (within 5%)
+
+### 6.2 Performance Tests
+
+**Benchmarks:**
+1. vs CPU TLS (hippke/tls)
+2. vs GPU BLS (cuvarbase existing)
+3. Scaling with ndata (10 to 10K points)
+4. Scaling with nperiods (100 to 10K)
+
+**Target metrics:**
+- <1 second per K2 light curve (90 days, 4K points)
+- 10-100x speedup vs CPU TLS
+- Similar or better than GPU BLS
+
+### 6.3 Test Data
+
+**Sources:**
+1. Synthetic light curves (known parameters)
+2. TESS light curves (2-min cadence)
+3. K2 light curves (30-min cadence)
+4. Kepler light curves (30-min cadence)
+
+---
+
+## 7. API Design
+
+### 7.1 High-Level Interface
+
+```python
+from cuvarbase import tls
+
+# Simple interface
+results = tls.search(t, y, dy,
+                     R_star=1.0,      # Solar radii
+                     M_star=1.0,      # Solar masses
+                     period_min=None, # Auto-detect
+                     period_max=None) # Auto-detect
+
+# Access results
+print(f"Period: {results.period:.4f} days")
+print(f"SDE: {results.SDE:.2f}")
+print(f"Depth: {results.depth*1e6:.1f} ppm")
+```
+
+### 7.2 Advanced Interface
+
+```python
+# Custom configuration
+results = tls.search_advanced(
+    t, y, dy,
+    periods=custom_periods,
+    durations=custom_durations,
+    transit_template='custom',
+    limb_dark='quadratic',
+    u=[0.4804, 0.1867],
+    use_optimized=True,
+    use_sparse=None,  # Auto-select
+    block_size=128,
+    stream=cuda_stream
+)
+```
+
+### 7.3 Batch Processing
+
+```python
+# Process multiple light curves
+results_list = tls.search_batch(
+    [t1, t2, ...],
+    [y1, y2, ...],
+    [dy1, dy2, ...],
+    n_streams=4,
+    parallel=True
+)
+```
+
+---
+
+## 8. Expected Performance
+
+### 8.1 Theoretical Analysis
+
+**CPU TLS (current):**
+- ~10 seconds per K2 light curve
+- Single-threaded
+- 12.2 GFLOPs (72% of theoretical CPU max)
+
+**GPU TLS (target):**
+- <1 second per K2 light curve
+- ~10³-10⁴ parallel threads
+- 100-1000 GFLOPs (GPU advantage)
+
+**Speedup sources:**
+1. Period parallelism: 8,500 periods → 8,500 threads
+2. T0 parallelism: ~100 T0 positions per duration
+3. Faster reductions: Tree + warp shuffle
+4. Memory bandwidth: GPU >> CPU
+
+### 8.2 Bottleneck Analysis
+
+**Potential bottlenecks:**
+1. **Sorting** - CUB DeviceRadixSort is fast but not free
+   - Solution: Use MergeSort for partially sorted data
+   - Cost: ~5-10% of total time
+
+2. **Transit model interpolation** - Texture memory helps
+   - Solution: Pre-compute at high resolution
+   - Cost: ~2-5% of total time
+
+3. **Out-of-transit caching** - Shared memory limits
+   - Solution: Use parallel scan (CUB DeviceScan)
+   - Cost: ~10-15% of total time
+
+4. **Global memory bandwidth** - Reading t, y, dy repeatedly
+   - Solution: Shared memory caching per block
+   - Cost: ~20-30% of total time
+
+**Expected time breakdown:**
+- Phase folding/sorting: 20%
+- Residual calculations: 60%
+- Reductions/comparisons: 15%
+- Overhead: 5%
+
+---
+
+## 9. File Structure
+
+```
+cuvarbase/
+├── tls.py                          # Main TLS API
+├── tls_models.py                   # Transit model generation
+├── tls_grids.py                    # Period/duration grid generation
+├── tls_stats.py                    # Statistical calculations (SDE, SNR, FAP)
+├── kernels/
+│   ├── tls.cu                      # Standard TLS kernel
+│   ├── tls_optimized.cu            # Optimized kernel
+│   └── tls_sparse.cu               # Sparse variant (small datasets)
+└── tests/
+    ├── test_tls_basic.py           # Basic functionality
+    ├── test_tls_consistency.py     # Consistency with CPU TLS
+    ├── test_tls_performance.py     # Performance benchmarks
+    └── test_tls_validation.py      # Known planet recovery
+```
+
+---
+
+## 10. Dependencies
+
+**Required:**
+- PyCUDA (existing)
+- NumPy (existing)
+- Batman-package (CPU transit models)
+
+**Optional:**
+- Astropy (stellar parameters, unit conversions)
+- Numba (CPU fallback)
+
+**CUDA features:**
+- CUB library (sorting, scanning)
+- Texture memory (transit model interpolation)
+- Warp shuffle intrinsics
+- Cooperative groups (advanced optimization)
+
+---
+
+## 11. Success Criteria
+
+**Functional:**
+- [ ] Passes all validation tests (>95% accuracy vs CPU TLS)
+- [ ] Recovers known planets in test dataset
+- [ ] Handles edge cases robustly
+
+**Performance:**
+- [ ] <1 second per K2 light curve
+- [ ] 10-100x speedup vs CPU TLS
+- [ ] Comparable or better than GPU BLS
+
+**Quality:**
+- [ ] Full test coverage (>90%)
+- [ ] Comprehensive documentation
+- [ ] Example notebooks
+
+**Usability:**
+- [ ] Simple API for basic use cases
+- [ ] Advanced API for expert users
+- [ ] Clear error messages
+
+---
+
+## 12. Risk Mitigation
+
+### 12.1 Technical Risks
+
+| Risk | Mitigation |
+|------|------------|
+| GPU memory limits | Implement batching, use sparse variant |
+| Kernel timeout (Windows) | Add freq_batch_size parameter |
+| Sorting performance | Use CUB MergeSort for partially sorted |
+| Transit model accuracy | Validate against Batman reference |
+| Edge effect handling | Implement CPU TLS's correction algorithm |
+
+### 12.2 Performance Risks
+
+| Risk | Mitigation |
+|------|------------|
+| Slower than expected | Profile with Nsight, optimize bottlenecks |
+| Memory bandwidth bound | Increase compute/memory ratio, use shared mem |
+| Low occupancy | Adjust block size, reduce register usage |
+| Divergent branches | Minimize conditionals in inner loops |
+
+---
+
+## 13. Future Enhancements
+
+**Phase 5 (future):**
+1. Multi-GPU support
+2. CPU fallback (Numba)
+3. Alternative limb darkening laws
+4. Non-circular orbits (eccentric transits)
+5. Multi-planet search
+6. Real-time detection (streaming data)
+7. Integration with lightkurve/eleanor
+
+---
+
+## 14. References
+
+### Primary Papers
+
+1. **Hippke & Heller (2019)** - "Transit Least Squares: Optimized transit detection algorithm"
+   - arXiv:1901.02015
+   - A&A 623, A39
+
+2. **Ofir (2014)** - "Algorithmic considerations for continuous GW search"
+   - A&A 561, A138
+   - Period sampling algorithm
+
+3. **Mandel & Agol (2002)** - "Analytic Light Curves for Planetary Transit Searches"
+   - ApJ 580, L171
+   - Transit model theory
+
+### Related Work
+
+4. **Kovács et al. (2002)** - Original BLS paper
+   - A&A 391, 369
+
+5. **Kreidberg (2015)** - Batman: Bad-Ass Transit Model cAlculatioN
+   - PASP 127, 1161
+
+6. **Panahi & Zucker (2021)** - Sparse BLS algorithm
+   - arXiv:2103.06193
+
+### Software
+
+- TLS GitHub: https://github.com/hippke/tls
+- TLS Docs: https://transitleastsquares.readthedocs.io/
+- Batman: https://github.com/lkreidberg/batman
+- CUB: https://nvlabs.github.io/cub/
+
+---
+
+## Appendix A: Algorithm Pseudocode
+
+### CPU TLS (reference)
+
+```python
+def tls_search(t, y, dy, periods, durations, transit_models):
+    results = []
+
+    for period in periods:
+        # Phase fold
+        phases = (t / period) % 1.0
+        sorted_idx = argsort(phases)
+        phases = phases[sorted_idx]
+        y_sorted = y[sorted_idx]
+        dy_sorted = dy[sorted_idx]
+
+        # Patch (extend for edge wrapping)
+        phases_ext, y_ext, dy_ext = patch_arrays(phases, y_sorted, dy_sorted)
+
+        min_chi2 = inf
+        best_t0 = None
+        best_duration = None
+
+        for duration in durations[period]:
+            # Get transit model
+            model = transit_models[duration]
+
+            # Calculate out-of-transit residuals (can be cached)
+            residuals_out = calc_out_of_transit(y_ext, dy_ext, model)
+
+            # Stride over T0 positions
+            for t0 in T0_grid:
+                # Calculate in-transit residuals
+                residuals_in = calc_in_transit(y_ext, dy_ext, model, t0)
+
+                # Optimal depth scaling
+                depth = optimal_depth(residuals_in, residuals_out)
+
+                # Chi-squared
+                chi2 = calc_chi2(residuals_in, residuals_out, depth)
+
+                if chi2 < min_chi2:
+                    min_chi2 = chi2
+                    best_t0 = t0
+                    best_duration = duration
+
+        results.append((period, min_chi2, best_t0, best_duration))
+
+    return results
+```
+
+### GPU TLS (proposed)
+
+```cuda
+__global__ void tls_search_kernel(...) {
+    int period_idx = blockIdx.x;
+    int tid = threadIdx.x;
+
+    __shared__ float shared_phases[MAX_NDATA];
+    __shared__ float shared_y[MAX_NDATA];
+    __shared__ float shared_dy[MAX_NDATA];
+    __shared__ float chi2_vals[BLOCK_SIZE];
+
+    // Load data to shared memory
+    for (int i = tid; i < ndata; i += blockDim.x) {
+        float phase = fmodf(t[i] / periods[period_idx], 1.0f);
+        shared_phases[i] = phase;
+        shared_y[i] = y[i];
+        shared_dy[i] = dy[i];
+    }
+    __syncthreads();
+
+    // Sort by phase (CUB DeviceRadixSort or MergeSort)
+    cub::DeviceRadixSort::SortPairs(...);
+    __syncthreads();
+
+    // Patch arrays (extend for wrapping)
+    patch_arrays_shared(...);
+    __syncthreads();
+
+    float thread_min_chi2 = INFINITY;
+
+    // Iterate over durations
+    int n_durations = duration_counts[period_idx];
+    for (int d = 0; d < n_durations; d++) {
+        float duration = durations[period_idx * MAX_DURATIONS + d];
+
+        // Load transit model from texture memory
+        float* model = tex2D(transit_model_texture, duration, ...);
+
+        // Calculate out-of-transit residuals (use parallel scan for cumsum)
+        float residuals_out = calc_out_of_transit_shared(...);
+
+        // Stride over T0 positions (each thread handles multiple)
+        for (int t0_idx = tid; t0_idx < n_t0_positions; t0_idx += blockDim.x) {
+            float t0 = t0_grid[t0_idx];
+
+            // In-transit residuals
+            float residuals_in = calc_in_transit_shared(...);
+
+            // Optimal depth
+            float depth = optimal_depth_fast(residuals_in, residuals_out);
+
+            // Chi-squared
+            float chi2 = calc_chi2_fast(residuals_in, residuals_out, depth);
+
+            thread_min_chi2 = fminf(thread_min_chi2, chi2);
+        }
+    }
+
+    // Store thread minimum
+    chi2_vals[tid] = thread_min_chi2;
+    __syncthreads();
+
+    // Parallel reduction to find block minimum
+    // Tree reduction + warp shuffle
+    for (int s = blockDim.x/2; s >= 32; s /= 2) {
+        if (tid < s) {
+            chi2_vals[tid] = fminf(chi2_vals[tid], chi2_vals[tid + s]);
+        }
+        __syncthreads();
+    }
+
+    // Final warp reduction
+    if (tid < 32) {
+        float val = chi2_vals[tid];
+        for (int offset = 16; offset > 0; offset /= 2) {
+            val = fminf(val, __shfl_down_sync(0xffffffff, val, offset));
+        }
+        if (tid == 0) {
+            chi2_min[period_idx] = val;
+        }
+    }
+}
+```
+
+---
+
+## Appendix B: Key Equations
+
+### Chi-Squared Calculation
+
+```
+χ²(P, t₀, d, δ) = Σᵢ [yᵢ - m(tᵢ; P, t₀, d, δ)]² / σᵢ²
+
+where m(t; P, t₀, d, δ) is the transit model:
+  m(t) = {
+    1 - δ × limb_darkened_transit(phase(t))  if in transit
+    1                                          otherwise
+  }
+```
+
+### Optimal Depth Scaling
+
+```
+δ_opt = Σᵢ [yᵢ × m(tᵢ)] / Σᵢ [m(tᵢ)²]
+
+This minimizes χ² analytically for given (P, t₀, d)
+```
+
+### Signal Detection Efficiency
+
+```
+SDE = (1 - ⟨SR⟩) / σ(SR)
+
+where SR = χ²_white_noise / χ²_signal
+
+Median filter applied to remove systematic trends
+```
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** 2025-10-27
+**Author:** Claude Code (Anthropic)
diff --git a/docs/TLS_GPU_README.md b/docs/TLS_GPU_README.md
new file mode 100644
index 0000000..2365812
--- /dev/null
+++ b/docs/TLS_GPU_README.md
@@ -0,0 +1,313 @@
+# GPU-Accelerated Transit Least Squares (TLS)
+
+## Overview
+
+This is a GPU-accelerated implementation of the Transit Least Squares (TLS) algorithm for detecting periodic planetary transits in astronomical time series data. Unlike BLS (Box Least Squares), TLS uses a physically realistic limb-darkened transit template for fitting, improving sensitivity to small planets.
+
+**Reference:** [Hippke & Heller (2019), A&A 623, A39](https://ui.adsabs.harvard.edu/abs/2019A%26A...623A..39H/abstract)
+
+## Quick Start
+
+### Standard Mode - Fixed Duration Range
+
+```python
+from cuvarbase import tls
+
+results = tls.tls_search_gpu(
+    t, y, dy,
+    period_min=5.0,
+    period_max=20.0,
+    R_star=1.0,
+    M_star=1.0
+)
+
+print(f"Period: {results['period']:.4f} days")
+print(f"Depth: {results['depth']:.6f}")
+print(f"SDE: {results['SDE']:.2f}")
+```
+
+### Keplerian Mode - Physically Motivated Duration Constraints
+
+```python
+results = tls.tls_transit(
+    t, y, dy,
+    R_star=1.0,      # Solar radii
+    M_star=1.0,      # Solar masses
+    R_planet=1.0,    # Earth radii (fiducial)
+    qmin_fac=0.5,    # Search 0.5x to 2.0x Keplerian duration
+    qmax_fac=2.0,
+    n_durations=15,
+    period_min=5.0,
+    period_max=20.0
+)
+```
+
+## Features
+
+### 1. Limb-Darkened Transit Template
+
+The key difference from BLS is the use of a physically realistic transit template
+computed using the batman package (Kreidberg 2015). The template accounts for
+stellar limb darkening, producing a rounded transit shape rather than a box.
+
+The template is:
+- Precomputed on the CPU with configurable limb darkening law and coefficients
+- Transferred to GPU shared memory (4KB for 1000-point template)
+- Interpolated via linear lookup during the chi-squared calculation
+- Falls back to a trapezoidal shape if batman is not installed
+
+### 2. Keplerian-Aware Duration Constraints
+
+Just like BLS's `eebls_transit()`, TLS exploits Keplerian physics to focus the search on plausible transit durations:
+
+```python
+from cuvarbase import tls_grids
+
+# Calculate expected fractional duration at each period
+q_values = tls_grids.q_transit(periods, R_star=1.0, M_star=1.0, R_planet=1.0)
+
+# Generate focused duration grid
+durations, counts, q_vals = tls_grids.duration_grid_keplerian(
+    periods, R_star=1.0, M_star=1.0, R_planet=1.0,
+    qmin_fac=0.5, qmax_fac=2.0, n_durations=15
+)
+```
+
+### 3. Optimal Period Grid Sampling
+
+Implements Ofir (2014) frequency-to-cubic transformation for optimal period sampling:
+
+```python
+periods = tls_grids.period_grid_ofir(
+    t,
+    R_star=1.0,
+    M_star=1.0,
+    period_min=5.0,
+    period_max=20.0,
+    oversampling_factor=3,
+    n_transits_min=2
+)
+```
+
+**Reference:** Ofir (2014), "An optimized transit detection algorithm to search for periodic transits of small planets", A&A 561, A138
+
+### 4. GPU Memory Management
+
+Efficient GPU memory handling via `TLSMemory` class:
+- Pre-allocates GPU arrays for t, y, dy, periods, template, results
+- Supports both standard and Keplerian modes (qmin/qmax arrays)
+- Memory pooling reduces allocation overhead
+
+### 5. Optimized CUDA Kernels
+
+Two optimized CUDA kernels in `cuvarbase/kernels/tls.cu`:
+
+**`tls_search_kernel()`** - Standard search:
+- Fixed duration range (0.5% to 15% of period)
+- Limb-darkened transit template in shared memory
+- Bitonic sort for phase-folding
+- Warp shuffle reduction for finding minimum chi-squared
+
+**`tls_search_kernel_keplerian()`** - Keplerian-aware:
+- Per-period qmin/qmax arrays
+- Focused search space
+- Same core algorithm with template
+
+Both kernels:
+- Use shared memory for phase-folded data and transit template
+- Minimize global memory accesses
+- Support datasets up to ~100,000 points
+
+## API Reference
+
+### High-Level Functions
+
+#### `tls_transit(t, y, dy, **kwargs)`
+
+High-level wrapper with Keplerian duration constraints (analog of BLS's `eebls_transit()`).
+
+**Parameters:**
+- `t` (array): Time values
+- `y` (array): Flux/magnitude values
+- `dy` (array): Measurement uncertainties
+- `R_star` (float): Stellar radius in solar radii (default: 1.0)
+- `M_star` (float): Stellar mass in solar masses (default: 1.0)
+- `R_planet` (float): Fiducial planet radius in Earth radii (default: 1.0)
+- `qmin_fac` (float): Minimum duration factor (default: 0.5)
+- `qmax_fac` (float): Maximum duration factor (default: 2.0)
+- `n_durations` (int): Number of duration samples (default: 15)
+- `period_min` (float): Minimum period in days
+- `period_max` (float): Maximum period in days
+- `n_transits_min` (int): Minimum transits required (default: 2)
+- `oversampling_factor` (int): Period grid oversampling (default: 3)
+
+**Returns:** Dictionary with keys:
+- `period`: Best-fit period (days)
+- `T0`: Best-fit transit epoch (days)
+- `duration`: Best-fit transit duration (days)
+- `depth`: Best-fit transit depth (fractional flux dip)
+- `SDE`: Signal Detection Efficiency
+- `chi2`: Chi-squared value
+- `periods`: Array of trial periods
+- `power`: Detrended power spectrum
+
+#### `tls_search_gpu(t, y, dy, periods=None, **kwargs)`
+
+Low-level GPU search function with custom period/duration grids.
+
+**Additional Parameters:**
+- `periods` (array): Custom period grid (if None, auto-generated)
+- `qmin` (array): Per-period minimum fractional durations (Keplerian mode)
+- `qmax` (array): Per-period maximum fractional durations (Keplerian mode)
+- `n_durations` (int): Number of duration samples if using qmin/qmax
+- `block_size` (int): CUDA block size (default: 128)
+
+### Grid Generation Functions
+
+#### `period_grid_ofir(t, R_star, M_star, **kwargs)`
+
+Generate optimal period grid using Ofir (2014) frequency-to-cubic sampling.
+
+#### `q_transit(period, R_star, M_star, R_planet)`
+
+Calculate Keplerian fractional transit duration (q = duration/period).
+
+#### `duration_grid_keplerian(periods, R_star, M_star, R_planet, **kwargs)`
+
+Generate Keplerian-aware duration grid for each period.
+
+## Algorithm Details
+
+### Transit Template
+
+The transit model uses a precomputed limb-darkened template:
+
+```
+model(t) = 1 - depth * template(transit_coord)
+```
+
+Where `transit_coord` maps the phase position within the transit window to [-1, 1],
+and `template()` returns a value in [0, 1] via linear interpolation of the
+precomputed template array. The template captures limb darkening effects, giving
+a rounded bottom rather than the flat-bottomed box of BLS.
+
+### Optimal Depth Fitting
+
+For each trial (period, duration, T0), depth is solved via weighted least squares:
+```
+depth = sum[(1-y_i) * T(x_i) / sigma_i^2] / sum[T(x_i)^2 / sigma_i^2]
+```
+where T(x_i) is the template value at the transit coordinate of point i.
+
+### Signal Detection Efficiency (SDE)
+
+The SDE metric quantifies signal significance:
+```
+SDE = (max(SR) - mean(SR)) / std(SR)
+```
+
+Where SR (Signal Residue) = 1 - chi2 / chi2_null.
+
+**SDE > 7** typically indicates a robust detection.
+
+## Known Limitations
+
+1. **Dataset Size**: Bitonic sort supports up to ~100,000 points
+   - Designed for typical astronomical light curves (500-20,000 points)
+   - For >100k points, consider binning or using CPU TLS
+   - Performance is optimal for ndata < 20,000
+
+2. **Memory**: Requires ~(3N + n_template + 4*block_size) floats of shared memory per block
+   - 5,000 points: ~60 KB + 4 KB template
+   - Should work on any GPU with >2GB VRAM
+
+3. **Duration Grid**: Currently uniform in log-space
+   - Could optimize further using Ofir-style adaptive sampling
+
+4. **Single GPU**: No multi-GPU support yet
+   - Trivial to parallelize across multiple light curves
+
+## Related Work
+
+**CETRA** (Smith et al. 2025) is a complementary GPU-accelerated transit detection
+algorithm that uses a different approach (matched filtering with analytic templates).
+CETRA may be preferable for survey-scale searches where computational throughput is
+paramount. GPU TLS is valuable when standard TLS outputs (SDE, FAP, odd/even tests)
+are needed for transit vetting pipelines, or when results must be directly comparable
+to published CPU TLS results.
+
+## Testing
+
+### Pytest Suite
+
+```bash
+pytest cuvarbase/tests/test_tls_basic.py -v
+```
+
+Tests cover:
+- Transit template generation (batman and trapezoidal fallback)
+- Kernel compilation
+- Memory allocation
+- Period grid generation
+- Statistics (SR, SDE, SNR)
+- Signal recovery (synthetic transits)
+- SDE > 0 regression test
+
+## Implementation Files
+
+### Core Implementation
+- `cuvarbase/tls.py` - Main Python API
+- `cuvarbase/tls_models.py` - Transit template generation
+- `cuvarbase/tls_grids.py` - Grid generation utilities
+- `cuvarbase/tls_stats.py` - Statistical calculations
+- `cuvarbase/kernels/tls.cu` - CUDA kernels
+
+### Testing
+- `cuvarbase/tests/test_tls_basic.py` - Unit tests
+
+### Documentation
+- `docs/TLS_GPU_README.md` - This file
+
+## References
+
+1. **Hippke & Heller (2019)**: "Optimized transit detection algorithm to search for periodic transits of small planets", A&A 623, A39
+   - Original TLS algorithm and SDE metric
+
+2. **Kovacs et al. (2002)**: "A box-fitting algorithm in the search for periodic transits", A&A 391, 369
+   - BLS algorithm (TLS is a refinement)
+
+3. **Ofir (2014)**: "An optimized transit detection algorithm to search for periodic transits of small planets", A&A 561, A138
+   - Optimal period grid sampling
+
+4. **Smith et al. (2025)**: "CETRA: GPU-accelerated transit detection"
+   - Complementary GPU transit detection approach
+
+5. **Kreidberg (2015)**: "batman: BAsic Transit Model cAlculatioN in Python", PASP 127, 1161
+   - Transit model package used for template generation
+
+6. **transitleastsquares**: https://github.com/hippke/tls
+   - Reference CPU implementation
+
+## Citation
+
+If you use this GPU TLS implementation, please cite both cuvarbase and the original TLS paper:
+
+```bibtex
+@MISC{2022ascl.soft10030H,
+       author = {{Hoffman}, John},
+        title = "{cuvarbase: GPU-Accelerated Variability Algorithms}",
+ howpublished = {Astrophysics Source Code Library, record ascl:2210.030},
+         year = 2022,
+       adsurl = {https://ui.adsabs.harvard.edu/abs/2022ascl.soft10030H}
+}
+
+@ARTICLE{2019A&A...623A..39H,
+       author = {{Hippke}, Michael and {Heller}, Ren{\'e}},
+        title = "{Optimized transit detection algorithm to search for periodic transits of small planets}",
+      journal = {Astronomy & Astrophysics},
+         year = 2019,
+       volume = {623},
+          eid = {A39},
+          doi = {10.1051/0004-6361/201834672}
+}
+```
diff --git a/examples/tls_example.py b/examples/tls_example.py
new file mode 100644
index 0000000..cbaed31
--- /dev/null
+++ b/examples/tls_example.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+Example: GPU-Accelerated Transit Least Squares
+
+This script demonstrates how to use cuvarbase's GPU-accelerated TLS
+implementation to detect planetary transits in photometric time series.
+
+Requirements:
+- PyCUDA
+- NumPy
+- batman-package (optional, for generating synthetic transits)
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Check if we can import TLS modules
+try:
+    from cuvarbase import tls_grids, tls_models, tls
+    TLS_AVAILABLE = True
+except ImportError as e:
+    print(f"Warning: Could not import TLS modules: {e}")
+    TLS_AVAILABLE = False
+
+# Check if batman is available for generating synthetic data
+try:
+    import batman
+    BATMAN_AVAILABLE = True
+except ImportError:
+    BATMAN_AVAILABLE = False
+    print("batman-package not available. Using simple synthetic transit.")
+
+
+def generate_synthetic_transit(period=10.0, depth=0.01, duration=0.1,
+                               t0=0.0, ndata=1000, noise_level=0.001,
+                               T_span=100.0):
+    """
+    Generate synthetic light curve with transit.
+
+    Parameters
+    ----------
+    period : float
+        Orbital period (days)
+    depth : float
+        Transit depth (fractional)
+    duration : float
+        Transit duration (days)
+    t0 : float
+        Mid-transit time (days)
+    ndata : int
+        Number of data points
+    noise_level : float
+        Gaussian noise level
+    T_span : float
+        Total observation span (days)
+
+    Returns
+    -------
+    t, y, dy : ndarray
+        Time, flux, and uncertainties
+    """
+    # Generate time series
+    t = np.sort(np.random.uniform(0, T_span, ndata))
+
+    # Start with flat light curve
+    y = np.ones(ndata)
+
+    if BATMAN_AVAILABLE:
+        # Use Batman for realistic transit
+        params = batman.TransitParams()
+        params.t0 = t0
+        params.per = period
+        params.rp = np.sqrt(depth)  # Radius ratio
+        params.a = 15.0  # Semi-major axis
+        params.inc = 90.0  # Edge-on
+        params.ecc = 0.0
+        params.w = 90.0
+        params.limb_dark = "quadratic"
+        params.u = [0.4804, 0.1867]
+
+        m = batman.TransitModel(params, t)
+        y = m.light_curve(params)
+    else:
+        # Simple box transit
+        phases = (t % period) / period
+        duration_phase = duration / period
+
+        # Transit at phase 0
+        in_transit = (phases < duration_phase / 2) | (phases > 1 - duration_phase / 2)
+        y[in_transit] -= depth
+
+    # Add noise
+    noise = np.random.normal(0, noise_level, ndata)
+    y += noise
+
+    # Uncertainties
+    dy = np.ones(ndata) * noise_level
+
+    return t, y, dy
+
+
+def run_tls_example(use_gpu=True):
+    """
+    Run TLS example on synthetic data.
+
+    Parameters
+    ----------
+    use_gpu : bool
+        Use GPU implementation (default: True)
+    """
+    if not TLS_AVAILABLE:
+        print("TLS modules not available. Cannot run example.")
+        return
+
+    print("=" * 60)
+    print("GPU-Accelerated Transit Least Squares Example")
+    print("=" * 60)
+
+    # Generate synthetic data
+    print("\n1. Generating synthetic transit...")
+    period_true = 12.5  # days
+    depth_true = 0.008  # 0.8% depth
+    duration_true = 0.12  # days
+
+    t, y, dy = generate_synthetic_transit(
+        period=period_true,
+        depth=depth_true,
+        duration=duration_true,
+        ndata=800,
+        noise_level=0.0005,
+        T_span=100.0
+    )
+
+    print(f"   Data points: {len(t)}")
+    print(f"   Time span: {np.max(t) - np.min(t):.1f} days")
+    print(f"   True period: {period_true:.2f} days")
+    print(f"   True depth: {depth_true:.4f} ({depth_true*1e6:.0f} ppm)")
+    print(f"   True duration: {duration_true:.3f} days")
+
+    # Generate period grid
+    print("\n2. Generating period grid...")
+    periods = tls_grids.period_grid_ofir(
+        t, R_star=1.0, M_star=1.0,
+        oversampling_factor=3,
+        period_min=8.0,
+        period_max=20.0
+    )
+    print(f"   Testing {len(periods)} periods from {periods[0]:.2f} to {periods[-1]:.2f} days")
+
+    # Run TLS search
+    print("\n3. Running TLS search...")
+    if use_gpu:
+        try:
+            results = tls.tls_search_gpu(
+                t, y, dy,
+                periods=periods,
+                R_star=1.0,
+                M_star=1.0
+            )
+            print("   ✓ GPU search completed")
+        except Exception as e:
+            print(f"   ✗ GPU search failed: {e}")
+            print("   Tip: Make sure you have a CUDA-capable GPU and PyCUDA installed")
+            return
+    else:
+        print("   CPU implementation not yet available")
+        return
+
+    # Display results
+    print("\n4. Results:")
+    print(f"   Best period: {results['period']:.4f} ± {results['period_uncertainty']:.4f} days")
+    print(f"   Best depth: {results['depth']:.6f} ({results['depth']*1e6:.1f} ppm)")
+    print(f"   Best duration: {results['duration']:.4f} days")
+    print(f"   Best T0: {results['T0']:.4f} (phase)")
+    print(f"   Number of transits: {results['n_transits']}")
+    print(f"\n   Statistics:")
+    print(f"   SDE: {results['SDE']:.2f}")
+    print(f"   SNR: {results['SNR']:.2f}")
+    print(f"   FAP: {results['FAP']:.2e}")
+
+    # Compare to truth
+    period_error = np.abs(results['period'] - period_true)
+    depth_error = np.abs(results['depth'] - depth_true)
+    duration_error = np.abs(results['duration'] - duration_true)
+
+    print(f"\n   Recovery accuracy:")
+    print(f"   Period error: {period_error:.4f} days ({period_error/period_true*100:.1f}%)")
+    print(f"   Depth error: {depth_error:.6f} ({depth_error/depth_true*100:.1f}%)")
+    print(f"   Duration error: {duration_error:.4f} days ({duration_error/duration_true*100:.1f}%)")
+
+    # Plot results
+    print("\n5. Creating plots...")
+    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+
+    # Plot 1: Periodogram
+    ax = axes[0, 0]
+    ax.plot(results['periods'], results['power'], 'b-', linewidth=0.5)
+    ax.axvline(period_true, color='r', linestyle='--', label='True period')
+    ax.axvline(results['period'], color='g', linestyle='--', label='Best period')
+    ax.set_xlabel('Period (days)')
+    ax.set_ylabel('Power (detrended SR)')
+    ax.set_title('TLS Periodogram')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # Plot 2: Chi-squared
+    ax = axes[0, 1]
+    ax.plot(results['periods'], results['chi2'], 'b-', linewidth=0.5)
+    ax.axvline(period_true, color='r', linestyle='--', label='True period')
+    ax.axvline(results['period'], color='g', linestyle='--', label='Best period')
+    ax.set_xlabel('Period (days)')
+    ax.set_ylabel('Chi-squared')
+    ax.set_title('Chi-squared vs Period')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # Plot 3: Phase-folded light curve at best period
+    ax = axes[1, 0]
+    phases = (t % results['period']) / results['period']
+    ax.plot(phases, y, 'k.', alpha=0.3, markersize=2)
+    # Plot best-fit model
+    model_phases = np.linspace(0, 1, 1000)
+    model_flux = np.ones(1000)
+    duration_phase = results['duration'] / results['period']
+    t0_phase = results['T0']
+    in_transit = np.abs((model_phases - t0_phase + 0.5) % 1.0 - 0.5) < duration_phase / 2
+    model_flux[in_transit] = 1 - results['depth']
+    ax.plot(model_phases, model_flux, 'r-', linewidth=2, label='Best-fit model')
+    ax.set_xlabel('Phase')
+    ax.set_ylabel('Relative Flux')
+    ax.set_title(f'Phase-Folded at P={results["period"]:.4f} days')
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    # Plot 4: Raw light curve
+    ax = axes[1, 1]
+    ax.plot(t, y, 'k.', alpha=0.5, markersize=1)
+    ax.set_xlabel('Time (days)')
+    ax.set_ylabel('Relative Flux')
+    ax.set_title('Raw Light Curve')
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    plt.savefig('tls_example_results.png', dpi=150, bbox_inches='tight')
+    print("   ✓ Plot saved to 'tls_example_results.png'")
+
+    print("\n" + "=" * 60)
+    print("Example complete!")
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    import sys
+
+    # Check for --no-gpu flag
+    use_gpu = '--no-gpu' not in sys.argv
+
+    if use_gpu and not TLS_AVAILABLE:
+        print("Error: TLS modules not available.")
+        print("Make sure you're in the cuvarbase directory or have installed it.")
+        sys.exit(1)
+
+    try:
+        run_tls_example(use_gpu=use_gpu)
+    except KeyboardInterrupt:
+        print("\nInterrupted by user")
+        sys.exit(0)
+    except Exception as e:
+        print(f"\nError running example: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
diff --git a/scripts/benchmark_batch_keplerian.py b/scripts/benchmark_batch_keplerian.py
deleted file mode 100644
index d084473..0000000
--- a/scripts/benchmark_batch_keplerian.py
+++ /dev/null
@@ -1,301 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark BLS with realistic parameters for batch lightcurve processing.
-
-Uses:
-- 10-year time baseline
-- Keplerian frequency/q grids
-- Typical TESS/ground-based survey ndata values
-- Batch processing of multiple lightcurves
-"""
-
-import numpy as np
-import time
-import json
-from datetime import datetime
-
-try:
-    from cuvarbase import bls
-    GPU_AVAILABLE = True
-except Exception as e:
-    GPU_AVAILABLE = False
-    print(f"GPU not available: {e}")
-
-
-def generate_realistic_lightcurve(ndata, time_baseline_years=10, period=None,
-                                   depth=0.01, rho_star=1.0, seed=None):
-    """
-    Generate realistic lightcurve for survey data.
-
-    Parameters
-    ----------
-    ndata : int
-        Number of observations
-    time_baseline_years : float
-        Total time baseline in years
-    period : float, optional
-        Transit period in days. If None, generates noise only.
-    depth : float
-        Transit depth
-    rho_star : float
-        Stellar density in solar units (for Keplerian q)
-    seed : int, optional
-        Random seed
-
-    Returns
-    -------
-    t, y, dy : arrays
-        Time, magnitude, and uncertainties
-    """
-    if seed is not None:
-        np.random.seed(seed)
-
-    # Generate realistic time sampling (gaps, clusters)
-    time_baseline_days = time_baseline_years * 365.25
-
-    # Simulate survey observing pattern: clusters of observations with gaps
-    n_seasons = int(time_baseline_years)
-    points_per_season = ndata // n_seasons
-
-    t_list = []
-    for season in range(n_seasons):
-        season_start = season * 365.25
-        season_end = season_start + 200  # 200-day observing season
-
-        # Random observations within season
-        t_season = np.random.uniform(season_start, season_end, points_per_season)
-        t_list.append(t_season)
-
-    # Add remaining points
-    remaining = ndata - len(np.concatenate(t_list))
-    if remaining > 0:
-        t_extra = np.random.uniform(0, time_baseline_days, remaining)
-        t_list.append(t_extra)
-
-    t = np.sort(np.concatenate(t_list)).astype(np.float32)
-    t = t[:ndata]  # Ensure exact ndata
-
-    y = np.ones(ndata, dtype=np.float32)
-
-    if period is not None:
-        # Add realistic transit signal with Keplerian duration
-        phase = (t % period) / period
-
-        # Transit duration from Keplerian assumption
-        q = bls.q_transit(1.0/period, rho=rho_star)
-
-        in_transit = phase < q
-        y[in_transit] -= depth
-
-    # Add realistic noise
-    scatter = 0.01  # 1% photometric precision
-    y += np.random.normal(0, scatter, ndata).astype(np.float32)
-    dy = np.ones(ndata, dtype=np.float32) * scatter
-
-    return t, y, dy
-
-
-def get_keplerian_grid(t, fmin_frac=1.0, fmax_frac=1.0, samples_per_peak=2,
-                       qmin_fac=0.5, qmax_fac=2.0, rho=1.0):
-    """
-    Generate Keplerian frequency grid for realistic BLS search.
-
-    Parameters
-    ----------
-    t : array
-        Observation times
-    fmin_frac, fmax_frac : float
-        Fraction of auto-determined limits
-    samples_per_peak : float
-        Oversampling factor
-    qmin_fac, qmax_fac : float
-        Fraction of Keplerian q to search
-    rho : float
-        Stellar density in solar units
-
-    Returns
-    -------
-    freqs : array
-        Frequency grid
-    qmins, qmaxes : arrays
-        Min and max q values for each frequency
-    """
-    fmin = bls.fmin_transit(t, rho=rho) * fmin_frac
-    fmax = bls.fmax_transit(rho=rho, qmax=0.5/qmax_fac) * fmax_frac
-
-    freqs, q0vals = bls.transit_autofreq(t, fmin=fmin, fmax=fmax,
-                                         samples_per_peak=samples_per_peak,
-                                         qmin_fac=qmin_fac, qmax_fac=qmax_fac,
-                                         rho=rho)
-
-    qmins = q0vals * qmin_fac
-    qmaxes = q0vals * qmax_fac
-
-    return freqs, qmins, qmaxes
-
-
-def benchmark_single_vs_batch(ndata, n_lightcurves, time_baseline=10, n_trials=3):
-    """
-    Benchmark single lightcurve vs batch processing.
-
-    Parameters
-    ----------
-    ndata : int
-        Number of observations per lightcurve
-    n_lightcurves : int
-        Number of lightcurves to process
-    time_baseline : float
-        Time baseline in years
-    n_trials : int
-        Number of trials
-
-    Returns
-    -------
-    results : dict
-        Benchmark results
-    """
-    print(f"\nBenchmarking ndata={ndata}, n_lightcurves={n_lightcurves}...")
-
-    # Generate realistic lightcurves
-    lightcurves = []
-    for i in range(n_lightcurves):
-        t, y, dy = generate_realistic_lightcurve(ndata, time_baseline_years=time_baseline,
-                                                 period=5.0 if i % 3 == 0 else None,
-                                                 seed=42+i)
-        lightcurves.append((t, y, dy))
-
-    # Generate Keplerian frequency grid (same for all)
-    t0, _, _ = lightcurves[0]
-    freqs, qmins, qmaxes = get_keplerian_grid(t0)
-
-    nfreq = len(freqs)
-    print(f"  Keplerian grid: {nfreq} frequencies")
-    print(f"  Period range: {1/freqs[-1]:.2f} - {1/freqs[0]:.2f} days")
-
-    results = {
-        'ndata': int(ndata),
-        'n_lightcurves': int(n_lightcurves),
-        'nfreq': int(nfreq),
-        'time_baseline_years': float(time_baseline)
-    }
-
-    # Benchmark 1: Sequential processing with standard kernel
-    print("  Sequential (standard)...")
-    times_seq_std = []
-
-    for trial in range(n_trials):
-        start = time.time()
-        for t, y, dy in lightcurves:
-            _ = bls.eebls_gpu_fast(t, y, dy, freqs, qmin=qmins, qmax=qmaxes)
-        elapsed = time.time() - start
-        times_seq_std.append(elapsed)
-
-    mean_seq_std = np.mean(times_seq_std)
-    print(f"    Mean: {mean_seq_std:.3f}s")
-    print(f"    Per LC: {mean_seq_std/n_lightcurves:.3f}s")
-
-    results['sequential_standard'] = {
-        'total_time': float(mean_seq_std),
-        'per_lc_time': float(mean_seq_std / n_lightcurves),
-        'throughput_lc_per_sec': float(n_lightcurves / mean_seq_std)
-    }
-
-    # Benchmark 2: Sequential with adaptive kernel
-    print("  Sequential (adaptive)...")
-    times_seq_adapt = []
-
-    for trial in range(n_trials):
-        start = time.time()
-        for t, y, dy in lightcurves:
-            _ = bls.eebls_gpu_fast_adaptive(t, y, dy, freqs, qmin=qmins, qmax=qmaxes)
-        elapsed = time.time() - start
-        times_seq_adapt.append(elapsed)
-
-    mean_seq_adapt = np.mean(times_seq_adapt)
-    print(f"    Mean: {mean_seq_adapt:.3f}s")
-    print(f"    Per LC: {mean_seq_adapt/n_lightcurves:.3f}s")
-
-    results['sequential_adaptive'] = {
-        'total_time': float(mean_seq_adapt),
-        'per_lc_time': float(mean_seq_adapt / n_lightcurves),
-        'throughput_lc_per_sec': float(n_lightcurves / mean_seq_adapt)
-    }
-
-    # Compute speedups
-    speedup = mean_seq_std / mean_seq_adapt
-    print(f"  Speedup (adaptive vs standard): {speedup:.2f}x")
-
-    results['speedup_adaptive_vs_standard'] = float(speedup)
-
-    # Estimate cost savings
-    cost_per_hour = 0.34  # RunPod RTX 4000 Ada spot price
-    hours_std = (mean_seq_std / 3600) * (5e6 / n_lightcurves)  # Scale to 5M LCs
-    hours_adapt = (mean_seq_adapt / 3600) * (5e6 / n_lightcurves)
-
-    cost_std = hours_std * cost_per_hour
-    cost_adapt = hours_adapt * cost_per_hour
-    cost_savings = cost_std - cost_adapt
-
-    print(f"\n  Estimated cost for 5M lightcurves:")
-    print(f"    Standard: ${cost_std:.2f} ({hours_std:.1f} hours)")
-    print(f"    Adaptive: ${cost_adapt:.2f} ({hours_adapt:.1f} hours)")
-    print(f"    Savings: ${cost_savings:.2f} ({100*(1-cost_adapt/cost_std):.1f}%)")
-
-    results['cost_estimate_5M_lcs'] = {
-        'standard_usd': float(cost_std),
-        'adaptive_usd': float(cost_adapt),
-        'savings_usd': float(cost_savings),
-        'savings_percent': float(100*(1-cost_adapt/cost_std))
-    }
-
-    return results
-
-
-def main():
-    """Run realistic batch benchmark."""
-    print("=" * 80)
-    print("BATCH KEPLERIAN BLS BENCHMARK")
-    print("=" * 80)
-    print("\nRealistic parameters:")
-    print("  - 10-year time baseline")
-    print("  - Keplerian frequency/q grids")
-    print("  - Survey-like time sampling (seasonal gaps)")
-    print()
-
-    if not GPU_AVAILABLE:
-        print("ERROR: GPU not available")
-        return
-
-    all_results = {
-        'timestamp': datetime.now().isoformat(),
-        'benchmarks': []
-    }
-
-    # Test configurations representing different survey types
-    configs = [
-        # (ndata, n_lcs, description)
-        (100, 10, "Sparse ground-based (e.g., MEarth, HATNet)"),
-        (500, 10, "Dense ground-based (e.g., NGTS, HATPI)"),
-        (20000, 5, "Space-based (e.g., TESS, Kepler)"),
-    ]
-
-    for ndata, n_lcs, desc in configs:
-        print(f"\n{desc}")
-        print("-" * 80)
-
-        results = benchmark_single_vs_batch(ndata, n_lcs, time_baseline=10, n_trials=3)
-        results['description'] = desc
-        all_results['benchmarks'].append(results)
-
-    # Save results
-    filename = 'bls_batch_keplerian_benchmark.json'
-    with open(filename, 'w') as f:
-        json.dump(all_results, f, indent=2)
-
-    print(f"\n{'=' * 80}")
-    print(f"Results saved to: {filename}")
-    print("=" * 80)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/scripts/gpu-test.sh b/scripts/gpu-test.sh
new file mode 100755
index 0000000..fa8d327
--- /dev/null
+++ b/scripts/gpu-test.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# One-shot: create pod -> setup -> run tests -> stop pod.
+#
+# Usage:
+#   ./scripts/gpu-test.sh                                          # Run all tests
+#   ./scripts/gpu-test.sh cuvarbase/tests/test_tls_basic.py -v     # Specific tests
+#   ./scripts/gpu-test.sh --keep cuvarbase/tests/test_tls_basic.py # Don't stop pod after
+
+set -e
+
+KEEP_POD=false
+if [ "$1" = "--keep" ]; then
+    KEEP_POD=true
+    shift
+fi
+
+TEST_ARGS="${@:-cuvarbase/tests/test_tls_basic.py -v}"
+
+echo "========================================"
+echo "GPU Test: full lifecycle"
+echo "========================================"
+echo ""
+
+# Step 1: Create pod (if not already running)
+source .runpod.env 2>/dev/null || true
+
+NEED_CREATE=true
+if [ -n "${RUNPOD_POD_ID}" ] && [ -n "${RUNPOD_API_KEY}" ]; then
+    # Check if existing pod is still running
+    API_URL="https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}"
+    STATUS=$(curl -s --request POST \
+        --header 'content-type: application/json' \
+        --url "${API_URL}" \
+        --data "{\"query\": \"query { pod(input: {podId: \\\"${RUNPOD_POD_ID}\\\"}) { desiredStatus } }\"}" \
+        | python3 -c "
+import sys, json
+try:
+    data = json.load(sys.stdin)
+    pod = data.get('data', {}).get('pod')
+    print(pod['desiredStatus'] if pod else 'GONE')
+except: print('GONE')
+" 2>/dev/null)
+
+    if [ "${STATUS}" = "RUNNING" ]; then
+        echo "Reusing existing pod ${RUNPOD_POD_ID}"
+        NEED_CREATE=false
+    fi
+fi
+
+if [ "${NEED_CREATE}" = true ]; then
+    echo "Step 1: Creating pod..."
+    ./scripts/runpod-create.sh
+    echo ""
+    echo "Step 2: Setting up environment..."
+    ./scripts/setup-remote.sh
+else
+    echo "Step 1: Pod already running, syncing code..."
+    ./scripts/sync-to-runpod.sh
+fi
+
+echo ""
+echo "Step 3: Running tests..."
+echo "========================================"
+./scripts/test-remote.sh ${TEST_ARGS}
+TEST_EXIT=$?
+
+echo ""
+if [ "${KEEP_POD}" = true ]; then
+    echo "Pod kept running (--keep flag). Stop with: ./scripts/runpod-stop.sh"
+else
+    echo "Step 4: Stopping pod..."
+    ./scripts/runpod-stop.sh
+fi
+
+exit ${TEST_EXIT}
diff --git a/scripts/runpod-create.sh b/scripts/runpod-create.sh
new file mode 100755
index 0000000..617b6f8
--- /dev/null
+++ b/scripts/runpod-create.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+# Create a RunPod GPU pod and configure .runpod.env for SSH access.
+#
+# Usage:
+#   ./scripts/runpod-create.sh              # Default: cheapest available GPU
+#   ./scripts/runpod-create.sh "NVIDIA RTX A4000"  # Specific GPU type
+#
+# Requires RUNPOD_API_KEY in .runpod.env
+
+set -e
+
+# Load config
+if [ ! -f .runpod.env ]; then
+    echo "Error: .runpod.env not found. Copy .runpod.env.template and add your RUNPOD_API_KEY."
+    exit 1
+fi
+source .runpod.env
+
+if [ -z "${RUNPOD_API_KEY}" ]; then
+    echo "Error: RUNPOD_API_KEY not set in .runpod.env"
+    echo "Get your key from https://www.runpod.io/console/user/settings"
+    exit 1
+fi
+
+GPU_TYPE="${1:-NVIDIA RTX A4000}"
+POD_NAME="cuvarbase-dev"
+IMAGE="runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04"
+VOLUME_GB=20
+DISK_GB=20
+API_URL="https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}"
+
+echo "Creating RunPod instance..."
+echo "  GPU: ${GPU_TYPE}"
+echo "  Image: ${IMAGE}"
+
+# Create pod
+RESPONSE=$(curl -s --request POST \
+    --header 'content-type: application/json' \
+    --url "${API_URL}" \
+    --data "{\"query\": \"mutation { podFindAndDeployOnDemand(input: { cloudType: ALL, gpuCount: 1, volumeInGb: ${VOLUME_GB}, containerDiskInGb: ${DISK_GB}, minVcpuCount: 2, minMemoryInGb: 15, gpuTypeId: \\\"${GPU_TYPE}\\\", name: \\\"${POD_NAME}\\\", imageName: \\\"${IMAGE}\\\", ports: \\\"22/tcp\\\", volumeMountPath: \\\"/workspace\\\" }) { id costPerHr } }\"}")
+
+# Extract pod ID
+POD_ID=$(echo "${RESPONSE}" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+if 'errors' in data:
+    print('ERROR: ' + data['errors'][0]['message'], file=sys.stderr)
+    sys.exit(1)
+pod = data['data']['podFindAndDeployOnDemand']
+print(pod['id'])
+" 2>&1)
+
+if [[ "${POD_ID}" == ERROR:* ]]; then
+    echo "${POD_ID}"
+    echo ""
+    echo "Full response: ${RESPONSE}"
+    exit 1
+fi
+
+COST=$(echo "${RESPONSE}" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+print(data['data']['podFindAndDeployOnDemand']['costPerHr'])
+")
+
+echo "Pod created: ${POD_ID} (\$${COST}/hr)"
+echo "Waiting for pod to start..."
+
+# Poll until running and SSH is available
+MAX_WAIT=180
+WAITED=0
+SSH_IP=""
+SSH_PORT=""
+
+while [ ${WAITED} -lt ${MAX_WAIT} ]; do
+    sleep 5
+    WAITED=$((WAITED + 5))
+
+    STATUS_RESPONSE=$(curl -s --request POST \
+        --header 'content-type: application/json' \
+        --url "${API_URL}" \
+        --data "{\"query\": \"query { pod(input: {podId: \\\"${POD_ID}\\\"}) { id desiredStatus runtime { uptimeInSeconds ports { ip isIpPublic privatePort publicPort type } } } }\"}")
+
+    # Parse status
+    eval "$(echo "${STATUS_RESPONSE}" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+pod = data['data']['pod']
+status = pod.get('desiredStatus', 'UNKNOWN')
+print(f'POD_STATUS={status}')
+runtime = pod.get('runtime')
+if runtime and runtime.get('ports'):
+    for port in runtime['ports']:
+        if port['privatePort'] == 22 and port['isIpPublic']:
+            print(f'SSH_IP={port[\"ip\"]}')
+            print(f'SSH_PORT={port[\"publicPort\"]}')
+")"
+
+    printf "\r  Status: %-10s Waited: %ds" "${POD_STATUS}" "${WAITED}"
+
+    if [ -n "${SSH_IP}" ] && [ -n "${SSH_PORT}" ]; then
+        echo ""
+        break
+    fi
+done
+
+if [ -z "${SSH_IP}" ] || [ -z "${SSH_PORT}" ]; then
+    echo ""
+    echo "Error: Pod did not become SSH-ready within ${MAX_WAIT}s"
+    echo "Pod ID: ${POD_ID} (check RunPod dashboard)"
+    echo "Last status: ${POD_STATUS}"
+    exit 1
+fi
+
+echo "SSH port reported: ${SSH_IP}:${SSH_PORT}"
+
+SSH_KEY_OPT=""
+if [ -f ~/.ssh/id_ed25519 ]; then
+    SSH_KEY_OPT="-i ~/.ssh/id_ed25519"
+fi
+
+# Get podHostId for proxy SSH
+echo "Getting proxy SSH credentials..."
+POD_HOST_ID=$(curl -s --request POST \
+    --header "content-type: application/json" \
+    --url "${API_URL}" \
+    --data "{\"query\": \"query { pod(input: {podId: \\\"${POD_ID}\\\"}) { machine { podHostId } } }\"}" \
+    | python3 -c "import sys, json; print(json.load(sys.stdin)['data']['pod']['machine']['podHostId'])")
+
+echo "Pod host ID: ${POD_HOST_ID}"
+
+# Start SSHD via RunPod proxy (the image doesn't auto-start it)
+echo "Starting SSH daemon via RunPod proxy..."
+PROXY_SSH="ssh -tt -o ConnectTimeout=15 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${SSH_KEY_OPT} ${POD_HOST_ID}@ssh.runpod.io"
+
+echo 'ssh-keygen -A 2>/dev/null; service ssh start; mkdir -p /root/.ssh; chmod 700 /root/.ssh; echo "SSHD_SETUP_DONE"; exit' \
+    | ${PROXY_SSH} 2>&1 | grep -q "SSHD_SETUP_DONE" && echo "SSHD started." || echo "Warning: SSHD setup may have failed."
+
+# Add local SSH public key to authorized_keys
+if [ -f ~/.ssh/id_ed25519.pub ]; then
+    LOCAL_PUBKEY=$(cat ~/.ssh/id_ed25519.pub)
+    echo "mkdir -p /root/.ssh && echo \"${LOCAL_PUBKEY}\" >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys && echo AUTH_OK; exit" \
+        | ${PROXY_SSH} 2>&1 | grep -q "AUTH_OK" && echo "SSH key authorized." || echo "Warning: key setup may have failed."
+fi
+
+# Wait for direct SSH to accept connections
+echo "Waiting for direct SSH..."
+SSH_READY=false
+SSH_WAIT=0
+SSH_MAX_WAIT=30
+while [ ${SSH_WAIT} -lt ${SSH_MAX_WAIT} ]; do
+    if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \
+        ${SSH_KEY_OPT} -p ${SSH_PORT} root@${SSH_IP} "echo ok" >/dev/null 2>&1; then
+        SSH_READY=true
+        break
+    fi
+    sleep 3
+    SSH_WAIT=$((SSH_WAIT + 3))
+    printf "\r  SSH wait: %ds" "${SSH_WAIT}"
+done
+echo ""
+
+if [ "${SSH_READY}" != true ]; then
+    echo "Warning: Direct SSH not responding. Proxy SSH should still work."
+fi
+
+echo "SSH ready: ${SSH_IP}:${SSH_PORT}"
+
+# Update .runpod.env with new connection details (preserve API key and other settings)
+python3 -c "
+import re
+
+with open('.runpod.env', 'r') as f:
+    content = f.read()
+
+replacements = {
+    'RUNPOD_SSH_HOST': '${SSH_IP}',
+    'RUNPOD_SSH_PORT': '${SSH_PORT}',
+    'RUNPOD_SSH_USER': 'root',
+    'RUNPOD_POD_ID': '${POD_ID}',
+}
+
+for key, val in replacements.items():
+    pattern = rf'^#?\s*{key}=.*$'
+    replacement = f'{key}={val}'
+    if re.search(pattern, content, re.MULTILINE):
+        content = re.sub(pattern, replacement, content, flags=re.MULTILINE)
+    else:
+        content = content.rstrip() + f'\n{replacement}\n'
+
+with open('.runpod.env', 'w') as f:
+    f.write(content)
+"
+
+echo ""
+echo "Updated .runpod.env with new connection details."
+echo ""
+echo "Pod ID:  ${POD_ID}"
+echo "SSH:     ssh -i ~/.ssh/id_ed25519 -p ${SSH_PORT} root@${SSH_IP}"
+echo "Cost:    \$${COST}/hr"
+echo ""
+echo "Next steps:"
+echo "  ./scripts/setup-remote.sh                          # Install cuvarbase"
+echo "  ./scripts/test-remote.sh cuvarbase/tests/test_tls_basic.py -v  # Run TLS tests"
+echo "  ./scripts/runpod-stop.sh                           # Stop pod when done"
diff --git a/scripts/runpod-stop.sh b/scripts/runpod-stop.sh
new file mode 100755
index 0000000..eb88393
--- /dev/null
+++ b/scripts/runpod-stop.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Stop (or terminate) the RunPod pod.
+#
+# Usage:
+#   ./scripts/runpod-stop.sh            # Stop (can resume later, keeps volume)
+#   ./scripts/runpod-stop.sh --terminate # Terminate (deletes everything)
+
+set -e
+
+if [ ! -f .runpod.env ]; then
+    echo "Error: .runpod.env not found"
+    exit 1
+fi
+source .runpod.env
+
+if [ -z "${RUNPOD_API_KEY}" ]; then
+    echo "Error: RUNPOD_API_KEY not set in .runpod.env"
+    exit 1
+fi
+
+if [ -z "${RUNPOD_POD_ID}" ]; then
+    echo "Error: RUNPOD_POD_ID not set in .runpod.env (no active pod?)"
+    exit 1
+fi
+
+API_URL="https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}"
+
+if [ "$1" = "--terminate" ]; then
+    echo "Terminating pod ${RUNPOD_POD_ID}..."
+    RESPONSE=$(curl -s --request POST \
+        --header 'content-type: application/json' \
+        --url "${API_URL}" \
+        --data "{\"query\": \"mutation { podTerminate(input: {podId: \\\"${RUNPOD_POD_ID}\\\"}) }\"}")
+    echo "Pod terminated."
+else
+    echo "Stopping pod ${RUNPOD_POD_ID}..."
+    RESPONSE=$(curl -s --request POST \
+        --header 'content-type: application/json' \
+        --url "${API_URL}" \
+        --data "{\"query\": \"mutation { podStop(input: {podId: \\\"${RUNPOD_POD_ID}\\\"}) { id desiredStatus } }\"}")
+    echo "Pod stopped. Resume later from the RunPod dashboard, or re-run ./scripts/runpod-create.sh"
+fi
diff --git a/scripts/setup-remote.sh b/scripts/setup-remote.sh
index a955181..d2f9319 100755
--- a/scripts/setup-remote.sh
+++ b/scripts/setup-remote.sh
@@ -13,7 +13,7 @@ fi
 source .runpod.env
 
 # Build SSH connection string
-SSH_OPTS="-p ${RUNPOD_SSH_PORT}"
+SSH_OPTS="-p ${RUNPOD_SSH_PORT} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR"
 if [ -n "${RUNPOD_SSH_KEY}" ]; then
     SSH_OPTS="${SSH_OPTS} -i ${RUNPOD_SSH_KEY}"
 fi
@@ -35,10 +35,16 @@ set -e
 
 cd /workspace/cuvarbase
 
-# Set up CUDA environment
-export PATH=/usr/local/cuda-12.8/bin:$PATH
-export CUDA_HOME=/usr/local/cuda-12.8
-export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:$LD_LIBRARY_PATH
+# Set up CUDA environment (auto-detect version)
+if [ -d /usr/local/cuda ]; then
+    export PATH=/usr/local/cuda/bin:$PATH
+    export CUDA_HOME=/usr/local/cuda
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+elif [ -d /usr/local/cuda-12.4 ]; then
+    export PATH=/usr/local/cuda-12.4/bin:$PATH
+    export CUDA_HOME=/usr/local/cuda-12.4
+    export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
+fi
 
 # Check if CUDA is available
 echo "Checking CUDA availability..."
@@ -61,47 +67,10 @@ import re
 import os
 import glob
 
-# Find skcuda installation (could be in different python versions)
-skcuda_paths = glob.glob('/usr/local/lib/python*/dist-packages/skcuda/misc.py')
-if not skcuda_paths:
-    print("Warning: skcuda/misc.py not found, skipping patch")
-    exit(0)
-
-misc_path = skcuda_paths[0]
-print(f"Patching {misc_path}...")
-
-# Read the file
-with open(misc_path, 'r') as f:
-    content = f.read()
-
-# Replace the problematic lines around line 637
-old_code = """# List of available numerical types provided by numpy:
-num_types = [np.sctypeDict[t] for t in \\
-             np.typecodes['AllInteger']+np.typecodes['AllFloat']]"""
-
-new_code = """# List of available numerical types provided by numpy:
-# Fixed for numpy 2.x compatibility
-try:
-    num_types = [np.sctypeDict[t] for t in \\
-                 np.typecodes['AllInteger']+np.typecodes['AllFloat']]
-except KeyError:
-    # numpy 2.x: build list manually
-    num_types = [np.int8, np.int16, np.int32, np.int64,
-                 np.uint8, np.uint16, np.uint32, np.uint64,
-                 np.float16, np.float32, np.float64]"""
-
-if old_code in content:
-    content = content.replace(old_code, new_code)
-    with open(misc_path, 'w') as f:
-        f.write(content)
-    print(f"✓ Patched {misc_path}")
-else:
-    print(f"Note: Already patched or code structure changed")
-
-# Patch np.sctypes usage across all scikit-cuda files
-print("")
-print("Patching np.sctypes usage in scikit-cuda...")
 skcuda_files = glob.glob('/usr/local/lib/python*/dist-packages/skcuda/*.py')
+if not skcuda_files:
+    print("Warning: skcuda not found, skipping patch")
+    exit(0)
 
 for filepath in skcuda_files:
     with open(filepath, 'r') as f:
@@ -109,34 +78,28 @@ for filepath in skcuda_files:
 
     original = content
 
-    # Replace np.sctypes with explicit types
-    content = re.sub(
-        r'np\.sctypes\[(["\'])float\1\]',
-        '[np.float16, np.float32, np.float64]',
-        content
-    )
+    # Replace num_types list comprehension using typeDict or sctypeDict
+    # This handles both np.typeDict and np.sctypeDict variants
     content = re.sub(
-        r'np\.sctypes\[(["\'])int\1\]',
-        '[np.int8, np.int16, np.int32, np.int64]',
-        content
-    )
-    content = re.sub(
-        r'np\.sctypes\[(["\'])uint\1\]',
-        '[np.uint8, np.uint16, np.uint32, np.uint64]',
-        content
-    )
-    content = re.sub(
-        r'np\.sctypes\[(["\'])complex\1\]',
-        '[np.complex64, np.complex128]',
+        r'num_types\s*=\s*\[np\.(?:type|sctype)Dict\[t\]\s+for\s+t\s+in\s*\\?\s*\n\s*np\.typecodes\[.AllInteger.\]\+np\.typecodes\[.AllFloat.\]\]',
+        'num_types = [np.int8, np.int16, np.int32, np.int64,\n'
+        '             np.uint8, np.uint16, np.uint32, np.uint64,\n'
+        '             np.float16, np.float32, np.float64]',
         content
     )
 
+    # Replace np.sctypes with explicit types
+    content = re.sub(r'np\.sctypes\[(["\'])float\1\]', '[np.float16, np.float32, np.float64]', content)
+    content = re.sub(r'np\.sctypes\[(["\'])int\1\]', '[np.int8, np.int16, np.int32, np.int64]', content)
+    content = re.sub(r'np\.sctypes\[(["\'])uint\1\]', '[np.uint8, np.uint16, np.uint32, np.uint64]', content)
+    content = re.sub(r'np\.sctypes\[(["\'])complex\1\]', '[np.complex64, np.complex128]', content)
+
     if content != original:
         with open(filepath, 'w') as f:
             f.write(content)
-        print(f"✓ Patched {os.path.basename(filepath)}")
+        print(f"  Patched {os.path.basename(filepath)}")
 
-print("✓ All scikit-cuda files patched for numpy 2.x compatibility")
+print("All scikit-cuda files patched for numpy 2.x compatibility")
 ENDPYTHON
 
 echo ""
diff --git a/scripts/sync-to-runpod.sh b/scripts/sync-to-runpod.sh
index bbbba6a..a47201d 100755
--- a/scripts/sync-to-runpod.sh
+++ b/scripts/sync-to-runpod.sh
@@ -13,7 +13,7 @@ fi
 source .runpod.env
 
 # Build SSH connection string
-SSH_OPTS="-p ${RUNPOD_SSH_PORT}"
+SSH_OPTS="-p ${RUNPOD_SSH_PORT} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR"
 if [ -n "${RUNPOD_SSH_KEY}" ]; then
     SSH_OPTS="${SSH_OPTS} -i ${RUNPOD_SSH_KEY}"
 fi
diff --git a/scripts/test-remote.sh b/scripts/test-remote.sh
index a242b4f..678df14 100755
--- a/scripts/test-remote.sh
+++ b/scripts/test-remote.sh
@@ -13,7 +13,7 @@ fi
 source .runpod.env
 
 # Build SSH connection string
-SSH_OPTS="-p ${RUNPOD_SSH_PORT}"
+SSH_OPTS="-p ${RUNPOD_SSH_PORT} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR"
 if [ -n "${RUNPOD_SSH_KEY}" ]; then
     SSH_OPTS="${SSH_OPTS} -i ${RUNPOD_SSH_KEY}"
 fi
@@ -40,7 +40,7 @@ echo "Step 2: Running tests on RunPod..."
 echo "=========================================="
 
 # Run tests remotely and stream output
-ssh ${SSH_OPTS} ${SSH_HOST} "export PATH=/usr/local/cuda-12.8/bin:\$PATH && export CUDA_HOME=/usr/local/cuda-12.8 && export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:\$LD_LIBRARY_PATH && cd ${RUNPOD_REMOTE_DIR} && pytest ${TEST_PATH} ${PYTEST_ARGS} -v"
+ssh ${SSH_OPTS} ${SSH_HOST} "export PATH=/usr/local/cuda/bin:\$PATH && export CUDA_HOME=/usr/local/cuda && export LD_LIBRARY_PATH=/usr/local/cuda/lib64:\$LD_LIBRARY_PATH && cd ${RUNPOD_REMOTE_DIR} && pytest ${TEST_PATH} ${PYTEST_ARGS} -v"
 
 echo ""
 echo "=========================================="