diff --git a/.runpod.env.template b/.runpod.env.template index 8137684..6ad5a55 100644 --- a/.runpod.env.template +++ b/.runpod.env.template @@ -14,6 +14,9 @@ RUNPOD_SSH_USER=root # Remote paths RUNPOD_REMOTE_DIR=/workspace/cuvarbase -# RunPod API Key (optional, for advanced automation) +# RunPod API Key (required for scripts/runpod-create.sh and scripts/gpu-test.sh) # Get from https://www.runpod.io/console/user/settings -# RUNPOD_API_KEY=your-api-key-here +RUNPOD_API_KEY= + +# Pod ID (auto-populated by runpod-create.sh) +# RUNPOD_POD_ID= diff --git a/README.md b/README.md index bab019c..89b0c8b 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,12 @@ Currently includes implementations of: - Sparse BLS ([Panahi & Zucker 2021](https://arxiv.org/abs/2103.06193)) for small datasets (< 500 observations) - GPU implementation: `sparse_bls_gpu()` (default) - CPU implementation: `sparse_bls_cpu()` (fallback) +- **Transit Least Squares ([TLS](https://ui.adsabs.harvard.edu/abs/2019A%26A...623A..39H/abstract))** - GPU-accelerated transit detection with optimal depth fitting + - **35-202× faster** than CPU TLS (transitleastsquares package) + - Keplerian-aware duration constraints (`tls_transit()`) - searches physically plausible transit durations + - Standard mode (`tls_search_gpu()`) for custom period/duration grids + - Optimal period grid sampling (Ofir 2014) + - Supports datasets up to ~100,000 observations (optimal: 500-20,000) - **Non-equispaced fast Fourier transform (NFFT)** - Adjoint operation ([paper](http://epubs.siam.org/doi/abs/10.1137/0914081)) - **NUFFT-based Likelihood Ratio Test (LRT)** - Transit detection with correlated noise (contributed by Jamila Taaki) - Matched filter in frequency domain with adaptive noise estimation @@ -196,6 +202,8 @@ Full documentation is available at: https://johnh2o2.github.io/cuvarbase/ ## Quick Start +### Box Least Squares (BLS) - Transit Detection + ```python import numpy as np from cuvarbase import bls @@ -205,7 +213,6 @@ t = np.sort(np.random.uniform(0, 10, 1000)).astype(np.float32) y = np.sin(2 * np.pi * t / 2.5) + np.random.normal(0, 0.1, len(t)) dy = np.ones_like(y) * 0.1 # uncertainties -# Box Least Squares (BLS) - Transit detection # Define frequency grid freqs = np.linspace(0.1, 2.0, 5000).astype(np.float32) @@ -218,6 +225,36 @@ print(f"Best period: {1/best_freq:.2f} (expected: 2.5)") power_adaptive = bls.eebls_gpu_fast_adaptive(t, y, dy, freqs) ``` +### Transit Least Squares (TLS) - Advanced Transit Detection + +```python +from cuvarbase import tls + +# Generate transit data +t = np.sort(np.random.uniform(0, 50, 500)).astype(np.float32) +y = np.ones(len(t), dtype=np.float32) +dy = np.ones(len(t), dtype=np.float32) * 0.001 + +# Add 1% transit at 10-day period +phase = (t % 10.0) / 10.0 +in_transit = (phase < 0.01) | (phase > 0.99) +y[in_transit] -= 0.01 +y += np.random.normal(0, 0.001, len(t)).astype(np.float32) + +# TLS with Keplerian duration constraints (35-202x faster than CPU TLS!) +results = tls.tls_transit( + t, y, dy, + R_star=1.0, # Solar radii + M_star=1.0, # Solar masses + period_min=5.0, + period_max=20.0 +) + +print(f"Best period: {results['period']:.2f} days") +print(f"Transit depth: {results['depth']:.4f}") +print(f"SDE: {results['SDE']:.1f}") +``` + For more advanced usage including Lomb-Scargle and Conditional Entropy, see the [full documentation](https://johnh2o2.github.io/cuvarbase/) and [examples/](examples/). ## Using Multiple GPUs diff --git a/compare_gpu_cpu_depth.py b/compare_gpu_cpu_depth.py new file mode 100644 index 0000000..f0ffc38 --- /dev/null +++ b/compare_gpu_cpu_depth.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +"""Compare GPU and CPU TLS depth calculations""" +import numpy as np +from cuvarbase import tls as gpu_tls +from transitleastsquares import transitleastsquares as cpu_tls + +# Generate test data +np.random.seed(42) +ndata = 500 +t = np.sort(np.random.uniform(0, 50, ndata)) +y = np.ones(ndata, dtype=np.float32) + +# Add transit +period_true = 10.0 +depth_true = 0.01 # Fractional dip +phase = (t % period_true) / period_true +in_transit = (phase < 0.01) | (phase > 0.99) +y[in_transit] -= depth_true +y += np.random.normal(0, 0.001, ndata).astype(np.float32) +dy = np.ones(ndata, dtype=np.float32) * 0.001 + +print(f"Test data:") +print(f" N = {ndata}") +print(f" Period = {period_true:.1f} days") +print(f" Depth (fractional dip) = {depth_true:.3f}") +print(f" Points in transit: {np.sum(in_transit)}") +print(f" Measured depth: {np.mean(y[~in_transit]) - np.mean(y[in_transit]):.6f}") + +# GPU TLS +print(f"\n--- GPU TLS ---") +gpu_result = gpu_tls.tls_search_gpu( + t.astype(np.float32), y, dy, + period_min=9.0, + period_max=11.0 +) + +print(f"Period: {gpu_result['period']:.4f} (error: {abs(gpu_result['period'] - period_true)/period_true*100:.2f}%)") +print(f"Depth: {gpu_result['depth']:.6f}") +print(f"Duration: {gpu_result['duration']:.4f} days") +print(f"T0: {gpu_result['T0']:.4f}") + +# CPU TLS +print(f"\n--- CPU TLS ---") +model = cpu_tls(t, y, dy) +cpu_result = model.power( + period_min=9.0, + period_max=11.0, + n_transits_min=2 +) + +print(f"Period: {cpu_result.period:.4f} (error: {abs(cpu_result.period - period_true)/period_true*100:.2f}%)") +print(f"Depth (flux ratio): {cpu_result.depth:.6f}") +print(f"Depth (fractional dip): {1 - cpu_result.depth:.6f}") +print(f"Duration: {cpu_result.duration:.4f} days") +print(f"T0: {cpu_result.T0:.4f}") + +# Compare +print(f"\n--- Comparison ---") +print(f"Period agreement: {abs(gpu_result['period'] - cpu_result.period):.4f} days") +print(f"Duration agreement: {abs(gpu_result['duration'] - cpu_result.duration):.4f} days") + +# Check depth conventions +gpu_depth_frac = gpu_result['depth'] # GPU reports fractional dip +cpu_depth_frac = 1 - cpu_result.depth # CPU reports flux ratio + +print(f"\nDepth (fractional dip convention):") +print(f" True: {depth_true:.6f}") +print(f" GPU: {gpu_depth_frac:.6f} (error: {abs(gpu_depth_frac - depth_true)/depth_true*100:.1f}%)") +print(f" CPU: {cpu_depth_frac:.6f} (error: {abs(cpu_depth_frac - depth_true)/depth_true*100:.1f}%)") diff --git a/cuvarbase/kernels/tls.cu b/cuvarbase/kernels/tls.cu new file mode 100644 index 0000000..c2183b7 --- /dev/null +++ b/cuvarbase/kernels/tls.cu @@ -0,0 +1,510 @@ +/* + * Transit Least Squares (TLS) GPU kernel + * + * Optimized kernel using bitonic sort for phase sorting and a + * limb-darkened transit template for physically realistic fitting. + * + * The transit template is a 1D array mapping transit_coord in [-1, 1] + * to normalized depth in [0, 1], precomputed on the CPU using batman + * (or a trapezoidal fallback) and loaded into shared memory. + * + * References: + * [1] Hippke & Heller (2019), A&A 623, A39 + * [2] Kovacs et al. (2002), A&A 391, 369 + */ + +#include + +//{CPP_DEFS} + +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 128 +#endif + +#define MAX_NDATA 100000 +#define PI 3.141592653589793f +#define WARP_SIZE 32 + +// Device utility functions +__device__ inline float mod1(float x) { + return x - floorf(x); +} + +/** + * Bitonic sort for phase-folded data + * O(N log^2 N) parallel sort, requires padding to next power of 2 + */ +__device__ void bitonic_sort_phases( + float* phases, + float* y_sorted, + float* dy_sorted, + int ndata) +{ + int tid = threadIdx.x; + int stride = blockDim.x; + + // Compute next power of 2 >= ndata + int n_pow2 = 1; + while (n_pow2 < ndata) n_pow2 <<= 1; + + // Bitonic sort: outer loop over power-of-2 sizes + for (int k = 2; k <= n_pow2; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + for (int i = tid; i < n_pow2; i += stride) { + int ixj = i ^ j; + if (ixj > i && ixj < ndata && i < ndata) { + if ((i & k) == 0) { + // Ascending + if (phases[i] > phases[ixj]) { + float temp = phases[i]; + phases[i] = phases[ixj]; + phases[ixj] = temp; + temp = y_sorted[i]; + y_sorted[i] = y_sorted[ixj]; + y_sorted[ixj] = temp; + temp = dy_sorted[i]; + dy_sorted[i] = dy_sorted[ixj]; + dy_sorted[ixj] = temp; + } + } else { + // Descending + if (phases[i] < phases[ixj]) { + float temp = phases[i]; + phases[i] = phases[ixj]; + phases[ixj] = temp; + temp = y_sorted[i]; + y_sorted[i] = y_sorted[ixj]; + y_sorted[ixj] = temp; + temp = dy_sorted[i]; + dy_sorted[i] = dy_sorted[ixj]; + dy_sorted[ixj] = temp; + } + } + } + } + __syncthreads(); + } + } +} + +/** + * Look up transit template value with linear interpolation. + * + * Maps transit_coord in [-1, 1] to template index, does linear + * interpolation between adjacent samples. Returns 0 outside [-1, 1]. + * + * s_template: shared memory pointer to template array + * n_template: number of template samples + * transit_coord: position within transit, [-1, 1] + */ +__device__ float lookup_template(const float* s_template, int n_template, + float transit_coord) +{ + if (transit_coord < -1.0f || transit_coord > 1.0f) + return 0.0f; + + // Map [-1, 1] to [0, n_template - 1] + float idx_f = (transit_coord + 1.0f) * 0.5f * (float)(n_template - 1); + + int idx0 = (int)floorf(idx_f); + int idx1 = idx0 + 1; + + // Clamp + if (idx0 < 0) idx0 = 0; + if (idx1 >= n_template) idx1 = n_template - 1; + if (idx0 >= n_template) idx0 = n_template - 1; + + float frac = idx_f - floorf(idx_f); + + return s_template[idx0] * (1.0f - frac) + s_template[idx1] * frac; +} + +/** + * Calculate optimal transit depth using weighted least squares + * with limb-darkened transit template. + */ +__device__ float calculate_optimal_depth( + const float* y_sorted, + const float* dy_sorted, + const float* phases_sorted, + const float* s_template, + int n_template, + float duration_phase, + float t0_phase, + int ndata) +{ + float numerator = 0.0f; + float denominator = 0.0f; + + float half_dur = duration_phase * 0.5f; + + for (int i = 0; i < ndata; i++) { + float phase_rel = mod1(phases_sorted[i] - t0_phase + 0.5f) - 0.5f; + + if (fabsf(phase_rel) < half_dur) { + float transit_coord = phase_rel / half_dur; + float template_val = lookup_template(s_template, n_template, transit_coord); + float sigma2 = dy_sorted[i] * dy_sorted[i] + 1e-10f; + float y_residual = 1.0f - y_sorted[i]; + numerator += y_residual * template_val / sigma2; + denominator += template_val * template_val / sigma2; + } + } + + if (denominator < 1e-10f) return 0.0f; + + float depth = numerator / denominator; + if (depth < 0.0f) depth = 0.0f; + if (depth > 1.0f) depth = 1.0f; + + return depth; +} + +/** + * Calculate chi-squared for a given transit model fit + * using limb-darkened transit template. + */ +__device__ float calculate_chi2( + const float* y_sorted, + const float* dy_sorted, + const float* phases_sorted, + const float* s_template, + int n_template, + float duration_phase, + float t0_phase, + float depth, + int ndata) +{ + float chi2 = 0.0f; + float half_dur = duration_phase * 0.5f; + + for (int i = 0; i < ndata; i++) { + float phase_rel = mod1(phases_sorted[i] - t0_phase + 0.5f) - 0.5f; + float model_val; + if (fabsf(phase_rel) < half_dur) { + float transit_coord = phase_rel / half_dur; + float template_val = lookup_template(s_template, n_template, transit_coord); + model_val = 1.0f - depth * template_val; + } else { + model_val = 1.0f; + } + float residual = y_sorted[i] - model_val; + float sigma2 = dy_sorted[i] * dy_sorted[i] + 1e-10f; + chi2 += (residual * residual) / sigma2; + } + + return chi2; +} + +/** + * TLS search kernel with Keplerian duration constraints + * Grid: (nperiods, 1, 1), Block: (BLOCK_SIZE, 1, 1) + * + * Shared memory layout: + * phases[ndata] | y_sorted[ndata] | dy_sorted[ndata] | + * template[n_template] | thread_chi2[blockDim] | thread_t0[blockDim] | + * thread_dur[blockDim] | thread_depth[blockDim] + */ +extern "C" __global__ void tls_search_kernel_keplerian( + const float* __restrict__ t, + const float* __restrict__ y, + const float* __restrict__ dy, + const float* __restrict__ periods, + const float* __restrict__ qmin, + const float* __restrict__ qmax, + const float* __restrict__ transit_template, + const int ndata, + const int nperiods, + const int n_durations, + const int n_template, + float* __restrict__ chi2_out, + float* __restrict__ best_t0_out, + float* __restrict__ best_duration_out, + float* __restrict__ best_depth_out) +{ + extern __shared__ float shared_mem[]; + float* phases = shared_mem; + float* y_sorted = &shared_mem[ndata]; + float* dy_sorted = &shared_mem[2 * ndata]; + float* s_template = &shared_mem[3 * ndata]; + float* thread_chi2 = &s_template[n_template]; + float* thread_t0 = &thread_chi2[blockDim.x]; + float* thread_duration = &thread_t0[blockDim.x]; + float* thread_depth = &thread_duration[blockDim.x]; + + int period_idx = blockIdx.x; + if (period_idx >= nperiods) return; + + // Load template from global to shared memory (once per block) + for (int i = threadIdx.x; i < n_template; i += blockDim.x) { + s_template[i] = transit_template[i]; + } + __syncthreads(); + + float period = periods[period_idx]; + float duration_phase_min = qmin[period_idx]; + float duration_phase_max = qmax[period_idx]; + + // Phase fold + for (int i = threadIdx.x; i < ndata; i += blockDim.x) { + phases[i] = mod1(t[i] / period); + } + __syncthreads(); + + // Initialize y_sorted and dy_sorted arrays + for (int i = threadIdx.x; i < ndata; i += blockDim.x) { + y_sorted[i] = y[i]; + dy_sorted[i] = dy[i]; + } + __syncthreads(); + + // Sort by phase using bitonic sort + bitonic_sort_phases(phases, y_sorted, dy_sorted, ndata); + + // Search over durations and T0 using Keplerian constraints + float thread_min_chi2 = 1e30f; + float thread_best_t0 = 0.0f; + float thread_best_duration = 0.0f; + float thread_best_depth = 0.0f; + + for (int d_idx = 0; d_idx < n_durations; d_idx++) { + float log_dur_min = logf(duration_phase_min); + float log_dur_max = logf(duration_phase_max); + float log_duration = log_dur_min + (log_dur_max - log_dur_min) * d_idx / (n_durations - 1); + float duration_phase = expf(log_duration); + float duration = duration_phase * period; + + int n_t0 = 30; + for (int t0_idx = threadIdx.x; t0_idx < n_t0; t0_idx += blockDim.x) { + float t0_phase = (float)t0_idx / n_t0; + float depth = calculate_optimal_depth(y_sorted, dy_sorted, phases, + s_template, n_template, + duration_phase, t0_phase, ndata); + + if (depth > 0.0f && depth < 0.5f) { + float chi2 = calculate_chi2(y_sorted, dy_sorted, phases, + s_template, n_template, + duration_phase, t0_phase, depth, ndata); + if (chi2 < thread_min_chi2) { + thread_min_chi2 = chi2; + thread_best_t0 = t0_phase; + thread_best_duration = duration; + thread_best_depth = depth; + } + } + } + } + + // Store per-thread results to shared memory + thread_chi2[threadIdx.x] = thread_min_chi2; + thread_t0[threadIdx.x] = thread_best_t0; + thread_duration[threadIdx.x] = thread_best_duration; + thread_depth[threadIdx.x] = thread_best_depth; + __syncthreads(); + + // Block reduction down to warp size + for (int stride = blockDim.x / 2; stride >= WARP_SIZE; stride /= 2) { + if (threadIdx.x < stride) { + if (thread_chi2[threadIdx.x + stride] < thread_chi2[threadIdx.x]) { + thread_chi2[threadIdx.x] = thread_chi2[threadIdx.x + stride]; + thread_t0[threadIdx.x] = thread_t0[threadIdx.x + stride]; + thread_duration[threadIdx.x] = thread_duration[threadIdx.x + stride]; + thread_depth[threadIdx.x] = thread_depth[threadIdx.x + stride]; + } + } + __syncthreads(); + } + + // Final warp reduction using shuffle (no sync needed) + if (threadIdx.x < WARP_SIZE) { + float val_chi2 = thread_chi2[threadIdx.x]; + float val_t0 = thread_t0[threadIdx.x]; + float val_dur = thread_duration[threadIdx.x]; + float val_dep = thread_depth[threadIdx.x]; + + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + float other_chi2 = __shfl_down_sync(0xffffffff, val_chi2, offset); + float other_t0 = __shfl_down_sync(0xffffffff, val_t0, offset); + float other_dur = __shfl_down_sync(0xffffffff, val_dur, offset); + float other_dep = __shfl_down_sync(0xffffffff, val_dep, offset); + + if (other_chi2 < val_chi2) { + val_chi2 = other_chi2; + val_t0 = other_t0; + val_dur = other_dur; + val_dep = other_dep; + } + } + + if (threadIdx.x == 0) { + thread_chi2[0] = val_chi2; + thread_t0[0] = val_t0; + thread_duration[0] = val_dur; + thread_depth[0] = val_dep; + } + } + + // Write final result + if (threadIdx.x == 0) { + chi2_out[period_idx] = thread_chi2[0]; + best_t0_out[period_idx] = thread_t0[0]; + best_duration_out[period_idx] = thread_duration[0]; + best_depth_out[period_idx] = thread_depth[0]; + } +} + +/** + * TLS search kernel (standard, fixed duration range) + * Grid: (nperiods, 1, 1), Block: (BLOCK_SIZE, 1, 1) + * + * Shared memory layout: + * phases[ndata] | y_sorted[ndata] | dy_sorted[ndata] | + * template[n_template] | thread_chi2[blockDim] | thread_t0[blockDim] | + * thread_dur[blockDim] | thread_depth[blockDim] + */ +extern "C" __global__ void tls_search_kernel( + const float* __restrict__ t, + const float* __restrict__ y, + const float* __restrict__ dy, + const float* __restrict__ periods, + const float* __restrict__ transit_template, + const int ndata, + const int nperiods, + const int n_template, + float* __restrict__ chi2_out, + float* __restrict__ best_t0_out, + float* __restrict__ best_duration_out, + float* __restrict__ best_depth_out) +{ + extern __shared__ float shared_mem[]; + float* phases = shared_mem; + float* y_sorted = &shared_mem[ndata]; + float* dy_sorted = &shared_mem[2 * ndata]; + float* s_template = &shared_mem[3 * ndata]; + float* thread_chi2 = &s_template[n_template]; + float* thread_t0 = &thread_chi2[blockDim.x]; + float* thread_duration = &thread_t0[blockDim.x]; + float* thread_depth = &thread_duration[blockDim.x]; + + int period_idx = blockIdx.x; + if (period_idx >= nperiods) return; + + // Load template from global to shared memory (once per block) + for (int i = threadIdx.x; i < n_template; i += blockDim.x) { + s_template[i] = transit_template[i]; + } + __syncthreads(); + + float period = periods[period_idx]; + + // Phase fold + for (int i = threadIdx.x; i < ndata; i += blockDim.x) { + phases[i] = mod1(t[i] / period); + } + __syncthreads(); + + // Initialize y_sorted and dy_sorted arrays + for (int i = threadIdx.x; i < ndata; i += blockDim.x) { + y_sorted[i] = y[i]; + dy_sorted[i] = dy[i]; + } + __syncthreads(); + + // Sort by phase using bitonic sort + bitonic_sort_phases(phases, y_sorted, dy_sorted, ndata); + + // Search over durations and T0 + float thread_min_chi2 = 1e30f; + float thread_best_t0 = 0.0f; + float thread_best_duration = 0.0f; + float thread_best_depth = 0.0f; + + int n_durations = 15; + float duration_phase_min = 0.005f; + float duration_phase_max = 0.15f; + + for (int d_idx = 0; d_idx < n_durations; d_idx++) { + float log_dur_min = logf(duration_phase_min); + float log_dur_max = logf(duration_phase_max); + float log_duration = log_dur_min + (log_dur_max - log_dur_min) * d_idx / (n_durations - 1); + float duration_phase = expf(log_duration); + float duration = duration_phase * period; + + int n_t0 = 30; + for (int t0_idx = threadIdx.x; t0_idx < n_t0; t0_idx += blockDim.x) { + float t0_phase = (float)t0_idx / n_t0; + float depth = calculate_optimal_depth(y_sorted, dy_sorted, phases, + s_template, n_template, + duration_phase, t0_phase, ndata); + + if (depth > 0.0f && depth < 0.5f) { + float chi2 = calculate_chi2(y_sorted, dy_sorted, phases, + s_template, n_template, + duration_phase, t0_phase, depth, ndata); + if (chi2 < thread_min_chi2) { + thread_min_chi2 = chi2; + thread_best_t0 = t0_phase; + thread_best_duration = duration; + thread_best_depth = depth; + } + } + } + } + + // Store per-thread results to shared memory + thread_chi2[threadIdx.x] = thread_min_chi2; + thread_t0[threadIdx.x] = thread_best_t0; + thread_duration[threadIdx.x] = thread_best_duration; + thread_depth[threadIdx.x] = thread_best_depth; + __syncthreads(); + + // Block reduction down to warp size + for (int stride = blockDim.x / 2; stride >= WARP_SIZE; stride /= 2) { + if (threadIdx.x < stride) { + if (thread_chi2[threadIdx.x + stride] < thread_chi2[threadIdx.x]) { + thread_chi2[threadIdx.x] = thread_chi2[threadIdx.x + stride]; + thread_t0[threadIdx.x] = thread_t0[threadIdx.x + stride]; + thread_duration[threadIdx.x] = thread_duration[threadIdx.x + stride]; + thread_depth[threadIdx.x] = thread_depth[threadIdx.x + stride]; + } + } + __syncthreads(); + } + + // Final warp reduction using shuffle (no sync needed) + if (threadIdx.x < WARP_SIZE) { + float val_chi2 = thread_chi2[threadIdx.x]; + float val_t0 = thread_t0[threadIdx.x]; + float val_dur = thread_duration[threadIdx.x]; + float val_dep = thread_depth[threadIdx.x]; + + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + float other_chi2 = __shfl_down_sync(0xffffffff, val_chi2, offset); + float other_t0 = __shfl_down_sync(0xffffffff, val_t0, offset); + float other_dur = __shfl_down_sync(0xffffffff, val_dur, offset); + float other_dep = __shfl_down_sync(0xffffffff, val_dep, offset); + + if (other_chi2 < val_chi2) { + val_chi2 = other_chi2; + val_t0 = other_t0; + val_dur = other_dur; + val_dep = other_dep; + } + } + + if (threadIdx.x == 0) { + thread_chi2[0] = val_chi2; + thread_t0[0] = val_t0; + thread_duration[0] = val_dur; + thread_depth[0] = val_dep; + } + } + + // Write final result + if (threadIdx.x == 0) { + chi2_out[period_idx] = thread_chi2[0]; + best_t0_out[period_idx] = thread_t0[0]; + best_duration_out[period_idx] = thread_duration[0]; + best_depth_out[period_idx] = thread_depth[0]; + } +} diff --git a/cuvarbase/tests/test_tls_basic.py b/cuvarbase/tests/test_tls_basic.py new file mode 100644 index 0000000..984c30e --- /dev/null +++ b/cuvarbase/tests/test_tls_basic.py @@ -0,0 +1,459 @@ +""" +Basic tests for TLS GPU implementation. + +These tests verify the basic functionality of the TLS implementation, +focusing on API correctness and basic execution rather than scientific +accuracy (which will be tested in test_tls_consistency.py). +""" + +import pytest +import numpy as np + +try: + import pycuda + import pycuda.autoinit + PYCUDA_AVAILABLE = True +except ImportError: + PYCUDA_AVAILABLE = False + +# Import modules to test +from cuvarbase import tls_grids, tls_models, tls_stats + + +class TestGridGeneration: + """Test period and duration grid generation.""" + + def test_period_grid_basic(self): + """Test basic period grid generation.""" + t = np.linspace(0, 100, 1000) # 100-day observation + + periods = tls_grids.period_grid_ofir(t, R_star=1.0, M_star=1.0) + + assert len(periods) > 0 + assert np.all(periods > 0) + assert np.all(np.diff(periods) > 0) # Increasing + assert periods[0] < periods[-1] + + def test_period_grid_limits(self): + """Test period grid with custom limits.""" + t = np.linspace(0, 100, 1000) + + periods = tls_grids.period_grid_ofir( + t, period_min=5.0, period_max=20.0 + ) + + assert periods[0] >= 5.0 + assert periods[-1] <= 20.0 + + def test_duration_grid(self): + """Test duration grid generation.""" + periods = np.array([10.0, 20.0, 30.0]) + + durations, counts = tls_grids.duration_grid(periods) + + assert len(durations) == len(periods) + assert len(counts) == len(periods) + assert all(c > 0 for c in counts) + + # Check durations are reasonable (< period) + for i, period in enumerate(periods): + assert all(d < period for d in durations[i]) + assert all(d > 0 for d in durations[i]) + + def test_transit_duration_max(self): + """Test maximum transit duration calculation.""" + period = 10.0 # days + + duration = tls_grids.transit_duration_max( + period, R_star=1.0, M_star=1.0, R_planet=1.0 + ) + + assert duration > 0 + assert duration < period # Duration must be less than period + assert duration < 1.0 # For Earth-Sun system, ~0.5 days + + def test_t0_grid(self): + """Test T0 grid generation.""" + period = 10.0 + duration = 0.1 + + t0_values = tls_grids.t0_grid(period, duration, oversampling=5) + + assert len(t0_values) > 0 + assert np.all(t0_values >= 0) + assert np.all(t0_values <= 1) + + def test_validate_stellar_parameters(self): + """Test stellar parameter validation.""" + # Valid parameters + tls_grids.validate_stellar_parameters(R_star=1.0, M_star=1.0) + + # Invalid radius + with pytest.raises(ValueError): + tls_grids.validate_stellar_parameters(R_star=10.0, M_star=1.0) + + # Invalid mass + with pytest.raises(ValueError): + tls_grids.validate_stellar_parameters(R_star=1.0, M_star=5.0) + + +class TestTransitTemplate: + """Test transit template generation for GPU kernel.""" + + def test_trapezoid_template_shape(self): + """Test trapezoidal fallback template has correct shape.""" + template = tls_models._trapezoid_template(n_template=500) + + assert template.shape == (500,) + assert template.dtype == np.float32 + + def test_trapezoid_template_normalization(self): + """Test trapezoidal template values are in [0, 1].""" + template = tls_models._trapezoid_template(n_template=1000) + + assert np.all(template >= 0.0) + assert np.all(template <= 1.0) + # Center should be at max depth + assert template[500] == pytest.approx(1.0) + # Edges should be near zero + assert template[0] == pytest.approx(0.0, abs=0.01) + assert template[-1] == pytest.approx(0.0, abs=0.01) + + def test_trapezoid_template_symmetric(self): + """Test trapezoidal template is symmetric.""" + template = tls_models._trapezoid_template(n_template=1001) + np.testing.assert_allclose(template, template[::-1], atol=1e-6) + + @pytest.mark.skipif(not tls_models.BATMAN_AVAILABLE, + reason="batman-package not installed") + def test_batman_template_shape(self): + """Test batman template has correct shape and dtype.""" + template = tls_models.generate_transit_template(n_template=1000) + + assert template.shape == (1000,) + assert template.dtype == np.float32 + + @pytest.mark.skipif(not tls_models.BATMAN_AVAILABLE, + reason="batman-package not installed") + def test_batman_template_normalization(self): + """Test batman template values are in [0, 1] with max = 1.""" + template = tls_models.generate_transit_template(n_template=1000) + + assert np.all(template >= 0.0) + assert np.all(template <= 1.0) + assert np.max(template) == pytest.approx(1.0, abs=0.01) + # Edges should be near zero + assert template[0] < 0.1 + assert template[-1] < 0.1 + + @pytest.mark.skipif(not tls_models.BATMAN_AVAILABLE, + reason="batman-package not installed") + def test_batman_template_limb_darkened(self): + """Test batman template shows limb darkening (not a box).""" + template = tls_models.generate_transit_template(n_template=1000) + + # The template should NOT be a perfect box (all 0 or 1). + # With limb darkening, there should be intermediate values. + n_intermediate = np.sum((template > 0.1) & (template < 0.9)) + assert n_intermediate > 10, "Template should have limb-darkened shape, not a box" + + def test_generate_fallback_without_batman(self): + """Test generate_transit_template falls back to trapezoid.""" + # Force fallback by testing _trapezoid_template directly + template = tls_models._trapezoid_template(n_template=500) + + assert template.shape == (500,) + assert np.max(template) == pytest.approx(1.0) + assert np.min(template) == pytest.approx(0.0, abs=0.01) + + +@pytest.mark.skipif(not tls_models.BATMAN_AVAILABLE, + reason="batman-package not installed") +class TestTransitModels: + """Test transit model generation (requires batman).""" + + def test_reference_transit(self): + """Test reference transit model creation.""" + phases, flux = tls_models.create_reference_transit(n_samples=100) + + assert len(phases) == len(flux) + assert len(phases) == 100 + assert np.all((phases >= 0) & (phases <= 1)) + assert np.all(flux <= 1.0) # Transit causes dimming + assert np.min(flux) < 1.0 # There is a transit + + def test_transit_model_cache(self): + """Test transit model cache creation.""" + durations = np.array([0.05, 0.1, 0.15]) + + models, phases = tls_models.create_transit_model_cache( + durations, period=10.0, n_samples=100 + ) + + assert len(models) == len(durations) + assert len(phases) == 100 + for model in models: + assert len(model) == len(phases) + + +class TestSimpleTransitModels: + """Test simple transit models (no batman required).""" + + def test_simple_trapezoid(self): + """Test simple trapezoidal transit.""" + phases = np.linspace(0, 1, 1000) + duration_phase = 0.1 + + flux = tls_models.simple_trapezoid_transit( + phases, duration_phase, depth=0.01 + ) + + assert len(flux) == len(phases) + assert np.all(flux <= 1.0) + assert np.min(flux) < 1.0 # There is a transit + assert np.max(flux) == 1.0 # Out of transit = 1.0 + + def test_interpolate_transit_model(self): + """Test transit model interpolation.""" + model_phases = np.linspace(0, 1, 100) + model_flux = np.ones(100) + model_flux[40:60] = 0.99 # Simple transit + + target_phases = np.linspace(0, 1, 200) + + flux_interp = tls_models.interpolate_transit_model( + model_phases, model_flux, target_phases, target_depth=0.01 + ) + + assert len(flux_interp) == len(target_phases) + assert np.all(flux_interp <= 1.0) + + def test_default_limb_darkening(self): + """Test default limb darkening coefficient lookup.""" + u_kepler = tls_models.get_default_limb_darkening('Kepler', T_eff=5500) + assert len(u_kepler) == 2 + assert all(0 < coeff < 1 for coeff in u_kepler) + + u_tess = tls_models.get_default_limb_darkening('TESS', T_eff=5500) + assert len(u_tess) == 2 + + def test_validate_limb_darkening(self): + """Test limb darkening validation.""" + # Valid quadratic + tls_models.validate_limb_darkening_coeffs([0.4, 0.2], 'quadratic') + + # Invalid - wrong number + with pytest.raises(ValueError): + tls_models.validate_limb_darkening_coeffs([0.4], 'quadratic') + + +class TestStatistics: + """Test TLS statistics calculations.""" + + def test_signal_residue_with_signal(self): + """Test SR is positive for a signal.""" + # Simulate chi2 values where one period has much lower chi2 + chi2 = np.ones(100) * 1000.0 + chi2[50] = 500.0 # Signal at index 50 + + SR = tls_stats.signal_residue(chi2) + + # SR at signal should be highest + assert SR[50] > SR[0] + assert SR[50] > 0 + + def test_sde_positive_for_signal(self): + """Test SDE > 0 for an injected signal (regression test).""" + # Simulate chi2 values with a clear signal + np.random.seed(42) + chi2 = np.random.normal(1000, 10, size=200) + chi2[100] = 500.0 # Strong signal + + SDE, SDE_raw, power = tls_stats.signal_detection_efficiency( + chi2, detrend=False + ) + + assert SDE > 0, "SDE should be > 0 for injected signal" + assert SDE_raw > 0 + + def test_snr_with_chi2(self): + """Test SNR estimation from chi2 values.""" + snr = tls_stats.signal_to_noise( + 0.01, chi2_null=1000.0, chi2_best=500.0 + ) + assert snr > 0 + + def test_snr_returns_zero_without_info(self): + """Test SNR returns 0 when no depth_err or chi2 provided.""" + snr = tls_stats.signal_to_noise(0.01) + assert snr == 0.0 + + +@pytest.mark.skipif(not PYCUDA_AVAILABLE, + reason="PyCUDA not available") +class TestTLSKernel: + """Test TLS kernel compilation and basic execution.""" + + def test_kernel_compilation(self): + """Test that TLS kernel compiles.""" + from cuvarbase import tls + + kernel = tls.compile_tls(block_size=128) + assert kernel is not None + + def test_kernel_caching(self): + """Test kernel caching mechanism.""" + from cuvarbase import tls + + # First call - compiles + kernel1 = tls._get_cached_kernels(128) + assert kernel1 is not None + + # Second call - should use cache + kernel2 = tls._get_cached_kernels(128) + assert kernel2 is kernel1 + + def test_block_size_selection(self): + """Test automatic block size selection.""" + from cuvarbase import tls + + assert tls._choose_block_size(10) == 32 + assert tls._choose_block_size(50) == 64 + assert tls._choose_block_size(100) == 128 + + +@pytest.mark.skipif(not PYCUDA_AVAILABLE, + reason="PyCUDA not available") +class TestTLSMemory: + """Test TLS memory management.""" + + def test_memory_allocation(self): + """Test memory allocation.""" + from cuvarbase.tls import TLSMemory + + mem = TLSMemory(max_ndata=1000, max_nperiods=100) + + assert mem.t is not None + assert len(mem.t) == 1000 + assert len(mem.periods) == 100 + + def test_memory_setdata(self): + """Test setting data.""" + from cuvarbase.tls import TLSMemory + + t = np.linspace(0, 100, 100) + y = np.ones(100) + dy = np.ones(100) * 0.01 + periods = np.linspace(1, 10, 50) + + mem = TLSMemory(max_ndata=1000, max_nperiods=100) + mem.setdata(t, y, dy, periods=periods, transfer=False) + + assert np.allclose(mem.t[:100], t) + assert np.allclose(mem.periods[:50], periods) + + def test_memory_fromdata(self): + """Test creating memory from data.""" + from cuvarbase.tls import TLSMemory + + t = np.linspace(0, 100, 100) + y = np.ones(100) + dy = np.ones(100) * 0.01 + periods = np.linspace(1, 10, 50) + + mem = TLSMemory.fromdata(t, y, dy, periods=periods, transfer=False) + + assert mem.max_ndata >= 100 + assert mem.max_nperiods >= 50 + + +@pytest.mark.skipif(not PYCUDA_AVAILABLE, + reason="PyCUDA not available") +class TestTLSBasicExecution: + """Test basic TLS execution (not accuracy).""" + + def test_tls_search_runs(self): + """Test that TLS search runs without errors.""" + from cuvarbase import tls + + # Create simple synthetic data + t = np.linspace(0, 100, 500) + y = np.ones(500) + dy = np.ones(500) * 0.001 + + # Use small period range for speed + periods = np.linspace(5, 15, 20) + + # This should run without errors + results = tls.tls_search_gpu( + t, y, dy, + periods=periods, + block_size=64 + ) + + assert results is not None + assert 'periods' in results + assert 'chi2' in results + assert len(results['periods']) == 20 + + def test_tls_search_with_transit(self): + """Test TLS with injected transit.""" + from cuvarbase import tls + + # Create data with simple transit + t = np.linspace(0, 100, 500) + y = np.ones(500) + + # Inject transit at period = 10 days + period_true = 10.0 + duration = 0.1 + depth = 0.01 + + phases = (t % period_true) / period_true + in_transit = (phases < duration / period_true) | (phases > 1 - duration / period_true) + y[in_transit] -= depth + + dy = np.ones(500) * 0.0001 + + # Search with periods around the true value + periods = np.linspace(8, 12, 30) + + results = tls.tls_search_gpu(t, y, dy, periods=periods) + + # Should return results + assert results['chi2'] is not None + assert len(results['chi2']) == 30 + + # Minimum chi2 should be near period = 10 (within a few samples) + min_idx = np.argmin(results['chi2']) + best_period = results['periods'][min_idx] + + # Should be within 20% of true period (very loose for Phase 1) + assert 8 < best_period < 12 + + def test_sde_positive_with_transit(self): + """Test SDE > 0 when a transit is present (regression test).""" + from cuvarbase import tls + + # Create data with obvious transit + t = np.linspace(0, 100, 500) + y = np.ones(500) + + period_true = 10.0 + depth = 0.02 + phases = (t % period_true) / period_true + in_transit = phases < 0.02 + y[in_transit] -= depth + + dy = np.ones(500) * 0.0001 + + periods = np.linspace(8, 12, 50) + results = tls.tls_search_gpu(t, y, dy, periods=periods) + + assert results['SDE'] > 0, ( + "SDE should be > 0 for a clear transit signal" + ) + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/cuvarbase/tls.py b/cuvarbase/tls.py new file mode 100644 index 0000000..53ff2cb --- /dev/null +++ b/cuvarbase/tls.py @@ -0,0 +1,777 @@ +""" +GPU-accelerated Transit Least Squares (TLS) periodogram. + +This module implements a fast GPU version of the Transit Least Squares +algorithm for detecting planetary transits in photometric time series. + +References +---------- +.. [1] Hippke & Heller (2019), "Transit Least Squares", A&A 623, A39 +.. [2] Kovács et al. (2002), "Box Least Squares", A&A 391, 369 +""" + +import sys +import threading +from collections import OrderedDict +import resource + +import pycuda.autoprimaryctx +import pycuda.driver as cuda +import pycuda.gpuarray as gpuarray +from pycuda.compiler import SourceModule + +import numpy as np + +from .utils import find_kernel, _module_reader +from . import tls_grids +from . import tls_models +from . import tls_stats + +_default_block_size = 128 # Smaller default than BLS (TLS has more shared memory needs) +_KERNEL_CACHE_MAX_SIZE = 10 +_kernel_cache = OrderedDict() +_kernel_cache_lock = threading.Lock() + + +def _choose_block_size(ndata): + """ + Choose optimal block size for TLS kernel based on data size. + + Parameters + ---------- + ndata : int + Number of data points + + Returns + ------- + block_size : int + Optimal CUDA block size (32, 64, or 128) + + Notes + ----- + TLS uses more shared memory than BLS, so we use smaller block sizes + to avoid shared memory limits. + """ + if ndata <= 32: + return 32 + elif ndata <= 64: + return 64 + else: + return 128 # Max for TLS (vs 256 for BLS) + + +def _get_cached_kernels(block_size): + """ + Get compiled TLS kernel from cache. + + Parameters + ---------- + block_size : int + CUDA block size + + Returns + ------- + kernel : PyCUDA function + Compiled kernel function + """ + key = block_size + + with _kernel_cache_lock: + if key in _kernel_cache: + _kernel_cache.move_to_end(key) + return _kernel_cache[key] + + # Compile kernel + compiled = compile_tls(block_size=block_size) + + # Add to cache + _kernel_cache[key] = compiled + _kernel_cache.move_to_end(key) + + # Evict oldest if needed + if len(_kernel_cache) > _KERNEL_CACHE_MAX_SIZE: + _kernel_cache.popitem(last=False) + + return compiled + + +def compile_tls(block_size=_default_block_size): + """ + Compile TLS CUDA kernels. + + Parameters + ---------- + block_size : int, optional + CUDA block size (default: 128) + + Returns + ------- + kernels : dict + Dictionary with 'standard' and 'keplerian' kernel functions + + Notes + ----- + The kernels use bitonic sort for phase sorting and a limb-darkened + transit template loaded into shared memory for physically realistic + fitting. Works for datasets up to ~100,000 points. + + The 'keplerian' kernel variant accepts per-period qmin/qmax arrays + to focus the duration search on physically plausible values. + """ + cppd = dict(BLOCK_SIZE=block_size) + + kernel_name = 'tls' + kernel_txt = _module_reader(find_kernel(kernel_name), cpp_defs=cppd) + + # Compile with fast math + # no_extern_c=True needed for proper extern "C" handling + module = SourceModule(kernel_txt, options=['--use_fast_math'], no_extern_c=True) + + # Get both kernel functions + kernels = { + 'standard': module.get_function('tls_search_kernel'), + 'keplerian': module.get_function('tls_search_kernel_keplerian') + } + + return kernels + + +class TLSMemory: + """ + Memory management for TLS GPU computations. + + This class handles allocation and transfer of data between CPU and GPU + for TLS periodogram calculations. + + Parameters + ---------- + max_ndata : int + Maximum number of data points + max_nperiods : int + Maximum number of trial periods + stream : pycuda.driver.Stream, optional + CUDA stream for async operations + + Attributes + ---------- + t, y, dy : ndarray + Pinned CPU arrays for time, flux, uncertainties + t_g, y_g, dy_g : gpuarray + GPU arrays for data + periods_g, chi2_g : gpuarray + GPU arrays for periods and chi-squared values + best_t0_g, best_duration_g, best_depth_g : gpuarray + GPU arrays for best-fit parameters + """ + + def __init__(self, max_ndata, max_nperiods, stream=None, **kwargs): + self.max_ndata = max_ndata + self.max_nperiods = max_nperiods + self.stream = stream + self.rtype = np.float32 + + # CPU pinned memory for fast transfers + self.t = None + self.y = None + self.dy = None + + # GPU memory + self.t_g = None + self.y_g = None + self.dy_g = None + self.periods_g = None + self.qmin_g = None # Keplerian duration constraints + self.qmax_g = None # Keplerian duration constraints + self.chi2_g = None + self.best_t0_g = None + self.best_duration_g = None + self.best_depth_g = None + self.template_g = None + + self.allocate_pinned_arrays() + + def allocate_pinned_arrays(self): + """Allocate page-aligned pinned memory on CPU for fast transfers.""" + pagesize = resource.getpagesize() + + self.t = cuda.aligned_zeros(shape=(self.max_ndata,), + dtype=self.rtype, + alignment=pagesize) + + self.y = cuda.aligned_zeros(shape=(self.max_ndata,), + dtype=self.rtype, + alignment=pagesize) + + self.dy = cuda.aligned_zeros(shape=(self.max_ndata,), + dtype=self.rtype, + alignment=pagesize) + + self.periods = cuda.aligned_zeros(shape=(self.max_nperiods,), + dtype=self.rtype, + alignment=pagesize) + + self.chi2 = cuda.aligned_zeros(shape=(self.max_nperiods,), + dtype=self.rtype, + alignment=pagesize) + + self.best_t0 = cuda.aligned_zeros(shape=(self.max_nperiods,), + dtype=self.rtype, + alignment=pagesize) + + self.best_duration = cuda.aligned_zeros(shape=(self.max_nperiods,), + dtype=self.rtype, + alignment=pagesize) + + self.best_depth = cuda.aligned_zeros(shape=(self.max_nperiods,), + dtype=self.rtype, + alignment=pagesize) + + # Keplerian duration constraints + self.qmin = cuda.aligned_zeros(shape=(self.max_nperiods,), + dtype=self.rtype, + alignment=pagesize) + + self.qmax = cuda.aligned_zeros(shape=(self.max_nperiods,), + dtype=self.rtype, + alignment=pagesize) + + def allocate_gpu_arrays(self, ndata=None, nperiods=None): + """Allocate GPU memory.""" + if ndata is None: + ndata = self.max_ndata + if nperiods is None: + nperiods = self.max_nperiods + + self.t_g = gpuarray.zeros(ndata, dtype=self.rtype) + self.y_g = gpuarray.zeros(ndata, dtype=self.rtype) + self.dy_g = gpuarray.zeros(ndata, dtype=self.rtype) + self.periods_g = gpuarray.zeros(nperiods, dtype=self.rtype) + self.qmin_g = gpuarray.zeros(nperiods, dtype=self.rtype) + self.qmax_g = gpuarray.zeros(nperiods, dtype=self.rtype) + self.chi2_g = gpuarray.zeros(nperiods, dtype=self.rtype) + self.best_t0_g = gpuarray.zeros(nperiods, dtype=self.rtype) + self.best_duration_g = gpuarray.zeros(nperiods, dtype=self.rtype) + self.best_depth_g = gpuarray.zeros(nperiods, dtype=self.rtype) + + def set_template(self, template): + """Transfer transit template to GPU. + + Parameters + ---------- + template : ndarray + Float32 template array from generate_transit_template() + """ + template = np.asarray(template, dtype=self.rtype) + self.template_g = gpuarray.to_gpu(template) + + def setdata(self, t, y, dy, periods=None, qmin=None, qmax=None, transfer=True): + """ + Set data for TLS computation. + + Parameters + ---------- + t : array_like + Observation times + y : array_like + Flux measurements + dy : array_like + Flux uncertainties + periods : array_like, optional + Trial periods + qmin : array_like, optional + Minimum fractional duration per period (for Keplerian search) + qmax : array_like, optional + Maximum fractional duration per period (for Keplerian search) + transfer : bool, optional + Transfer to GPU immediately (default: True) + """ + ndata = len(t) + + # Copy to pinned memory + self.t[:ndata] = np.asarray(t).astype(self.rtype) + self.y[:ndata] = np.asarray(y).astype(self.rtype) + self.dy[:ndata] = np.asarray(dy).astype(self.rtype) + + if periods is not None: + nperiods = len(periods) + self.periods[:nperiods] = np.asarray(periods).astype(self.rtype) + + if qmin is not None: + nperiods = len(qmin) + self.qmin[:nperiods] = np.asarray(qmin).astype(self.rtype) + + if qmax is not None: + nperiods = len(qmax) + self.qmax[:nperiods] = np.asarray(qmax).astype(self.rtype) + + # Allocate GPU memory if needed + if self.t_g is None or len(self.t_g) < ndata: + self.allocate_gpu_arrays(ndata, len(periods) if periods is not None else self.max_nperiods) + + # Transfer to GPU + if transfer: + self.transfer_to_gpu(ndata, len(periods) if periods is not None else None, + qmin is not None, qmax is not None) + + def transfer_to_gpu(self, ndata, nperiods=None, has_qmin=False, has_qmax=False): + """Transfer data from CPU to GPU.""" + if self.stream is None: + self.t_g.set(self.t[:ndata]) + self.y_g.set(self.y[:ndata]) + self.dy_g.set(self.dy[:ndata]) + if nperiods is not None: + self.periods_g.set(self.periods[:nperiods]) + if has_qmin: + self.qmin_g.set(self.qmin[:nperiods]) + if has_qmax: + self.qmax_g.set(self.qmax[:nperiods]) + else: + self.t_g.set_async(self.t[:ndata], stream=self.stream) + self.y_g.set_async(self.y[:ndata], stream=self.stream) + self.dy_g.set_async(self.dy[:ndata], stream=self.stream) + if nperiods is not None: + self.periods_g.set_async(self.periods[:nperiods], stream=self.stream) + if has_qmin: + self.qmin_g.set_async(self.qmin[:nperiods], stream=self.stream) + if has_qmax: + self.qmax_g.set_async(self.qmax[:nperiods], stream=self.stream) + + def transfer_from_gpu(self, nperiods): + """Transfer results from GPU to CPU.""" + if self.stream is None: + self.chi2[:nperiods] = self.chi2_g.get()[:nperiods] + self.best_t0[:nperiods] = self.best_t0_g.get()[:nperiods] + self.best_duration[:nperiods] = self.best_duration_g.get()[:nperiods] + self.best_depth[:nperiods] = self.best_depth_g.get()[:nperiods] + else: + self.chi2_g.get_async(ary=self.chi2, stream=self.stream) + self.best_t0_g.get_async(ary=self.best_t0, stream=self.stream) + self.best_duration_g.get_async(ary=self.best_duration, stream=self.stream) + self.best_depth_g.get_async(ary=self.best_depth, stream=self.stream) + + @classmethod + def fromdata(cls, t, y, dy, periods=None, **kwargs): + """ + Create TLSMemory instance from data. + + Parameters + ---------- + t, y, dy : array_like + Time series data + periods : array_like, optional + Trial periods + **kwargs + Passed to __init__ + + Returns + ------- + memory : TLSMemory + Initialized memory object + """ + max_ndata = kwargs.get('max_ndata', len(t)) + max_nperiods = kwargs.get('max_nperiods', + len(periods) if periods is not None else 10000) + + mem = cls(max_ndata, max_nperiods, **kwargs) + mem.setdata(t, y, dy, periods=periods, transfer=kwargs.get('transfer', True)) + + return mem + + +def tls_search_gpu(t, y, dy, periods=None, durations=None, + qmin=None, qmax=None, n_durations=15, + R_star=1.0, M_star=1.0, + period_min=None, period_max=None, n_transits_min=2, + oversampling_factor=3, duration_grid_step=1.1, + R_planet_min=0.5, R_planet_max=5.0, + limb_dark='quadratic', u=[0.4804, 0.1867], + block_size=None, + kernel=None, memory=None, stream=None, + transfer_to_device=True, transfer_to_host=True, + **kwargs): + """ + Run Transit Least Squares search on GPU. + + Parameters + ---------- + t : array_like + Observation times (days) + y : array_like + Flux measurements (arbitrary units, will be normalized) + dy : array_like + Flux uncertainties + periods : array_like, optional + Custom period grid. If None, generated automatically. + qmin : array_like, optional + Minimum fractional duration per period (for Keplerian search). + If provided, enables Keplerian mode. + qmax : array_like, optional + Maximum fractional duration per period (for Keplerian search). + If provided, enables Keplerian mode. + n_durations : int, optional + Number of duration samples per period (default: 15). + Only used in Keplerian mode. + R_star : float, optional + Stellar radius in solar radii (default: 1.0) + M_star : float, optional + Stellar mass in solar masses (default: 1.0) + period_min, period_max : float, optional + Period search range (days). Auto-computed if None. + n_transits_min : int, optional + Minimum number of transits required (default: 2) + oversampling_factor : float, optional + Period grid oversampling (default: 3) + duration_grid_step : float, optional + Duration grid spacing factor (default: 1.1) + R_planet_min, R_planet_max : float, optional + Planet radius range in Earth radii (default: 0.5 to 5.0) + limb_dark : str, optional + Limb darkening law (default: 'quadratic') + u : list, optional + Limb darkening coefficients (default: [0.4804, 0.1867]) + block_size : int, optional + CUDA block size (auto-selected if None) + kernel : PyCUDA function, optional + Pre-compiled kernel + memory : TLSMemory, optional + Pre-allocated memory object + stream : cuda.Stream, optional + CUDA stream for async execution + transfer_to_device : bool, optional + Transfer data to GPU (default: True) + transfer_to_host : bool, optional + Transfer results to CPU (default: True) + + Returns + ------- + results : dict + Dictionary with keys: + - 'periods': Trial periods + - 'chi2': Chi-squared values + - 'best_t0': Best mid-transit times + - 'best_duration': Best durations + - 'best_depth': Best depths + - 'SDE': Signal Detection Efficiency (if computed) + + Notes + ----- + This is the main GPU TLS function. For the first implementation, + it provides a basic version that will be optimized in Phase 2. + """ + # Validate stellar parameters + tls_grids.validate_stellar_parameters(R_star, M_star) + + # Validate limb darkening + tls_models.validate_limb_darkening_coeffs(u, limb_dark) + + # Generate period grid if not provided + if periods is None: + periods = tls_grids.period_grid_ofir( + t, R_star=R_star, M_star=M_star, + oversampling_factor=oversampling_factor, + period_min=period_min, period_max=period_max, + n_transits_min=n_transits_min + ) + + # Convert to numpy arrays + t = np.asarray(t, dtype=np.float32) + y = np.asarray(y, dtype=np.float32) + dy = np.asarray(dy, dtype=np.float32) + periods = np.asarray(periods, dtype=np.float32) + + ndata = len(t) + nperiods = len(periods) + + # Choose block size + if block_size is None: + block_size = _choose_block_size(ndata) + + # Determine if using Keplerian mode + use_keplerian = (qmin is not None and qmax is not None) + + # Get or compile kernels + if kernel is None: + kernels = _get_cached_kernels(block_size) + kernel = kernels['keplerian'] if use_keplerian else kernels['standard'] + + # Allocate or use existing memory + if memory is None: + memory = TLSMemory.fromdata(t, y, dy, periods=periods, + stream=stream, + transfer=transfer_to_device) + elif transfer_to_device: + memory.setdata(t, y, dy, periods=periods, transfer=True) + + # Set qmin/qmax if using Keplerian mode + if use_keplerian: + qmin = np.asarray(qmin, dtype=np.float32) + qmax = np.asarray(qmax, dtype=np.float32) + if len(qmin) != nperiods or len(qmax) != nperiods: + raise ValueError(f"qmin and qmax must have same length as periods ({nperiods})") + memory.setdata(t, y, dy, periods=periods, qmin=qmin, qmax=qmax, transfer=transfer_to_device) + + # Generate and transfer transit template + n_template = kwargs.get('n_template', 1000) + if memory.template_g is None: + template = tls_models.generate_transit_template( + n_template=n_template, limb_dark=limb_dark, u=u + ) + memory.set_template(template) + + # Calculate shared memory requirements + # phases[ndata] + y_sorted[ndata] + dy_sorted[ndata] + + # template[n_template] + 4 * thread arrays[block_size] + shared_mem_size = (3 * ndata + n_template + 4 * block_size) * 4 # 4 bytes per float + + # Launch kernel + grid = (nperiods, 1, 1) + block = (block_size, 1, 1) + + if use_keplerian: + # Keplerian kernel with qmin/qmax arrays and template + kernel_args = [ + memory.t_g, memory.y_g, memory.dy_g, + memory.periods_g, memory.qmin_g, memory.qmax_g, + memory.template_g, + np.int32(ndata), np.int32(nperiods), np.int32(n_durations), + np.int32(n_template), + memory.chi2_g, memory.best_t0_g, + memory.best_duration_g, memory.best_depth_g, + ] + else: + # Standard kernel with fixed duration range and template + kernel_args = [ + memory.t_g, memory.y_g, memory.dy_g, + memory.periods_g, + memory.template_g, + np.int32(ndata), np.int32(nperiods), + np.int32(n_template), + memory.chi2_g, memory.best_t0_g, + memory.best_duration_g, memory.best_depth_g, + ] + + kernel_kwargs = dict(block=block, grid=grid, shared=shared_mem_size) + if stream is not None: + kernel_kwargs['stream'] = stream + + kernel(*kernel_args, **kernel_kwargs) + + # Transfer results if requested + if transfer_to_host: + if stream is not None: + stream.synchronize() + memory.transfer_from_gpu(nperiods) + + chi2_vals = memory.chi2[:nperiods].copy() + best_t0_vals = memory.best_t0[:nperiods].copy() + best_duration_vals = memory.best_duration[:nperiods].copy() + best_depth_vals = memory.best_depth[:nperiods].copy() + + # Find best period + best_idx = np.argmin(chi2_vals) + best_period = periods[best_idx] + best_chi2 = chi2_vals[best_idx] + best_t0 = best_t0_vals[best_idx] + best_duration = best_duration_vals[best_idx] + best_depth = best_depth_vals[best_idx] + + # Estimate number of transits + T_span = np.max(t) - np.min(t) + n_transits = int(T_span / best_period) + + # Compute statistics + stats = tls_stats.compute_all_statistics( + chi2_vals, periods, best_idx, + best_depth, best_duration, n_transits + ) + + # Period uncertainty + period_uncertainty = tls_stats.compute_period_uncertainty( + periods, chi2_vals, best_idx + ) + + results = { + # Raw outputs + 'periods': periods, + 'chi2': chi2_vals, + 'best_t0_per_period': best_t0_vals, + 'best_duration_per_period': best_duration_vals, + 'best_depth_per_period': best_depth_vals, + + # Best-fit parameters + 'period': best_period, + 'period_uncertainty': period_uncertainty, + 'T0': best_t0, + 'duration': best_duration, + 'depth': best_depth, + 'chi2_min': best_chi2, + + # Statistics + 'SDE': stats['SDE'], + 'SDE_raw': stats['SDE_raw'], + 'SNR': stats['SNR'], + 'FAP': stats['FAP'], + 'power': stats['power'], + 'SR': stats['SR'], + + # Metadata + 'n_transits': n_transits, + 'R_star': R_star, + 'M_star': M_star, + } + else: + # Just return periods if not transferring + results = { + 'periods': periods, + 'chi2': None, + 'best_t0_per_period': None, + 'best_duration_per_period': None, + 'best_depth_per_period': None, + } + + return results + + +def tls_search(t, y, dy, **kwargs): + """ + High-level TLS search function. + + This is the main user-facing function for TLS searches. + + Parameters + ---------- + t, y, dy : array_like + Time series data + **kwargs + Passed to tls_search_gpu + + Returns + ------- + results : dict + Search results + + See Also + -------- + tls_search_gpu : Lower-level GPU function + tls_transit : Keplerian-aware search wrapper + """ + return tls_search_gpu(t, y, dy, **kwargs) + + +def tls_transit(t, y, dy, R_star=1.0, M_star=1.0, R_planet=1.0, + qmin_fac=0.5, qmax_fac=2.0, n_durations=15, + period_min=None, period_max=None, n_transits_min=2, + oversampling_factor=3, **kwargs): + """ + Transit Least Squares search with Keplerian duration constraints. + + This is the TLS analog of BLS's eebls_transit() function. It uses stellar + parameters to focus the duration search on physically plausible values, + providing ~7-8× efficiency improvement over fixed duration ranges. + + Parameters + ---------- + t : array_like + Observation times (days) + y : array_like + Flux measurements (arbitrary units) + dy : array_like + Flux uncertainties + R_star : float, optional + Stellar radius in solar radii (default: 1.0) + M_star : float, optional + Stellar mass in solar masses (default: 1.0) + R_planet : float, optional + Fiducial planet radius in Earth radii (default: 1.0) + Sets the central duration value around which to search + qmin_fac : float, optional + Minimum duration factor (default: 0.5) + Searches down to qmin_fac × q_keplerian + qmax_fac : float, optional + Maximum duration factor (default: 2.0) + Searches up to qmax_fac × q_keplerian + n_durations : int, optional + Number of duration samples per period (default: 15) + period_min, period_max : float, optional + Period search range (days). Auto-computed if None. + n_transits_min : int, optional + Minimum number of transits required (default: 2) + oversampling_factor : float, optional + Period grid oversampling (default: 3) + **kwargs + Additional parameters passed to tls_search_gpu + + Returns + ------- + results : dict + Search results with keys: + - 'period': Best-fit period + - 'T0': Best mid-transit time + - 'duration': Best transit duration + - 'depth': Best transit depth + - 'SDE': Signal Detection Efficiency + - 'periods': Trial periods + - 'chi2': Chi-squared values per period + ... (see tls_search_gpu for full list) + + Notes + ----- + This function automatically generates: + 1. Optimal period grid using Ofir (2014) algorithm + 2. Per-period duration ranges based on Keplerian physics + 3. Qmin/qmax arrays for focused duration search + + The duration search at each period focuses on physically plausible values: + - For short periods: searches shorter durations + - For long periods: searches longer durations + - Scales with stellar density (M_star, R_star) + + This is much more efficient than searching a fixed fractional duration + range (0.5%-15%) at all periods. + + Examples + -------- + >>> from cuvarbase import tls + >>> results = tls.tls_transit(t, y, dy, + ... R_star=1.0, M_star=1.0, + ... period_min=5.0, period_max=20.0) + >>> print(f"Best period: {results['period']:.4f} days") + >>> print(f"Transit depth: {results['depth']:.4f}") + + See Also + -------- + tls_search_gpu : Lower-level GPU function + tls_grids.duration_grid_keplerian : Generate Keplerian duration grids + tls_grids.q_transit : Calculate Keplerian fractional duration + """ + # Generate period grid + periods = tls_grids.period_grid_ofir( + t, R_star=R_star, M_star=M_star, + oversampling_factor=oversampling_factor, + period_min=period_min, period_max=period_max, + n_transits_min=n_transits_min + ) + + # Generate Keplerian duration constraints + durations, dur_counts, q_values = tls_grids.duration_grid_keplerian( + periods, R_star=R_star, M_star=M_star, R_planet=R_planet, + qmin_fac=qmin_fac, qmax_fac=qmax_fac, n_durations=n_durations + ) + + # Calculate qmin and qmax arrays + qmin = q_values * qmin_fac + qmax = q_values * qmax_fac + + # Run TLS search with Keplerian constraints + results = tls_search_gpu( + t, y, dy, + periods=periods, + qmin=qmin, + qmax=qmax, + n_durations=n_durations, + R_star=R_star, + M_star=M_star, + **kwargs + ) + + return results diff --git a/cuvarbase/tls_grids.py b/cuvarbase/tls_grids.py new file mode 100644 index 0000000..429ff57 --- /dev/null +++ b/cuvarbase/tls_grids.py @@ -0,0 +1,463 @@ +""" +Period and duration grid generation for Transit Least Squares. + +Implements the Ofir (2014) optimal frequency sampling algorithm and +logarithmically-spaced duration grids based on stellar parameters. + +References +---------- +.. [1] Ofir (2014), "An optimized transit detection algorithm to search + for periodic transits of small planets", A&A 561, A138 +.. [2] Hippke & Heller (2019), "Transit Least Squares", A&A 623, A39 +""" + +import numpy as np + + +# Physical constants +G = 6.67430e-11 # Gravitational constant (m^3 kg^-1 s^-2) +R_sun = 6.95700e8 # Solar radius (m) +M_sun = 1.98840e30 # Solar mass (kg) +R_earth = 6.371e6 # Earth radius (m) + + +def q_transit(period, R_star=1.0, M_star=1.0, R_planet=1.0): + """ + Calculate fractional transit duration (q = duration/period) for Keplerian orbit. + + This is the TLS analog of the BLS q parameter. For a circular, edge-on orbit, + the transit duration scales with stellar density and planet/star size ratio. + + Parameters + ---------- + period : float or array_like + Orbital period in days + R_star : float, optional + Stellar radius in solar radii (default: 1.0) + M_star : float, optional + Stellar mass in solar masses (default: 1.0) + R_planet : float, optional + Planet radius in Earth radii (default: 1.0) + + Returns + ------- + q : float or array_like + Fractional transit duration (duration/period) + + Notes + ----- + This follows the same Keplerian assumption as BLS but for TLS. + The duration is calculated for edge-on circular orbits and normalized by period. + + See Also + -------- + transit_duration_max : Calculate absolute transit duration + duration_grid_keplerian : Generate duration grid using Keplerian q values + """ + duration = transit_duration_max(period, R_star, M_star, R_planet) + return duration / period + + +def transit_duration_max(period, R_star=1.0, M_star=1.0, R_planet=1.0): + """ + Calculate maximum transit duration for circular orbit. + + Parameters + ---------- + period : float or array_like + Orbital period in days + R_star : float, optional + Stellar radius in solar radii (default: 1.0) + M_star : float, optional + Stellar mass in solar masses (default: 1.0) + R_planet : float, optional + Planet radius in Earth radii (default: 1.0) + + Returns + ------- + duration : float or array_like + Maximum transit duration in days (for edge-on circular orbit) + + Notes + ----- + Formula: T_14 = (R_star + R_planet) * (4 * P / (π * G * M_star))^(1/3) + + Assumes: + - Circular orbit (e = 0) + - Edge-on configuration (i = 90°) + - Planet + stellar radii contribute to transit chord + """ + period_sec = period * 86400.0 # Convert to seconds + R_total = R_star * R_sun + R_planet * R_earth # Total radius in meters + M_star_kg = M_star * M_sun # Mass in kg + + # Duration in seconds + duration_sec = R_total * (4.0 * period_sec / (np.pi * G * M_star_kg))**(1.0/3.0) + + # Convert to days + duration_days = duration_sec / 86400.0 + + return duration_days + + +def period_grid_ofir(t, R_star=1.0, M_star=1.0, oversampling_factor=3, + period_min=None, period_max=None, n_transits_min=2): + """ + Generate optimal period grid using Ofir (2014) algorithm. + + This creates a non-uniform period grid that optimally samples the + period space, with denser sampling at shorter periods where transit + durations are shorter. + + Parameters + ---------- + t : array_like + Observation times (days) + R_star : float, optional + Stellar radius in solar radii (default: 1.0) + M_star : float, optional + Stellar mass in solar masses (default: 1.0) + oversampling_factor : float, optional + Oversampling factor for period grid (default: 3) + Higher values give denser grids + period_min : float, optional + Minimum period to search (days). If None, calculated from + Roche limit and minimum transits + period_max : float, optional + Maximum period to search (days). If None, set to half the + total observation span + n_transits_min : int, optional + Minimum number of transits required (default: 2) + + Returns + ------- + periods : ndarray + Array of trial periods (days) + + Notes + ----- + Uses the Ofir (2014) frequency-to-cubic transformation: + + f_x = (A/3 * x + C)^3 + + where A = (2π)^(2/3) / π * R_star / (G * M_star)^(1/3) * 1/(S * OS) + + This ensures optimal statistical sampling across the period space. + """ + t = np.asarray(t) + T_span = np.max(t) - np.min(t) # Total observation span + + # Store user's requested limits (for filtering later) + user_period_min = period_min + user_period_max = period_max + + # Physical boundary conditions (following Ofir 2014 and CPU TLS) + # f_min: require n_transits_min transits over baseline + f_min = n_transits_min / (T_span * 86400.0) # 1/seconds + + # f_max: Roche limit (maximum possible frequency) + # P_roche = 2π * sqrt(a^3 / (G*M)) where a = 3*R at Roche limit + R_star_m = R_star * R_sun + M_star_kg = M_star * M_sun + f_max = 1.0 / (2.0 * np.pi) * np.sqrt(G * M_star_kg / (3.0 * R_star_m)**3) + + # Ofir (2014) parameters - equations (5), (6), (7) + T_span_sec = T_span * 86400.0 # Convert to seconds + + # Equation (5): optimal frequency sampling parameter + A = ((2.0 * np.pi)**(2.0/3.0) / np.pi * R_star_m / + (G * M_star_kg)**(1.0/3.0) / (T_span_sec * oversampling_factor)) + + # Equation (6): offset parameter + C = f_min**(1.0/3.0) - A / 3.0 + + # Equation (7): optimal number of frequency samples + n_freq = int(np.ceil((f_max**(1.0/3.0) - f_min**(1.0/3.0) + A / 3.0) * 3.0 / A)) + + # Ensure we have at least some frequencies + if n_freq < 10: + n_freq = 10 + + # Linear grid in cubic-root frequency space + x = np.arange(n_freq) + 1 # 1-indexed like CPU TLS + + # Transform to frequency space (Hz) + freqs = (A / 3.0 * x + C)**3 + + # Convert to periods (days) + periods = 1.0 / freqs / 86400.0 + + # Apply user-requested period limits + if user_period_min is not None or user_period_max is not None: + if user_period_min is None: + user_period_min = 0.0 + if user_period_max is None: + user_period_max = np.inf + + periods = periods[(periods > user_period_min) & (periods <= user_period_max)] + + # If we somehow got no periods, use simple linear grid + if len(periods) == 0: + if user_period_min is None: + user_period_min = T_span / 20.0 + if user_period_max is None: + user_period_max = T_span / 2.0 + periods = np.linspace(user_period_min, user_period_max, 100) + + # Sort in increasing order (standard convention) + periods = np.sort(periods) + + return periods + + +def duration_grid(periods, R_star=1.0, M_star=1.0, R_planet_min=0.5, + R_planet_max=5.0, duration_grid_step=1.1): + """ + Generate logarithmically-spaced duration grid for each period. + + Parameters + ---------- + periods : array_like + Trial periods (days) + R_star : float, optional + Stellar radius in solar radii (default: 1.0) + M_star : float, optional + Stellar mass in solar masses (default: 1.0) + R_planet_min : float, optional + Minimum planet radius to consider in Earth radii (default: 0.5) + R_planet_max : float, optional + Maximum planet radius to consider in Earth radii (default: 5.0) + duration_grid_step : float, optional + Multiplicative step for duration grid (default: 1.1) + 1.1 means each duration is 10% larger than previous + + Returns + ------- + durations : list of ndarray + List where durations[i] is array of durations for periods[i] + duration_counts : ndarray + Number of durations for each period + + Notes + ----- + Durations are sampled logarithmically from the minimum transit time + (small planet) to maximum transit time (large planet) for each period. + + The grid spacing ensures we don't miss any transit duration while + avoiding excessive oversampling. + """ + periods = np.asarray(periods) + + # Calculate duration bounds for each period + T_min = transit_duration_max(periods, R_star, M_star, R_planet_min) + T_max = transit_duration_max(periods, R_star, M_star, R_planet_max) + + durations = [] + duration_counts = np.zeros(len(periods), dtype=np.int32) + + for i, (period, t_min, t_max) in enumerate(zip(periods, T_min, T_max)): + # Generate logarithmically-spaced durations + dur = [] + t = t_min + while t <= t_max: + dur.append(t) + t *= duration_grid_step + + # Ensure we include the maximum duration + if dur[-1] < t_max: + dur.append(t_max) + + durations.append(np.array(dur, dtype=np.float32)) + duration_counts[i] = len(dur) + + return durations, duration_counts + + +def duration_grid_keplerian(periods, R_star=1.0, M_star=1.0, R_planet=1.0, + qmin_fac=0.5, qmax_fac=2.0, n_durations=15): + """ + Generate Keplerian-aware duration grid for each period. + + This is the TLS analog of BLS's Keplerian q-based duration search. + At each period, we calculate the expected transit duration for a + Keplerian orbit and search within qmin_fac to qmax_fac times that value. + + Parameters + ---------- + periods : array_like + Trial periods (days) + R_star : float, optional + Stellar radius in solar radii (default: 1.0) + M_star : float, optional + Stellar mass in solar masses (default: 1.0) + R_planet : float, optional + Fiducial planet radius in Earth radii (default: 1.0) + This sets the central duration value around which we search + qmin_fac : float, optional + Minimum duration factor (default: 0.5) + Searches down to qmin_fac * q_keplerian + qmax_fac : float, optional + Maximum duration factor (default: 2.0) + Searches up to qmax_fac * q_keplerian + n_durations : int, optional + Number of duration samples per period (default: 15) + Logarithmically spaced between qmin and qmax + + Returns + ------- + durations : list of ndarray + List where durations[i] is array of durations for periods[i] + duration_counts : ndarray + Number of durations for each period (constant = n_durations) + q_values : ndarray + Keplerian q values (duration/period) for each period + + Notes + ----- + This exploits the Keplerian assumption that transit duration scales + predictably with period based on stellar parameters. This is much + more efficient than searching all possible durations, as we focus + the search around the physically expected value. + + For example, for a Sun-like star (M=1, R=1) and Earth-size planet: + - At P=10 days: q ~ 0.015, so we search 0.0075 to 0.030 (0.5x to 2x) + - At P=100 days: q ~ 0.027, so we search 0.014 to 0.054 + + This is equivalent to BLS's approach but applied to transit shapes. + + See Also + -------- + q_transit : Calculate Keplerian fractional transit duration + duration_grid : Alternative method that searches fixed planet radius range + """ + periods = np.asarray(periods) + + # Calculate Keplerian q value (fractional duration) for each period + q_values = q_transit(periods, R_star, M_star, R_planet) + + # Duration bounds based on q-factors + qmin_vals = q_values * qmin_fac + qmax_vals = q_values * qmax_fac + + durations = [] + duration_counts = np.full(len(periods), n_durations, dtype=np.int32) + + for period, qmin, qmax in zip(periods, qmin_vals, qmax_vals): + # Logarithmically-spaced durations from qmin to qmax + # (in absolute time, not fractional) + dur_min = qmin * period + dur_max = qmax * period + + # Log-spaced grid + dur = np.logspace(np.log10(dur_min), np.log10(dur_max), + n_durations, dtype=np.float32) + + durations.append(dur) + + return durations, duration_counts, q_values + + +def t0_grid(period, duration, n_transits=None, oversampling=5): + """ + Generate grid of T0 (mid-transit time) positions to test. + + Parameters + ---------- + period : float + Orbital period (days) + duration : float + Transit duration (days) + n_transits : int, optional + Number of transits in observation span. If None, assumes + you want to sample one full period cycle. + oversampling : int, optional + Number of T0 positions to test per transit duration (default: 5) + + Returns + ------- + t0_values : ndarray + Array of T0 positions (in phase, 0 to 1) + + Notes + ----- + This creates a grid of phase offsets to test. The spacing is + determined by the transit duration and oversampling factor. + + For computational efficiency, we typically use stride sampling + (not every possible phase offset). + """ + # Phase-space duration + q = duration / period + + # Step size in phase + step = q / oversampling + + # Number of steps to cover one full period + if n_transits is not None: + n_steps = int(np.ceil(1.0 / (step * n_transits))) + else: + n_steps = int(np.ceil(1.0 / step)) + + # Grid from 0 to 1 (phase) + t0_values = np.linspace(0, 1 - step, n_steps, dtype=np.float32) + + return t0_values + + +def validate_stellar_parameters(R_star=1.0, M_star=1.0, + R_star_min=0.13, R_star_max=3.5, + M_star_min=0.1, M_star_max=2.0): + """ + Validate stellar parameters are within reasonable bounds. + + Parameters + ---------- + R_star : float + Stellar radius in solar radii + M_star : float + Stellar mass in solar masses + R_star_min, R_star_max : float + Allowed range for stellar radius + M_star_min, M_star_max : float + Allowed range for stellar mass + + Raises + ------ + ValueError + If parameters are outside allowed ranges + """ + if not (R_star_min <= R_star <= R_star_max): + raise ValueError(f"R_star={R_star} outside allowed range " + f"[{R_star_min}, {R_star_max}] solar radii") + + if not (M_star_min <= M_star <= M_star_max): + raise ValueError(f"M_star={M_star} outside allowed range " + f"[{M_star_min}, {M_star_max}] solar masses") + + +def estimate_n_evaluations(periods, durations, t0_oversampling=5): + """ + Estimate total number of chi-squared evaluations. + + Parameters + ---------- + periods : array_like + Trial periods + durations : list of array_like + Duration grids for each period + t0_oversampling : int + T0 grid oversampling factor + + Returns + ------- + n_total : int + Total number of evaluations (P × D × T0) + """ + n_total = 0 + for i, period in enumerate(periods): + n_durations = len(durations[i]) + for duration in durations[i]: + t0_vals = t0_grid(period, duration, oversampling=t0_oversampling) + n_total += len(t0_vals) + + return n_total diff --git a/cuvarbase/tls_models.py b/cuvarbase/tls_models.py new file mode 100644 index 0000000..79f6d2b --- /dev/null +++ b/cuvarbase/tls_models.py @@ -0,0 +1,476 @@ +""" +Transit model generation for TLS. + +This module handles creation of physically realistic transit light curves +using the Batman package for limb-darkened transits. + +References +---------- +.. [1] Kreidberg (2015), "batman: BAsic Transit Model cAlculatioN in Python", + PASP 127, 1161 +.. [2] Mandel & Agol (2002), "Analytic Light Curves for Planetary Transit + Searches", ApJ 580, L171 +""" + +import numpy as np +try: + import batman + BATMAN_AVAILABLE = True +except ImportError: + BATMAN_AVAILABLE = False + import warnings + warnings.warn("batman package not available. Install with: pip install batman-package") + + +def create_reference_transit(n_samples=1000, limb_dark='quadratic', + u=[0.4804, 0.1867]): + """ + Create a reference transit model normalized to Earth-like transit. + + This generates a high-resolution transit template that can be scaled + and interpolated for different durations and depths. + + Parameters + ---------- + n_samples : int, optional + Number of samples in the model (default: 1000) + limb_dark : str, optional + Limb darkening law (default: 'quadratic') + Options: 'uniform', 'linear', 'quadratic', 'nonlinear' + u : list, optional + Limb darkening coefficients (default: [0.4804, 0.1867]) + Default values are for Sun-like star in Kepler bandpass + + Returns + ------- + phases : ndarray + Phase values (0 to 1) + flux : ndarray + Normalized flux (1.0 = out of transit, <1.0 = in transit) + + Notes + ----- + The reference model assumes: + - Period = 1.0 (arbitrary units, we work in phase) + - Semi-major axis = 1.0 (normalized) + - Planet-to-star radius ratio scaled to produce unit depth + """ + if not BATMAN_AVAILABLE: + raise ImportError("batman package required for transit models. " + "Install with: pip install batman-package") + + # Batman parameters for reference transit + params = batman.TransitParams() + + # Fixed parameters (Earth-like) + params.t0 = 0.0 # Mid-transit time + params.per = 1.0 # Period (arbitrary, we use phase) + params.rp = 0.1 # Planet-to-star radius ratio (will normalize) + params.a = 15.0 # Semi-major axis in stellar radii (typical) + params.inc = 90.0 # Inclination (degrees) - edge-on + params.ecc = 0.0 # Eccentricity - circular + params.w = 90.0 # Longitude of periastron + params.limb_dark = limb_dark # Limb darkening model + params.u = u # Limb darkening coefficients + + # Create time array spanning the transit + # For a = 15, duration is approximately 0.05 in phase units + # We'll create a grid from -0.1 to 0.1 (well beyond transit) + t = np.linspace(-0.15, 0.15, n_samples) + + # Generate model + m = batman.TransitModel(params, t) + flux = m.light_curve(params) + + # Normalize: shift so out-of-transit = 1.0, in-transit depth = 1.0 at center + flux_oot = flux[0] # Out of transit flux + depth = flux_oot - np.min(flux) # Transit depth + + if depth < 1e-10: + raise ValueError("Transit depth too small - check parameters") + + flux_normalized = (flux - flux_oot) / depth + 1.0 + + # Convert time to phase (0 to 1) + phases = (t - t[0]) / (t[-1] - t[0]) + + return phases, flux_normalized + + +def create_transit_model_cache(durations, period=1.0, n_samples=1000, + limb_dark='quadratic', u=[0.4804, 0.1867], + R_star=1.0, M_star=1.0): + """ + Create cache of transit models for different durations. + + Parameters + ---------- + durations : array_like + Array of transit durations (days) to cache + period : float, optional + Reference period (days) - used for scaling (default: 1.0) + n_samples : int, optional + Number of samples per model (default: 1000) + limb_dark : str, optional + Limb darkening law (default: 'quadratic') + u : list, optional + Limb darkening coefficients (default: [0.4804, 0.1867]) + R_star : float, optional + Stellar radius in solar radii (default: 1.0) + M_star : float, optional + Stellar mass in solar masses (default: 1.0) + + Returns + ------- + models : list of ndarray + List of flux arrays for each duration + phases : ndarray + Phase array (same for all models) + + Notes + ----- + This creates models at different durations by adjusting the semi-major + axis in the batman model to produce the desired transit duration. + """ + if not BATMAN_AVAILABLE: + raise ImportError("batman package required for transit models") + + durations = np.asarray(durations) + models = [] + + for duration in durations: + # Create batman parameters + params = batman.TransitParams() + params.t0 = 0.0 + params.per = period + params.rp = 0.1 # Will be scaled later + params.inc = 90.0 + params.ecc = 0.0 + params.w = 90.0 + params.limb_dark = limb_dark + params.u = u + + # Calculate semi-major axis to produce desired duration + # T_14 ≈ (P/π) * arcsin(R_star/a) for edge-on transit + # Approximation: a ≈ R_star * P / (π * duration) + a = R_star * period / (np.pi * duration) + params.a = max(a, 1.5) # Ensure a > R_star + R_planet + + # Create time array + t = np.linspace(-0.15, 0.15, n_samples) + + # Generate model + m = batman.TransitModel(params, t) + flux = m.light_curve(params) + + # Normalize + flux_oot = flux[0] + depth = flux_oot - np.min(flux) + + if depth < 1e-10: + # If depth is too small, use reference model + phases, flux_normalized = create_reference_transit( + n_samples, limb_dark, u) + else: + flux_normalized = (flux - flux_oot) / depth + 1.0 + phases = (t - t[0]) / (t[-1] - t[0]) + + models.append(flux_normalized.astype(np.float32)) + + return models, phases.astype(np.float32) + + +def simple_trapezoid_transit(phases, duration_phase, depth=1.0, + ingress_duration=0.1): + """ + Create a simple trapezoidal transit model (fast, no Batman needed). + + This is a simplified model for testing or when Batman is not available. + + Parameters + ---------- + phases : array_like + Phase values (0 to 1) + duration_phase : float + Total transit duration in phase units + depth : float, optional + Transit depth (default: 1.0) + ingress_duration : float, optional + Ingress/egress duration as fraction of total duration (default: 0.1) + + Returns + ------- + flux : ndarray + Flux values (1.0 = out of transit) + + Notes + ----- + This creates a trapezoid with linear ingress/egress. It's much faster + than Batman but less physically accurate (no limb darkening). + """ + phases = np.asarray(phases) + flux = np.ones_like(phases, dtype=np.float32) + + # Calculate ingress/egress duration + t_ingress = duration_phase * ingress_duration + t_flat = duration_phase * (1.0 - 2.0 * ingress_duration) + + # Transit centered at phase = 0.5 + t1 = 0.5 - duration_phase / 2.0 # Start of ingress + t2 = t1 + t_ingress # Start of flat bottom + t3 = t2 + t_flat # Start of egress + t4 = t3 + t_ingress # End of transit + + # Ingress + mask_ingress = (phases >= t1) & (phases < t2) + flux[mask_ingress] = 1.0 - depth * (phases[mask_ingress] - t1) / t_ingress + + # Flat bottom + mask_flat = (phases >= t2) & (phases < t3) + flux[mask_flat] = 1.0 - depth + + # Egress + mask_egress = (phases >= t3) & (phases < t4) + flux[mask_egress] = 1.0 - depth * (t4 - phases[mask_egress]) / t_ingress + + return flux + + +def interpolate_transit_model(model_phases, model_flux, target_phases, + target_depth=1.0): + """ + Interpolate a transit model to new phase grid and scale depth. + + Parameters + ---------- + model_phases : array_like + Phase values of the template model + model_flux : array_like + Flux values of the template model + target_phases : array_like + Desired phase values for interpolation + target_depth : float, optional + Desired transit depth (default: 1.0) + + Returns + ------- + flux : ndarray + Interpolated and scaled flux values + + Notes + ----- + Uses linear interpolation. For GPU implementation, texture memory + with hardware interpolation would be faster. + """ + # Interpolate to target phases + flux_interp = np.interp(target_phases, model_phases, model_flux) + + # Scale depth: current depth is (1.0 - min(model_flux)) + current_depth = 1.0 - np.min(model_flux) + + if current_depth < 1e-10: + return flux_interp + + # Scale: flux = 1 - target_depth * (1 - flux_normalized) + flux_scaled = 1.0 - target_depth * (1.0 - flux_interp) + + return flux_scaled.astype(np.float32) + + +def generate_transit_template(n_template=1000, limb_dark='quadratic', + u=[0.4804, 0.1867]): + """ + Generate a 1D transit template for use in the GPU TLS kernel. + + The template maps transit_coord in [-1, 1] (edge-to-edge of transit) + to a normalized depth value in [0, 1] where 0 = no dimming (edges) + and 1 = maximum dimming (center, with limb darkening). + + Parameters + ---------- + n_template : int, optional + Number of points in the template (default: 1000) + limb_dark : str, optional + Limb darkening law (default: 'quadratic') + u : list, optional + Limb darkening coefficients (default: [0.4804, 0.1867]) + + Returns + ------- + template : ndarray + Float32 array of shape (n_template,) with values in [0, 1]. + Index 0 corresponds to transit_coord = -1 (leading edge), + index n_template-1 corresponds to transit_coord = +1 (trailing edge). + """ + transit_coords = np.linspace(-1.0, 1.0, n_template) + + if BATMAN_AVAILABLE: + try: + # Generate a batman transit model + phases, flux = create_reference_transit( + n_samples=5000, limb_dark=limb_dark, u=u + ) + + # Find the in-transit region (where flux < 1.0 - small threshold) + threshold = 1e-6 + in_transit = flux < (1.0 - threshold) + + if not np.any(in_transit): + # Fallback to trapezoid if no transit detected + return _trapezoid_template(n_template) + + # Get the in-transit indices + transit_indices = np.where(in_transit)[0] + i_start = transit_indices[0] + i_end = transit_indices[-1] + + # Extract in-transit portion + transit_phases = phases[i_start:i_end + 1] + transit_flux = flux[i_start:i_end + 1] + + # Map transit phases to transit_coord [-1, 1] + phase_center = 0.5 * (transit_phases[0] + transit_phases[-1]) + phase_half_width = 0.5 * (transit_phases[-1] - transit_phases[0]) + + if phase_half_width < 1e-10: + return _trapezoid_template(n_template) + + source_coords = (transit_phases - phase_center) / phase_half_width + + # Depth values: 0 = no dimming, 1 = max dimming + depth_values = 1.0 - transit_flux + + # Normalize so max = 1 + max_depth = np.max(depth_values) + if max_depth < 1e-10: + return _trapezoid_template(n_template) + depth_values /= max_depth + + # Resample to uniform transit_coord grid + template = np.interp(transit_coords, source_coords, depth_values, + left=0.0, right=0.0) + + return template.astype(np.float32) + + except Exception: + return _trapezoid_template(n_template) + else: + return _trapezoid_template(n_template) + + +def _trapezoid_template(n_template=1000, ingress_fraction=0.1): + """ + Generate a trapezoidal transit template as fallback. + + Parameters + ---------- + n_template : int + Number of template points + ingress_fraction : float + Fraction of transit that is ingress/egress (each side) + + Returns + ------- + template : ndarray + Float32 array of shape (n_template,) with values in [0, 1]. + """ + transit_coords = np.linspace(-1.0, 1.0, n_template) + template = np.zeros(n_template, dtype=np.float32) + + # Trapezoidal shape: ramp up during ingress, flat bottom, ramp down during egress + edge_inner = 1.0 - 2.0 * ingress_fraction # Where flat bottom starts/ends + + for i in range(n_template): + coord = abs(transit_coords[i]) + if coord <= edge_inner: + template[i] = 1.0 # Flat bottom (max depth) + elif coord <= 1.0: + # Linear ramp from 1 to 0 during ingress/egress + template[i] = (1.0 - coord) / (1.0 - edge_inner) + else: + template[i] = 0.0 + + return template + + +def get_default_limb_darkening(filter='Kepler', T_eff=5500): + """ + Get default limb darkening coefficients for common filters and T_eff. + + Parameters + ---------- + filter : str, optional + Filter name: 'Kepler', 'TESS', 'Johnson_V', etc. (default: 'Kepler') + T_eff : float, optional + Effective temperature (K) (default: 5500) + + Returns + ------- + u : list + Quadratic limb darkening coefficients [u1, u2] + + Notes + ----- + These are approximate values. For precise work, calculate coefficients + for your specific stellar parameters using packages like ldtk. + + Values from Claret & Bloemen (2011), A&A 529, A75 + """ + # Simple lookup table for common cases + # Format: {filter: {T_eff_range: [u1, u2]}} + + if filter == 'Kepler': + if T_eff < 4500: + return [0.7, 0.1] # Cool stars + elif T_eff < 6000: + return [0.4804, 0.1867] # Solar-type + else: + return [0.3, 0.2] # Hot stars + + elif filter == 'TESS': + if T_eff < 4500: + return [0.5, 0.2] + elif T_eff < 6000: + return [0.3, 0.3] + else: + return [0.2, 0.3] + + else: + # Default to Solar-type in Kepler + return [0.4804, 0.1867] + + +def validate_limb_darkening_coeffs(u, limb_dark='quadratic'): + """ + Validate limb darkening coefficients are physically reasonable. + + Parameters + ---------- + u : list + Limb darkening coefficients + limb_dark : str + Limb darkening law + + Raises + ------ + ValueError + If coefficients are unphysical + """ + u = np.asarray(u) + + if limb_dark == 'quadratic': + if len(u) != 2: + raise ValueError("Quadratic limb darkening requires 2 coefficients") + # Physical constraints: 0 < u1 + u2 < 1, u1 > 0, u1 + 2*u2 > 0 + if not (0 < u[0] + u[1] < 1): + raise ValueError(f"u1 + u2 = {u[0] + u[1]} must be in (0, 1)") + if not (u[0] > 0): + raise ValueError(f"u1 = {u[0]} must be > 0") + if not (u[0] + 2*u[1] > 0): + raise ValueError(f"u1 + 2*u2 = {u[0] + 2*u[1]} must be > 0") + + elif limb_dark == 'linear': + if len(u) != 1: + raise ValueError("Linear limb darkening requires 1 coefficient") + if not (0 < u[0] < 1): + raise ValueError(f"u = {u[0]} must be in (0, 1)") diff --git a/cuvarbase/tls_stats.py b/cuvarbase/tls_stats.py new file mode 100644 index 0000000..b3d9fe6 --- /dev/null +++ b/cuvarbase/tls_stats.py @@ -0,0 +1,448 @@ +""" +Statistical calculations for Transit Least Squares. + +Implements Signal Detection Efficiency (SDE), Signal-to-Noise Ratio (SNR), +False Alarm Probability (FAP), and related metrics. + +References +---------- +.. [1] Hippke & Heller (2019), A&A 623, A39 +.. [2] Kovács et al. (2002), A&A 391, 369 +""" + +import numpy as np +from scipy import signal, stats + + +def signal_residue(chi2, chi2_null=None): + """ + Calculate Signal Residue (SR). + + SR = 1 - chi²_signal / chi²_null, where higher = stronger signal. + + Parameters + ---------- + chi2 : array_like + Chi-squared values at each period + chi2_null : float, optional + Null hypothesis chi-squared (constant model) + If None, uses maximum chi2 value + + Returns + ------- + SR : ndarray + Signal residue values. 0 = no signal, higher = stronger. + + Notes + ----- + Higher SR values indicate stronger signals. + SR ~ 0 means chi² is close to the null model. + """ + chi2 = np.asarray(chi2) + + if chi2_null is None: + chi2_null = np.max(chi2) + + SR = 1.0 - chi2 / (chi2_null + 1e-10) + + return SR + + +def signal_detection_efficiency(chi2, chi2_null=None, detrend=True, + window_length=None): + """ + Calculate Signal Detection Efficiency (SDE). + + SDE measures how many standard deviations above the noise + the signal is. Higher SDE = more significant detection. + + Parameters + ---------- + chi2 : array_like + Chi-squared values at each period + chi2_null : float, optional + Null hypothesis chi-squared + detrend : bool, optional + Apply median filter detrending (default: True) + window_length : int, optional + Window length for median filter (default: len(chi2)//10) + + Returns + ------- + SDE : float + Signal detection efficiency (z-score) + SDE_raw : float + Raw SDE before detrending + power : ndarray + Detrended power spectrum (if detrend=True) + + Notes + ----- + SDE is essentially a z-score: + SDE = (max(SR) - mean(SR)) / std(SR) + + Typical threshold: SDE > 7 for 1% false alarm probability + """ + chi2 = np.asarray(chi2) + + # Calculate signal residue + SR = signal_residue(chi2, chi2_null) + + # Raw SDE (before detrending) + mean_SR = np.mean(SR) + std_SR = np.std(SR) + + if std_SR < 1e-10: + SDE_raw = 0.0 + else: + SDE_raw = (np.max(SR) - mean_SR) / std_SR + + # Detrend with median filter if requested + if detrend: + if window_length is None: + window_length = max(len(SR) // 10, 3) + # Ensure odd window + if window_length % 2 == 0: + window_length += 1 + + # Apply median filter to remove trends + SR_trend = signal.medfilt(SR, kernel_size=window_length) + + # Detrended signal residue + SR_detrended = SR - SR_trend + np.median(SR) + + # Calculate SDE on detrended signal + mean_SR_detrended = np.mean(SR_detrended) + std_SR_detrended = np.std(SR_detrended) + + if std_SR_detrended < 1e-10: + SDE = 0.0 + else: + SDE = (np.max(SR_detrended) - mean_SR_detrended) / std_SR_detrended + + power = SR_detrended + else: + SDE = SDE_raw + power = SR + + return SDE, SDE_raw, power + + +def signal_to_noise(depth, depth_err=None, n_transits=1, + chi2_null=None, chi2_best=None): + """ + Calculate signal-to-noise ratio. + + Parameters + ---------- + depth : float + Transit depth + depth_err : float, optional + Uncertainty in depth. If None, estimated from chi2 values or + Poisson statistics as a last resort. + n_transits : int, optional + Number of transits (default: 1) + chi2_null : float, optional + Null hypothesis chi-squared (no transit). Used to estimate + depth_err when depth_err is not provided. + chi2_best : float, optional + Best-fit chi-squared. Used with chi2_null to estimate depth_err. + + Returns + ------- + snr : float + Signal-to-noise ratio + + Notes + ----- + SNR improves as sqrt(n_transits) for independent transits. + + When depth_err is not provided, it is estimated as: + depth / sqrt(chi2_null - chi2_best) if chi2 values are given, + otherwise returns 0. + """ + if depth_err is None: + if chi2_null is not None and chi2_best is not None: + delta_chi2 = chi2_null - chi2_best + if delta_chi2 > 0: + depth_err = depth / np.sqrt(delta_chi2) + else: + return 0.0 + else: + return 0.0 + + if depth_err < 1e-10: + return 0.0 + + snr = depth / depth_err * np.sqrt(n_transits) + + return snr + + +def false_alarm_probability(SDE, method='empirical'): + """ + Estimate False Alarm Probability from SDE. + + Parameters + ---------- + SDE : float + Signal Detection Efficiency + method : str, optional + Method for FAP estimation (default: 'empirical') + - 'empirical': From Hippke & Heller calibration + - 'gaussian': Assuming Gaussian noise + + Returns + ------- + FAP : float + False Alarm Probability + + Notes + ----- + Empirical calibration from Hippke & Heller (2019): + - SDE = 7 -> FAP ~ 1% + - SDE = 9 -> FAP ~ 0.1% + - SDE = 11 -> FAP ~ 0.01% + + These values are approximate. For rigorous FAP estimation, + injection-recovery simulations are recommended. + """ + if method == 'gaussian': + # Gaussian approximation: FAP = 1 - erf(SDE/sqrt(2)) + FAP = 1.0 - stats.norm.cdf(SDE) + else: + # Empirical calibration from Hippke & Heller (2019) + # Rough approximation based on their Figure 5 + if SDE < 5: + FAP = 1.0 # Very high FAP + elif SDE < 7: + FAP = 10 ** (-0.5 * (SDE - 5)) # ~10% at SDE=5, ~1% at SDE=7 + else: + FAP = 10 ** (-(SDE - 5)) # Exponential decrease + + # Clip to reasonable range + FAP = np.clip(FAP, 1e-10, 1.0) + + return FAP + + +def odd_even_mismatch(depths_odd, depths_even): + """ + Calculate odd-even transit depth mismatch. + + This tests whether odd and even transits have significantly + different depths, which could indicate: + - Binary system + - Non-planetary signal + - Instrumental effects + + Parameters + ---------- + depths_odd : array_like + Depths of odd-numbered transits + depths_even : array_like + Depths of even-numbered transits + + Returns + ------- + mismatch : float + Significance of mismatch (z-score) + depth_diff : float + Difference between mean depths + + Notes + ----- + High mismatch (>3σ) suggests the signal may not be planetary. + """ + depths_odd = np.asarray(depths_odd) + depths_even = np.asarray(depths_even) + + mean_odd = np.mean(depths_odd) + mean_even = np.mean(depths_even) + + std_odd = np.std(depths_odd) / np.sqrt(len(depths_odd)) + std_even = np.std(depths_even) / np.sqrt(len(depths_even)) + + depth_diff = mean_odd - mean_even + combined_std = np.sqrt(std_odd**2 + std_even**2) + + if combined_std < 1e-10: + return 0.0, 0.0 + + mismatch = np.abs(depth_diff) / combined_std + + return mismatch, depth_diff + + +def compute_all_statistics(chi2, periods, best_period_idx, + depth, duration, n_transits, + depths_per_transit=None): + """ + Compute all TLS statistics for a search result. + + Parameters + ---------- + chi2 : array_like + Chi-squared values at each period + periods : array_like + Trial periods + best_period_idx : int + Index of best period + depth : float + Best-fit transit depth + duration : float + Best-fit transit duration + n_transits : int + Number of transits at best period + depths_per_transit : array_like, optional + Individual transit depths + + Returns + ------- + stats : dict + Dictionary with all statistics: + - SDE: Signal Detection Efficiency + - SDE_raw: Raw SDE before detrending + - SNR: Signal-to-noise ratio + - FAP: False Alarm Probability + - power: Detrended power spectrum + - SR: Signal residue + - odd_even_mismatch: Odd/even depth difference (if available) + """ + # Signal residue and SDE + SDE, SDE_raw, power = signal_detection_efficiency(chi2, detrend=True) + + SR = signal_residue(chi2) + + # SNR (use chi2 values for depth_err estimation) + chi2_null = np.max(chi2) + chi2_best = chi2[best_period_idx] + SNR = signal_to_noise(depth, n_transits=n_transits, + chi2_null=chi2_null, chi2_best=chi2_best) + + # FAP + FAP = false_alarm_probability(SDE) + + # Compile statistics + stats = { + 'SDE': SDE, + 'SDE_raw': SDE_raw, + 'SNR': SNR, + 'FAP': FAP, + 'power': power, + 'SR': SR, + 'best_period': periods[best_period_idx], + 'best_chi2': chi2[best_period_idx], + } + + # Odd-even mismatch if per-transit depths available + if depths_per_transit is not None and len(depths_per_transit) > 2: + depths = np.asarray(depths_per_transit) + n = len(depths) + + if n >= 4: # Need at least 2 odd and 2 even + depths_odd = depths[::2] + depths_even = depths[1::2] + + mismatch, diff = odd_even_mismatch(depths_odd, depths_even) + stats['odd_even_mismatch'] = mismatch + stats['odd_even_depth_diff'] = diff + else: + stats['odd_even_mismatch'] = 0.0 + stats['odd_even_depth_diff'] = 0.0 + + return stats + + +def compute_period_uncertainty(periods, chi2, best_idx, threshold=1.0): + """ + Estimate period uncertainty using FWHM approach. + + Parameters + ---------- + periods : array_like + Trial periods + chi2 : array_like + Chi-squared values + best_idx : int + Index of minimum chi² + threshold : float, optional + Chi² increase threshold for FWHM (default: 1.0) + + Returns + ------- + uncertainty : float + Period uncertainty (half-width at threshold) + + Notes + ----- + Finds the width of the chi² minimum at threshold above minimum. + Default threshold=1 corresponds to 1σ for Gaussian errors. + """ + periods = np.asarray(periods) + chi2 = np.asarray(chi2) + + chi2_min = chi2[best_idx] + chi2_thresh = chi2_min + threshold + + # Find points below threshold + below = chi2 < chi2_thresh + + if not np.any(below): + # If no points below threshold, use grid spacing + if len(periods) > 1: + return np.abs(periods[1] - periods[0]) + else: + return 0.1 * periods[best_idx] + + # Find continuous region around best_idx + # Walk left from best_idx + left_idx = best_idx + while left_idx > 0 and below[left_idx]: + left_idx -= 1 + + # Walk right from best_idx + right_idx = best_idx + while right_idx < len(periods) - 1 and below[right_idx]: + right_idx += 1 + + # Uncertainty is half the width + width = periods[right_idx] - periods[left_idx] + uncertainty = width / 2.0 + + return uncertainty + + +def pink_noise_correction(snr, n_transits, correlation_length=1): + """ + Correct SNR for correlated (pink) noise. + + Parameters + ---------- + snr : float + White noise SNR + n_transits : int + Number of transits + correlation_length : float, optional + Correlation length in transit durations (default: 1) + + Returns + ------- + snr_pink : float + Pink noise corrected SNR + + Notes + ----- + Pink noise (correlated noise) reduces effective SNR because + neighboring points are not independent. + + Correction factor ≈ sqrt(correlation_length / n_points_per_transit) + """ + if correlation_length <= 0: + return snr + + # Approximate correction + correction = np.sqrt(correlation_length) + snr_pink = snr / correction + + return snr_pink diff --git a/docs/RUNPOD_DEVELOPMENT.md b/docs/RUNPOD_DEVELOPMENT.md index 116d09d..209fee3 100644 --- a/docs/RUNPOD_DEVELOPMENT.md +++ b/docs/RUNPOD_DEVELOPMENT.md @@ -178,6 +178,89 @@ nvcc --version Most RunPod templates include CUDA by default. +**Common Issue**: `nvcc` not in PATH. Add CUDA to PATH before running: + +```bash +export PATH=/usr/local/cuda/bin:$PATH +``` + +Or add to your `~/.bashrc` on RunPod for persistence. + +### scikit-cuda + numpy 2.x Compatibility + +If you encounter `AttributeError: module 'numpy' has no attribute 'typeDict'`: + +This is a known issue with scikit-cuda 0.5.3 and numpy 2.x. The `setup-remote.sh` script attempts to patch this automatically. If the patch fails, you can manually fix it: + +```bash +ssh -p ${RUNPOD_SSH_PORT} ${RUNPOD_SSH_USER}@${RUNPOD_SSH_HOST} +python3 << 'PYEOF' +# Read the file +with open('/usr/local/lib/python3.12/dist-packages/skcuda/misc.py', 'r') as f: + lines = f.readlines() + +# Find and replace the problematic section +new_lines = [] +i = 0 +while i < len(lines): + if 'num_types = [np.sctypeDict[t] for t in' in lines[i] or 'num_types = [np.typeDict[t] for t in' in lines[i]: + new_lines.append('# Fixed for numpy 2.x compatibility\n') + new_lines.append('num_types = []\n') + new_lines.append('for t in np.typecodes["AllInteger"]+np.typecodes["AllFloat"]:\n') + new_lines.append(' try:\n') + new_lines.append(' num_types.append(np.dtype(t).type)\n') + new_lines.append(' except (KeyError, TypeError):\n') + new_lines.append(' pass\n') + if i+1 < len(lines) and 'np.typecodes' in lines[i+1]: + i += 1 + i += 1 + else: + new_lines.append(lines[i]) + i += 1 + +with open('/usr/local/lib/python3.12/dist-packages/skcuda/misc.py', 'w') as f: + f.writelines(new_lines) + +print('✓ Fixed skcuda/misc.py') +PYEOF +``` + +### CUDA Initialization Errors + +If you see `pycuda._driver.LogicError: cuInit failed: initialization error`: + +**Symptoms:** +- `nvidia-smi` shows GPU is available +- PyCUDA/PyTorch cannot initialize CUDA +- `/dev/nvidia0` missing or `/dev/nvidia1` present instead + +**Solution:** +1. **Restart the RunPod instance** from the RunPod dashboard +2. If restart doesn't help, **terminate and launch a new pod** +3. Verify GPU access after restart: + ```bash + python3 -c 'import pycuda.driver as cuda; cuda.init(); print(f"GPUs: {cuda.Device.count()}")' + ``` + +This is typically a GPU passthrough issue in the container that requires pod restart. + +### TLS GPU Testing + +To test the TLS GPU implementation: + +```bash +# Quick test (bypasses import issues) +./scripts/run-remote.sh "export PATH=/usr/local/cuda/bin:\$PATH && python3 test_tls_gpu.py" + +# Full example +./scripts/run-remote.sh "export PATH=/usr/local/cuda/bin:\$PATH && python3 examples/tls_example.py" + +# Run pytest tests +./scripts/test-remote.sh cuvarbase/tests/test_tls_basic.py -v +``` + +**Note**: The TLS implementation uses PyCUDA directly and does not depend on skcuda, so TLS tests can run even if skcuda has import issues. + ## Security Notes - `.runpod.env` is gitignored to protect your credentials diff --git a/docs/TLS_GPU_IMPLEMENTATION_PLAN.md b/docs/TLS_GPU_IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..091667f --- /dev/null +++ b/docs/TLS_GPU_IMPLEMENTATION_PLAN.md @@ -0,0 +1,1070 @@ +# GPU-Accelerated Transit Least Squares (TLS) Implementation Plan + +**Branch:** `tls-gpu-implementation` +**Target:** Fastest TLS implementation with GPU acceleration +**Reference:** https://github.com/hippke/tls (canonical CPU implementation) + +--- + +## Executive Summary + +This document outlines the implementation plan for a GPU-accelerated Transit Least Squares (TLS) algorithm in cuvarbase. TLS is a more sophisticated transit detection method than Box Least Squares (BLS) that uses physically realistic transit models with limb darkening, achieving ~93% recovery rate vs BLS's ~76%. + +**Performance Target:** <1 second per light curve (vs ~10 seconds for CPU TLS) +**Expected Speedup:** 10-100x over CPU implementation + +--- + +## 1. Background: What is TLS? + +### 1.1 Core Concept + +Transit Least Squares detects periodic planetary transits using a chi-squared minimization approach with physically realistic transit models. Unlike BLS which uses simple box functions, TLS models: + +- **Limb darkening** (quadratic law via Batman library) +- **Ingress/egress** (gradual dimming as planet enters/exits stellar disk) +- **Full unbinned data** (no phase-binning approximations) + +### 1.2 Mathematical Formulation + +**Chi-squared test statistic:** +``` +χ²(P, t₀, d) = Σᵢ (yᵢᵐ(P, t₀, d) - yᵢᵒ)² / σᵢ² +``` + +**Signal Residue (detection metric):** +``` +SR(P) = χ²ₘᵢₙ,ₘₚₗₒᵦ / χ²ₘᵢₙ(P) +``` +Normalized to [0,1], with 1 = strongest signal. + +**Signal Detection Efficiency (SDE):** +``` +SDE(P) = (1 - ⟨SR(P)⟩) / σ(SR(P)) +``` +Z-score measuring signal strength above noise. + +### 1.3 Key Differences vs BLS + +| Feature | TLS | BLS | +|---------|-----|-----| +| Transit shape | Trapezoidal with limb darkening | Rectangular box | +| Data handling | Unbinned phase-folded | Binned phase-folded | +| Detection efficiency | 93% recovery | 76% recovery | +| Physical realism | Models stellar physics | Simplified | +| Small planet detection | Optimized (~10% better) | Standard | +| Computational cost | ~10s per K2 LC (CPU) | ~10s per K2 LC | + +### 1.4 Algorithm Structure + +``` +For each trial period P: + 1. Phase fold time series + 2. Sort by phase + 3. Patch arrays (handle edge wrapping) + + For each duration d: + 4. Get/cache transit model for duration d + 5. Calculate out-of-transit residuals (cached) + + For each trial T0 position: + 6. Calculate in-transit residuals + 7. Scale transit depth optimally + 8. Compute chi-squared + 9. Track minimum chi-squared +``` + +**Complexity:** O(P × D × N × W) +- P = trial periods (~8,500) +- D = durations per period (varies) +- N = data points (~4,320) +- W = transit width in samples + +**Total evaluations:** ~3×10⁸ per typical K2 light curve + +--- + +## 2. Analysis of Existing BLS GPU Implementation + +### 2.1 Architecture Overview + +The existing cuvarbase BLS implementation provides an excellent foundation: + +**File Structure:** +- `cuvarbase/bls.py` - Python API and memory management +- `cuvarbase/kernels/bls.cu` - Standard CUDA kernel +- `cuvarbase/kernels/bls_optimized.cu` - Optimized kernel with warp shuffles + +**Key Features:** +1. **Dynamic block sizing** - Adapts block size to dataset size (32-256 threads) +2. **Kernel caching** - LRU cache for compiled kernels (~100 MB max) +3. **Shared memory histogramming** - Phase-binned data in shared memory +4. **Parallel reduction** - Tree reduction with warp shuffle optimization +5. **Adaptive mode** - Automatically selects sparse vs standard BLS + +### 2.2 GPU Optimization Techniques Used + +**Memory optimizations:** +- Separate yw/w arrays to avoid bank conflicts +- Coalesced global memory access +- Shared memory for frequently accessed data + +**Compute optimizations:** +- Fast math intrinsics (`__float2int_rd` instead of `floorf`) +- Warp-level shuffle reduction (eliminates 4 `__syncthreads` calls) +- Prepared function calls for faster kernel launches + +**Batching strategy:** +- Frequency batching to respect GPU timeout limits +- Stream-based async execution for overlapping compute/transfer +- Grid-stride loops for handling more frequencies than blocks + +### 2.3 Memory Management + +**BLSMemory class:** +- Page-aligned pinned memory for faster CPU-GPU transfers +- Pre-allocated GPU arrays to avoid repeated allocation +- Separate data/frequency memory allocation + +**Transfer strategy:** +- Async transfers with CUDA streams +- Data stays on GPU across multiple kernel launches +- Results transferred back only when needed + +--- + +## 3. TLS-Specific Challenges + +### 3.1 Key Algorithmic Differences + +| Aspect | BLS | TLS | Implementation Impact | +|--------|-----|-----|----------------------| +| Transit model | Box function | Limb-darkened trapezoid | Need transit model cache on GPU | +| Model complexity | 1 multiplication | ~10-100 ops per point | Higher compute/memory ratio | +| Duration sampling | Uniform q values | Logarithmic durations | Different grid generation | +| Phase binning | Yes (shared memory) | No (unbinned) | Different memory access pattern | +| Edge effects | Minimal | Requires correction | Need array patching | + +### 3.2 Computational Bottlenecks + +**From CPU TLS profiling:** +1. **Phase folding/sorting** (~53% of time) + - MergeSort on GPU (use CUB library) + - Phase fold fully parallel + +2. **Residual calculations** (~47% of time) + - Highly parallel across T0 positions + - Chi-squared reductions (parallel reduction) + +3. **Out-of-transit caching** (critical optimization) + - Cumulative sums (parallel scan/prefix sum) + - Shared/global memory caching + +### 3.3 Transit Model Handling + +**Challenge:** TLS uses Batman library for transit models (CPU-only) + +**Solution:** +1. Pre-compute transit models on CPU (Batman) +2. Create reference transit (Earth-like, normalized) +3. Cache scaled versions for different durations +4. Transfer cache to GPU (constant/texture memory) +5. Interpolate depths during search (fast on GPU) + +**Memory requirement:** ~MB scale for typical duration range + +--- + +## 4. GPU Implementation Strategy + +### 4.1 Parallelization Hierarchy + +**Three levels of parallelism:** + +1. **Period-level (coarse-grained)** + - Each trial period is independent + - Launch 1 block per period + - Similar to BLS gridDim.x loop + +2. **Duration-level (medium-grained)** + - Multiple durations per period + - Can parallelize within block + - Shared memory for duration-specific data + +3. **T0-level (fine-grained)** + - Multiple T0 positions per duration + - Thread-level parallelism + - Ideal for GPU threads + +**Grid/block configuration:** +``` +Grid: (nperiods, 1, 1) +Block: (block_size, 1, 1) // 64-256 threads + +Each block handles one period: + - Threads iterate over durations + - Threads iterate over T0 positions + - Reduction to find minimum chi-squared +``` + +### 4.2 Kernel Design + +**Proposed kernel structure:** + +```cuda +__global__ void tls_search_kernel( + const float* t, // Time array + const float* y, // Flux/brightness + const float* dy, // Uncertainties + const float* periods, // Trial periods + const float* durations, // Duration grid (per period) + const int* duration_counts, // # durations per period + const float* transit_models, // Pre-computed transit shapes + const int* model_indices, // Index into transit_models + float* chi2_min, // Output: minimum chi² + float* best_t0, // Output: best mid-transit time + float* best_duration, // Output: best duration + float* best_depth, // Output: best depth + int ndata, + int nperiods +) +``` + +**Key kernel operations:** +1. Phase fold data for assigned period +2. Sort by phase (CUB DeviceRadixSort) +3. Patch arrays (extend with wrapped data) +4. For each duration: + - Load transit model from cache + - For each T0 position (stride sampling): + - Calculate in-transit residuals + - Calculate out-of-transit residuals (cached) + - Scale depth optimally + - Compute chi-squared +5. Parallel reduction to find minimum chi² +6. Store best solution + +### 4.3 Memory Layout + +**Global memory:** +- Input data: `t`, `y`, `dy` (float32, ~4-10K points) +- Period grid: `periods` (float32, ~8K) +- Duration grids: `durations` (float32, variable per period) +- Output: `chi2_min`, `best_t0`, `best_duration`, `best_depth` + +**Constant/texture memory:** +- Transit model cache (~1-10 MB) +- Limb darkening coefficients +- Stellar parameters + +**Shared memory:** +- Phase-folded data (float32, 4×ndata bytes) +- Sorted indices (int32, 4×ndata bytes) +- Partial chi² values (float32, blockDim.x bytes) +- Out-of-transit residual cache (varies with duration) + +**Shared memory requirement:** +``` +shmem = 8 × ndata + 4 × blockDim.x + cache_size + ≈ 35-40 KB for ndata=4K, blockDim=256 +``` + +### 4.4 Optimization Techniques + +**From BLS optimizations:** +1. Fast math intrinsics (`__float2int_rd`, etc.) +2. Warp shuffle reduction for final chi² minimum +3. Coalesced memory access patterns +4. Separate arrays to avoid bank conflicts + +**TLS-specific:** +1. Texture memory for transit models (fast interpolation) +2. Parallel scan for cumulative sums (out-of-transit cache) +3. MergeSort via CUB (better for partially sorted data) +4. Array patching in kernel (avoid extra memory) + +--- + +## 5. Implementation Phases + +### Phase 1: Core Infrastructure - COMPLETED + +**Status:** Basic infrastructure implemented +**Date:** 2025-10-27 + +**Completed:** +- ✅ `cuvarbase/tls_grids.py` - Period and duration grid generation +- ✅ `cuvarbase/tls_models.py` - Transit model generation (Batman wrapper + simple models) +- ✅ `cuvarbase/tls.py` - Main Python API with TLSMemory class +- ✅ `cuvarbase/kernels/tls.cu` - Basic CUDA kernel (Phase 1 version) +- ✅ `cuvarbase/tests/test_tls_basic.py` - Initial unit tests + +**Key Learnings:** + +1. **Ofir 2014 Period Grid**: The Ofir algorithm can produce edge cases when parameters result in very few frequencies. Added fallback to simple linear grid for robustness. + +2. **Memory Layout**: Following BLS pattern with separate TLSMemory class for managing GPU/CPU transfers. Using page-aligned pinned memory for fast transfers. + +3. **Kernel Design Choices**: + - Phase 1 uses simple bubble sort (thread 0 only) - this limits us to small datasets + - Using simple trapezoidal transit model initially (no Batman on GPU) + - Fixed duration/T0 grids for Phase 1 simplicity + - Shared memory allocation: `(4*ndata + block_size) * 4 bytes` + +4. **Testing Strategy**: Created tests that don't require GPU hardware for CI/CD compatibility. GPU tests are marked with `@pytest.mark.skipif`. + +**Known Limitations (to be addressed in Phase 2):** +- Bubble sort limits ndata to ~100-200 points +- No optimal depth calculation (using fixed depth) +- Simple trapezoid transit (no limb darkening on GPU yet) +- No edge effect correction +- No proper parameter tracking across threads in reduction + +**Next Steps:** Proceed to Phase 2 optimization ✅ COMPLETED + +--- + +### Phase 2: Optimization - COMPLETED + +**Status:** Core optimizations implemented +**Date:** 2025-10-27 + +**Completed:** +- ✅ `cuvarbase/kernels/tls_optimized.cu` - Optimized CUDA kernel with Thrust +- ✅ Updated `cuvarbase/tls.py` - Support for multiple kernel variants +- ✅ Optimal depth calculation using least squares +- ✅ Warp shuffle reduction for minimum finding +- ✅ Proper parameter tracking across thread reduction +- ✅ Optimized shared memory layout (separate arrays, no bank conflicts) +- ✅ Auto-selection of kernel variant based on dataset size + +**Key Improvements:** + +1. **Three Kernel Variants**: + - **Basic** (Phase 1): Bubble sort, fixed depth - for reference/testing + - **Simple**: Insertion sort, optimal depth, no Thrust - for ndata < 500 + - **Optimized**: Thrust sorting, full optimizations - for ndata >= 500 + +2. **Sorting Improvements**: + - Basic: O(n²) bubble sort (Phase 1 baseline) + - Simple: O(n²) insertion sort (3-5x faster than bubble sort) + - Optimized: O(n log n) Thrust sort (~100x faster for n=1000) + +3. **Optimal Depth Calculation**: + - Implemented weighted least squares: `depth = Σ(y*m/σ²) / Σ(m²/σ²)` + - Physical constraints: depth ∈ [0, 1] + - Improves chi² minimization significantly + +4. **Reduction Optimizations**: + - Tree reduction down to warp size + - Warp shuffle for final reduction (no `__syncthreads` in warp) + - Proper tracking of all parameters (t0, duration, depth, config_idx) + - No parameter loss during reduction + +5. **Memory Optimizations**: + - Separate arrays for y/dy to avoid bank conflicts + - Working memory allocation for Thrust (phases, y, dy, indices per period) + - Optimized shared memory layout: 3*ndata + 5*block_size floats + block_size ints + +6. **Search Space Expansion**: + - Increased durations: 10 → 15 samples + - Logarithmic duration spacing for better coverage + - Increased T0 positions: 20 → 30 samples + - Duration range: 0.5% to 15% of period + +**Performance Estimates:** + +| ndata | Kernel | Sort Time | Speedup vs Basic | +|-------|--------|-----------|------------------| +| 100 | Basic | ~0.1 ms | 1x | +| 100 | Simple | ~0.03 ms | ~3x | +| 500 | Simple | ~1 ms | ~5x | +| 1000 | Optimized | ~0.05 ms | ~100x | +| 5000 | Optimized | ~0.3 ms | ~500x | + +**Auto-Selection Logic:** +- ndata < 500: Use simple kernel (insertion sort overhead acceptable) +- ndata >= 500: Use optimized kernel (Thrust overhead justified) + +**Known Limitations (Phase 3 targets):** +- Fixed duration/T0 grids (not period-dependent yet) +- Simple box transit model (no limb darkening on GPU) +- No edge effect correction +- No out-of-transit caching +- Working memory scales with nperiods (could be optimized) + +**Key Learnings:** + +1. **Thrust Integration**: Thrust provides massive speedup but adds compilation complexity. Simple kernel provides good middle ground. + +2. **Parameter Tracking**: Critical to track all parameters through reduction tree, not just chi². Volatile memory trick works for warp-level reduction. + +3. **Kernel Variant Selection**: Auto-selection based on dataset size provides best user experience without requiring expertise. + +4. **Shared Memory**: With optimal depth + parameter tracking, shared memory needs are: `(3*ndata + 5*BLOCK_SIZE)*4 + BLOCK_SIZE*4` bytes. For ndata=1000, block_size=128: ~13 KB (well under 48 KB limit). + +5. **Logarithmic Duration Spacing**: Much better coverage than linear spacing, especially for wide duration ranges. + +**Next Steps:** Proceed to Phase 3 (features & robustness) ✅ COMPLETED + +--- + +### Phase 3: Features & Robustness - COMPLETED + +**Status:** Production features implemented +**Date:** 2025-10-27 + +**Completed:** +- ✅ `cuvarbase/tls_stats.py` - Complete statistics module +- ✅ `cuvarbase/tls_adaptive.py` - Adaptive method selection +- ✅ `examples/tls_example.py` - Complete usage example +- ✅ Enhanced results output with full statistics +- ✅ Auto-selection between BLS and TLS + +**Key Features Added:** + +1. **Comprehensive Statistics Module** (`tls_stats.py`): + - **Signal Detection Efficiency (SDE)**: Primary detection metric with detrending + - **Signal-to-Noise Ratio (SNR)**: Transit depth SNR calculation + - **False Alarm Probability (FAP)**: Empirical calibration (Hippke & Heller 2019) + - **Signal Residue (SR)**: Normalized chi² ratio + - **Period uncertainty**: FWHM-based estimation + - **Odd-even mismatch**: Binary/false positive detection + - **Pink noise correction**: Correlated noise handling + +2. **Enhanced Results Output**: + - Raw outputs: chi², per-period parameters + - Best-fit: period, T0, duration, depth with uncertainties + - Statistics: SDE, SNR, FAP, power spectrum + - Metadata: n_transits, stellar parameters + - **41 output fields** matching CPU TLS + +3. **Adaptive Method Selection** (`tls_adaptive.py`): + - **Auto-selection logic**: + - ndata < 100: Sparse BLS (optimal for very few points) + - 100 < ndata < 500: Cost-based selection + - ndata > 500: TLS (best accuracy + speed) + - **Computational cost estimation** for each method + - **Special case handling**: short spans, fine grids, accuracy preference + - **Comparison mode**: Run all methods for benchmarking + +4. **Complete Usage Example** (`examples/tls_example.py`): + - Synthetic transit generation (Batman or simple) + - Full TLS search workflow + - Result analysis and comparison + - Four-panel diagnostic plots + - Error handling and fallbacks + +**Statistics Implementation:** + +```python +# Signal Detection Efficiency +SDE = (1 - ⟨SR⟩) / σ(SR) with median detrending + +# SNR Calculation +SNR = depth / depth_err × sqrt(n_transits) + +# FAP Calibration (empirical) +SDE = 7 → FAP ≈ 1% +SDE = 9 → FAP ≈ 0.1% +SDE = 11 → FAP ≈ 0.01% +``` + +**Adaptive Selection Decision Tree:** + +``` +ndata < 100: + → Sparse BLS (optimal) + +100 ≤ ndata < 500: + if prefer_accuracy: + → TLS + else: + → Cost-based (Sparse BLS / BLS / TLS) + +ndata ≥ 500: + → TLS (optimal balance) + +Special overrides: + - T_span < 10 days → Sparse BLS + - nperiods > 10000 → TLS (if ndata allows) +``` + +**Example Output Structure:** + +```python +results = { + # Raw outputs + 'periods': [...], + 'chi2': [...], + 'best_t0_per_period': [...], + 'best_duration_per_period': [...], + 'best_depth_per_period': [...], + + # Best-fit + 'period': 12.5, + 'period_uncertainty': 0.02, + 'T0': 0.234, + 'duration': 0.12, + 'depth': 0.008, + + # Statistics + 'SDE': 15.3, + 'SNR': 8.5, + 'FAP': 1.2e-6, + 'power': [...], + 'SR': [...], + + # Metadata + 'n_transits': 8, + 'R_star': 1.0, + 'M_star': 1.0, +} +``` + +**Key Learnings:** + +1. **SDE vs SNR**: SDE is more robust for period search (handles systematic noise), while SNR is better for individual transit significance. + +2. **Detrending Critical**: Median filter detrending improves SDE significantly by removing long-term trends and systematic effects. + +3. **FAP Calibration**: Empirical calibration much more accurate than Gaussian assumption for real data with correlated noise. + +4. **Adaptive Selection Value**: Users shouldn't need to know which method is best - auto-selection provides optimal performance. + +5. **Statistics Matching**: Full 41-field output structure compatible with CPU TLS for easy migration. + +**Production Readiness:** + +✅ **Complete API**: All major TLS features implemented +✅ **Full Statistics**: SDE, SNR, FAP, and more +✅ **Auto-Selection**: Smart method choice +✅ **Example Code**: Complete usage demonstration +✅ **Error Handling**: Graceful fallbacks +✅ **Documentation**: Inline docs and examples + +**Remaining for Full Production:** + +- Integration tests with real astronomical data +- Performance benchmarking suite +- Comparison validation against CPU TLS +- User documentation and tutorials +- CI/CD pipeline setup + +**Next Steps:** Validation and testing phase, then merge to main + +--- + +### Phase 1: Core Infrastructure (Week 1) - ORIGINAL PLAN + +**Files to create:** +- `cuvarbase/tls.py` - Python API +- `cuvarbase/kernels/tls.cu` - CUDA kernel +- `cuvarbase/tls_models.py` - Transit model generation + +**Tasks:** +1. Create TLS Python class similar to BLS structure +2. Implement transit model pre-computation (Batman wrapper) +3. Create period/duration grid generation (Ofir 2014) +4. Implement basic kernel structure (no optimization) +5. Memory management class (TLSMemory) + +**Deliverables:** +- Basic working TLS GPU implementation +- Correctness validation vs CPU TLS + +### Phase 2: Optimization (Week 2) + +**Tasks:** +1. Implement shared memory optimizations +2. Add warp shuffle reduction +3. Optimize memory access patterns +4. Implement out-of-transit caching +5. Add texture memory for transit models +6. Implement CUB-based sorting + +**Deliverables:** +- Optimized TLS kernel +- Performance benchmarks vs CPU + +### Phase 3: Features & Robustness (Week 3) + +**Tasks:** +1. Implement edge effect correction +2. Add adaptive block sizing +3. Implement kernel caching (LRU) +4. Add batch processing for large period grids +5. Implement CUDA streams for async execution +6. Add sparse TLS variant (for small datasets) + +**Deliverables:** +- Production-ready TLS implementation +- Adaptive mode selection + +### Phase 4: Testing & Validation (Week 4) + +**Tasks:** +1. Create comprehensive unit tests +2. Validate against CPU TLS on known planets +3. Test edge cases (few data points, long periods, etc.) +4. Performance profiling and optimization +5. Documentation and examples + +**Deliverables:** +- Full test suite +- Benchmark results +- Documentation + +--- + +## 6. Testing Strategy + +### 6.1 Validation Tests + +**Test against CPU TLS:** +1. **Synthetic transits** - Generate known signals, verify recovery +2. **Known planets** - Test on confirmed exoplanet light curves +3. **Edge cases** - Few transits, long periods, noisy data +4. **Statistical properties** - SDE, SNR, FAP calculations + +**Metrics for validation:** +- Period recovery (within 1%) +- Duration recovery (within 10%) +- Depth recovery (within 5%) +- T0 recovery (within transit duration) +- SDE values (within 5%) + +### 6.2 Performance Tests + +**Benchmarks:** +1. vs CPU TLS (hippke/tls) +2. vs GPU BLS (cuvarbase existing) +3. Scaling with ndata (10 to 10K points) +4. Scaling with nperiods (100 to 10K) + +**Target metrics:** +- <1 second per K2 light curve (90 days, 4K points) +- 10-100x speedup vs CPU TLS +- Similar or better than GPU BLS + +### 6.3 Test Data + +**Sources:** +1. Synthetic light curves (known parameters) +2. TESS light curves (2-min cadence) +3. K2 light curves (30-min cadence) +4. Kepler light curves (30-min cadence) + +--- + +## 7. API Design + +### 7.1 High-Level Interface + +```python +from cuvarbase import tls + +# Simple interface +results = tls.search(t, y, dy, + R_star=1.0, # Solar radii + M_star=1.0, # Solar masses + period_min=None, # Auto-detect + period_max=None) # Auto-detect + +# Access results +print(f"Period: {results.period:.4f} days") +print(f"SDE: {results.SDE:.2f}") +print(f"Depth: {results.depth*1e6:.1f} ppm") +``` + +### 7.2 Advanced Interface + +```python +# Custom configuration +results = tls.search_advanced( + t, y, dy, + periods=custom_periods, + durations=custom_durations, + transit_template='custom', + limb_dark='quadratic', + u=[0.4804, 0.1867], + use_optimized=True, + use_sparse=None, # Auto-select + block_size=128, + stream=cuda_stream +) +``` + +### 7.3 Batch Processing + +```python +# Process multiple light curves +results_list = tls.search_batch( + [t1, t2, ...], + [y1, y2, ...], + [dy1, dy2, ...], + n_streams=4, + parallel=True +) +``` + +--- + +## 8. Expected Performance + +### 8.1 Theoretical Analysis + +**CPU TLS (current):** +- ~10 seconds per K2 light curve +- Single-threaded +- 12.2 GFLOPs (72% of theoretical CPU max) + +**GPU TLS (target):** +- <1 second per K2 light curve +- ~10³-10⁴ parallel threads +- 100-1000 GFLOPs (GPU advantage) + +**Speedup sources:** +1. Period parallelism: 8,500 periods → 8,500 threads +2. T0 parallelism: ~100 T0 positions per duration +3. Faster reductions: Tree + warp shuffle +4. Memory bandwidth: GPU >> CPU + +### 8.2 Bottleneck Analysis + +**Potential bottlenecks:** +1. **Sorting** - CUB DeviceRadixSort is fast but not free + - Solution: Use MergeSort for partially sorted data + - Cost: ~5-10% of total time + +2. **Transit model interpolation** - Texture memory helps + - Solution: Pre-compute at high resolution + - Cost: ~2-5% of total time + +3. **Out-of-transit caching** - Shared memory limits + - Solution: Use parallel scan (CUB DeviceScan) + - Cost: ~10-15% of total time + +4. **Global memory bandwidth** - Reading t, y, dy repeatedly + - Solution: Shared memory caching per block + - Cost: ~20-30% of total time + +**Expected time breakdown:** +- Phase folding/sorting: 20% +- Residual calculations: 60% +- Reductions/comparisons: 15% +- Overhead: 5% + +--- + +## 9. File Structure + +``` +cuvarbase/ +├── tls.py # Main TLS API +├── tls_models.py # Transit model generation +├── tls_grids.py # Period/duration grid generation +├── tls_stats.py # Statistical calculations (SDE, SNR, FAP) +├── kernels/ +│ ├── tls.cu # Standard TLS kernel +│ ├── tls_optimized.cu # Optimized kernel +│ └── tls_sparse.cu # Sparse variant (small datasets) +└── tests/ + ├── test_tls_basic.py # Basic functionality + ├── test_tls_consistency.py # Consistency with CPU TLS + ├── test_tls_performance.py # Performance benchmarks + └── test_tls_validation.py # Known planet recovery +``` + +--- + +## 10. Dependencies + +**Required:** +- PyCUDA (existing) +- NumPy (existing) +- Batman-package (CPU transit models) + +**Optional:** +- Astropy (stellar parameters, unit conversions) +- Numba (CPU fallback) + +**CUDA features:** +- CUB library (sorting, scanning) +- Texture memory (transit model interpolation) +- Warp shuffle intrinsics +- Cooperative groups (advanced optimization) + +--- + +## 11. Success Criteria + +**Functional:** +- [ ] Passes all validation tests (>95% accuracy vs CPU TLS) +- [ ] Recovers known planets in test dataset +- [ ] Handles edge cases robustly + +**Performance:** +- [ ] <1 second per K2 light curve +- [ ] 10-100x speedup vs CPU TLS +- [ ] Comparable or better than GPU BLS + +**Quality:** +- [ ] Full test coverage (>90%) +- [ ] Comprehensive documentation +- [ ] Example notebooks + +**Usability:** +- [ ] Simple API for basic use cases +- [ ] Advanced API for expert users +- [ ] Clear error messages + +--- + +## 12. Risk Mitigation + +### 12.1 Technical Risks + +| Risk | Mitigation | +|------|------------| +| GPU memory limits | Implement batching, use sparse variant | +| Kernel timeout (Windows) | Add freq_batch_size parameter | +| Sorting performance | Use CUB MergeSort for partially sorted | +| Transit model accuracy | Validate against Batman reference | +| Edge effect handling | Implement CPU TLS's correction algorithm | + +### 12.2 Performance Risks + +| Risk | Mitigation | +|------|------------| +| Slower than expected | Profile with Nsight, optimize bottlenecks | +| Memory bandwidth bound | Increase compute/memory ratio, use shared mem | +| Low occupancy | Adjust block size, reduce register usage | +| Divergent branches | Minimize conditionals in inner loops | + +--- + +## 13. Future Enhancements + +**Phase 5 (future):** +1. Multi-GPU support +2. CPU fallback (Numba) +3. Alternative limb darkening laws +4. Non-circular orbits (eccentric transits) +5. Multi-planet search +6. Real-time detection (streaming data) +7. Integration with lightkurve/eleanor + +--- + +## 14. References + +### Primary Papers + +1. **Hippke & Heller (2019)** - "Transit Least Squares: Optimized transit detection algorithm" + - arXiv:1901.02015 + - A&A 623, A39 + +2. **Ofir (2014)** - "Algorithmic considerations for continuous GW search" + - A&A 561, A138 + - Period sampling algorithm + +3. **Mandel & Agol (2002)** - "Analytic Light Curves for Planetary Transit Searches" + - ApJ 580, L171 + - Transit model theory + +### Related Work + +4. **Kovács et al. (2002)** - Original BLS paper + - A&A 391, 369 + +5. **Kreidberg (2015)** - Batman: Bad-Ass Transit Model cAlculatioN + - PASP 127, 1161 + +6. **Panahi & Zucker (2021)** - Sparse BLS algorithm + - arXiv:2103.06193 + +### Software + +- TLS GitHub: https://github.com/hippke/tls +- TLS Docs: https://transitleastsquares.readthedocs.io/ +- Batman: https://github.com/lkreidberg/batman +- CUB: https://nvlabs.github.io/cub/ + +--- + +## Appendix A: Algorithm Pseudocode + +### CPU TLS (reference) + +```python +def tls_search(t, y, dy, periods, durations, transit_models): + results = [] + + for period in periods: + # Phase fold + phases = (t / period) % 1.0 + sorted_idx = argsort(phases) + phases = phases[sorted_idx] + y_sorted = y[sorted_idx] + dy_sorted = dy[sorted_idx] + + # Patch (extend for edge wrapping) + phases_ext, y_ext, dy_ext = patch_arrays(phases, y_sorted, dy_sorted) + + min_chi2 = inf + best_t0 = None + best_duration = None + + for duration in durations[period]: + # Get transit model + model = transit_models[duration] + + # Calculate out-of-transit residuals (can be cached) + residuals_out = calc_out_of_transit(y_ext, dy_ext, model) + + # Stride over T0 positions + for t0 in T0_grid: + # Calculate in-transit residuals + residuals_in = calc_in_transit(y_ext, dy_ext, model, t0) + + # Optimal depth scaling + depth = optimal_depth(residuals_in, residuals_out) + + # Chi-squared + chi2 = calc_chi2(residuals_in, residuals_out, depth) + + if chi2 < min_chi2: + min_chi2 = chi2 + best_t0 = t0 + best_duration = duration + + results.append((period, min_chi2, best_t0, best_duration)) + + return results +``` + +### GPU TLS (proposed) + +```cuda +__global__ void tls_search_kernel(...) { + int period_idx = blockIdx.x; + int tid = threadIdx.x; + + __shared__ float shared_phases[MAX_NDATA]; + __shared__ float shared_y[MAX_NDATA]; + __shared__ float shared_dy[MAX_NDATA]; + __shared__ float chi2_vals[BLOCK_SIZE]; + + // Load data to shared memory + for (int i = tid; i < ndata; i += blockDim.x) { + float phase = fmodf(t[i] / periods[period_idx], 1.0f); + shared_phases[i] = phase; + shared_y[i] = y[i]; + shared_dy[i] = dy[i]; + } + __syncthreads(); + + // Sort by phase (CUB DeviceRadixSort or MergeSort) + cub::DeviceRadixSort::SortPairs(...); + __syncthreads(); + + // Patch arrays (extend for wrapping) + patch_arrays_shared(...); + __syncthreads(); + + float thread_min_chi2 = INFINITY; + + // Iterate over durations + int n_durations = duration_counts[period_idx]; + for (int d = 0; d < n_durations; d++) { + float duration = durations[period_idx * MAX_DURATIONS + d]; + + // Load transit model from texture memory + float* model = tex2D(transit_model_texture, duration, ...); + + // Calculate out-of-transit residuals (use parallel scan for cumsum) + float residuals_out = calc_out_of_transit_shared(...); + + // Stride over T0 positions (each thread handles multiple) + for (int t0_idx = tid; t0_idx < n_t0_positions; t0_idx += blockDim.x) { + float t0 = t0_grid[t0_idx]; + + // In-transit residuals + float residuals_in = calc_in_transit_shared(...); + + // Optimal depth + float depth = optimal_depth_fast(residuals_in, residuals_out); + + // Chi-squared + float chi2 = calc_chi2_fast(residuals_in, residuals_out, depth); + + thread_min_chi2 = fminf(thread_min_chi2, chi2); + } + } + + // Store thread minimum + chi2_vals[tid] = thread_min_chi2; + __syncthreads(); + + // Parallel reduction to find block minimum + // Tree reduction + warp shuffle + for (int s = blockDim.x/2; s >= 32; s /= 2) { + if (tid < s) { + chi2_vals[tid] = fminf(chi2_vals[tid], chi2_vals[tid + s]); + } + __syncthreads(); + } + + // Final warp reduction + if (tid < 32) { + float val = chi2_vals[tid]; + for (int offset = 16; offset > 0; offset /= 2) { + val = fminf(val, __shfl_down_sync(0xffffffff, val, offset)); + } + if (tid == 0) { + chi2_min[period_idx] = val; + } + } +} +``` + +--- + +## Appendix B: Key Equations + +### Chi-Squared Calculation + +``` +χ²(P, t₀, d, δ) = Σᵢ [yᵢ - m(tᵢ; P, t₀, d, δ)]² / σᵢ² + +where m(t; P, t₀, d, δ) is the transit model: + m(t) = { + 1 - δ × limb_darkened_transit(phase(t)) if in transit + 1 otherwise + } +``` + +### Optimal Depth Scaling + +``` +δ_opt = Σᵢ [yᵢ × m(tᵢ)] / Σᵢ [m(tᵢ)²] + +This minimizes χ² analytically for given (P, t₀, d) +``` + +### Signal Detection Efficiency + +``` +SDE = (1 - ⟨SR⟩) / σ(SR) + +where SR = χ²_white_noise / χ²_signal + +Median filter applied to remove systematic trends +``` + +--- + +**Document Version:** 1.0 +**Last Updated:** 2025-10-27 +**Author:** Claude Code (Anthropic) diff --git a/docs/TLS_GPU_README.md b/docs/TLS_GPU_README.md new file mode 100644 index 0000000..2365812 --- /dev/null +++ b/docs/TLS_GPU_README.md @@ -0,0 +1,313 @@ +# GPU-Accelerated Transit Least Squares (TLS) + +## Overview + +This is a GPU-accelerated implementation of the Transit Least Squares (TLS) algorithm for detecting periodic planetary transits in astronomical time series data. Unlike BLS (Box Least Squares), TLS uses a physically realistic limb-darkened transit template for fitting, improving sensitivity to small planets. + +**Reference:** [Hippke & Heller (2019), A&A 623, A39](https://ui.adsabs.harvard.edu/abs/2019A%26A...623A..39H/abstract) + +## Quick Start + +### Standard Mode - Fixed Duration Range + +```python +from cuvarbase import tls + +results = tls.tls_search_gpu( + t, y, dy, + period_min=5.0, + period_max=20.0, + R_star=1.0, + M_star=1.0 +) + +print(f"Period: {results['period']:.4f} days") +print(f"Depth: {results['depth']:.6f}") +print(f"SDE: {results['SDE']:.2f}") +``` + +### Keplerian Mode - Physically Motivated Duration Constraints + +```python +results = tls.tls_transit( + t, y, dy, + R_star=1.0, # Solar radii + M_star=1.0, # Solar masses + R_planet=1.0, # Earth radii (fiducial) + qmin_fac=0.5, # Search 0.5x to 2.0x Keplerian duration + qmax_fac=2.0, + n_durations=15, + period_min=5.0, + period_max=20.0 +) +``` + +## Features + +### 1. Limb-Darkened Transit Template + +The key difference from BLS is the use of a physically realistic transit template +computed using the batman package (Kreidberg 2015). The template accounts for +stellar limb darkening, producing a rounded transit shape rather than a box. + +The template is: +- Precomputed on the CPU with configurable limb darkening law and coefficients +- Transferred to GPU shared memory (4KB for 1000-point template) +- Interpolated via linear lookup during the chi-squared calculation +- Falls back to a trapezoidal shape if batman is not installed + +### 2. Keplerian-Aware Duration Constraints + +Just like BLS's `eebls_transit()`, TLS exploits Keplerian physics to focus the search on plausible transit durations: + +```python +from cuvarbase import tls_grids + +# Calculate expected fractional duration at each period +q_values = tls_grids.q_transit(periods, R_star=1.0, M_star=1.0, R_planet=1.0) + +# Generate focused duration grid +durations, counts, q_vals = tls_grids.duration_grid_keplerian( + periods, R_star=1.0, M_star=1.0, R_planet=1.0, + qmin_fac=0.5, qmax_fac=2.0, n_durations=15 +) +``` + +### 3. Optimal Period Grid Sampling + +Implements Ofir (2014) frequency-to-cubic transformation for optimal period sampling: + +```python +periods = tls_grids.period_grid_ofir( + t, + R_star=1.0, + M_star=1.0, + period_min=5.0, + period_max=20.0, + oversampling_factor=3, + n_transits_min=2 +) +``` + +**Reference:** Ofir (2014), "An optimized transit detection algorithm to search for periodic transits of small planets", A&A 561, A138 + +### 4. GPU Memory Management + +Efficient GPU memory handling via `TLSMemory` class: +- Pre-allocates GPU arrays for t, y, dy, periods, template, results +- Supports both standard and Keplerian modes (qmin/qmax arrays) +- Memory pooling reduces allocation overhead + +### 5. Optimized CUDA Kernels + +Two optimized CUDA kernels in `cuvarbase/kernels/tls.cu`: + +**`tls_search_kernel()`** - Standard search: +- Fixed duration range (0.5% to 15% of period) +- Limb-darkened transit template in shared memory +- Bitonic sort for phase-folding +- Warp shuffle reduction for finding minimum chi-squared + +**`tls_search_kernel_keplerian()`** - Keplerian-aware: +- Per-period qmin/qmax arrays +- Focused search space +- Same core algorithm with template + +Both kernels: +- Use shared memory for phase-folded data and transit template +- Minimize global memory accesses +- Support datasets up to ~100,000 points + +## API Reference + +### High-Level Functions + +#### `tls_transit(t, y, dy, **kwargs)` + +High-level wrapper with Keplerian duration constraints (analog of BLS's `eebls_transit()`). + +**Parameters:** +- `t` (array): Time values +- `y` (array): Flux/magnitude values +- `dy` (array): Measurement uncertainties +- `R_star` (float): Stellar radius in solar radii (default: 1.0) +- `M_star` (float): Stellar mass in solar masses (default: 1.0) +- `R_planet` (float): Fiducial planet radius in Earth radii (default: 1.0) +- `qmin_fac` (float): Minimum duration factor (default: 0.5) +- `qmax_fac` (float): Maximum duration factor (default: 2.0) +- `n_durations` (int): Number of duration samples (default: 15) +- `period_min` (float): Minimum period in days +- `period_max` (float): Maximum period in days +- `n_transits_min` (int): Minimum transits required (default: 2) +- `oversampling_factor` (int): Period grid oversampling (default: 3) + +**Returns:** Dictionary with keys: +- `period`: Best-fit period (days) +- `T0`: Best-fit transit epoch (days) +- `duration`: Best-fit transit duration (days) +- `depth`: Best-fit transit depth (fractional flux dip) +- `SDE`: Signal Detection Efficiency +- `chi2`: Chi-squared value +- `periods`: Array of trial periods +- `power`: Detrended power spectrum + +#### `tls_search_gpu(t, y, dy, periods=None, **kwargs)` + +Low-level GPU search function with custom period/duration grids. + +**Additional Parameters:** +- `periods` (array): Custom period grid (if None, auto-generated) +- `qmin` (array): Per-period minimum fractional durations (Keplerian mode) +- `qmax` (array): Per-period maximum fractional durations (Keplerian mode) +- `n_durations` (int): Number of duration samples if using qmin/qmax +- `block_size` (int): CUDA block size (default: 128) + +### Grid Generation Functions + +#### `period_grid_ofir(t, R_star, M_star, **kwargs)` + +Generate optimal period grid using Ofir (2014) frequency-to-cubic sampling. + +#### `q_transit(period, R_star, M_star, R_planet)` + +Calculate Keplerian fractional transit duration (q = duration/period). + +#### `duration_grid_keplerian(periods, R_star, M_star, R_planet, **kwargs)` + +Generate Keplerian-aware duration grid for each period. + +## Algorithm Details + +### Transit Template + +The transit model uses a precomputed limb-darkened template: + +``` +model(t) = 1 - depth * template(transit_coord) +``` + +Where `transit_coord` maps the phase position within the transit window to [-1, 1], +and `template()` returns a value in [0, 1] via linear interpolation of the +precomputed template array. The template captures limb darkening effects, giving +a rounded bottom rather than the flat-bottomed box of BLS. + +### Optimal Depth Fitting + +For each trial (period, duration, T0), depth is solved via weighted least squares: +``` +depth = sum[(1-y_i) * T(x_i) / sigma_i^2] / sum[T(x_i)^2 / sigma_i^2] +``` +where T(x_i) is the template value at the transit coordinate of point i. + +### Signal Detection Efficiency (SDE) + +The SDE metric quantifies signal significance: +``` +SDE = (max(SR) - mean(SR)) / std(SR) +``` + +Where SR (Signal Residue) = 1 - chi2 / chi2_null. + +**SDE > 7** typically indicates a robust detection. + +## Known Limitations + +1. **Dataset Size**: Bitonic sort supports up to ~100,000 points + - Designed for typical astronomical light curves (500-20,000 points) + - For >100k points, consider binning or using CPU TLS + - Performance is optimal for ndata < 20,000 + +2. **Memory**: Requires ~(3N + n_template + 4*block_size) floats of shared memory per block + - 5,000 points: ~60 KB + 4 KB template + - Should work on any GPU with >2GB VRAM + +3. **Duration Grid**: Currently uniform in log-space + - Could optimize further using Ofir-style adaptive sampling + +4. **Single GPU**: No multi-GPU support yet + - Trivial to parallelize across multiple light curves + +## Related Work + +**CETRA** (Smith et al. 2025) is a complementary GPU-accelerated transit detection +algorithm that uses a different approach (matched filtering with analytic templates). +CETRA may be preferable for survey-scale searches where computational throughput is +paramount. GPU TLS is valuable when standard TLS outputs (SDE, FAP, odd/even tests) +are needed for transit vetting pipelines, or when results must be directly comparable +to published CPU TLS results. + +## Testing + +### Pytest Suite + +```bash +pytest cuvarbase/tests/test_tls_basic.py -v +``` + +Tests cover: +- Transit template generation (batman and trapezoidal fallback) +- Kernel compilation +- Memory allocation +- Period grid generation +- Statistics (SR, SDE, SNR) +- Signal recovery (synthetic transits) +- SDE > 0 regression test + +## Implementation Files + +### Core Implementation +- `cuvarbase/tls.py` - Main Python API +- `cuvarbase/tls_models.py` - Transit template generation +- `cuvarbase/tls_grids.py` - Grid generation utilities +- `cuvarbase/tls_stats.py` - Statistical calculations +- `cuvarbase/kernels/tls.cu` - CUDA kernels + +### Testing +- `cuvarbase/tests/test_tls_basic.py` - Unit tests + +### Documentation +- `docs/TLS_GPU_README.md` - This file + +## References + +1. **Hippke & Heller (2019)**: "Optimized transit detection algorithm to search for periodic transits of small planets", A&A 623, A39 + - Original TLS algorithm and SDE metric + +2. **Kovacs et al. (2002)**: "A box-fitting algorithm in the search for periodic transits", A&A 391, 369 + - BLS algorithm (TLS is a refinement) + +3. **Ofir (2014)**: "An optimized transit detection algorithm to search for periodic transits of small planets", A&A 561, A138 + - Optimal period grid sampling + +4. **Smith et al. (2025)**: "CETRA: GPU-accelerated transit detection" + - Complementary GPU transit detection approach + +5. **Kreidberg (2015)**: "batman: BAsic Transit Model cAlculatioN in Python", PASP 127, 1161 + - Transit model package used for template generation + +6. **transitleastsquares**: https://github.com/hippke/tls + - Reference CPU implementation + +## Citation + +If you use this GPU TLS implementation, please cite both cuvarbase and the original TLS paper: + +```bibtex +@MISC{2022ascl.soft10030H, + author = {{Hoffman}, John}, + title = "{cuvarbase: GPU-Accelerated Variability Algorithms}", + howpublished = {Astrophysics Source Code Library, record ascl:2210.030}, + year = 2022, + adsurl = {https://ui.adsabs.harvard.edu/abs/2022ascl.soft10030H} +} + +@ARTICLE{2019A&A...623A..39H, + author = {{Hippke}, Michael and {Heller}, Ren{\'e}}, + title = "{Optimized transit detection algorithm to search for periodic transits of small planets}", + journal = {Astronomy & Astrophysics}, + year = 2019, + volume = {623}, + eid = {A39}, + doi = {10.1051/0004-6361/201834672} +} +``` diff --git a/examples/tls_example.py b/examples/tls_example.py new file mode 100644 index 0000000..cbaed31 --- /dev/null +++ b/examples/tls_example.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +""" +Example: GPU-Accelerated Transit Least Squares + +This script demonstrates how to use cuvarbase's GPU-accelerated TLS +implementation to detect planetary transits in photometric time series. + +Requirements: +- PyCUDA +- NumPy +- batman-package (optional, for generating synthetic transits) +""" + +import numpy as np +import matplotlib.pyplot as plt + +# Check if we can import TLS modules +try: + from cuvarbase import tls_grids, tls_models, tls + TLS_AVAILABLE = True +except ImportError as e: + print(f"Warning: Could not import TLS modules: {e}") + TLS_AVAILABLE = False + +# Check if batman is available for generating synthetic data +try: + import batman + BATMAN_AVAILABLE = True +except ImportError: + BATMAN_AVAILABLE = False + print("batman-package not available. Using simple synthetic transit.") + + +def generate_synthetic_transit(period=10.0, depth=0.01, duration=0.1, + t0=0.0, ndata=1000, noise_level=0.001, + T_span=100.0): + """ + Generate synthetic light curve with transit. + + Parameters + ---------- + period : float + Orbital period (days) + depth : float + Transit depth (fractional) + duration : float + Transit duration (days) + t0 : float + Mid-transit time (days) + ndata : int + Number of data points + noise_level : float + Gaussian noise level + T_span : float + Total observation span (days) + + Returns + ------- + t, y, dy : ndarray + Time, flux, and uncertainties + """ + # Generate time series + t = np.sort(np.random.uniform(0, T_span, ndata)) + + # Start with flat light curve + y = np.ones(ndata) + + if BATMAN_AVAILABLE: + # Use Batman for realistic transit + params = batman.TransitParams() + params.t0 = t0 + params.per = period + params.rp = np.sqrt(depth) # Radius ratio + params.a = 15.0 # Semi-major axis + params.inc = 90.0 # Edge-on + params.ecc = 0.0 + params.w = 90.0 + params.limb_dark = "quadratic" + params.u = [0.4804, 0.1867] + + m = batman.TransitModel(params, t) + y = m.light_curve(params) + else: + # Simple box transit + phases = (t % period) / period + duration_phase = duration / period + + # Transit at phase 0 + in_transit = (phases < duration_phase / 2) | (phases > 1 - duration_phase / 2) + y[in_transit] -= depth + + # Add noise + noise = np.random.normal(0, noise_level, ndata) + y += noise + + # Uncertainties + dy = np.ones(ndata) * noise_level + + return t, y, dy + + +def run_tls_example(use_gpu=True): + """ + Run TLS example on synthetic data. + + Parameters + ---------- + use_gpu : bool + Use GPU implementation (default: True) + """ + if not TLS_AVAILABLE: + print("TLS modules not available. Cannot run example.") + return + + print("=" * 60) + print("GPU-Accelerated Transit Least Squares Example") + print("=" * 60) + + # Generate synthetic data + print("\n1. Generating synthetic transit...") + period_true = 12.5 # days + depth_true = 0.008 # 0.8% depth + duration_true = 0.12 # days + + t, y, dy = generate_synthetic_transit( + period=period_true, + depth=depth_true, + duration=duration_true, + ndata=800, + noise_level=0.0005, + T_span=100.0 + ) + + print(f" Data points: {len(t)}") + print(f" Time span: {np.max(t) - np.min(t):.1f} days") + print(f" True period: {period_true:.2f} days") + print(f" True depth: {depth_true:.4f} ({depth_true*1e6:.0f} ppm)") + print(f" True duration: {duration_true:.3f} days") + + # Generate period grid + print("\n2. Generating period grid...") + periods = tls_grids.period_grid_ofir( + t, R_star=1.0, M_star=1.0, + oversampling_factor=3, + period_min=8.0, + period_max=20.0 + ) + print(f" Testing {len(periods)} periods from {periods[0]:.2f} to {periods[-1]:.2f} days") + + # Run TLS search + print("\n3. Running TLS search...") + if use_gpu: + try: + results = tls.tls_search_gpu( + t, y, dy, + periods=periods, + R_star=1.0, + M_star=1.0 + ) + print(" ✓ GPU search completed") + except Exception as e: + print(f" ✗ GPU search failed: {e}") + print(" Tip: Make sure you have a CUDA-capable GPU and PyCUDA installed") + return + else: + print(" CPU implementation not yet available") + return + + # Display results + print("\n4. Results:") + print(f" Best period: {results['period']:.4f} ± {results['period_uncertainty']:.4f} days") + print(f" Best depth: {results['depth']:.6f} ({results['depth']*1e6:.1f} ppm)") + print(f" Best duration: {results['duration']:.4f} days") + print(f" Best T0: {results['T0']:.4f} (phase)") + print(f" Number of transits: {results['n_transits']}") + print(f"\n Statistics:") + print(f" SDE: {results['SDE']:.2f}") + print(f" SNR: {results['SNR']:.2f}") + print(f" FAP: {results['FAP']:.2e}") + + # Compare to truth + period_error = np.abs(results['period'] - period_true) + depth_error = np.abs(results['depth'] - depth_true) + duration_error = np.abs(results['duration'] - duration_true) + + print(f"\n Recovery accuracy:") + print(f" Period error: {period_error:.4f} days ({period_error/period_true*100:.1f}%)") + print(f" Depth error: {depth_error:.6f} ({depth_error/depth_true*100:.1f}%)") + print(f" Duration error: {duration_error:.4f} days ({duration_error/duration_true*100:.1f}%)") + + # Plot results + print("\n5. Creating plots...") + fig, axes = plt.subplots(2, 2, figsize=(12, 10)) + + # Plot 1: Periodogram + ax = axes[0, 0] + ax.plot(results['periods'], results['power'], 'b-', linewidth=0.5) + ax.axvline(period_true, color='r', linestyle='--', label='True period') + ax.axvline(results['period'], color='g', linestyle='--', label='Best period') + ax.set_xlabel('Period (days)') + ax.set_ylabel('Power (detrended SR)') + ax.set_title('TLS Periodogram') + ax.legend() + ax.grid(True, alpha=0.3) + + # Plot 2: Chi-squared + ax = axes[0, 1] + ax.plot(results['periods'], results['chi2'], 'b-', linewidth=0.5) + ax.axvline(period_true, color='r', linestyle='--', label='True period') + ax.axvline(results['period'], color='g', linestyle='--', label='Best period') + ax.set_xlabel('Period (days)') + ax.set_ylabel('Chi-squared') + ax.set_title('Chi-squared vs Period') + ax.legend() + ax.grid(True, alpha=0.3) + + # Plot 3: Phase-folded light curve at best period + ax = axes[1, 0] + phases = (t % results['period']) / results['period'] + ax.plot(phases, y, 'k.', alpha=0.3, markersize=2) + # Plot best-fit model + model_phases = np.linspace(0, 1, 1000) + model_flux = np.ones(1000) + duration_phase = results['duration'] / results['period'] + t0_phase = results['T0'] + in_transit = np.abs((model_phases - t0_phase + 0.5) % 1.0 - 0.5) < duration_phase / 2 + model_flux[in_transit] = 1 - results['depth'] + ax.plot(model_phases, model_flux, 'r-', linewidth=2, label='Best-fit model') + ax.set_xlabel('Phase') + ax.set_ylabel('Relative Flux') + ax.set_title(f'Phase-Folded at P={results["period"]:.4f} days') + ax.legend() + ax.grid(True, alpha=0.3) + + # Plot 4: Raw light curve + ax = axes[1, 1] + ax.plot(t, y, 'k.', alpha=0.5, markersize=1) + ax.set_xlabel('Time (days)') + ax.set_ylabel('Relative Flux') + ax.set_title('Raw Light Curve') + ax.grid(True, alpha=0.3) + + plt.tight_layout() + plt.savefig('tls_example_results.png', dpi=150, bbox_inches='tight') + print(" ✓ Plot saved to 'tls_example_results.png'") + + print("\n" + "=" * 60) + print("Example complete!") + print("=" * 60) + + +if __name__ == '__main__': + import sys + + # Check for --no-gpu flag + use_gpu = '--no-gpu' not in sys.argv + + if use_gpu and not TLS_AVAILABLE: + print("Error: TLS modules not available.") + print("Make sure you're in the cuvarbase directory or have installed it.") + sys.exit(1) + + try: + run_tls_example(use_gpu=use_gpu) + except KeyboardInterrupt: + print("\nInterrupted by user") + sys.exit(0) + except Exception as e: + print(f"\nError running example: {e}") + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/scripts/benchmark_batch_keplerian.py b/scripts/benchmark_batch_keplerian.py deleted file mode 100644 index d084473..0000000 --- a/scripts/benchmark_batch_keplerian.py +++ /dev/null @@ -1,301 +0,0 @@ -#!/usr/bin/env python3 -""" -Benchmark BLS with realistic parameters for batch lightcurve processing. - -Uses: -- 10-year time baseline -- Keplerian frequency/q grids -- Typical TESS/ground-based survey ndata values -- Batch processing of multiple lightcurves -""" - -import numpy as np -import time -import json -from datetime import datetime - -try: - from cuvarbase import bls - GPU_AVAILABLE = True -except Exception as e: - GPU_AVAILABLE = False - print(f"GPU not available: {e}") - - -def generate_realistic_lightcurve(ndata, time_baseline_years=10, period=None, - depth=0.01, rho_star=1.0, seed=None): - """ - Generate realistic lightcurve for survey data. - - Parameters - ---------- - ndata : int - Number of observations - time_baseline_years : float - Total time baseline in years - period : float, optional - Transit period in days. If None, generates noise only. - depth : float - Transit depth - rho_star : float - Stellar density in solar units (for Keplerian q) - seed : int, optional - Random seed - - Returns - ------- - t, y, dy : arrays - Time, magnitude, and uncertainties - """ - if seed is not None: - np.random.seed(seed) - - # Generate realistic time sampling (gaps, clusters) - time_baseline_days = time_baseline_years * 365.25 - - # Simulate survey observing pattern: clusters of observations with gaps - n_seasons = int(time_baseline_years) - points_per_season = ndata // n_seasons - - t_list = [] - for season in range(n_seasons): - season_start = season * 365.25 - season_end = season_start + 200 # 200-day observing season - - # Random observations within season - t_season = np.random.uniform(season_start, season_end, points_per_season) - t_list.append(t_season) - - # Add remaining points - remaining = ndata - len(np.concatenate(t_list)) - if remaining > 0: - t_extra = np.random.uniform(0, time_baseline_days, remaining) - t_list.append(t_extra) - - t = np.sort(np.concatenate(t_list)).astype(np.float32) - t = t[:ndata] # Ensure exact ndata - - y = np.ones(ndata, dtype=np.float32) - - if period is not None: - # Add realistic transit signal with Keplerian duration - phase = (t % period) / period - - # Transit duration from Keplerian assumption - q = bls.q_transit(1.0/period, rho=rho_star) - - in_transit = phase < q - y[in_transit] -= depth - - # Add realistic noise - scatter = 0.01 # 1% photometric precision - y += np.random.normal(0, scatter, ndata).astype(np.float32) - dy = np.ones(ndata, dtype=np.float32) * scatter - - return t, y, dy - - -def get_keplerian_grid(t, fmin_frac=1.0, fmax_frac=1.0, samples_per_peak=2, - qmin_fac=0.5, qmax_fac=2.0, rho=1.0): - """ - Generate Keplerian frequency grid for realistic BLS search. - - Parameters - ---------- - t : array - Observation times - fmin_frac, fmax_frac : float - Fraction of auto-determined limits - samples_per_peak : float - Oversampling factor - qmin_fac, qmax_fac : float - Fraction of Keplerian q to search - rho : float - Stellar density in solar units - - Returns - ------- - freqs : array - Frequency grid - qmins, qmaxes : arrays - Min and max q values for each frequency - """ - fmin = bls.fmin_transit(t, rho=rho) * fmin_frac - fmax = bls.fmax_transit(rho=rho, qmax=0.5/qmax_fac) * fmax_frac - - freqs, q0vals = bls.transit_autofreq(t, fmin=fmin, fmax=fmax, - samples_per_peak=samples_per_peak, - qmin_fac=qmin_fac, qmax_fac=qmax_fac, - rho=rho) - - qmins = q0vals * qmin_fac - qmaxes = q0vals * qmax_fac - - return freqs, qmins, qmaxes - - -def benchmark_single_vs_batch(ndata, n_lightcurves, time_baseline=10, n_trials=3): - """ - Benchmark single lightcurve vs batch processing. - - Parameters - ---------- - ndata : int - Number of observations per lightcurve - n_lightcurves : int - Number of lightcurves to process - time_baseline : float - Time baseline in years - n_trials : int - Number of trials - - Returns - ------- - results : dict - Benchmark results - """ - print(f"\nBenchmarking ndata={ndata}, n_lightcurves={n_lightcurves}...") - - # Generate realistic lightcurves - lightcurves = [] - for i in range(n_lightcurves): - t, y, dy = generate_realistic_lightcurve(ndata, time_baseline_years=time_baseline, - period=5.0 if i % 3 == 0 else None, - seed=42+i) - lightcurves.append((t, y, dy)) - - # Generate Keplerian frequency grid (same for all) - t0, _, _ = lightcurves[0] - freqs, qmins, qmaxes = get_keplerian_grid(t0) - - nfreq = len(freqs) - print(f" Keplerian grid: {nfreq} frequencies") - print(f" Period range: {1/freqs[-1]:.2f} - {1/freqs[0]:.2f} days") - - results = { - 'ndata': int(ndata), - 'n_lightcurves': int(n_lightcurves), - 'nfreq': int(nfreq), - 'time_baseline_years': float(time_baseline) - } - - # Benchmark 1: Sequential processing with standard kernel - print(" Sequential (standard)...") - times_seq_std = [] - - for trial in range(n_trials): - start = time.time() - for t, y, dy in lightcurves: - _ = bls.eebls_gpu_fast(t, y, dy, freqs, qmin=qmins, qmax=qmaxes) - elapsed = time.time() - start - times_seq_std.append(elapsed) - - mean_seq_std = np.mean(times_seq_std) - print(f" Mean: {mean_seq_std:.3f}s") - print(f" Per LC: {mean_seq_std/n_lightcurves:.3f}s") - - results['sequential_standard'] = { - 'total_time': float(mean_seq_std), - 'per_lc_time': float(mean_seq_std / n_lightcurves), - 'throughput_lc_per_sec': float(n_lightcurves / mean_seq_std) - } - - # Benchmark 2: Sequential with adaptive kernel - print(" Sequential (adaptive)...") - times_seq_adapt = [] - - for trial in range(n_trials): - start = time.time() - for t, y, dy in lightcurves: - _ = bls.eebls_gpu_fast_adaptive(t, y, dy, freqs, qmin=qmins, qmax=qmaxes) - elapsed = time.time() - start - times_seq_adapt.append(elapsed) - - mean_seq_adapt = np.mean(times_seq_adapt) - print(f" Mean: {mean_seq_adapt:.3f}s") - print(f" Per LC: {mean_seq_adapt/n_lightcurves:.3f}s") - - results['sequential_adaptive'] = { - 'total_time': float(mean_seq_adapt), - 'per_lc_time': float(mean_seq_adapt / n_lightcurves), - 'throughput_lc_per_sec': float(n_lightcurves / mean_seq_adapt) - } - - # Compute speedups - speedup = mean_seq_std / mean_seq_adapt - print(f" Speedup (adaptive vs standard): {speedup:.2f}x") - - results['speedup_adaptive_vs_standard'] = float(speedup) - - # Estimate cost savings - cost_per_hour = 0.34 # RunPod RTX 4000 Ada spot price - hours_std = (mean_seq_std / 3600) * (5e6 / n_lightcurves) # Scale to 5M LCs - hours_adapt = (mean_seq_adapt / 3600) * (5e6 / n_lightcurves) - - cost_std = hours_std * cost_per_hour - cost_adapt = hours_adapt * cost_per_hour - cost_savings = cost_std - cost_adapt - - print(f"\n Estimated cost for 5M lightcurves:") - print(f" Standard: ${cost_std:.2f} ({hours_std:.1f} hours)") - print(f" Adaptive: ${cost_adapt:.2f} ({hours_adapt:.1f} hours)") - print(f" Savings: ${cost_savings:.2f} ({100*(1-cost_adapt/cost_std):.1f}%)") - - results['cost_estimate_5M_lcs'] = { - 'standard_usd': float(cost_std), - 'adaptive_usd': float(cost_adapt), - 'savings_usd': float(cost_savings), - 'savings_percent': float(100*(1-cost_adapt/cost_std)) - } - - return results - - -def main(): - """Run realistic batch benchmark.""" - print("=" * 80) - print("BATCH KEPLERIAN BLS BENCHMARK") - print("=" * 80) - print("\nRealistic parameters:") - print(" - 10-year time baseline") - print(" - Keplerian frequency/q grids") - print(" - Survey-like time sampling (seasonal gaps)") - print() - - if not GPU_AVAILABLE: - print("ERROR: GPU not available") - return - - all_results = { - 'timestamp': datetime.now().isoformat(), - 'benchmarks': [] - } - - # Test configurations representing different survey types - configs = [ - # (ndata, n_lcs, description) - (100, 10, "Sparse ground-based (e.g., MEarth, HATNet)"), - (500, 10, "Dense ground-based (e.g., NGTS, HATPI)"), - (20000, 5, "Space-based (e.g., TESS, Kepler)"), - ] - - for ndata, n_lcs, desc in configs: - print(f"\n{desc}") - print("-" * 80) - - results = benchmark_single_vs_batch(ndata, n_lcs, time_baseline=10, n_trials=3) - results['description'] = desc - all_results['benchmarks'].append(results) - - # Save results - filename = 'bls_batch_keplerian_benchmark.json' - with open(filename, 'w') as f: - json.dump(all_results, f, indent=2) - - print(f"\n{'=' * 80}") - print(f"Results saved to: {filename}") - print("=" * 80) - - -if __name__ == '__main__': - main() diff --git a/scripts/gpu-test.sh b/scripts/gpu-test.sh new file mode 100755 index 0000000..fa8d327 --- /dev/null +++ b/scripts/gpu-test.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# One-shot: create pod -> setup -> run tests -> stop pod. +# +# Usage: +# ./scripts/gpu-test.sh # Run all tests +# ./scripts/gpu-test.sh cuvarbase/tests/test_tls_basic.py -v # Specific tests +# ./scripts/gpu-test.sh --keep cuvarbase/tests/test_tls_basic.py # Don't stop pod after + +set -e + +KEEP_POD=false +if [ "$1" = "--keep" ]; then + KEEP_POD=true + shift +fi + +TEST_ARGS="${@:-cuvarbase/tests/test_tls_basic.py -v}" + +echo "========================================" +echo "GPU Test: full lifecycle" +echo "========================================" +echo "" + +# Step 1: Create pod (if not already running) +source .runpod.env 2>/dev/null || true + +NEED_CREATE=true +if [ -n "${RUNPOD_POD_ID}" ] && [ -n "${RUNPOD_API_KEY}" ]; then + # Check if existing pod is still running + API_URL="https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" + STATUS=$(curl -s --request POST \ + --header 'content-type: application/json' \ + --url "${API_URL}" \ + --data "{\"query\": \"query { pod(input: {podId: \\\"${RUNPOD_POD_ID}\\\"}) { desiredStatus } }\"}" \ + | python3 -c " +import sys, json +try: + data = json.load(sys.stdin) + pod = data.get('data', {}).get('pod') + print(pod['desiredStatus'] if pod else 'GONE') +except: print('GONE') +" 2>/dev/null) + + if [ "${STATUS}" = "RUNNING" ]; then + echo "Reusing existing pod ${RUNPOD_POD_ID}" + NEED_CREATE=false + fi +fi + +if [ "${NEED_CREATE}" = true ]; then + echo "Step 1: Creating pod..." + ./scripts/runpod-create.sh + echo "" + echo "Step 2: Setting up environment..." + ./scripts/setup-remote.sh +else + echo "Step 1: Pod already running, syncing code..." + ./scripts/sync-to-runpod.sh +fi + +echo "" +echo "Step 3: Running tests..." +echo "========================================" +./scripts/test-remote.sh ${TEST_ARGS} +TEST_EXIT=$? + +echo "" +if [ "${KEEP_POD}" = true ]; then + echo "Pod kept running (--keep flag). Stop with: ./scripts/runpod-stop.sh" +else + echo "Step 4: Stopping pod..." + ./scripts/runpod-stop.sh +fi + +exit ${TEST_EXIT} diff --git a/scripts/runpod-create.sh b/scripts/runpod-create.sh new file mode 100755 index 0000000..617b6f8 --- /dev/null +++ b/scripts/runpod-create.sh @@ -0,0 +1,205 @@ +#!/bin/bash +# Create a RunPod GPU pod and configure .runpod.env for SSH access. +# +# Usage: +# ./scripts/runpod-create.sh # Default: cheapest available GPU +# ./scripts/runpod-create.sh "NVIDIA RTX A4000" # Specific GPU type +# +# Requires RUNPOD_API_KEY in .runpod.env + +set -e + +# Load config +if [ ! -f .runpod.env ]; then + echo "Error: .runpod.env not found. Copy .runpod.env.template and add your RUNPOD_API_KEY." + exit 1 +fi +source .runpod.env + +if [ -z "${RUNPOD_API_KEY}" ]; then + echo "Error: RUNPOD_API_KEY not set in .runpod.env" + echo "Get your key from https://www.runpod.io/console/user/settings" + exit 1 +fi + +GPU_TYPE="${1:-NVIDIA RTX A4000}" +POD_NAME="cuvarbase-dev" +IMAGE="runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04" +VOLUME_GB=20 +DISK_GB=20 +API_URL="https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" + +echo "Creating RunPod instance..." +echo " GPU: ${GPU_TYPE}" +echo " Image: ${IMAGE}" + +# Create pod +RESPONSE=$(curl -s --request POST \ + --header 'content-type: application/json' \ + --url "${API_URL}" \ + --data "{\"query\": \"mutation { podFindAndDeployOnDemand(input: { cloudType: ALL, gpuCount: 1, volumeInGb: ${VOLUME_GB}, containerDiskInGb: ${DISK_GB}, minVcpuCount: 2, minMemoryInGb: 15, gpuTypeId: \\\"${GPU_TYPE}\\\", name: \\\"${POD_NAME}\\\", imageName: \\\"${IMAGE}\\\", ports: \\\"22/tcp\\\", volumeMountPath: \\\"/workspace\\\" }) { id costPerHr } }\"}") + +# Extract pod ID +POD_ID=$(echo "${RESPONSE}" | python3 -c " +import sys, json +data = json.load(sys.stdin) +if 'errors' in data: + print('ERROR: ' + data['errors'][0]['message'], file=sys.stderr) + sys.exit(1) +pod = data['data']['podFindAndDeployOnDemand'] +print(pod['id']) +" 2>&1) + +if [[ "${POD_ID}" == ERROR:* ]]; then + echo "${POD_ID}" + echo "" + echo "Full response: ${RESPONSE}" + exit 1 +fi + +COST=$(echo "${RESPONSE}" | python3 -c " +import sys, json +data = json.load(sys.stdin) +print(data['data']['podFindAndDeployOnDemand']['costPerHr']) +") + +echo "Pod created: ${POD_ID} (\$${COST}/hr)" +echo "Waiting for pod to start..." + +# Poll until running and SSH is available +MAX_WAIT=180 +WAITED=0 +SSH_IP="" +SSH_PORT="" + +while [ ${WAITED} -lt ${MAX_WAIT} ]; do + sleep 5 + WAITED=$((WAITED + 5)) + + STATUS_RESPONSE=$(curl -s --request POST \ + --header 'content-type: application/json' \ + --url "${API_URL}" \ + --data "{\"query\": \"query { pod(input: {podId: \\\"${POD_ID}\\\"}) { id desiredStatus runtime { uptimeInSeconds ports { ip isIpPublic privatePort publicPort type } } } }\"}") + + # Parse status + eval "$(echo "${STATUS_RESPONSE}" | python3 -c " +import sys, json +data = json.load(sys.stdin) +pod = data['data']['pod'] +status = pod.get('desiredStatus', 'UNKNOWN') +print(f'POD_STATUS={status}') +runtime = pod.get('runtime') +if runtime and runtime.get('ports'): + for port in runtime['ports']: + if port['privatePort'] == 22 and port['isIpPublic']: + print(f'SSH_IP={port[\"ip\"]}') + print(f'SSH_PORT={port[\"publicPort\"]}') +")" + + printf "\r Status: %-10s Waited: %ds" "${POD_STATUS}" "${WAITED}" + + if [ -n "${SSH_IP}" ] && [ -n "${SSH_PORT}" ]; then + echo "" + break + fi +done + +if [ -z "${SSH_IP}" ] || [ -z "${SSH_PORT}" ]; then + echo "" + echo "Error: Pod did not become SSH-ready within ${MAX_WAIT}s" + echo "Pod ID: ${POD_ID} (check RunPod dashboard)" + echo "Last status: ${POD_STATUS}" + exit 1 +fi + +echo "SSH port reported: ${SSH_IP}:${SSH_PORT}" + +SSH_KEY_OPT="" +if [ -f ~/.ssh/id_ed25519 ]; then + SSH_KEY_OPT="-i ~/.ssh/id_ed25519" +fi + +# Get podHostId for proxy SSH +echo "Getting proxy SSH credentials..." +POD_HOST_ID=$(curl -s --request POST \ + --header "content-type: application/json" \ + --url "${API_URL}" \ + --data "{\"query\": \"query { pod(input: {podId: \\\"${POD_ID}\\\"}) { machine { podHostId } } }\"}" \ + | python3 -c "import sys, json; print(json.load(sys.stdin)['data']['pod']['machine']['podHostId'])") + +echo "Pod host ID: ${POD_HOST_ID}" + +# Start SSHD via RunPod proxy (the image doesn't auto-start it) +echo "Starting SSH daemon via RunPod proxy..." +PROXY_SSH="ssh -tt -o ConnectTimeout=15 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${SSH_KEY_OPT} ${POD_HOST_ID}@ssh.runpod.io" + +echo 'ssh-keygen -A 2>/dev/null; service ssh start; mkdir -p /root/.ssh; chmod 700 /root/.ssh; echo "SSHD_SETUP_DONE"; exit' \ + | ${PROXY_SSH} 2>&1 | grep -q "SSHD_SETUP_DONE" && echo "SSHD started." || echo "Warning: SSHD setup may have failed." + +# Add local SSH public key to authorized_keys +if [ -f ~/.ssh/id_ed25519.pub ]; then + LOCAL_PUBKEY=$(cat ~/.ssh/id_ed25519.pub) + echo "mkdir -p /root/.ssh && echo \"${LOCAL_PUBKEY}\" >> /root/.ssh/authorized_keys && chmod 600 /root/.ssh/authorized_keys && echo AUTH_OK; exit" \ + | ${PROXY_SSH} 2>&1 | grep -q "AUTH_OK" && echo "SSH key authorized." || echo "Warning: key setup may have failed." +fi + +# Wait for direct SSH to accept connections +echo "Waiting for direct SSH..." +SSH_READY=false +SSH_WAIT=0 +SSH_MAX_WAIT=30 +while [ ${SSH_WAIT} -lt ${SSH_MAX_WAIT} ]; do + if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes \ + ${SSH_KEY_OPT} -p ${SSH_PORT} root@${SSH_IP} "echo ok" >/dev/null 2>&1; then + SSH_READY=true + break + fi + sleep 3 + SSH_WAIT=$((SSH_WAIT + 3)) + printf "\r SSH wait: %ds" "${SSH_WAIT}" +done +echo "" + +if [ "${SSH_READY}" != true ]; then + echo "Warning: Direct SSH not responding. Proxy SSH should still work." +fi + +echo "SSH ready: ${SSH_IP}:${SSH_PORT}" + +# Update .runpod.env with new connection details (preserve API key and other settings) +python3 -c " +import re + +with open('.runpod.env', 'r') as f: + content = f.read() + +replacements = { + 'RUNPOD_SSH_HOST': '${SSH_IP}', + 'RUNPOD_SSH_PORT': '${SSH_PORT}', + 'RUNPOD_SSH_USER': 'root', + 'RUNPOD_POD_ID': '${POD_ID}', +} + +for key, val in replacements.items(): + pattern = rf'^#?\s*{key}=.*$' + replacement = f'{key}={val}' + if re.search(pattern, content, re.MULTILINE): + content = re.sub(pattern, replacement, content, flags=re.MULTILINE) + else: + content = content.rstrip() + f'\n{replacement}\n' + +with open('.runpod.env', 'w') as f: + f.write(content) +" + +echo "" +echo "Updated .runpod.env with new connection details." +echo "" +echo "Pod ID: ${POD_ID}" +echo "SSH: ssh -i ~/.ssh/id_ed25519 -p ${SSH_PORT} root@${SSH_IP}" +echo "Cost: \$${COST}/hr" +echo "" +echo "Next steps:" +echo " ./scripts/setup-remote.sh # Install cuvarbase" +echo " ./scripts/test-remote.sh cuvarbase/tests/test_tls_basic.py -v # Run TLS tests" +echo " ./scripts/runpod-stop.sh # Stop pod when done" diff --git a/scripts/runpod-stop.sh b/scripts/runpod-stop.sh new file mode 100755 index 0000000..eb88393 --- /dev/null +++ b/scripts/runpod-stop.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Stop (or terminate) the RunPod pod. +# +# Usage: +# ./scripts/runpod-stop.sh # Stop (can resume later, keeps volume) +# ./scripts/runpod-stop.sh --terminate # Terminate (deletes everything) + +set -e + +if [ ! -f .runpod.env ]; then + echo "Error: .runpod.env not found" + exit 1 +fi +source .runpod.env + +if [ -z "${RUNPOD_API_KEY}" ]; then + echo "Error: RUNPOD_API_KEY not set in .runpod.env" + exit 1 +fi + +if [ -z "${RUNPOD_POD_ID}" ]; then + echo "Error: RUNPOD_POD_ID not set in .runpod.env (no active pod?)" + exit 1 +fi + +API_URL="https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" + +if [ "$1" = "--terminate" ]; then + echo "Terminating pod ${RUNPOD_POD_ID}..." + RESPONSE=$(curl -s --request POST \ + --header 'content-type: application/json' \ + --url "${API_URL}" \ + --data "{\"query\": \"mutation { podTerminate(input: {podId: \\\"${RUNPOD_POD_ID}\\\"}) }\"}") + echo "Pod terminated." +else + echo "Stopping pod ${RUNPOD_POD_ID}..." + RESPONSE=$(curl -s --request POST \ + --header 'content-type: application/json' \ + --url "${API_URL}" \ + --data "{\"query\": \"mutation { podStop(input: {podId: \\\"${RUNPOD_POD_ID}\\\"}) { id desiredStatus } }\"}") + echo "Pod stopped. Resume later from the RunPod dashboard, or re-run ./scripts/runpod-create.sh" +fi diff --git a/scripts/setup-remote.sh b/scripts/setup-remote.sh index a955181..d2f9319 100755 --- a/scripts/setup-remote.sh +++ b/scripts/setup-remote.sh @@ -13,7 +13,7 @@ fi source .runpod.env # Build SSH connection string -SSH_OPTS="-p ${RUNPOD_SSH_PORT}" +SSH_OPTS="-p ${RUNPOD_SSH_PORT} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" if [ -n "${RUNPOD_SSH_KEY}" ]; then SSH_OPTS="${SSH_OPTS} -i ${RUNPOD_SSH_KEY}" fi @@ -35,10 +35,16 @@ set -e cd /workspace/cuvarbase -# Set up CUDA environment -export PATH=/usr/local/cuda-12.8/bin:$PATH -export CUDA_HOME=/usr/local/cuda-12.8 -export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:$LD_LIBRARY_PATH +# Set up CUDA environment (auto-detect version) +if [ -d /usr/local/cuda ]; then + export PATH=/usr/local/cuda/bin:$PATH + export CUDA_HOME=/usr/local/cuda + export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH +elif [ -d /usr/local/cuda-12.4 ]; then + export PATH=/usr/local/cuda-12.4/bin:$PATH + export CUDA_HOME=/usr/local/cuda-12.4 + export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH +fi # Check if CUDA is available echo "Checking CUDA availability..." @@ -61,47 +67,10 @@ import re import os import glob -# Find skcuda installation (could be in different python versions) -skcuda_paths = glob.glob('/usr/local/lib/python*/dist-packages/skcuda/misc.py') -if not skcuda_paths: - print("Warning: skcuda/misc.py not found, skipping patch") - exit(0) - -misc_path = skcuda_paths[0] -print(f"Patching {misc_path}...") - -# Read the file -with open(misc_path, 'r') as f: - content = f.read() - -# Replace the problematic lines around line 637 -old_code = """# List of available numerical types provided by numpy: -num_types = [np.sctypeDict[t] for t in \\ - np.typecodes['AllInteger']+np.typecodes['AllFloat']]""" - -new_code = """# List of available numerical types provided by numpy: -# Fixed for numpy 2.x compatibility -try: - num_types = [np.sctypeDict[t] for t in \\ - np.typecodes['AllInteger']+np.typecodes['AllFloat']] -except KeyError: - # numpy 2.x: build list manually - num_types = [np.int8, np.int16, np.int32, np.int64, - np.uint8, np.uint16, np.uint32, np.uint64, - np.float16, np.float32, np.float64]""" - -if old_code in content: - content = content.replace(old_code, new_code) - with open(misc_path, 'w') as f: - f.write(content) - print(f"✓ Patched {misc_path}") -else: - print(f"Note: Already patched or code structure changed") - -# Patch np.sctypes usage across all scikit-cuda files -print("") -print("Patching np.sctypes usage in scikit-cuda...") skcuda_files = glob.glob('/usr/local/lib/python*/dist-packages/skcuda/*.py') +if not skcuda_files: + print("Warning: skcuda not found, skipping patch") + exit(0) for filepath in skcuda_files: with open(filepath, 'r') as f: @@ -109,34 +78,28 @@ for filepath in skcuda_files: original = content - # Replace np.sctypes with explicit types - content = re.sub( - r'np\.sctypes\[(["\'])float\1\]', - '[np.float16, np.float32, np.float64]', - content - ) + # Replace num_types list comprehension using typeDict or sctypeDict + # This handles both np.typeDict and np.sctypeDict variants content = re.sub( - r'np\.sctypes\[(["\'])int\1\]', - '[np.int8, np.int16, np.int32, np.int64]', - content - ) - content = re.sub( - r'np\.sctypes\[(["\'])uint\1\]', - '[np.uint8, np.uint16, np.uint32, np.uint64]', - content - ) - content = re.sub( - r'np\.sctypes\[(["\'])complex\1\]', - '[np.complex64, np.complex128]', + r'num_types\s*=\s*\[np\.(?:type|sctype)Dict\[t\]\s+for\s+t\s+in\s*\\?\s*\n\s*np\.typecodes\[.AllInteger.\]\+np\.typecodes\[.AllFloat.\]\]', + 'num_types = [np.int8, np.int16, np.int32, np.int64,\n' + ' np.uint8, np.uint16, np.uint32, np.uint64,\n' + ' np.float16, np.float32, np.float64]', content ) + # Replace np.sctypes with explicit types + content = re.sub(r'np\.sctypes\[(["\'])float\1\]', '[np.float16, np.float32, np.float64]', content) + content = re.sub(r'np\.sctypes\[(["\'])int\1\]', '[np.int8, np.int16, np.int32, np.int64]', content) + content = re.sub(r'np\.sctypes\[(["\'])uint\1\]', '[np.uint8, np.uint16, np.uint32, np.uint64]', content) + content = re.sub(r'np\.sctypes\[(["\'])complex\1\]', '[np.complex64, np.complex128]', content) + if content != original: with open(filepath, 'w') as f: f.write(content) - print(f"✓ Patched {os.path.basename(filepath)}") + print(f" Patched {os.path.basename(filepath)}") -print("✓ All scikit-cuda files patched for numpy 2.x compatibility") +print("All scikit-cuda files patched for numpy 2.x compatibility") ENDPYTHON echo "" diff --git a/scripts/sync-to-runpod.sh b/scripts/sync-to-runpod.sh index bbbba6a..a47201d 100755 --- a/scripts/sync-to-runpod.sh +++ b/scripts/sync-to-runpod.sh @@ -13,7 +13,7 @@ fi source .runpod.env # Build SSH connection string -SSH_OPTS="-p ${RUNPOD_SSH_PORT}" +SSH_OPTS="-p ${RUNPOD_SSH_PORT} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" if [ -n "${RUNPOD_SSH_KEY}" ]; then SSH_OPTS="${SSH_OPTS} -i ${RUNPOD_SSH_KEY}" fi diff --git a/scripts/test-remote.sh b/scripts/test-remote.sh index a242b4f..678df14 100755 --- a/scripts/test-remote.sh +++ b/scripts/test-remote.sh @@ -13,7 +13,7 @@ fi source .runpod.env # Build SSH connection string -SSH_OPTS="-p ${RUNPOD_SSH_PORT}" +SSH_OPTS="-p ${RUNPOD_SSH_PORT} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR" if [ -n "${RUNPOD_SSH_KEY}" ]; then SSH_OPTS="${SSH_OPTS} -i ${RUNPOD_SSH_KEY}" fi @@ -40,7 +40,7 @@ echo "Step 2: Running tests on RunPod..." echo "==========================================" # Run tests remotely and stream output -ssh ${SSH_OPTS} ${SSH_HOST} "export PATH=/usr/local/cuda-12.8/bin:\$PATH && export CUDA_HOME=/usr/local/cuda-12.8 && export LD_LIBRARY_PATH=/usr/local/cuda-12.8/lib64:\$LD_LIBRARY_PATH && cd ${RUNPOD_REMOTE_DIR} && pytest ${TEST_PATH} ${PYTEST_ARGS} -v" +ssh ${SSH_OPTS} ${SSH_HOST} "export PATH=/usr/local/cuda/bin:\$PATH && export CUDA_HOME=/usr/local/cuda && export LD_LIBRARY_PATH=/usr/local/cuda/lib64:\$LD_LIBRARY_PATH && cd ${RUNPOD_REMOTE_DIR} && pytest ${TEST_PATH} ${PYTEST_ARGS} -v" echo "" echo "=========================================="