m96-chan
diff --git a/‎README.md‎
Lines changed: 53 additions & 0 deletions b/‎README.md‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎bench_all_strategies.py‎
Lines changed: 20 additions & 9 deletions b/‎bench_all_strategies.py‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎bench_batch_decode.py‎
Lines changed: 3 additions & 3 deletions b/‎bench_batch_decode.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bench_e2e_batch.py‎
Lines changed: 11 additions & 7 deletions b/‎bench_e2e_batch.py‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎bench_graph_replay_only.py‎
Lines changed: 15 additions & 7 deletions b/‎bench_graph_replay_only.py‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎bench_jacobi_lookahead.py‎
Lines changed: 32 additions & 17 deletions b/‎bench_jacobi_lookahead.py‎
Lines changed: 32 additions & 17 deletions
@@ -33,6 +33,58 @@ PyGPUkit aims to be the "micro-runtime for GPU computing": small, fast, and idea
 
 ---
 
+## What's New in v0.2.12
+
+### GPU Audio Processing (Driver-Only)
+Comprehensive audio processing operations with custom Radix-2 FFT - no cuFFT dependency.
+
+| Category | Operations |
+|----------|------------|
+| **Time-Frequency** | `stft`, `istft`, `griffin_lim` |
+| **Spectral Features** | `spectral_centroid`, `spectral_bandwidth`, `spectral_rolloff`, `spectral_flatness`, `spectral_contrast` |
+| **Pitch Detection** | `detect_pitch_yin`, `detect_pitch_yin_frames`, `autocorrelation` |
+| **Music Analysis** | `cqt`, `chroma_stft`, `chroma_cqt`, `zero_crossing_rate` |
+| **Source Separation** | `hpss`, `harmonic`, `percussive` |
+| **Time/Pitch** | `time_stretch`, `pitch_shift` |
+
+```python
+from pygpukit.ops import audio
+import numpy as np
+
+# Load audio
+samples = np.random.randn(16000).astype(np.float32)  # 1 sec @ 16kHz
+buf = audio.from_pcm(samples, sample_rate=16000)
+
+# STFT -> Magnitude -> ISTFT roundtrip
+stft_out = audio.stft(buf, n_fft=512, hop_length=160)
+mag = audio.magnitude_spectrum(stft_out)
+reconstructed = audio.griffin_lim(mag, n_iter=32)
+
+# Spectral features
+centroid = audio.spectral_centroid(mag, sample_rate=16000)
+flatness = audio.spectral_flatness(mag)
+
+# HPSS (Harmonic-Percussive Separation)
+harmonic, percussive = audio.hpss(mag, kernel_size=17)
+
+# Time stretch (slow down to half speed)
+slow = audio.time_stretch(buf, rate=0.5)
+
+# Pitch shift (+12 semitones = 1 octave up)
+higher = audio.pitch_shift(buf, sample_rate=16000, n_steps=12)
+```
+
+### Previous Audio Features (v0.2.11)
+| Feature | Description |
+|---------|-------------|
+| **STFT** | Custom Radix-2 FFT (no cuFFT) |
+| **Mel Filterbank** | Whisper-compatible preprocessing |
+| **MFCC** | DCT-II based extraction |
+| **VAD** | Voice Activity Detection |
+| **Streaming** | Ring buffer, windowing |
+
+---
+
 ## What's New in v0.2.11
 
 ### Batch Decode Support
@@ -624,6 +676,7 @@ PyGPUkit/
 | **v0.2.9** | **Unified LLM interface** (CausalTransformerModel), ModelSpec abstraction, GPT-2/LLaMA/Qwen3 support |
 | **v0.2.10** | **Dynamic cuBLASLt loading**, CUDA Graph optimizations, descriptor caching |
 | **v0.2.11** | **Batch decode** (6.8x speedup), Decode Strategy framework, Driver API async, Dual CUDA builds, RTX 5090 (SM120) |
+| **v0.2.12** | **Advanced audio processing** (ISTFT, Griffin-Lim, HPSS, CQT, pitch detection, time stretch) |
 
 ### Planned
 
 
@@ -162,8 +162,11 @@ def main():
 
         # Allocate batch buffers
         batch_buffers = DecodeBuffers.allocate(
-            model.config, dtype=dtype, use_qk_norm=use_qk_norm, vocab_size=vocab_size,
-            max_batch_size=batch_size
+            model.config,
+            dtype=dtype,
+            use_qk_norm=use_qk_norm,
+            vocab_size=vocab_size,
+            max_batch_size=batch_size,
         )
 
         init_kv_caches(model, MAX_SEQ_LEN, dtype)
@@ -269,11 +272,14 @@ def main():
         tps_spec = total_tokens / t_spec
         accept_rate = total_accepted / total_drafted if total_drafted > 0 else 0
         results["DecodeSpeculative"] = {
-            "time": t_spec, "tps": tps_spec, "tokens": total_tokens,
-            "accept_rate": accept_rate, "iterations": iterations
+            "time": t_spec,
+            "tps": tps_spec,
+            "tokens": total_tokens,
+            "accept_rate": accept_rate,
+            "iterations": iterations,
         }
         print(f"  Tokens generated: {total_tokens}")
-        print(f"  Iterations: {iterations} (avg {total_tokens/iterations:.1f} tok/iter)")
+        print(f"  Iterations: {iterations} (avg {total_tokens / iterations:.1f} tok/iter)")
         print(f"  Accept rate: {accept_rate:.1%}")
         print(f"  Time: {t_spec:.3f}s")
         print(f"  Throughput: {tps_spec:.1f} tok/s")
@@ -338,11 +344,14 @@ def main():
         tps_jacobi = total_tokens / t_jacobi
         converge_rate = total_converged / iterations if iterations > 0 else 0
         results["DecodeJacobi"] = {
-            "time": t_jacobi, "tps": tps_jacobi, "tokens": total_tokens,
-            "converge_rate": converge_rate, "iterations": iterations
+            "time": t_jacobi,
+            "tps": tps_jacobi,
+            "tokens": total_tokens,
+            "converge_rate": converge_rate,
+            "iterations": iterations,
         }
         print(f"  Tokens generated: {total_tokens}")
-        print(f"  Iterations: {iterations} (avg {total_tokens/iterations:.1f} tok/iter)")
+        print(f"  Iterations: {iterations} (avg {total_tokens / iterations:.1f} tok/iter)")
         print(f"  Convergence rate: {converge_rate:.1%}")
         print(f"  Time: {t_jacobi:.3f}s")
         print(f"  Throughput: {tps_jacobi:.1f} tok/s")
@@ -366,7 +375,9 @@ def main():
             print(f"{name:<25} {'SKIPPED':<10}")
         else:
             speedup = data["tps"] / baseline_tps
-            print(f"{name:<25} {data['tokens']:<10} {data['time']:<12.3f} {data['tps']:<10.1f} {speedup:<10.2f}x")
+            print(
+                f"{name:<25} {data['tokens']:<10} {data['time']:<12.3f} {data['tps']:<10.1f} {speedup:<10.2f}x"
+            )
 
     print()
     print("Notes:")
 
@@ -2,12 +2,14 @@
 """Benchmark batch decode vs sequential decode performance."""
 
 import numpy as np
-import time
 
 model_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/model.safetensors.index.json"
 tokenizer_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/tokenizer.json"
 
 from tokenizers import Tokenizer
+
+from pygpukit import CudaEvent, event_elapsed_us
+from pygpukit.core import default_stream, from_numpy
 from pygpukit.llm import (
     ChatMessage,
     detect_model_spec,
@@ -16,9 +18,7 @@
     load_safetensors,
 )
 from pygpukit.llm.model import precompute_freqs_cis, sample_token
-from pygpukit.core import default_stream, from_numpy
 from pygpukit.ops.basic import kv_cache_prefill_gqa
-from pygpukit import CudaEvent, event_elapsed_us
 
 MAX_SEQ_LEN = 512
 NUM_ITERATIONS = 10
 
@@ -2,12 +2,14 @@
 """End-to-end benchmark: Sequential vs Batch decode for text generation."""
 
 import numpy as np
-import time
 
 model_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/model.safetensors.index.json"
 tokenizer_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/tokenizer.json"
 
 from tokenizers import Tokenizer
+
+from pygpukit import CudaEvent, event_elapsed_ms
+from pygpukit.core import default_stream, from_numpy
 from pygpukit.llm import (
     ChatMessage,
     detect_model_spec,
@@ -16,9 +18,7 @@
     load_safetensors,
 )
 from pygpukit.llm.model import precompute_freqs_cis, sample_token
-from pygpukit.core import default_stream, from_numpy
 from pygpukit.ops.basic import kv_cache_prefill_gqa
-from pygpukit import CudaEvent, event_elapsed_ms
 
 MAX_SEQ_LEN = 512
 GEN_TOKENS = 32  # Number of tokens to generate
@@ -177,13 +177,13 @@ def generate_batch_parallel(model, tokenizer, first_token, prefill_len, kv_backu
         remaining = len(draft_tokens) - idx
         current_batch = min(batch_size, remaining)
 
-        batch_tokens = draft_tokens[idx:idx + current_batch]
+        batch_tokens = draft_tokens[idx : idx + current_batch]
 
         # Batch verify
         hidden = model._decode_step_fixed_cache_batch(
             batch_tokens,
             position,
-            context_len + current_batch  # Context includes new tokens
+            context_len + current_batch,  # Context includes new tokens
         )
 
         # Get logits for verification (would compare with draft in real speculative)
@@ -305,8 +305,12 @@ def main():
     print(f"\n{'Method':<30} {'Time (ms)':<12} {'tok/s':<10} {'Speedup':<10}")
     print("-" * 62)
     print(f"{'Sequential':<30} {seq_time:<12.1f} {seq_tps:<10.2f} {'1.00x':<10}")
-    print(f"{'Batch Verify (batch=4)':<30} {batch_time:<12.1f} {batch_tps:<10.2f} {batch_tps/seq_tps:<10.2f}x")
-    print(f"{'Batch Verify (batch=8)':<30} {batch8_time:<12.1f} {batch8_tps:<10.2f} {batch8_tps/seq_tps:<10.2f}x")
+    print(
+        f"{'Batch Verify (batch=4)':<30} {batch_time:<12.1f} {batch_tps:<10.2f} {batch_tps / seq_tps:<10.2f}x"
+    )
+    print(
+        f"{'Batch Verify (batch=8)':<30} {batch8_time:<12.1f} {batch8_tps:<10.2f} {batch8_tps / seq_tps:<10.2f}x"
+    )
 
     print("\nNote: 'Batch Verify' measures verification phase only.")
     print("Real speculative decoding would add draft model overhead.")
 
@@ -3,15 +3,17 @@
 
 import gc
 import time
+
 import numpy as np
 
 model_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/model.safetensors.index.json"
 
+from pygpukit._pygpukit_native import CudaGraph
+
+from pygpukit.core import default_stream, from_numpy
 from pygpukit.llm import detect_model_spec, load_model_from_safetensors, load_safetensors
 from pygpukit.llm.model import DecodeBuffers, precompute_freqs_cis
-from pygpukit.core import default_stream, from_numpy
-from pygpukit.ops.basic import kv_cache_prefill_gqa, rmsnorm, copy_to, add_inplace, embedding_lookup
-from pygpukit._pygpukit_native import CudaGraph
+from pygpukit.ops.basic import add_inplace, copy_to, embedding_lookup, kv_cache_prefill_gqa, rmsnorm
 
 MAX_SEQ_LEN = 512
 
@@ -53,14 +55,19 @@
 position = 5
 context_len = 6
 
+
 # Define inline decode step
 def _inline_decode_step():
     embedding_lookup(model.embed_tokens, buffers.hidden, token_id)
     for block in model.blocks:
         rmsnorm(buffers.hidden, block.attn_norm.weight, block.attn_norm.eps, out=buffers.norm_out)
         copy_to(buffers.hidden, buffers.residual)
         model._attention_forward_zero_alloc(
-            block.attn, buffers.norm_out, position, context_len, buffers,
+            block.attn,
+            buffers.norm_out,
+            position,
+            context_len,
+            buffers,
             use_position_ptr=False,
         )
         add_inplace(buffers.hidden, buffers.residual)
@@ -71,6 +78,7 @@ def _inline_decode_step():
     rmsnorm(buffers.hidden, model.final_norm.weight, model.final_norm.eps, out=buffers.norm_out)
     copy_to(buffers.norm_out, buffers.hidden)
 
+
 # ============================================================
 # Test 1: Direct kernel launches (no graph)
 # ============================================================
@@ -90,7 +98,7 @@ def _inline_decode_step():
     default_stream().synchronize()
     elapsed = (time.perf_counter() - start) * 1000
     times_direct.append(elapsed)
-    print(f"  {i+1}: {elapsed:.2f} ms")
+    print(f"  {i + 1}: {elapsed:.2f} ms")
 
 mean_direct = np.mean(times_direct)
 print(f"  Mean: {mean_direct:.2f} ms")
@@ -126,7 +134,7 @@ def _inline_decode_step():
     graph.synchronize()
     elapsed = (time.perf_counter() - start) * 1000
     times_graph.append(elapsed)
-    print(f"  {i+1}: {elapsed:.2f} ms")
+    print(f"  {i + 1}: {elapsed:.2f} ms")
 
 mean_graph = np.mean(times_graph)
 print(f"  Mean: {mean_graph:.2f} ms")
@@ -139,6 +147,6 @@ def _inline_decode_step():
 print("=" * 60)
 print(f"Direct launches: {mean_direct:.2f} ms")
 print(f"Graph replay:    {mean_graph:.2f} ms")
-print(f"Speedup:         {mean_direct/mean_graph:.2f}x")
+print(f"Speedup:         {mean_direct / mean_graph:.2f}x")
 print(f"Saved per step:  {mean_direct - mean_graph:.2f} ms")
 print("=" * 60)
@@ -52,8 +52,14 @@ def generate_sequential_greedy(model, first_token, prefill_len, kv_backup, num_t
 
 
 def generate_jacobi_original(
-    model, first_token, prefill_len, kv_backup, num_tokens,
-    n_tokens=8, max_iter=3, init_strategy="repeat"
+    model,
+    first_token,
+    prefill_len,
+    kv_backup,
+    num_tokens,
+    n_tokens=8,
+    max_iter=3,
+    init_strategy="repeat",
 ):
     """Generate tokens using Jacobi decoding (original, with CPU copies)."""
     model.restore_kv_cache(kv_backup)
@@ -74,7 +80,9 @@ def generate_jacobi_original(
             break
 
         accepted, new_pos, stats = model.decode_step_jacobi(
-            tokens[-1], position, context_len,
+            tokens[-1],
+            position,
+            context_len,
             n_tokens=current_n,
             max_iter=max_iter,
             init_strategy=init_strategy,
@@ -95,8 +103,7 @@ def generate_jacobi_original(
 
 
 def generate_jacobi_lookahead(
-    model, first_token, prefill_len, num_tokens,
-    n_tokens=8, max_iter=3, init_strategy="repeat"
+    model, first_token, prefill_len, num_tokens, n_tokens=8, max_iter=3, init_strategy="repeat"
 ):
     """Generate tokens using Jacobi decoding with lookahead KV (GPU-side)."""
     # Set confirmed position after prefill
@@ -195,9 +202,7 @@ def main():
     print(f"\n--- Sequential Baseline ({GEN_TOKENS} tokens) ---")
 
     start_event.record()
-    seq_tokens = generate_sequential_greedy(
-        model, first_token, prefill_len, kv_backup, GEN_TOKENS
-    )
+    seq_tokens = generate_sequential_greedy(model, first_token, prefill_len, kv_backup, GEN_TOKENS)
     stop_event.record()
     stop_event.synchronize()
 
@@ -215,8 +220,14 @@ def main():
 
     start_event.record()
     jacobi_orig_tokens, avg_iter_o, conv_rate_o = generate_jacobi_original(
-        model, first_token, prefill_len, kv_backup, GEN_TOKENS,
-        n_tokens=8, max_iter=3, init_strategy="repeat"
+        model,
+        first_token,
+        prefill_len,
+        kv_backup,
+        GEN_TOKENS,
+        n_tokens=8,
+        max_iter=3,
+        init_strategy="repeat",
     )
     stop_event.record()
     stop_event.synchronize()
@@ -239,8 +250,7 @@ def main():
 
     start_event.record()
     jacobi_look_tokens, avg_iter_l, conv_rate_l = generate_jacobi_lookahead(
-        model, first_token, prefill_len, GEN_TOKENS,
-        n_tokens=8, max_iter=3, init_strategy="repeat"
+        model, first_token, prefill_len, GEN_TOKENS, n_tokens=8, max_iter=3, init_strategy="repeat"
     )
     stop_event.record()
     stop_event.synchronize()
@@ -263,8 +273,7 @@ def main():
 
     start_event.record()
     jacobi_greedy_tokens, avg_iter_g, conv_rate_g = generate_jacobi_lookahead(
-        model, first_token, prefill_len, GEN_TOKENS,
-        n_tokens=8, max_iter=3, init_strategy="greedy"
+        model, first_token, prefill_len, GEN_TOKENS, n_tokens=8, max_iter=3, init_strategy="greedy"
     )
     stop_event.record()
     stop_event.synchronize()
@@ -291,9 +300,15 @@ def main():
     print(f"\n{'Method':<35} {'Time (ms)':<12} {'tok/s':<10} {'Speedup':<10} {'Match'}")
     print("-" * 77)
     print(f"{'Sequential (baseline)':<35} {seq_time:<12.1f} {seq_tps:<10.2f} {'1.00x':<10} {'N/A'}")
-    print(f"{'Jacobi Original (CPU copies)':<35} {jacobi_orig_time:<12.1f} {jacobi_orig_tps:<10.2f} {speedup_orig:.2f}x{'':<5} {'YES' if match_orig else 'NO'}")
-    print(f"{'Jacobi Lookahead (GPU-side)':<35} {jacobi_look_time:<12.1f} {jacobi_look_tps:<10.2f} {speedup_look:.2f}x{'':<5} {'YES' if match_look else 'NO'}")
-    print(f"{'Jacobi Lookahead (greedy init)':<35} {jacobi_greedy_time:<12.1f} {jacobi_greedy_tps:<10.2f} {(seq_time / jacobi_greedy_time):.2f}x{'':<5} {'YES' if match_greedy else 'NO'}")
+    print(
+        f"{'Jacobi Original (CPU copies)':<35} {jacobi_orig_time:<12.1f} {jacobi_orig_tps:<10.2f} {speedup_orig:.2f}x{'':<5} {'YES' if match_orig else 'NO'}"
+    )
+    print(
+        f"{'Jacobi Lookahead (GPU-side)':<35} {jacobi_look_time:<12.1f} {jacobi_look_tps:<10.2f} {speedup_look:.2f}x{'':<5} {'YES' if match_look else 'NO'}"
+    )
+    print(
+        f"{'Jacobi Lookahead (greedy init)':<35} {jacobi_greedy_time:<12.1f} {jacobi_greedy_tps:<10.2f} {(seq_time / jacobi_greedy_time):.2f}x{'':<5} {'YES' if match_greedy else 'NO'}"
+    )
 
     print(f"\nLookahead vs Original speedup: {speedup_look_vs_orig:.2f}x")