Skip to content

Commit 45aa16f

Browse files
authored
Merge pull request #99 from m96-chan/feature/v0.2.12
feat(audio): add advanced audio processing kernels (v0.2.12)
2 parents 6d5f5dc + bd5b49c commit 45aa16f

35 files changed

Lines changed: 8375 additions & 221 deletions

README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,58 @@ PyGPUkit aims to be the "micro-runtime for GPU computing": small, fast, and idea
3333
3434
---
3535

36+
## What's New in v0.2.12
37+
38+
### GPU Audio Processing (Driver-Only)
39+
Comprehensive audio processing operations with custom Radix-2 FFT - no cuFFT dependency.
40+
41+
| Category | Operations |
42+
|----------|------------|
43+
| **Time-Frequency** | `stft`, `istft`, `griffin_lim` |
44+
| **Spectral Features** | `spectral_centroid`, `spectral_bandwidth`, `spectral_rolloff`, `spectral_flatness`, `spectral_contrast` |
45+
| **Pitch Detection** | `detect_pitch_yin`, `detect_pitch_yin_frames`, `autocorrelation` |
46+
| **Music Analysis** | `cqt`, `chroma_stft`, `chroma_cqt`, `zero_crossing_rate` |
47+
| **Source Separation** | `hpss`, `harmonic`, `percussive` |
48+
| **Time/Pitch** | `time_stretch`, `pitch_shift` |
49+
50+
```python
51+
from pygpukit.ops import audio
52+
import numpy as np
53+
54+
# Load audio
55+
samples = np.random.randn(16000).astype(np.float32) # 1 sec @ 16kHz
56+
buf = audio.from_pcm(samples, sample_rate=16000)
57+
58+
# STFT -> Magnitude -> ISTFT roundtrip
59+
stft_out = audio.stft(buf, n_fft=512, hop_length=160)
60+
mag = audio.magnitude_spectrum(stft_out)
61+
reconstructed = audio.griffin_lim(mag, n_iter=32)
62+
63+
# Spectral features
64+
centroid = audio.spectral_centroid(mag, sample_rate=16000)
65+
flatness = audio.spectral_flatness(mag)
66+
67+
# HPSS (Harmonic-Percussive Separation)
68+
harmonic, percussive = audio.hpss(mag, kernel_size=17)
69+
70+
# Time stretch (slow down to half speed)
71+
slow = audio.time_stretch(buf, rate=0.5)
72+
73+
# Pitch shift (+12 semitones = 1 octave up)
74+
higher = audio.pitch_shift(buf, sample_rate=16000, n_steps=12)
75+
```
76+
77+
### Previous Audio Features (v0.2.11)
78+
| Feature | Description |
79+
|---------|-------------|
80+
| **STFT** | Custom Radix-2 FFT (no cuFFT) |
81+
| **Mel Filterbank** | Whisper-compatible preprocessing |
82+
| **MFCC** | DCT-II based extraction |
83+
| **VAD** | Voice Activity Detection |
84+
| **Streaming** | Ring buffer, windowing |
85+
86+
---
87+
3688
## What's New in v0.2.11
3789

3890
### Batch Decode Support
@@ -624,6 +676,7 @@ PyGPUkit/
624676
| **v0.2.9** | **Unified LLM interface** (CausalTransformerModel), ModelSpec abstraction, GPT-2/LLaMA/Qwen3 support |
625677
| **v0.2.10** | **Dynamic cuBLASLt loading**, CUDA Graph optimizations, descriptor caching |
626678
| **v0.2.11** | **Batch decode** (6.8x speedup), Decode Strategy framework, Driver API async, Dual CUDA builds, RTX 5090 (SM120) |
679+
| **v0.2.12** | **Advanced audio processing** (ISTFT, Griffin-Lim, HPSS, CQT, pitch detection, time stretch) |
627680

628681
### Planned
629682

bench_all_strategies.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,11 @@ def main():
162162

163163
# Allocate batch buffers
164164
batch_buffers = DecodeBuffers.allocate(
165-
model.config, dtype=dtype, use_qk_norm=use_qk_norm, vocab_size=vocab_size,
166-
max_batch_size=batch_size
165+
model.config,
166+
dtype=dtype,
167+
use_qk_norm=use_qk_norm,
168+
vocab_size=vocab_size,
169+
max_batch_size=batch_size,
167170
)
168171

169172
init_kv_caches(model, MAX_SEQ_LEN, dtype)
@@ -269,11 +272,14 @@ def main():
269272
tps_spec = total_tokens / t_spec
270273
accept_rate = total_accepted / total_drafted if total_drafted > 0 else 0
271274
results["DecodeSpeculative"] = {
272-
"time": t_spec, "tps": tps_spec, "tokens": total_tokens,
273-
"accept_rate": accept_rate, "iterations": iterations
275+
"time": t_spec,
276+
"tps": tps_spec,
277+
"tokens": total_tokens,
278+
"accept_rate": accept_rate,
279+
"iterations": iterations,
274280
}
275281
print(f" Tokens generated: {total_tokens}")
276-
print(f" Iterations: {iterations} (avg {total_tokens/iterations:.1f} tok/iter)")
282+
print(f" Iterations: {iterations} (avg {total_tokens / iterations:.1f} tok/iter)")
277283
print(f" Accept rate: {accept_rate:.1%}")
278284
print(f" Time: {t_spec:.3f}s")
279285
print(f" Throughput: {tps_spec:.1f} tok/s")
@@ -338,11 +344,14 @@ def main():
338344
tps_jacobi = total_tokens / t_jacobi
339345
converge_rate = total_converged / iterations if iterations > 0 else 0
340346
results["DecodeJacobi"] = {
341-
"time": t_jacobi, "tps": tps_jacobi, "tokens": total_tokens,
342-
"converge_rate": converge_rate, "iterations": iterations
347+
"time": t_jacobi,
348+
"tps": tps_jacobi,
349+
"tokens": total_tokens,
350+
"converge_rate": converge_rate,
351+
"iterations": iterations,
343352
}
344353
print(f" Tokens generated: {total_tokens}")
345-
print(f" Iterations: {iterations} (avg {total_tokens/iterations:.1f} tok/iter)")
354+
print(f" Iterations: {iterations} (avg {total_tokens / iterations:.1f} tok/iter)")
346355
print(f" Convergence rate: {converge_rate:.1%}")
347356
print(f" Time: {t_jacobi:.3f}s")
348357
print(f" Throughput: {tps_jacobi:.1f} tok/s")
@@ -366,7 +375,9 @@ def main():
366375
print(f"{name:<25} {'SKIPPED':<10}")
367376
else:
368377
speedup = data["tps"] / baseline_tps
369-
print(f"{name:<25} {data['tokens']:<10} {data['time']:<12.3f} {data['tps']:<10.1f} {speedup:<10.2f}x")
378+
print(
379+
f"{name:<25} {data['tokens']:<10} {data['time']:<12.3f} {data['tps']:<10.1f} {speedup:<10.2f}x"
380+
)
370381

371382
print()
372383
print("Notes:")

bench_batch_decode.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
"""Benchmark batch decode vs sequential decode performance."""
33

44
import numpy as np
5-
import time
65

76
model_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/model.safetensors.index.json"
87
tokenizer_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/tokenizer.json"
98

109
from tokenizers import Tokenizer
10+
11+
from pygpukit import CudaEvent, event_elapsed_us
12+
from pygpukit.core import default_stream, from_numpy
1113
from pygpukit.llm import (
1214
ChatMessage,
1315
detect_model_spec,
@@ -16,9 +18,7 @@
1618
load_safetensors,
1719
)
1820
from pygpukit.llm.model import precompute_freqs_cis, sample_token
19-
from pygpukit.core import default_stream, from_numpy
2021
from pygpukit.ops.basic import kv_cache_prefill_gqa
21-
from pygpukit import CudaEvent, event_elapsed_us
2222

2323
MAX_SEQ_LEN = 512
2424
NUM_ITERATIONS = 10

bench_e2e_batch.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
"""End-to-end benchmark: Sequential vs Batch decode for text generation."""
33

44
import numpy as np
5-
import time
65

76
model_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/model.safetensors.index.json"
87
tokenizer_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/tokenizer.json"
98

109
from tokenizers import Tokenizer
10+
11+
from pygpukit import CudaEvent, event_elapsed_ms
12+
from pygpukit.core import default_stream, from_numpy
1113
from pygpukit.llm import (
1214
ChatMessage,
1315
detect_model_spec,
@@ -16,9 +18,7 @@
1618
load_safetensors,
1719
)
1820
from pygpukit.llm.model import precompute_freqs_cis, sample_token
19-
from pygpukit.core import default_stream, from_numpy
2021
from pygpukit.ops.basic import kv_cache_prefill_gqa
21-
from pygpukit import CudaEvent, event_elapsed_ms
2222

2323
MAX_SEQ_LEN = 512
2424
GEN_TOKENS = 32 # Number of tokens to generate
@@ -177,13 +177,13 @@ def generate_batch_parallel(model, tokenizer, first_token, prefill_len, kv_backu
177177
remaining = len(draft_tokens) - idx
178178
current_batch = min(batch_size, remaining)
179179

180-
batch_tokens = draft_tokens[idx:idx + current_batch]
180+
batch_tokens = draft_tokens[idx : idx + current_batch]
181181

182182
# Batch verify
183183
hidden = model._decode_step_fixed_cache_batch(
184184
batch_tokens,
185185
position,
186-
context_len + current_batch # Context includes new tokens
186+
context_len + current_batch, # Context includes new tokens
187187
)
188188

189189
# Get logits for verification (would compare with draft in real speculative)
@@ -305,8 +305,12 @@ def main():
305305
print(f"\n{'Method':<30} {'Time (ms)':<12} {'tok/s':<10} {'Speedup':<10}")
306306
print("-" * 62)
307307
print(f"{'Sequential':<30} {seq_time:<12.1f} {seq_tps:<10.2f} {'1.00x':<10}")
308-
print(f"{'Batch Verify (batch=4)':<30} {batch_time:<12.1f} {batch_tps:<10.2f} {batch_tps/seq_tps:<10.2f}x")
309-
print(f"{'Batch Verify (batch=8)':<30} {batch8_time:<12.1f} {batch8_tps:<10.2f} {batch8_tps/seq_tps:<10.2f}x")
308+
print(
309+
f"{'Batch Verify (batch=4)':<30} {batch_time:<12.1f} {batch_tps:<10.2f} {batch_tps / seq_tps:<10.2f}x"
310+
)
311+
print(
312+
f"{'Batch Verify (batch=8)':<30} {batch8_time:<12.1f} {batch8_tps:<10.2f} {batch8_tps / seq_tps:<10.2f}x"
313+
)
310314

311315
print("\nNote: 'Batch Verify' measures verification phase only.")
312316
print("Real speculative decoding would add draft model overhead.")

bench_graph_replay_only.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,17 @@
33

44
import gc
55
import time
6+
67
import numpy as np
78

89
model_path = "C:/Users/y_har/.cache/huggingface/hub/models--Aratako--Qwen3-8B-ERP-v0.1/snapshots/8311aa4482f02c2de93872e4979887def1841faf/model.safetensors.index.json"
910

11+
from pygpukit._pygpukit_native import CudaGraph
12+
13+
from pygpukit.core import default_stream, from_numpy
1014
from pygpukit.llm import detect_model_spec, load_model_from_safetensors, load_safetensors
1115
from pygpukit.llm.model import DecodeBuffers, precompute_freqs_cis
12-
from pygpukit.core import default_stream, from_numpy
13-
from pygpukit.ops.basic import kv_cache_prefill_gqa, rmsnorm, copy_to, add_inplace, embedding_lookup
14-
from pygpukit._pygpukit_native import CudaGraph
16+
from pygpukit.ops.basic import add_inplace, copy_to, embedding_lookup, kv_cache_prefill_gqa, rmsnorm
1517

1618
MAX_SEQ_LEN = 512
1719

@@ -53,14 +55,19 @@
5355
position = 5
5456
context_len = 6
5557

58+
5659
# Define inline decode step
5760
def _inline_decode_step():
5861
embedding_lookup(model.embed_tokens, buffers.hidden, token_id)
5962
for block in model.blocks:
6063
rmsnorm(buffers.hidden, block.attn_norm.weight, block.attn_norm.eps, out=buffers.norm_out)
6164
copy_to(buffers.hidden, buffers.residual)
6265
model._attention_forward_zero_alloc(
63-
block.attn, buffers.norm_out, position, context_len, buffers,
66+
block.attn,
67+
buffers.norm_out,
68+
position,
69+
context_len,
70+
buffers,
6471
use_position_ptr=False,
6572
)
6673
add_inplace(buffers.hidden, buffers.residual)
@@ -71,6 +78,7 @@ def _inline_decode_step():
7178
rmsnorm(buffers.hidden, model.final_norm.weight, model.final_norm.eps, out=buffers.norm_out)
7279
copy_to(buffers.norm_out, buffers.hidden)
7380

81+
7482
# ============================================================
7583
# Test 1: Direct kernel launches (no graph)
7684
# ============================================================
@@ -90,7 +98,7 @@ def _inline_decode_step():
9098
default_stream().synchronize()
9199
elapsed = (time.perf_counter() - start) * 1000
92100
times_direct.append(elapsed)
93-
print(f" {i+1}: {elapsed:.2f} ms")
101+
print(f" {i + 1}: {elapsed:.2f} ms")
94102

95103
mean_direct = np.mean(times_direct)
96104
print(f" Mean: {mean_direct:.2f} ms")
@@ -126,7 +134,7 @@ def _inline_decode_step():
126134
graph.synchronize()
127135
elapsed = (time.perf_counter() - start) * 1000
128136
times_graph.append(elapsed)
129-
print(f" {i+1}: {elapsed:.2f} ms")
137+
print(f" {i + 1}: {elapsed:.2f} ms")
130138

131139
mean_graph = np.mean(times_graph)
132140
print(f" Mean: {mean_graph:.2f} ms")
@@ -139,6 +147,6 @@ def _inline_decode_step():
139147
print("=" * 60)
140148
print(f"Direct launches: {mean_direct:.2f} ms")
141149
print(f"Graph replay: {mean_graph:.2f} ms")
142-
print(f"Speedup: {mean_direct/mean_graph:.2f}x")
150+
print(f"Speedup: {mean_direct / mean_graph:.2f}x")
143151
print(f"Saved per step: {mean_direct - mean_graph:.2f} ms")
144152
print("=" * 60)

bench_jacobi_lookahead.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,14 @@ def generate_sequential_greedy(model, first_token, prefill_len, kv_backup, num_t
5252

5353

5454
def generate_jacobi_original(
55-
model, first_token, prefill_len, kv_backup, num_tokens,
56-
n_tokens=8, max_iter=3, init_strategy="repeat"
55+
model,
56+
first_token,
57+
prefill_len,
58+
kv_backup,
59+
num_tokens,
60+
n_tokens=8,
61+
max_iter=3,
62+
init_strategy="repeat",
5763
):
5864
"""Generate tokens using Jacobi decoding (original, with CPU copies)."""
5965
model.restore_kv_cache(kv_backup)
@@ -74,7 +80,9 @@ def generate_jacobi_original(
7480
break
7581

7682
accepted, new_pos, stats = model.decode_step_jacobi(
77-
tokens[-1], position, context_len,
83+
tokens[-1],
84+
position,
85+
context_len,
7886
n_tokens=current_n,
7987
max_iter=max_iter,
8088
init_strategy=init_strategy,
@@ -95,8 +103,7 @@ def generate_jacobi_original(
95103

96104

97105
def generate_jacobi_lookahead(
98-
model, first_token, prefill_len, num_tokens,
99-
n_tokens=8, max_iter=3, init_strategy="repeat"
106+
model, first_token, prefill_len, num_tokens, n_tokens=8, max_iter=3, init_strategy="repeat"
100107
):
101108
"""Generate tokens using Jacobi decoding with lookahead KV (GPU-side)."""
102109
# Set confirmed position after prefill
@@ -195,9 +202,7 @@ def main():
195202
print(f"\n--- Sequential Baseline ({GEN_TOKENS} tokens) ---")
196203

197204
start_event.record()
198-
seq_tokens = generate_sequential_greedy(
199-
model, first_token, prefill_len, kv_backup, GEN_TOKENS
200-
)
205+
seq_tokens = generate_sequential_greedy(model, first_token, prefill_len, kv_backup, GEN_TOKENS)
201206
stop_event.record()
202207
stop_event.synchronize()
203208

@@ -215,8 +220,14 @@ def main():
215220

216221
start_event.record()
217222
jacobi_orig_tokens, avg_iter_o, conv_rate_o = generate_jacobi_original(
218-
model, first_token, prefill_len, kv_backup, GEN_TOKENS,
219-
n_tokens=8, max_iter=3, init_strategy="repeat"
223+
model,
224+
first_token,
225+
prefill_len,
226+
kv_backup,
227+
GEN_TOKENS,
228+
n_tokens=8,
229+
max_iter=3,
230+
init_strategy="repeat",
220231
)
221232
stop_event.record()
222233
stop_event.synchronize()
@@ -239,8 +250,7 @@ def main():
239250

240251
start_event.record()
241252
jacobi_look_tokens, avg_iter_l, conv_rate_l = generate_jacobi_lookahead(
242-
model, first_token, prefill_len, GEN_TOKENS,
243-
n_tokens=8, max_iter=3, init_strategy="repeat"
253+
model, first_token, prefill_len, GEN_TOKENS, n_tokens=8, max_iter=3, init_strategy="repeat"
244254
)
245255
stop_event.record()
246256
stop_event.synchronize()
@@ -263,8 +273,7 @@ def main():
263273

264274
start_event.record()
265275
jacobi_greedy_tokens, avg_iter_g, conv_rate_g = generate_jacobi_lookahead(
266-
model, first_token, prefill_len, GEN_TOKENS,
267-
n_tokens=8, max_iter=3, init_strategy="greedy"
276+
model, first_token, prefill_len, GEN_TOKENS, n_tokens=8, max_iter=3, init_strategy="greedy"
268277
)
269278
stop_event.record()
270279
stop_event.synchronize()
@@ -291,9 +300,15 @@ def main():
291300
print(f"\n{'Method':<35} {'Time (ms)':<12} {'tok/s':<10} {'Speedup':<10} {'Match'}")
292301
print("-" * 77)
293302
print(f"{'Sequential (baseline)':<35} {seq_time:<12.1f} {seq_tps:<10.2f} {'1.00x':<10} {'N/A'}")
294-
print(f"{'Jacobi Original (CPU copies)':<35} {jacobi_orig_time:<12.1f} {jacobi_orig_tps:<10.2f} {speedup_orig:.2f}x{'':<5} {'YES' if match_orig else 'NO'}")
295-
print(f"{'Jacobi Lookahead (GPU-side)':<35} {jacobi_look_time:<12.1f} {jacobi_look_tps:<10.2f} {speedup_look:.2f}x{'':<5} {'YES' if match_look else 'NO'}")
296-
print(f"{'Jacobi Lookahead (greedy init)':<35} {jacobi_greedy_time:<12.1f} {jacobi_greedy_tps:<10.2f} {(seq_time / jacobi_greedy_time):.2f}x{'':<5} {'YES' if match_greedy else 'NO'}")
303+
print(
304+
f"{'Jacobi Original (CPU copies)':<35} {jacobi_orig_time:<12.1f} {jacobi_orig_tps:<10.2f} {speedup_orig:.2f}x{'':<5} {'YES' if match_orig else 'NO'}"
305+
)
306+
print(
307+
f"{'Jacobi Lookahead (GPU-side)':<35} {jacobi_look_time:<12.1f} {jacobi_look_tps:<10.2f} {speedup_look:.2f}x{'':<5} {'YES' if match_look else 'NO'}"
308+
)
309+
print(
310+
f"{'Jacobi Lookahead (greedy init)':<35} {jacobi_greedy_time:<12.1f} {jacobi_greedy_tps:<10.2f} {(seq_time / jacobi_greedy_time):.2f}x{'':<5} {'YES' if match_greedy else 'NO'}"
311+
)
297312

298313
print(f"\nLookahead vs Original speedup: {speedup_look_vs_orig:.2f}x")
299314

0 commit comments

Comments
 (0)