-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcomparison.html
More file actions
596 lines (548 loc) · 35.2 KB
/
comparison.html
File metadata and controls
596 lines (548 loc) · 35.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Silk vs The World — Silk Docs</title>
<link rel="stylesheet" href="css/shared.css">
<style>
.chapter-num{font-family:'Fragment Mono',monospace;font-size:0.7rem;letter-spacing:0.15em;color:var(--text-muted);margin-bottom:0.5rem;display:block;}
.visual-break{height:2px;background:linear-gradient(90deg,var(--accent),transparent);margin:3rem 0;}
/* Hero comparison */
.comp-hero{background:linear-gradient(135deg,#1a1a2e,#16213e);border-radius:var(--radius-lg);padding:2.5rem 2rem;margin:1.5rem 0;text-align:center;}
.comp-hero h2{color:#fff;font-family:'DM Serif Display',Georgia,serif;font-size:2rem;font-style:italic;margin:0 0 0.5rem;}
.comp-hero p{color:rgba(255,255,255,0.6);font-size:0.95rem;max-width:600px;margin:0 auto;}
.comp-hero::before{content:'rmk-silk-005 · classified briefing';display:block;font-family:'Fragment Mono',monospace;font-size:0.5rem;color:rgba(255,255,255,0.2);letter-spacing:0.15em;margin-bottom:1rem;}
/* Approach cards */
.approach-grid{display:grid;grid-template-columns:1fr 1fr;gap:1.5rem;margin:1.5rem 0;}
.approach-card{border-radius:var(--radius-lg);padding:2rem;position:relative;overflow:hidden;}
.approach-card.others{background:#fcfaf6;border:2px solid #e3e3e3;}
.approach-card.silk{background:#f0faf3;border:2px solid #2d8a4e;}
.approach-card h3{margin:0 0 1rem;font-size:1.1rem;}
.approach-card .tag{display:inline-block;font-family:'Fragment Mono',monospace;font-size:0.65rem;padding:3px 8px;border-radius:3px;letter-spacing:0.05em;margin-bottom:0.75rem;}
.approach-card.others .tag{background:rgba(26,26,26,0.08);color:#1a1a1a;}
.approach-card.silk .tag{background:rgba(45,138,78,0.1);color:#2d8a4e;}
/* Step flow in approach cards */
.step-flow{display:flex;flex-direction:column;gap:0.5rem;margin:1rem 0;}
.step{display:flex;align-items:center;gap:0.75rem;padding:0.6rem 0.75rem;border-radius:8px;font-size:0.85rem;}
.approach-card.others .step{background:rgba(0,0,0,0.03);}
.approach-card.silk .step{background:rgba(45,138,78,0.06);}
.step-num{width:24px;height:24px;border-radius:50%;display:flex;align-items:center;justify-content:center;font-family:'Fragment Mono',monospace;font-size:0.7rem;font-weight:700;flex-shrink:0;}
.approach-card.others .step-num{background:rgba(26,26,26,0.1);color:#555;}
.approach-card.silk .step-num{background:rgba(45,138,78,0.15);color:#2d8a4e;}
/* Battle cards */
.battle-card{background:var(--bg-card);border:1px solid var(--border);border-radius:var(--radius-lg);padding:2rem;margin:1.5rem 0;transition:all var(--transition);}
.battle-card:hover{border-color:var(--accent);box-shadow:var(--shadow-md);}
.battle-header{display:flex;align-items:center;justify-content:space-between;margin-bottom:1.5rem;}
.battle-header h3{margin:0;font-size:1.1rem;}
.battle-badge{font-family:'Fragment Mono',monospace;font-size:0.65rem;padding:4px 10px;border-radius:4px;letter-spacing:0.05em;}
.badge-silk{background:rgba(45,138,78,0.12);color:#2d8a4e;}
.badge-tie{background:rgba(26,26,26,0.08);color:#555;}
/* Comparison bars */
.comp-bars{display:flex;flex-direction:column;gap:1rem;}
.comp-bar-row{display:grid;grid-template-columns:120px 1fr 60px;align-items:center;gap:0.75rem;}
.comp-bar-label{font-family:'Fragment Mono',monospace;font-size:0.75rem;color:var(--text-muted);text-align:right;}
.comp-bar-track{height:28px;background:var(--bg-alt);border-radius:6px;overflow:hidden;position:relative;}
.comp-bar-fill{height:100%;border-radius:6px;display:flex;align-items:center;padding:0 10px;font-family:'Fragment Mono',monospace;font-size:0.7rem;color:white;font-weight:600;transition:width 1.5s cubic-bezier(0.22,1,0.36,1);}
.comp-bar-val{font-family:'Fragment Mono',monospace;font-size:0.8rem;font-weight:700;min-width:50px;}
/* Analogy section */
.analogy-cards{display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin:1.5rem 0;}
.analogy-item{background:var(--bg-card);border:1px solid var(--border);border-radius:var(--radius-md);padding:1.5rem;text-align:center;transition:all var(--transition);}
.analogy-item:hover{border-color:var(--accent);transform:translateY(-2px);box-shadow:var(--shadow-sm);}
.analogy-icon{font-size:2.5rem;margin-bottom:0.75rem;}
.analogy-item h4{margin:0 0 0.5rem;font-size:0.95rem;}
.analogy-item p{font-size:0.8rem;color:var(--text-muted);line-height:1.5;margin:0;}
/* Company comparison table */
.company-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));gap:1rem;margin:1.5rem 0;}
.company-card{background:var(--bg-card);border:1px solid var(--border);border-radius:var(--radius-md);padding:1.25rem;transition:all var(--transition);}
.company-card:hover{border-color:var(--accent);box-shadow:var(--shadow-sm);}
.company-card.is-silk{border-color:#2d8a4e;background:#f0faf3;}
.company-name{font-family:'Fragment Mono',monospace;font-size:0.85rem;font-weight:700;margin-bottom:0.75rem;}
.company-card.is-silk .company-name{color:#2d8a4e;}
.company-detail{display:flex;justify-content:space-between;padding:0.35rem 0;border-bottom:1px solid var(--border-light);font-size:0.8rem;}
.company-detail:last-child{border-bottom:none;}
.company-detail .label{color:var(--text-muted);}
.company-detail .val{font-family:'Fragment Mono',monospace;font-weight:600;}
/* Race animation */
.race-track{background:var(--code-bg);border-radius:var(--radius-lg);padding:1.5rem;margin:1.5rem 0;overflow:hidden;}
.race-lane{display:flex;align-items:center;gap:0.75rem;margin:0.5rem 0;height:36px;}
.race-name{min-width:90px;font-family:'Fragment Mono',monospace;font-size:0.75rem;color:rgba(255,255,255,0.6);text-align:right;}
.race-bar-bg{flex:1;height:28px;background:rgba(255,255,255,0.06);border-radius:6px;overflow:hidden;position:relative;}
.race-bar{height:100%;border-radius:6px;transition:width 2.5s cubic-bezier(0.22,1,0.36,1);display:flex;align-items:center;justify-content:flex-end;padding-right:8px;}
.race-bar span{font-family:'Fragment Mono',monospace;font-size:0.7rem;color:white;font-weight:600;white-space:nowrap;}
.race-bar.silk-bar{background:linear-gradient(90deg,#1a7a3d,#2d8a4e);}
.race-bar.other-bar{background:linear-gradient(90deg,#555,#777);}
.race-bar.mimi-bar{background:linear-gradient(90deg,#5a5a8b,#7a7ab0);}
.race-time{min-width:60px;font-family:'Fragment Mono',monospace;font-size:0.75rem;color:rgba(255,255,255,0.4);text-align:left;}
/* Bottom line */
.bottom-line{background:linear-gradient(135deg,#1a2e1a,#16213e);border-radius:var(--radius-lg);padding:2.5rem;margin:2rem 0;text-align:center;}
.bottom-line h2{color:#fff;font-family:'DM Serif Display',Georgia,serif;font-size:1.8rem;font-style:italic;margin:0 0 1rem;}
.bottom-line p{color:rgba(255,255,255,0.7);font-size:0.95rem;max-width:600px;margin:0 auto 1.5rem;line-height:1.7;}
.bottom-line-stats{display:flex;justify-content:center;gap:2rem;flex-wrap:wrap;}
.bl-stat{text-align:center;}
.bl-stat .num{font-family:'Fragment Mono',monospace;font-size:2rem;font-weight:700;color:#fff;}
.bl-stat .label{font-size:0.75rem;color:rgba(255,255,255,0.4);margin-top:0.25rem;}
@media(max-width:768px){
.approach-grid{grid-template-columns:1fr;}
.analogy-cards{grid-template-columns:1fr;}
.comp-hero h2{font-size:1.5rem;}
.company-grid{grid-template-columns:1fr;}
.comp-bar-row{grid-template-columns:80px 1fr 50px;}
}
</style>
</head>
<body>
<aside class="sidebar">
<div class="sidebar-progress"><div class="sidebar-progress-fill"></div></div>
<div class="sidebar-rail">
<div class="sidebar-brand">S</div>
<nav>
<a href="index.html" data-page="index.html" data-num="01" title="How Voice AI Works"><span class="pip-tooltip">How Voice AI Works</span></a>
<a href="how-codecs-work.html" data-page="how-codecs-work.html" data-num="02" title="How Codecs Work"><span class="pip-tooltip">How Codecs Work</span></a>
<a href="silk-codec.html" data-page="silk-codec.html" data-num="03" title="Latent Space Learning"><span class="pip-tooltip">Latent Space Learning</span></a>
<a href="voice-ai-pipeline.html" data-page="voice-ai-pipeline.html" data-num="04" title="Voice AI Pipeline"><span class="pip-tooltip">Voice AI Pipeline</span></a>
<a href="comparison.html" data-page="comparison.html" data-num="05" title="Silk vs The World"><span class="pip-tooltip">Silk vs The World</span></a>
<a href="geometry.html" data-page="geometry.html" data-num="06" title="Why It Works"><span class="pip-tooltip">Why It Works</span></a>
<div class="quiz-pip"><a href="quiz.html" data-page="quiz.html" data-num="?" title="Test Yourself"><span class="pip-tooltip">Test Yourself</span></a></div>
</nav>
</div>
<div class="sidebar-panel">
<div class="sidebar-logo"><h2>Silk</h2><span>internal briefing</span></div>
<nav>
<a href="index.html" data-page="index.html"><span class="nav-num">01</span> How Voice AI Works</a>
<a href="how-codecs-work.html" data-page="how-codecs-work.html"><span class="nav-num">02</span> How Codecs Work</a>
<a href="silk-codec.html" data-page="silk-codec.html"><span class="nav-num">03</span> Latent Space Learning</a>
<a href="voice-ai-pipeline.html" data-page="voice-ai-pipeline.html"><span class="nav-num">04</span> Voice AI Pipeline</a>
<a href="comparison.html" data-page="comparison.html"><span class="nav-num">05</span> Silk vs The World</a>
<a href="geometry.html" data-page="geometry.html"><span class="nav-num">06</span> Why It Works</a>
<div class="quiz-link"><a href="quiz.html" data-page="quiz.html"><span class="nav-num">?</span> Test Yourself</a></div>
</nav>
</div>
</aside>
<main class="main-content">
<div class="page-content">
<span class="chapter-num">rmk-silk-005 · comparison</span>
<h1>Silk vs the world</h1>
<p class="subtitle">how Silk compares to ElevenLabs, OpenAI, and other voice AI systems — explained simply.</p>
<!-- ==================== THE SIMPLE EXPLANATION ==================== -->
<div class="comp-hero">
<h2>the short version</h2>
<p>every voice AI company uses the same recipe. Silk changes the one ingredient that matters most.</p>
</div>
<!-- ==================== KITCHEN ANALOGY ==================== -->
<h2>the restaurant analogy</h2>
<p>every voice AI system has the same three steps:</p>
<div class="analogy-cards fade-in">
<div class="analogy-item">
<div class="analogy-icon">📖</div>
<h4>step 1: the recipe</h4>
<p>a large language model (LLM) reads text and decides what sounds to make — like a chef reading a recipe.</p>
</div>
<div class="analogy-item" style="border-color:var(--accent);background:var(--accent-light);">
<div class="analogy-icon">🧂</div>
<h4>step 2: the ingredients</h4>
<p>the <strong>codec</strong> defines what "ingredients" (audio tokens) exist. this is where Silk is different. <strong>this is the part that matters most.</strong></p>
</div>
<div class="analogy-item">
<div class="analogy-icon">🍽️</div>
<h4>step 3: the plate</h4>
<p>the decoder assembles the final audio — like plating the dish.</p>
</div>
</div>
<div class="callout insight fade-in">
<div class="callout-title">what this means</div>
<p>ElevenLabs, OpenAI, and others compete on their LLMs. but they all use codebook-based codecs with the same limitations. Silk built a different codec. the codec sets the quality ceiling for the entire system.</p>
</div>
<div class="visual-break"></div>
<!-- ==================== TWO APPROACHES ==================== -->
<h2>two fundamentally different approaches</h2>
<p>here's what happens to your voice inside each system:</p>
<div class="approach-grid fade-in">
<div class="approach-card others">
<div class="tag">ElevenLabs / OpenAI / others</div>
<h3 style="color:var(--text);">the "snap to grid" approach</h3>
<div class="step-flow">
<div class="step">
<div class="step-num">1</div>
<div>your voice comes in as a smooth, continuous wave</div>
</div>
<div class="step">
<div class="step-num">2</div>
<div>the codec <strong>snaps</strong> every sound to the nearest entry in a fixed codebook — like forcing your voice through a grid</div>
</div>
<div class="step">
<div class="step-num">3</div>
<div>subtle details between grid points? <strong>gone forever</strong></div>
</div>
<div class="step">
<div class="step-num">4</div>
<div>to compensate, they use 600-800 tokens per second of audio</div>
</div>
</div>
<div style="text-align:center;margin-top:1rem;">
<svg viewBox="0 0 300 100" style="width:100%;max-width:300px;">
<line x1="10" y1="50" x2="290" y2="50" stroke="#e3e3e3" stroke-width="2"/>
<circle cx="50" cy="50" r="8" fill="#d4cfc7" stroke="#bbb" stroke-width="1.5"/>
<circle cx="120" cy="50" r="8" fill="#d4cfc7" stroke="#bbb" stroke-width="1.5"/>
<circle cx="190" cy="50" r="8" fill="#d4cfc7" stroke="#bbb" stroke-width="1.5"/>
<circle cx="260" cy="50" r="8" fill="#d4cfc7" stroke="#bbb" stroke-width="1.5"/>
<circle cx="85" cy="50" r="6" fill="#d32f2f" opacity="0.8"/>
<line x1="85" y1="50" x2="120" y2="50" stroke="#d32f2f" stroke-width="2" stroke-dasharray="4,3"/>
<circle cx="155" cy="50" r="6" fill="#d32f2f" opacity="0.8"/>
<line x1="155" y1="50" x2="120" y2="50" stroke="#d32f2f" stroke-width="2" stroke-dasharray="4,3"/>
<text x="150" y="88" text-anchor="middle" fill="#d32f2f" font-size="11" font-weight="600" font-family="Fragment Mono">both snap to same point</text>
</svg>
</div>
</div>
<div class="approach-card silk">
<div class="tag">Silk Codec</div>
<h3 style="color:#2d8a4e;">the "learn the space" approach</h3>
<div class="step-flow">
<div class="step">
<div class="step-num">1</div>
<div>your voice comes in as a smooth, continuous wave</div>
</div>
<div class="step">
<div class="step-num">2</div>
<div>the codec maps it into a <strong>learned continuous space</strong> — it places your sound exactly where it belongs</div>
</div>
<div class="step">
<div class="step-num">3</div>
<div>FSQ quantizes efficiently: <strong>128 dimensions x 8 levels each</strong></div>
</div>
<div class="step">
<div class="step-num">4</div>
<div>result: only <strong>47.5 tokens per second</strong> — 12x fewer, but more info kept</div>
</div>
</div>
<div style="text-align:center;margin-top:1rem;">
<svg viewBox="0 0 300 100" style="width:100%;max-width:300px;">
<defs><linearGradient id="sg" x1="0" y1="0" x2="1" y2="0"><stop offset="0%" stop-color="#2d8a4e" stop-opacity="0.05"/><stop offset="50%" stop-color="#2d8a4e" stop-opacity="0.3"/><stop offset="100%" stop-color="#2d8a4e" stop-opacity="0.05"/></linearGradient></defs>
<rect x="10" y="30" width="280" height="40" rx="20" fill="url(#sg)"/>
<line x1="10" y1="50" x2="290" y2="50" stroke="#2d8a4e" stroke-width="1" opacity="0.3"/>
<circle cx="85" cy="50" r="7" fill="#2d8a4e"/>
<circle cx="85" cy="50" r="12" fill="none" stroke="#2d8a4e" stroke-width="1.5" opacity="0.4"/>
<circle cx="155" cy="50" r="7" fill="#2d8a4e"/>
<circle cx="155" cy="50" r="12" fill="none" stroke="#2d8a4e" stroke-width="1.5" opacity="0.4"/>
<text x="150" y="88" text-anchor="middle" fill="#2d8a4e" font-size="11" font-weight="600" font-family="Fragment Mono">each sound lands in its own spot</text>
</svg>
</div>
</div>
</div>
<div class="visual-break"></div>
<!-- ==================== HEAD TO HEAD ==================== -->
<h2>head to head: the numbers</h2>
<p>here are the numbers:</p>
<!-- BATTLE 1: Speed -->
<div class="battle-card fade-in">
<div class="battle-header">
<h3>tokens per second of audio</h3>
<span class="battle-badge badge-silk">Silk wins by 12x</span>
</div>
<p style="font-size:0.85rem;color:var(--text-muted);margin-bottom:1rem;">fewer <span class="term" data-term="tokens">tokens</span> = the LLM generates speech faster. the single biggest bottleneck in voice AI latency.</p>
<div class="comp-bars">
<div class="comp-bar-row">
<div class="comp-bar-label">ElevenLabs<br><span style="font-size:0.65rem;opacity:0.6;">(EnCodec)</span></div>
<div class="comp-bar-track"><div class="comp-bar-fill" style="width:0%;background:#666;" data-width="77%"><span style="opacity:0.9;">600</span></div></div>
<div class="comp-bar-val" style="color:#555;">600/s</div>
</div>
<div class="comp-bar-row">
<div class="comp-bar-label">DAC<br><span style="font-size:0.65rem;opacity:0.6;">(Descript)</span></div>
<div class="comp-bar-track"><div class="comp-bar-fill" style="width:0%;background:#666;" data-width="100%"><span style="opacity:0.9;">774</span></div></div>
<div class="comp-bar-val" style="color:#555;">774/s</div>
</div>
<div class="comp-bar-row">
<div class="comp-bar-label">Mimi<br><span style="font-size:0.65rem;opacity:0.6;">(Kyutai)</span></div>
<div class="comp-bar-track"><div class="comp-bar-fill" style="width:0%;background:#7a7ab0;" data-width="13%"><span style="opacity:0.9;">100</span></div></div>
<div class="comp-bar-val" style="color:#7a7ab0;">100/s</div>
</div>
<div class="comp-bar-row">
<div class="comp-bar-label" style="color:#2d8a4e;font-weight:700;">Silk<br><span style="font-size:0.65rem;opacity:0.6;">(Rumik)</span></div>
<div class="comp-bar-track"><div class="comp-bar-fill" style="width:0%;background:#2d8a4e;" data-width="6%"><span style="opacity:0.9;">47.5</span></div></div>
<div class="comp-bar-val" style="color:#2d8a4e;font-size:0.9rem;">47.5/s</div>
</div>
</div>
<div class="callout insight" style="margin-top:1.25rem;">
<div class="callout-title">why this matters</div>
<p>a standard LLM can generate ~100 tokens per second. at 600 tokens/s, the LLM needs 6 seconds just to produce 1 second of audio. at 47.5 tokens/s, the LLM generates audio <strong>faster than real-time</strong>. that's the difference between noticeable lag and real-time conversation.</p>
</div>
</div>
<!-- BATTLE 2: Real-time race -->
<div class="battle-card fade-in">
<div class="battle-header">
<h3>the real-time generation race</h3>
<span class="battle-badge badge-silk">Silk is the only one faster than real-time</span>
</div>
<p style="font-size:0.85rem;color:var(--text-muted);margin-bottom:1rem;">how long does the LLM take to generate 1 second of audio? (assuming ~100 tokens/sec LLM speed)</p>
<div class="race-track" id="race-track">
<div style="display:flex;justify-content:space-between;margin-bottom:0.75rem;padding:0 100px 0 0;">
<span style="font-family:'Fragment Mono',monospace;font-size:0.65rem;color:rgba(255,255,255,0.3);">0s</span>
<span style="font-family:'Fragment Mono',monospace;font-size:0.65rem;color:rgba(255,255,255,0.25);">1s (real-time line)</span>
<span style="font-family:'Fragment Mono',monospace;font-size:0.65rem;color:rgba(255,255,255,0.3);">8s</span>
</div>
<!-- Real-time marker -->
<div style="position:relative;">
<div style="position:absolute;left:12.5%;top:-8px;bottom:-8px;width:2px;background:rgba(255,255,255,0.15);z-index:1;"></div>
<div style="position:absolute;left:12.5%;top:-20px;font-family:'Fragment Mono',monospace;font-size:0.6rem;color:rgba(255,255,255,0.3);transform:translateX(-50%);">real-time</div>
</div>
<div class="race-lane">
<div class="race-name">DAC</div>
<div class="race-bar-bg"><div class="race-bar other-bar" style="width:0%;" data-width="97%"><span>7.74s</span></div></div>
<div class="race-time">7.74s</div>
</div>
<div class="race-lane">
<div class="race-name">EnCodec</div>
<div class="race-bar-bg"><div class="race-bar other-bar" style="width:0%;" data-width="75%"><span>6.0s</span></div></div>
<div class="race-time">6.0s</div>
</div>
<div class="race-lane">
<div class="race-name">Mimi</div>
<div class="race-bar-bg"><div class="race-bar mimi-bar" style="width:0%;" data-width="12.5%"><span>1.0s</span></div></div>
<div class="race-time">1.0s</div>
</div>
<div class="race-lane">
<div class="race-name" style="color:#2d8a4e;font-weight:700;">Silk</div>
<div class="race-bar-bg"><div class="race-bar silk-bar" style="width:0%;" data-width="6%"><span>0.48s</span></div></div>
<div class="race-time" style="color:#2d8a4e;">0.48s</div>
</div>
</div>
<p style="font-size:0.8rem;color:var(--text-muted);text-align:center;margin-top:0.75rem;">Silk is the <strong>only codec</strong> where a standard transformer can generate speech faster than you can listen to it.</p>
</div>
<!-- BATTLE 3: Cross-lingual -->
<div class="battle-card fade-in">
<div class="battle-header">
<h3>cross-lingual quality (the surprise result)</h3>
<span class="battle-badge badge-silk">trained on English only</span>
</div>
<p style="font-size:0.85rem;color:var(--text-muted);margin-bottom:1rem;">Silk was trained on <strong>5,800 hours of English only</strong>. then tested on languages it has never heard:</p>
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin:1rem 0;">
<div style="background:var(--bg-alt);border-radius:var(--radius-md);padding:1.25rem;text-align:center;">
<div style="font-size:0.75rem;color:var(--text-muted);margin-bottom:0.5rem;">Hindi (0 hrs training)</div>
<div style="font-family:'Fragment Mono',monospace;font-size:2rem;font-weight:700;color:#2d8a4e;">3.80</div>
<div style="font-size:0.7rem;color:var(--text-light);margin-top:0.25rem;">PESQ score</div>
</div>
<div style="background:var(--bg-alt);border-radius:var(--radius-md);padding:1.25rem;text-align:center;">
<div style="font-size:0.75rem;color:var(--text-muted);margin-bottom:0.5rem;">Mandarin (0 hrs training)</div>
<div style="font-family:'Fragment Mono',monospace;font-size:2rem;font-weight:700;color:#2d8a4e;">3.55</div>
<div style="font-size:0.7rem;color:var(--text-light);margin-top:0.25rem;">PESQ score</div>
</div>
<div style="background:var(--bg-alt);border-radius:var(--radius-md);padding:1.25rem;text-align:center;">
<div style="font-size:0.75rem;color:var(--text-muted);margin-bottom:0.5rem;">PESQ scale</div>
<div style="font-family:'Fragment Mono',monospace;font-size:2rem;font-weight:700;color:var(--text-light);">1-4.5</div>
<div style="font-size:0.7rem;color:var(--text-light);margin-top:0.25rem;">higher = better</div>
</div>
</div>
<div class="callout insight">
<div class="callout-title">why this works</div>
<p>traditional codecs learn language-specific patterns — how english vowels sound. Silk learns <strong>the physics of sound</strong>: vibration patterns, harmonic structures, formant shapes. physics is the same across languages. mandarin tones and hindi retroflex consonants use the same physical principles as english.</p>
</div>
</div>
<div class="visual-break"></div>
<!-- ==================== COMPANY COMPARISON ==================== -->
<h2>who's who in voice AI</h2>
<p>here's how the major players stack up:</p>
<div class="company-grid fade-in">
<div class="company-card">
<div class="company-name">ElevenLabs</div>
<div class="company-detail"><span class="label">codec</span><span class="val">EnCodec-based</span></div>
<div class="company-detail"><span class="label">tokens/sec</span><span class="val" style="color:#555;">~600</span></div>
<div class="company-detail"><span class="label">approach</span><span class="val">RVQ codebook</span></div>
<div class="company-detail"><span class="label">languages</span><span class="val">29 (trained)</span></div>
<div class="company-detail"><span class="label">real-time?</span><span class="val" style="color:#555;">no (6x slower)</span></div>
</div>
<div class="company-card">
<div class="company-name">OpenAI (GPT-4o voice)</div>
<div class="company-detail"><span class="label">codec</span><span class="val">proprietary</span></div>
<div class="company-detail"><span class="label">tokens/sec</span><span class="val" style="color:#555;">~500+ (est.)</span></div>
<div class="company-detail"><span class="label">approach</span><span class="val">multi-codebook</span></div>
<div class="company-detail"><span class="label">languages</span><span class="val">50+ (trained)</span></div>
<div class="company-detail"><span class="label">real-time?</span><span class="val" style="color:#555;">custom hardware</span></div>
</div>
<div class="company-card">
<div class="company-name">Kyutai (Moshi)</div>
<div class="company-detail"><span class="label">codec</span><span class="val">Mimi</span></div>
<div class="company-detail"><span class="label">tokens/sec</span><span class="val" style="color:#7a7ab0;">100</span></div>
<div class="company-detail"><span class="label">approach</span><span class="val">RVQ + distill</span></div>
<div class="company-detail"><span class="label">languages</span><span class="val">English/French</span></div>
<div class="company-detail"><span class="label">real-time?</span><span class="val" style="color:#e6a817;">borderline</span></div>
</div>
<div class="company-card is-silk">
<div class="company-name">Silk (Rumik)</div>
<div class="company-detail"><span class="label">codec</span><span class="val" style="color:#2d8a4e;">Silk Codec</span></div>
<div class="company-detail"><span class="label">tokens/sec</span><span class="val" style="color:#2d8a4e;">47.5</span></div>
<div class="company-detail"><span class="label">approach</span><span class="val" style="color:#2d8a4e;">continuous + FSQ</span></div>
<div class="company-detail"><span class="label">languages</span><span class="val" style="color:#2d8a4e;">all (physics-based)</span></div>
<div class="company-detail"><span class="label">real-time?</span><span class="val" style="color:#2d8a4e;">yes (2x faster)</span></div>
</div>
</div>
<div class="visual-break"></div>
<!-- ==================== INTERACTIVE: WHY FEWER TOKENS WIN ==================== -->
<h2>interactive: why fewer tokens = better voice AI</h2>
<p>drag the slider to see how token count affects the entire system:</p>
<div class="interactive-block fade-in">
<h3>token burden calculator</h3>
<div style="padding:1rem 0;">
<div style="display:flex;align-items:center;gap:1rem;margin-bottom:1.5rem;">
<label style="font-family:'Fragment Mono',monospace;font-size:0.8rem;color:var(--text-muted);min-width:100px;">tokens/sec:</label>
<input type="range" id="token-slider" min="40" max="800" value="600" style="flex:1;accent-color:var(--accent);">
<span id="token-val" style="font-family:'Fragment Mono',monospace;font-size:1.1rem;font-weight:700;color:var(--text);min-width:60px;text-align:right;">600</span>
</div>
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;">
<div style="background:var(--bg-alt);border-radius:var(--radius-md);padding:1.25rem;text-align:center;">
<div style="font-size:0.75rem;color:var(--text-muted);margin-bottom:0.5rem;">LLM time for 1s audio</div>
<div id="calc-time" style="font-family:'Fragment Mono',monospace;font-size:1.5rem;font-weight:700;color:var(--text);">6.00s</div>
<div id="calc-realtime" style="font-size:0.7rem;margin-top:0.25rem;color:#555;">6x slower than real-time</div>
</div>
<div style="background:var(--bg-alt);border-radius:var(--radius-md);padding:1.25rem;text-align:center;">
<div style="font-size:0.75rem;color:var(--text-muted);margin-bottom:0.5rem;">tokens for 10s clip</div>
<div id="calc-total" style="font-family:'Fragment Mono',monospace;font-size:1.5rem;font-weight:700;color:var(--text);">6,000</div>
</div>
<div style="background:var(--bg-alt);border-radius:var(--radius-md);padding:1.25rem;text-align:center;">
<div style="font-size:0.75rem;color:var(--text-muted);margin-bottom:0.5rem;">GPU cost relative to Silk</div>
<div id="calc-cost" style="font-family:'Fragment Mono',monospace;font-size:1.5rem;font-weight:700;color:var(--text);">12.6x</div>
</div>
</div>
<div style="display:flex;justify-content:center;gap:0.5rem;margin-top:1rem;">
<button onclick="document.getElementById('token-slider').value=774;document.getElementById('token-slider').dispatchEvent(new Event('input'));" style="font-family:'Fragment Mono',monospace;font-size:0.7rem;padding:4px 12px;border:1px solid var(--border);border-radius:4px;background:var(--bg-card);cursor:pointer;color:var(--text-muted);">DAC (774)</button>
<button onclick="document.getElementById('token-slider').value=600;document.getElementById('token-slider').dispatchEvent(new Event('input'));" style="font-family:'Fragment Mono',monospace;font-size:0.7rem;padding:4px 12px;border:1px solid var(--border);border-radius:4px;background:var(--bg-card);cursor:pointer;color:var(--text-muted);">EnCodec (600)</button>
<button onclick="document.getElementById('token-slider').value=100;document.getElementById('token-slider').dispatchEvent(new Event('input'));" style="font-family:'Fragment Mono',monospace;font-size:0.7rem;padding:4px 12px;border:1px solid var(--border);border-radius:4px;background:var(--bg-card);cursor:pointer;color:var(--text-muted);">Mimi (100)</button>
<button onclick="document.getElementById('token-slider').value=48;document.getElementById('token-slider').dispatchEvent(new Event('input'));" style="font-family:'Fragment Mono',monospace;font-size:0.7rem;padding:4px 12px;border:1px solid #2d8a4e;border-radius:4px;background:rgba(45,138,78,0.08);cursor:pointer;color:#2d8a4e;font-weight:600;">Silk (47.5)</button>
</div>
</div>
</div>
<div class="visual-break"></div>
<!-- ==================== THE BOTTOM LINE ==================== -->
<h2>the bottom line</h2>
<p>here's what it all means in plain English:</p>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:1.5rem;margin:1.5rem 0;" class="fade-in">
<div style="background:var(--bg-card);border:1px solid var(--border);border-radius:var(--radius-lg);padding:1.5rem;">
<h4 style="color:#1a1a1a;margin-top:0;">what everyone else does</h4>
<div style="font-size:0.85rem;line-height:1.8;">
<div>Train on many languages</div>
<div>Use 600+ tokens per second</div>
<div>Need expensive hardware for real-time</div>
<div>Quality limited by codebook size</div>
<div>Adding a new language = new training data</div>
</div>
</div>
<div style="background:#f0faf3;border:2px solid #2d8a4e;border-radius:var(--radius-lg);padding:1.5rem;">
<h4 style="color:#2d8a4e;margin-top:0;">what Silk does differently</h4>
<div style="font-size:0.85rem;line-height:1.8;">
<div><strong>Train on English, works on all languages</strong></div>
<div><strong>47.5 tokens per second (12x fewer)</strong></div>
<div><strong>Real-time on standard hardware</strong></div>
<div><strong>Quality limited only by physics</strong></div>
<div><strong>New language = already works</strong></div>
</div>
</div>
</div>
<div class="bottom-line fade-in">
<h2>different foundation, different results.</h2>
<p>others optimize the LLM. Silk optimized the codec — the layer that constrains everything above it.</p>
<div class="bottom-line-stats">
<div class="bl-stat">
<div class="num">12x</div>
<div class="label">fewer tokens</div>
</div>
<div class="bl-stat">
<div class="num">2x</div>
<div class="label">faster than real-time</div>
</div>
<div class="bl-stat">
<div class="num">0</div>
<div class="label">hours of non-English training</div>
</div>
</div>
</div>
<!-- Navigation -->
<div style="display:flex;justify-content:space-between;margin-top:3rem;padding-top:1.5rem;border-top:1px solid var(--border-light);">
<a href="voice-ai-pipeline.html" style="color:var(--text-muted);text-decoration:none;font-size:0.9rem;">← voice AI pipeline</a>
<a href="geometry.html" style="color:var(--accent);text-decoration:none;font-size:0.9rem;">next: why it works →</a>
</div>
</div>
</main>
<script src="js/shared.js"></script>
<script>
// === ANIMATE COMPARISON BARS ON SCROLL ===
(function(){
const bars=document.querySelectorAll('.comp-bar-fill[data-width]');
const obs=new IntersectionObserver(entries=>{
entries.forEach(e=>{
if(e.isIntersecting){
e.target.style.width=e.target.dataset.width;
obs.unobserve(e.target);
}
});
},{threshold:0.3});
bars.forEach(b=>obs.observe(b));
})();
// === ANIMATE RACE BARS ON SCROLL ===
(function(){
const track=document.getElementById('race-track');if(!track)return;
const bars=track.querySelectorAll('.race-bar[data-width]');
const obs=new IntersectionObserver(entries=>{
if(entries[0].isIntersecting){
bars.forEach(b=>{b.style.width=b.dataset.width;});
obs.unobserve(track);
}
},{threshold:0.3});
obs.observe(track);
})();
// === TOKEN BURDEN CALCULATOR ===
(function(){
const slider=document.getElementById('token-slider');
const valEl=document.getElementById('token-val');
const timeEl=document.getElementById('calc-time');
const rtEl=document.getElementById('calc-realtime');
const totalEl=document.getElementById('calc-total');
const costEl=document.getElementById('calc-cost');
if(!slider)return;
const LLM_SPEED=100; // tokens per second
const SILK_TOKENS=47.5;
function update(){
const tokens=parseInt(slider.value);
valEl.textContent=tokens;
const timeFor1s=(tokens/LLM_SPEED).toFixed(2);
timeEl.textContent=timeFor1s+'s';
const ratio=tokens/LLM_SPEED;
if(ratio<=1){
rtEl.textContent=`${(1/ratio).toFixed(1)}x faster than real-time`;
rtEl.style.color='#2d8a4e';
timeEl.style.color='#2d8a4e';
valEl.style.color='#2d8a4e';
}else{
rtEl.textContent=`${ratio.toFixed(1)}x slower than real-time`;
rtEl.style.color='#555';
timeEl.style.color='#555';
valEl.style.color='#555';
}
totalEl.textContent=(tokens*10).toLocaleString();
costEl.textContent=(tokens/SILK_TOKENS).toFixed(1)+'x';
}
slider.addEventListener('input',update);
update();
})();
// === PIPELINE ANIMATION ===
(function(){
document.querySelectorAll('.process-flow').forEach(container=>{
const arrows=container.querySelectorAll('.pf-arrow');
const data=container.querySelectorAll('.pf-data');
let i=0;
function step(){
if(i<arrows.length){arrows[i].classList.add('show');i++;setTimeout(step,200);}
if(i-1<data.length&&i>0){data[Math.max(0,i-1)]?.classList.add('show');}
}
const obs=new IntersectionObserver((entries)=>{
if(entries[0].isIntersecting){step();obs.unobserve(container);}
},{threshold:0.3});
obs.observe(container);
});
})();
</script>
<nav class="mobile-nav">
<a href="#" class="mobile-prev">← prev</a>
<span class="current-page"></span>
<a href="#" class="mobile-next">next →</a>
</nav>
</body>
</html>