silk-codec-docs/index.html at main · ira-rumik/silk-codec-docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Silk — How Voice AI Works</title>
<link rel="stylesheet" href="css/shared.css">
<style>
/* Page-specific styles */
.hero-viz{position:relative;height:320px;background:linear-gradient(135deg,#1a1a2e,#16213e,#0f3460);border-radius:var(--radius-lg);overflow:hidden;margin:0 0 2rem;}
.hero-viz canvas{width:100%;height:100%;display:block;}
.hero-overlay{position:absolute;top:0;left:0;right:0;bottom:0;display:flex;flex-direction:column;align-items:center;justify-content:center;z-index:2;text-align:center;padding:2rem;}
.hero-overlay h1{color:#fff;font-family:'DM Serif Display',Georgia,serif;font-size:3.5rem;font-weight:700;letter-spacing:-0.03em;margin:0;}
.hero-overlay .tagline{font-family:'Fragment Mono',monospace;font-size:0.85rem;color:rgba(255,255,255,0.5);letter-spacing:0.05em;margin-top:0.5rem;}
.hero-overlay p{color:rgba(255,255,255,0.7);font-size:1rem;max-width:520px;margin-top:1rem;line-height:1.6;}

.chapter-num{font-family:'Fragment Mono',monospace;font-size:0.7rem;letter-spacing:0.15em;color:var(--text-muted);margin-bottom:0.5rem;display:block;}
.visual-break{height:2px;background:linear-gradient(90deg,var(--accent),transparent);margin:3rem 0;border-radius:1px;}

/* Sound wave demo */
.sound-demo{background:var(--code-bg);border-radius:var(--radius-lg);padding:0;overflow:hidden;margin:1.5rem 0;position:relative;}
.sound-demo canvas{display:block;width:100%;}
.sound-demo-label{position:absolute;bottom:0;left:0;right:0;padding:0.75rem 1rem;background:linear-gradient(transparent,rgba(44,42,38,0.9));display:flex;justify-content:space-between;font-family:'Fragment Mono',monospace;font-size:0.7rem;color:rgba(255,255,255,0.5);}

/* Infographic boxes */
.info-row{display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin:1.5rem 0;}
.info-box{background:var(--bg-card);border:1px solid var(--border);border-radius:var(--radius-md);padding:1.5rem;text-align:center;transition:all var(--transition);}
.info-box:hover{border-color:var(--accent);transform:translateY(-2px);box-shadow:var(--shadow-md);}
.info-icon{font-size:2rem;margin-bottom:0.75rem;}
.info-title{font-family:'Fragment Mono',monospace;font-size:0.8rem;font-weight:600;color:var(--text);margin-bottom:0.5rem;}
.info-desc{font-size:0.8rem;color:var(--text-muted);line-height:1.5;}

/* Codec analogy */
.analogy-grid{display:grid;grid-template-columns:1fr 60px 1fr;align-items:center;gap:0;margin:1.5rem 0;}
.analogy-box{background:var(--bg-card);border:1px solid var(--border);border-radius:var(--radius-md);padding:1.25rem;text-align:center;}
.analogy-arrow{text-align:center;color:var(--accent);font-size:1.5rem;}

/* Engine analogy */
.engine-stack{display:flex;flex-direction:column;gap:0;margin:1.5rem 0;border:1px solid var(--border);border-radius:var(--radius-lg);overflow:hidden;}
.engine-layer{padding:1.25rem 1.5rem;display:flex;align-items:center;gap:1rem;border-bottom:1px solid var(--border-light);background:var(--bg-card);transition:all 0.3s;}
.engine-layer:last-child{border-bottom:none;}
.engine-layer:hover{background:var(--bg-alt);}
.engine-layer.foundation{background:var(--accent-light);border-left:4px solid var(--accent);}
.engine-label{font-family:'Fragment Mono',monospace;font-size:0.7rem;letter-spacing:0.1em;color:var(--text-light);min-width:80px;}
.engine-name{font-weight:600;font-size:0.95rem;color:var(--text);}
.engine-desc{font-size:0.8rem;color:var(--text-muted);margin-left:auto;}

/* Stat row */
.stat-row{display:grid;grid-template-columns:repeat(4,1fr);gap:1rem;margin:2rem 0;}
.stat-card{background:var(--bg-card);border:1px solid var(--border);border-radius:var(--radius-md);padding:1.25rem;text-align:center;transition:all var(--transition);}
.stat-card:hover{border-color:var(--accent);box-shadow:var(--shadow-md);transform:translateY(-2px);}

@media(max-width:768px){
  .info-row{grid-template-columns:1fr;}
  .analogy-grid{grid-template-columns:1fr;gap:0.5rem;}
  .analogy-arrow{transform:rotate(90deg);}
  .stat-row{grid-template-columns:repeat(2,1fr);}
  .hero-overlay h1{font-size:2.5rem;}
}
</style>
</head>
<body>

<aside class="sidebar">
  <div class="sidebar-progress"><div class="sidebar-progress-fill"></div></div>
  <div class="sidebar-rail">
    <div class="sidebar-brand">S</div>
    <nav>
      <a href="index.html" data-page="index.html" data-num="01" title="How Voice AI Works"><span class="pip-tooltip">How Voice AI Works</span></a>
      <a href="how-codecs-work.html" data-page="how-codecs-work.html" data-num="02" title="How Codecs Work"><span class="pip-tooltip">How Codecs Work</span></a>
      <a href="silk-codec.html" data-page="silk-codec.html" data-num="03" title="Latent Space Learning"><span class="pip-tooltip">Latent Space Learning</span></a>
      <a href="voice-ai-pipeline.html" data-page="voice-ai-pipeline.html" data-num="04" title="Voice AI Pipeline"><span class="pip-tooltip">Voice AI Pipeline</span></a>
      <a href="comparison.html" data-page="comparison.html" data-num="05" title="Silk vs The World"><span class="pip-tooltip">Silk vs The World</span></a>
      <a href="geometry.html" data-page="geometry.html" data-num="06" title="Why It Works"><span class="pip-tooltip">Why It Works</span></a>
      <div class="quiz-pip"><a href="quiz.html" data-page="quiz.html" data-num="?" title="Test Yourself"><span class="pip-tooltip">Test Yourself</span></a></div>
    </nav>
  </div>
  <div class="sidebar-panel">
    <div class="sidebar-logo"><h2>Silk</h2><span>internal briefing</span></div>
    <nav>
      <a href="index.html" data-page="index.html"><span class="nav-num">01</span> How Voice AI Works</a>
      <a href="how-codecs-work.html" data-page="how-codecs-work.html"><span class="nav-num">02</span> How Codecs Work</a>
      <a href="silk-codec.html" data-page="silk-codec.html"><span class="nav-num">03</span> Latent Space Learning</a>
      <a href="voice-ai-pipeline.html" data-page="voice-ai-pipeline.html"><span class="nav-num">04</span> Voice AI Pipeline</a>
      <a href="comparison.html" data-page="comparison.html"><span class="nav-num">05</span> Silk vs The World</a>
      <a href="geometry.html" data-page="geometry.html"><span class="nav-num">06</span> Why It Works</a>
      <div class="quiz-link"><a href="quiz.html" data-page="quiz.html"><span class="nav-num">?</span> Test Yourself</a></div>
    </nav>
  </div>
</aside>

<main class="main-content">
<div class="page-content">

  <!-- ==================== HERO ==================== -->
  <div class="hero-viz">
    <canvas id="hero-particles"></canvas>
    <div class="hero-overlay">
      <div style="font-family:'Fragment Mono',monospace;font-size:0.55rem;color:rgba(255,255,255,0.25);letter-spacing:0.2em;margin-bottom:1rem;">doc ref: rmk-silk-001 · distribution: internal only</div>
      <h1>Silk</h1>
      <div class="tagline">rumik · latent space learning voice AI</div>
      <p>a continuous audio codec built on finite scalar quantization.</p>
      <div style="font-family:'Fragment Mono',monospace;font-size:0.5rem;color:rgba(255,255,255,0.15);margin-top:1.5rem;letter-spacing:0.15em;">prepared by the silk codec team · rumik research</div>
    </div>
  </div>

  <!-- ==================== CH1: WHAT IS SOUND ==================== -->
  <span class="chapter-num">section 01</span>
  <h2 style="border:none;padding:0;margin-top:0;">what is sound, really?</h2>
  <p>sound is air vibrating. your vocal cords push air molecules back and forth, those vibrations travel through space and hit a microphone, and the microphone writes them down as numbers.</p>
  <p>it measures air pressure 24,000 times per second. that stream of numbers is a <span class="term" data-term="waveform"><strong>waveform</strong></span>.</p>

  <div class="sound-demo">
    <canvas id="waveform-demo" height="120"></canvas>
    <div class="sound-demo-label">
      <span>simulated speech waveform</span>
      <span>24,000 samples/sec</span>
    </div>
  </div>

  <p>one second of audio = 24,000 numbers. ten seconds = 240,000. most of it is redundant — patterns, repetitions, structure. codecs compress it.</p>

  <div class="visual-break"></div>

  <!-- ==================== CH2: WHAT IS A CODEC ==================== -->
  <span class="chapter-num">section 02</span>
  <h2 style="border:none;padding:0;margin-top:0;">what is a codec?</h2>
  <p>a codec takes something big (a waveform with 24,000 numbers per second) and translates it into something small and organized.</p>

  <div class="analogy-grid fade-in">
    <div class="analogy-box">
      <div style="font-size:2rem;margin-bottom:0.5rem;">📖</div>
      <div style="font-family:'Fragment Mono',monospace;font-size:0.75rem;color:var(--text-muted);">a paragraph of text</div>
      <div style="font-size:0.85rem;margin-top:0.5rem;">"the quick brown fox jumps over the lazy dog near the river"</div>
    </div>
    <div class="analogy-arrow">→</div>
    <div class="analogy-box" style="border-color:var(--accent);background:var(--accent-light);">
      <div style="font-size:2rem;margin-bottom:0.5rem;">🗜️</div>
      <div style="font-family:'Fragment Mono',monospace;font-size:0.75rem;color:var(--accent);">compressed summary</div>
      <div style="font-size:0.85rem;margin-top:0.5rem;">"fox jumped, dog rested by river"</div>
    </div>
  </div>

  <p>but here's the key: a good codec can <strong>reconstruct</strong> the original from the compressed version. not just summarize — actually rebuild it.</p>

  <div class="info-row fade-in">
    <div class="info-box">
      <div class="info-icon">🎤</div>
      <div class="info-title">encode</div>
      <div class="info-desc">take 24,000 numbers/sec and compress them into a tiny representation</div>
    </div>
    <div class="info-box">
      <div class="info-icon">💾</div>
      <div class="info-title">latent space</div>
      <div class="info-desc">the compressed version. could be tokens, numbers, or vectors</div>
    </div>
    <div class="info-box">
      <div class="info-icon">🔊</div>
      <div class="info-title">decode</div>
      <div class="info-desc">reconstruct the full audio from the tiny representation</div>
    </div>
  </div>

  <p>MP3 and AAC are codecs — hand-crafted ones where humans wrote the rules for what to keep and throw away. <strong>neural audio codecs</strong> use neural networks instead. the network learns the best compression strategy on its own.</p>

  <!-- Animated: waveform → encoder → latent → decoder → waveform -->
  <div class="interactive-block fade-in">
    <h3>watch a codec in action</h3>
    <p class="description">raw audio flows in, gets compressed to a tiny latent representation, then gets reconstructed.</p>
    <div class="process-flow" id="codec-pipeline">
      <div class="pf-node"><div class="pf-box">🎤 audio in</div><div class="pf-sub">24,000 nums/sec</div></div>
      <div class="pf-arrow">→</div>
      <div class="pf-data">[1,440,000]</div>
      <div class="pf-arrow">→</div>
      <div class="pf-node"><div class="pf-box highlight">encoder</div><div class="pf-sub">neural network</div></div>
      <div class="pf-arrow">→</div>
      <div class="pf-data">[200 vectors]</div>
      <div class="pf-arrow">→</div>
      <div class="pf-node"><div class="pf-box green">latent</div><div class="pf-sub">compressed</div></div>
      <div class="pf-arrow">→</div>
      <div class="pf-data">[200 vectors]</div>
      <div class="pf-arrow">→</div>
      <div class="pf-node"><div class="pf-box highlight">decoder</div><div class="pf-sub">neural network</div></div>
      <div class="pf-arrow">→</div>
      <div class="pf-data">[1,440,000]</div>
      <div class="pf-arrow">→</div>
      <div class="pf-node"><div class="pf-box">🔊 audio out</div><div class="pf-sub">reconstructed</div></div>
    </div>
  </div>

  <div class="visual-break"></div>

  <!-- ==================== CH3: WHY THE CODEC IS THE ENGINE ==================== -->
  <span class="chapter-num">section 03</span>
  <h2 style="border:none;padding:0;margin-top:0;">why the codec is the engine of voice AI</h2>

  <p>in modern voice AI, the codec isn't just a compressor. it defines <strong>what the entire system can represent.</strong> everything downstream — the LLM, the app — is constrained by what the codec can encode.</p>

  <p>the stack looks like this:</p>

  <div class="engine-stack fade-in">
    <div class="engine-layer">
      <span class="engine-label">layer 3</span>
      <span class="engine-name">the app</span>
      <span class="engine-desc">voice assistant, customer support bot, translator</span>
    </div>
    <div class="engine-layer">
      <span class="engine-label">layer 2</span>
      <span class="engine-name">the LLM</span>
      <span class="engine-desc">GPT-style model that generates speech tokens</span>
    </div>
    <div class="engine-layer foundation">
      <span class="engine-label">layer 1</span>
      <span class="engine-name">the codec</span>
      <span class="engine-desc">determines what sounds can exist. THE ceiling.</span>
    </div>
  </div>

  <div class="callout insight fade-in">
    <div class="callout-title">the car analogy</div>
    <p>the LLM is the driver, the app is the destination, but the <strong>codec is the engine.</strong> if the engine caps out at 60mph, the driver's skill doesn't matter. if the codec can't represent mandarin tones, no LLM will ever produce good mandarin.</p>
  </div>

  <p>this is why every major voice AI company started with a codec:</p>

  <div class="side-by-side fade-in">
    <div class="side">
      <h4>codec → model</h4>
      <div style="font-size:0.85rem;line-height:2;">
        <div><strong>Kyutai</strong> built Mimi → then Moshi</div>
        <div><strong>Meta</strong> built EnCodec → then Voicebox</div>
        <div><strong>Microsoft</strong> used EnCodec → then VALL-E</div>
        <div><strong>Rumik</strong> built Silk Codec → then Silk</div>
      </div>
    </div>
    <div class="side">
      <h4>why codec first?</h4>
      <p style="font-size:0.85rem;">the LLM is a well-understood problem — it's a transformer predicting the next output. the hard part, the novel part, is <strong>what that output represents.</strong> the codec defines that.</p>
      <p style="font-size:0.85rem;">change the codec → change everything the model can do.</p>
    </div>
  </div>

  <div class="visual-break"></div>

  <!-- ==================== CH4: HOW NEURAL NETWORKS LEARN ==================== -->
  <span class="chapter-num">section 04</span>
  <h2 style="border:none;padding:0;margin-top:0;">how a neural network learns (60-second version)</h2>
  <p>neural networks in 60 seconds. three ideas, no math.</p>

  <!-- IDEA 1: Neurons — HTML/SVG based for clarity -->
  <div class="interactive-block fade-in">
    <h3>idea 1: neurons are just math</h3>
    <p class="description">a neuron takes numbers in, multiplies them by "weights" (importance), adds them up, and outputs a number.</p>
    <div style="background:#1a1a2e;border-radius:var(--radius-lg);padding:2.5rem 2rem;margin:1rem 0;">
      <svg viewBox="0 0 700 280" style="width:100%;height:auto;display:block;">
        <!-- Input nodes -->
        <circle cx="80" cy="60" r="28" fill="rgba(100,100,100,0.2)" stroke="#888" stroke-width="2"/>
        <text x="80" y="65" text-anchor="middle" fill="rgba(255,255,255,0.9)" font-size="15" font-weight="600" font-family="Fragment Mono">0.8</text>
        <text x="80" y="105" text-anchor="middle" fill="rgba(255,255,255,0.45)" font-size="12" font-family="Fragment Mono">loudness</text>

        <circle cx="80" cy="140" r="28" fill="rgba(100,100,100,0.2)" stroke="#888" stroke-width="2"/>
        <text x="80" y="145" text-anchor="middle" fill="rgba(255,255,255,0.9)" font-size="15" font-weight="600" font-family="Fragment Mono">0.3</text>
        <text x="80" y="185" text-anchor="middle" fill="rgba(255,255,255,0.45)" font-size="12" font-family="Fragment Mono">pitch</text>

        <circle cx="80" cy="220" r="28" fill="rgba(100,100,100,0.2)" stroke="#888" stroke-width="2"/>
        <text x="80" y="225" text-anchor="middle" fill="rgba(255,255,255,0.9)" font-size="15" font-weight="600" font-family="Fragment Mono">0.6</text>
        <text x="80" y="265" text-anchor="middle" fill="rgba(255,255,255,0.45)" font-size="12" font-family="Fragment Mono">speed</text>

        <!-- Connection lines -->
        <line x1="108" y1="60" x2="332" y2="140" stroke="#888" stroke-width="3" opacity="0.5"/>
        <line x1="108" y1="140" x2="332" y2="140" stroke="#888" stroke-width="4" opacity="0.7"/>
        <line x1="108" y1="220" x2="332" y2="140" stroke="#888" stroke-width="2" opacity="0.3"/>

        <!-- Weight labels on connections -->
        <rect x="175" y="78" width="55" height="22" rx="4" fill="rgba(100,100,100,0.25)"/>
        <text x="202" y="93" text-anchor="middle" fill="#aaa" font-size="13" font-weight="600" font-family="Fragment Mono">x0.5</text>

        <rect x="195" y="122" width="55" height="22" rx="4" fill="rgba(100,100,100,0.35)"/>
        <text x="222" y="137" text-anchor="middle" fill="#aaa" font-size="13" font-weight="600" font-family="Fragment Mono">x0.8</text>

        <rect x="175" y="174" width="55" height="22" rx="4" fill="rgba(100,100,100,0.15)"/>
        <text x="202" y="189" text-anchor="middle" fill="#aaa" font-size="13" font-weight="600" font-family="Fragment Mono">x0.2</text>

        <!-- Neuron (sum) -->
        <circle cx="360" cy="140" r="36" fill="rgba(100,100,100,0.25)" stroke="#888" stroke-width="2.5"/>
        <text x="360" y="137" text-anchor="middle" fill="white" font-size="22" font-weight="700" font-family="Fragment Mono">+</text>
        <text x="360" y="158" text-anchor="middle" fill="rgba(255,255,255,0.5)" font-size="10" font-family="Fragment Mono">sum</text>

        <!-- Arrow to output -->
        <line x1="396" y1="140" x2="520" y2="140" stroke="#2d8a4e" stroke-width="3"/>
        <polygon points="520,132 536,140 520,148" fill="#2d8a4e"/>

        <!-- Calculation label -->
        <text x="458" y="128" text-anchor="middle" fill="rgba(255,255,255,0.35)" font-size="10" font-family="Fragment Mono">sigmoid</text>

        <!-- Output node -->
        <circle cx="580" cy="140" r="32" fill="rgba(45,138,78,0.25)" stroke="#2d8a4e" stroke-width="2.5"/>
        <text x="580" y="145" text-anchor="middle" fill="rgba(255,255,255,0.95)" font-size="17" font-weight="700" font-family="Fragment Mono">0.76</text>
        <text x="580" y="190" text-anchor="middle" fill="rgba(45,138,78,0.7)" font-size="13" font-weight="600" font-family="Fragment Mono">output</text>

        <!-- Math explanation -->
        <text x="350" y="260" text-anchor="middle" fill="rgba(255,255,255,0.3)" font-size="12" font-family="Fragment Mono">(0.8 x 0.5) + (0.3 x 0.8) + (0.6 x 0.2) = 0.76</text>
      </svg>
    </div>
  </div>

  <!-- IDEA 2: Layers — HTML/SVG based for clarity -->
  <div class="interactive-block fade-in">
    <h3>idea 2: stack neurons into layers</h3>
    <p class="description">one neuron does one simple calculation. stack hundreds together and they can recognize patterns. stack layers of layers and they can understand audio.</p>
    <div style="background:#1a1a2e;border-radius:var(--radius-lg);padding:2rem 1.5rem;">
      <svg id="nn-layers-svg" viewBox="0 0 700 340" style="width:100%;height:auto;display:block;">
        <!-- Layer labels at top -->
        <text x="80" y="24" text-anchor="middle" fill="rgba(136,136,136,0.8)" font-size="12" font-weight="600" font-family="Fragment Mono">INPUT</text>
        <text x="240" y="24" text-anchor="middle" fill="rgba(255,255,255,0.35)" font-size="11" font-family="Fragment Mono">hidden 1</text>
        <text x="390" y="24" text-anchor="middle" fill="rgba(255,255,255,0.35)" font-size="11" font-family="Fragment Mono">hidden 2</text>
        <text x="530" y="24" text-anchor="middle" fill="rgba(255,255,255,0.35)" font-size="11" font-family="Fragment Mono">hidden 3</text>
        <text x="640" y="24" text-anchor="middle" fill="rgba(45,138,78,0.6)" font-size="12" font-weight="600" font-family="Fragment Mono">OUTPUT</text>

        <!-- Connections will be drawn by JS for animation -->
        <!-- Nodes -->
      </svg>
      <canvas id="nn-layers-anim" style="position:absolute;top:0;left:0;width:100%;height:100%;pointer-events:none;display:none;"></canvas>
    </div>
    <div style="display:flex;justify-content:space-between;margin-top:1rem;padding:0 0.5rem;">
      <div style="text-align:center;">
        <div style="font-family:'Fragment Mono',monospace;font-size:0.8rem;color:var(--accent);font-weight:600;">input layer</div>
        <div style="font-size:0.8rem;color:var(--text-muted);">raw audio features</div>
      </div>
      <div style="text-align:center;">
        <div style="font-family:'Fragment Mono',monospace;font-size:0.8rem;color:var(--text-light);">hidden layers</div>
        <div style="font-size:0.8rem;color:var(--text-muted);">learned patterns</div>
      </div>
      <div style="text-align:center;">
        <div style="font-family:'Fragment Mono',monospace;font-size:0.8rem;color:var(--success);font-weight:600;">output layer</div>
        <div style="font-size:0.8rem;color:var(--text-muted);">compressed</div>
      </div>
    </div>
  </div>

  <!-- IDEA 3: Training -->
  <div class="interactive-block fade-in">
    <h3>idea 3: training = adjusting weights until the output is right</h3>
    <p class="description">the network starts with random weights. feed audio in, measure how wrong the output is, adjust weights to reduce the error. this loop runs billions of times.</p>
    <div style="display:grid;grid-template-columns:1fr 1fr 1fr 1fr;gap:0.75rem;margin:1.5rem 0;">
      <div style="background:var(--bg);border-radius:var(--radius-md);padding:1rem;text-align:center;">
        <div style="font-size:1.5rem;margin-bottom:0.5rem;">1️⃣</div>
        <div style="font-size:0.8rem;font-weight:600;">feed audio in</div>
        <div style="font-size:0.75rem;color:var(--text-muted);margin-top:0.25rem;">input → network</div>
      </div>
      <div style="background:var(--bg);border-radius:var(--radius-md);padding:1rem;text-align:center;">
        <div style="font-size:1.5rem;margin-bottom:0.5rem;">2️⃣</div>
        <div style="font-size:0.8rem;font-weight:600;">get output</div>
        <div style="font-size:0.75rem;color:var(--text-muted);margin-top:0.25rem;">network → guess</div>
      </div>
      <div style="background:var(--bg);border-radius:var(--radius-md);padding:1rem;text-align:center;">
        <div style="font-size:1.5rem;margin-bottom:0.5rem;">3️⃣</div>
        <div style="font-size:0.8rem;font-weight:600;">compute error</div>
        <div style="font-size:0.75rem;color:var(--text-muted);margin-top:0.25rem;">guess vs reality</div>
      </div>
      <div style="background:var(--bg);border-radius:var(--radius-md);padding:1rem;text-align:center;">
        <div style="font-size:1.5rem;margin-bottom:0.5rem;">4️⃣</div>
        <div style="font-size:0.8rem;font-weight:600;">adjust weights</div>
        <div style="font-size:0.75rem;color:var(--text-muted);margin-top:0.25rem;">reduce error</div>
      </div>
    </div>

    <!-- Animated training loop -->
    <div style="background:var(--code-bg);border-radius:var(--radius-md);padding:1rem 1.5rem;margin-top:1rem;">
      <div style="display:flex;align-items:center;justify-content:space-between;">
        <span style="font-family:'Fragment Mono',monospace;font-size:0.75rem;color:rgba(255,255,255,0.5);">training progress</span>
        <span id="training-epoch" style="font-family:'Fragment Mono',monospace;font-size:0.75rem;color:var(--accent);">epoch 0</span>
      </div>
      <div style="display:flex;align-items:center;gap:1rem;margin-top:0.75rem;">
        <div style="font-family:'Fragment Mono',monospace;font-size:0.7rem;color:rgba(255,255,255,0.4);min-width:40px;">error</div>
        <div style="flex:1;height:8px;background:rgba(255,255,255,0.1);border-radius:4px;overflow:hidden;">
          <div id="training-error-bar" style="height:100%;background:var(--accent);border-radius:4px;transition:width 0.3s;width:95%;"></div>
        </div>
        <div id="training-error-val" style="font-family:'Fragment Mono',monospace;font-size:0.75rem;color:var(--accent);min-width:50px;text-align:right;">0.95</div>
      </div>
      <canvas id="training-loss-chart" height="60" style="width:100%;display:block;margin-top:0.75rem;"></canvas>
    </div>
  </div>

  <div class="visual-break"></div>

  <!-- ==================== CH5: INTRODUCING SILK ==================== -->
  <span class="chapter-num">section 05</span>
  <h2 style="border:none;padding:0;margin-top:0;">three breakthroughs behind Silk</h2>

  <p>every codec in production today does the same thing: snap continuous audio representations to a fixed set of discrete <span class="term" data-term="tokens">tokens</span> — like rounding every number to the nearest integer. each snap destroys information.</p>
  <p>Silk breaks this pattern with three advances. <strong><span class="term" data-term="latent space">latent space learning (LSL)</span></strong>: the <span class="term" data-term="encoder">encoder</span> learns a continuous space that captures the physics of sound — no fixed codebook, no language dependency. a <strong>codec architecture</strong> that operates on this continuous space, using <span class="term" data-term="fsq">Finite Scalar Quantization (FSQ)</span> to convert to tokens only after the space is already well-structured. and a <strong>2.5 Hz frame rate</strong> — 47.5 tokens/sec vs 600+ for every competitor — that makes real-time generation possible on a standard transformer.</p>

  <!-- Side by side: discrete vs continuous -->
  <div class="side-by-side fade-in">
    <div class="side" style="border-color:var(--error);background:#fffbfb;">
      <h4 style="color:var(--error);">every other codec</h4>
      <div style="text-align:center;margin:1rem 0;">
        <svg viewBox="0 0 280 130" style="width:100%;height:auto;">
          <line x1="15" y1="55" x2="265" y2="55" stroke="#e5e0d8" stroke-width="2"/>
          <!-- Grid points (codebook entries) -->
          <circle cx="40" cy="55" r="10" fill="#d4cfc7"/>
          <circle cx="100" cy="55" r="10" fill="#d4cfc7"/>
          <circle cx="160" cy="55" r="10" fill="#d4cfc7"/>
          <circle cx="220" cy="55" r="10" fill="#d4cfc7"/>
          <text x="40" y="30" text-anchor="middle" fill="#9b9590" font-size="10" font-family="Fragment Mono">#1</text>
          <text x="100" y="30" text-anchor="middle" fill="#9b9590" font-size="10" font-family="Fragment Mono">#2</text>
          <text x="160" y="30" text-anchor="middle" fill="#9b9590" font-size="10" font-family="Fragment Mono">#3</text>
          <text x="220" y="30" text-anchor="middle" fill="#9b9590" font-size="10" font-family="Fragment Mono">#4</text>
          <!-- Original point -->
          <circle cx="130" cy="55" r="8" fill="#d32f2f" opacity="0.8"/>
          <!-- Snap line with arrow -->
          <line x1="130" y1="55" x2="160" y2="55" stroke="#d32f2f" stroke-width="2" stroke-dasharray="4,3"/>
          <text x="145" y="80" text-anchor="middle" fill="#d32f2f" font-size="11" font-weight="600" font-family="Fragment Mono">snap!</text>
          <!-- Error annotation -->
          <rect x="75" y="95" width="130" height="24" rx="4" fill="rgba(196,64,64,0.12)"/>
          <text x="140" y="112" text-anchor="middle" fill="#d32f2f" font-size="12" font-weight="600" font-family="Fragment Mono">info lost forever</text>
        </svg>
      </div>
      <p style="font-size:0.85rem;">encode → <strong style="color:var(--error);">snap to codebook</strong> → decode. the "snap" destroys subtle details the codebook doesn't have entries for.</p>
    </div>
    <div class="side" style="border-color:var(--success);background:#f9fefb;">
      <h4 style="color:var(--success);">Silk Codec</h4>
      <div style="text-align:center;margin:1rem 0;">
        <svg viewBox="0 0 280 130" style="width:100%;height:auto;">
          <!-- Continuous gradient bar -->
          <defs><linearGradient id="cg" x1="0" y1="0" x2="1" y2="0"><stop offset="0%" stop-color="#2d8a4e" stop-opacity="0.05"/><stop offset="30%" stop-color="#2d8a4e" stop-opacity="0.25"/><stop offset="50%" stop-color="#2d8a4e" stop-opacity="0.4"/><stop offset="70%" stop-color="#2d8a4e" stop-opacity="0.25"/><stop offset="100%" stop-color="#2d8a4e" stop-opacity="0.05"/></linearGradient></defs>
          <rect x="15" y="35" width="250" height="40" rx="20" fill="url(#cg)"/>
          <line x1="15" y1="55" x2="265" y2="55" stroke="#2d8a4e" stroke-width="1" opacity="0.3"/>
          <!-- Original point stays exactly where it is -->
          <circle cx="130" cy="55" r="10" fill="#2d8a4e"/>
          <circle cx="130" cy="55" r="16" fill="none" stroke="#2d8a4e" stroke-width="1.5" opacity="0.4"/>
          <text x="130" y="25" text-anchor="middle" fill="#2d8a4e" font-size="11" font-weight="600" font-family="Fragment Mono">lands exactly here</text>
          <!-- Success annotation -->
          <rect x="65" y="95" width="150" height="24" rx="4" fill="rgba(45,138,78,0.12)"/>
          <text x="140" y="112" text-anchor="middle" fill="#2d8a4e" font-size="12" font-weight="600" font-family="Fragment Mono">nothing lost</text>
        </svg>
      </div>
      <p style="font-size:0.85rem;">encode → <strong style="color:var(--success);">continuous latent space</strong> → FSQ → decode. the encoder learns in a 128-dimensional continuous space. FSQ then quantizes each dimension to 8 levels, producing 19 packed token groups per frame.</p>
    </div>
  </div>

  <!-- Key stats — 2 rows for readability -->
  <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin:2rem 0;" class="fade-in">
    <div class="stat-card">
      <div class="stat-counter">
        <div class="big-num" data-target="19">0</div>
        <div class="stat-label">FSQ token groups</div>
        <div class="stat-sub">per frame</div>
      </div>
    </div>
    <div class="stat-card">
      <div class="stat-counter">
        <div class="big-num" data-target="128">0</div>
        <div class="stat-label">latent dimensions</div>
        <div class="stat-sub">per frame (continuous)</div>
      </div>
    </div>
    <div class="stat-card">
      <div class="stat-counter">
        <div class="big-num" data-target="5800">0</div>
        <div class="stat-label">training hours</div>
        <div class="stat-sub">english only</div>
      </div>
    </div>
  </div>
  <div style="display:grid;grid-template-columns:repeat(2,1fr);gap:1rem;margin:0 0 2rem;" class="fade-in">
    <div class="stat-card">
      <div class="stat-counter">
        <div class="big-num" data-target="3.8">0</div>
        <div class="stat-label">PESQ on hindi</div>
        <div class="stat-sub">0 hrs hindi data</div>
      </div>
    </div>
    <div class="stat-card">
      <div class="stat-counter">
        <div class="big-num" data-target="3.55">0</div>
        <div class="stat-label">PESQ on mandarin</div>
        <div class="stat-sub">0 hrs chinese data</div>
      </div>
    </div>
  </div>

  <div class="callout insight fade-in">
    <div class="callout-title">cross-lingual result</div>
    <p>trained on <strong>5,800 hours of english only.</strong> tested on languages it never heard: hindi scored <span class="term" data-term="pesq"><strong>PESQ</strong></span> <strong>3.8+</strong>, mandarin scored <strong>3.55</strong>. traditional <span class="term" data-term="codebook">codebook</span>-based codecs can't do this — their entries are tuned to training-language sounds. continuous latent space captures physics, not language-specific patterns.</p>
  </div>

  <div class="visual-break"></div>

  <!-- ==================== NAVIGATION ==================== -->
  <h2>pages</h2>
  <p>each page has expandable "go deeper" sections for technical detail.</p>

  <div class="card-grid fade-in">
    <a href="how-codecs-work.html" class="card">
      <div class="card-number">01</div>
      <h3>how codecs work today</h3>
      <p>RVQ explained visually. interactive quantization. see exactly where information gets destroyed.</p>
    </a>
    <a href="silk-codec.html" class="card">
      <div class="card-number">02</div>
      <h3>the Silk Codec</h3>
      <p>latent space learning architecture. 8 Conformer blocks. FSQ quantization.</p>
    </a>
    <a href="voice-ai-pipeline.html" class="card">
      <div class="card-number">03</div>
      <h3>voice AI pipeline</h3>
      <p>how codec → LLM → TTS works. how Silk changes the architecture. generation speed comparison.</p>
    </a>
    <a href="comparison.html" class="card">
      <div class="card-number">04</div>
      <h3>Silk vs the world</h3>
      <p>how Silk compares to ElevenLabs, OpenAI, and others — explained for non-technical people.</p>
    </a>
    <a href="geometry.html" class="card">
      <div class="card-number">05</div>
      <h3>why it works: the geometry</h3>
      <p>why continuous captures physics. interactive latent space. cross-lingual results breakdown.</p>
    </a>
  </div>

  <div style="display:flex;justify-content:flex-end;margin-top:3rem;padding-top:1.5rem;border-top:1px solid var(--border-light);">
    <a href="how-codecs-work.html" style="color:var(--accent);text-decoration:none;font-size:0.9rem;">next: how codecs work today →</a>
  </div>

</div>
</main>

<script src="js/shared.js"></script>
<script>
// === HERO PARTICLES ===
(function(){
  const c=document.getElementById('hero-particles');if(!c)return;
  const ctx=c.getContext('2d');let W,H;
  const particles=[];
  function resize(){W=c.width=c.parentElement.offsetWidth*2;H=c.height=c.parentElement.offsetHeight*2;}
  resize();
  for(let i=0;i<100;i++)particles.push({x:Math.random()*W,y:Math.random()*H,vx:(Math.random()-0.5)*0.6,vy:(Math.random()-0.5)*0.6,r:1+Math.random()*2,a:0.15+Math.random()*0.35});
  function draw(){
    ctx.clearRect(0,0,W,H);
    particles.forEach(p=>{p.x+=p.vx;p.y+=p.vy;if(p.x<0)p.x=W;if(p.x>W)p.x=0;if(p.y<0)p.y=H;if(p.y>H)p.y=0;ctx.beginPath();ctx.arc(p.x,p.y,p.r,0,Math.PI*2);ctx.fillStyle=`rgba(180,180,180,${p.a})`;ctx.fill();});
    for(let i=0;i<particles.length;i++){for(let j=i+1;j<particles.length;j++){const d=Math.hypot(particles[i].x-particles[j].x,particles[i].y-particles[j].y);if(d<150){ctx.beginPath();ctx.moveTo(particles[i].x,particles[i].y);ctx.lineTo(particles[j].x,particles[j].y);ctx.strokeStyle=`rgba(180,180,180,${0.12*(1-d/150)})`;ctx.lineWidth=0.5;ctx.stroke();}}}
    requestAnimationFrame(draw);
  }
  draw();
  window.addEventListener('resize',resize);
})();

// === WAVEFORM DEMO ===
(function(){
  const c=document.getElementById('waveform-demo');if(!c)return;
  const ctx=c.getContext('2d');c.width=c.parentElement.offsetWidth*2;c.height=240;
  let t=0;
  function draw(){
    ctx.clearRect(0,0,c.width,c.height);
    ctx.strokeStyle='#555';ctx.lineWidth=1.5;ctx.beginPath();
    for(let x=0;x<c.width;x++){
      const freq1=Math.sin(x*0.015+t*0.04)*40;
      const freq2=Math.sin(x*0.04+t*0.06)*20;
      const freq3=Math.sin(x*0.008+t*0.02)*15;
      const envelope=Math.sin(x*0.003+t*0.01)*0.5+0.5;
      const noise=(Math.random()-0.5)*3;
      const y=c.height/2+(freq1+freq2+freq3)*envelope+noise;
      x===0?ctx.moveTo(x,y):ctx.lineTo(x,y);
    }
    ctx.stroke();
    // Center line
    ctx.strokeStyle='rgba(255,255,255,0.1)';ctx.lineWidth=1;ctx.beginPath();ctx.moveTo(0,c.height/2);ctx.lineTo(c.width,c.height/2);ctx.stroke();
    t++;requestAnimationFrame(draw);
  }
  draw();
})();

// === CODEC PIPELINE ANIMATION ===
(function(){
  const container=document.getElementById('codec-pipeline');if(!container)return;
  const arrows=container.querySelectorAll('.pf-arrow');
  const data=container.querySelectorAll('.pf-data');
  const boxes=container.querySelectorAll('.pf-box');
  let i=0;
  function step(){
    if(i<arrows.length){arrows[i].classList.add('show');i++;setTimeout(step,200);}
    if(i-1<data.length&&i>0){data[Math.max(0,i-1)]?.classList.add('show');}
  }
  const obs=new IntersectionObserver((entries)=>{
    if(entries[0].isIntersecting){step();obs.unobserve(container);}
  },{threshold:0.3});
  obs.observe(container);
})();

// === NEURAL NETWORK: LAYERS (SVG-based with animated particles) ===
(function(){
  const svg=document.getElementById('nn-layers-svg');if(!svg)return;
  const layers=[4,7,7,5,3];
  const layerX=[80,240,390,530,640];
  const topPad=40;
  function getY(l,i){const n=layers[l];const h=n*38;const start=(340-h)/2+topPad;return start+i*38;}

  // Draw connections (static)
  let connHtml='';
  for(let l=0;l<layers.length-1;l++){
    for(let i=0;i<layers[l];i++){
      for(let j=0;j<layers[l+1];j++){
        connHtml+=`<line x1="${layerX[l]}" y1="${getY(l,i)}" x2="${layerX[l+1]}" y2="${getY(l+1,j)}" stroke="rgba(136,136,136,0.15)" stroke-width="0.7"/>`;
      }
    }
  }
  // Draw nodes
  let nodeHtml='';
  for(let l=0;l<layers.length;l++){
    const isInput=l===0,isOutput=l===layers.length-1;
    const fillColor=isInput?'rgba(136,136,136,0.25)':isOutput?'rgba(45,138,78,0.25)':'rgba(136,136,136,0.15)';
    const strokeColor=isInput?'#888':isOutput?'#2d8a4e':'rgba(136,136,136,0.4)';
    for(let i=0;i<layers[l];i++){
      const x=layerX[l],y=getY(l,i);
      nodeHtml+=`<circle cx="${x}" cy="${y}" r="14" fill="${fillColor}" stroke="${strokeColor}" stroke-width="1.5"/>`;
    }
  }
  // Animated particles container
  let particleHtml='';
  for(let p=0;p<12;p++){
    particleHtml+=`<circle class="nn-particle" cx="0" cy="0" r="4" fill="#888" opacity="0.7" style="display:none;"/>`;
  }

  svg.innerHTML+=connHtml+nodeHtml+particleHtml;

  // Animate particles
  const particles=svg.querySelectorAll('.nn-particle');
  let t=0;
  function animate(){
    particles.forEach((p,idx)=>{
      const layer=Math.floor((t*0.008+idx*0.25)%(layers.length-1));
      const prog=((t*0.008+idx*0.25)%(layers.length-1))-layer;
      const srcNode=idx%layers[layer];
      const dstNode=idx%layers[layer+1];
      const x1=layerX[layer],y1=getY(layer,srcNode);
      const x2=layerX[layer+1],y2=getY(layer+1,dstNode);
      p.setAttribute('cx',x1+(x2-x1)*prog);
      p.setAttribute('cy',y1+(y2-y1)*prog);
      p.style.display='';
    });
    t++;requestAnimationFrame(animate);
  }
  const obs=new IntersectionObserver(e=>{if(e[0].isIntersecting){animate();obs.unobserve(svg);}},{threshold:0.3});
  obs.observe(svg);
})();

// === TRAINING ANIMATION ===
(function(){
  const epochEl=document.getElementById('training-epoch');
  const barEl=document.getElementById('training-error-bar');
  const valEl=document.getElementById('training-error-val');
  const chartEl=document.getElementById('training-loss-chart');
  if(!chartEl)return;
  const ctx=chartEl.getContext('2d');
  chartEl.width=chartEl.parentElement.offsetWidth*2;chartEl.height=120;

  let epoch=0;
  const losses=[];

  function tick(){
    epoch++;
    const loss=0.95*Math.exp(-epoch*0.015)+0.05+Math.random()*0.02;
    losses.push(loss);
    if(losses.length>100)losses.shift();

    epochEl.textContent='epoch '+epoch;
    barEl.style.width=(loss*100)+'%';
    valEl.textContent=loss.toFixed(3);

    // Draw loss chart
    ctx.clearRect(0,0,chartEl.width,chartEl.height);
    ctx.strokeStyle='rgba(85,85,85,0.8)';ctx.lineWidth=1.5;ctx.beginPath();
    losses.forEach((l,i)=>{
      const x=i*(chartEl.width/100);
      const y=chartEl.height-l*chartEl.height;
      i===0?ctx.moveTo(x,y):ctx.lineTo(x,y);
    });
    ctx.stroke();
    // Fill under
    ctx.lineTo(chartEl.width,chartEl.height);ctx.lineTo(0,chartEl.height);ctx.closePath();
    ctx.fillStyle='rgba(85,85,85,0.1)';ctx.fill();

    if(epoch<200)setTimeout(tick,80);
  }
  const obs=new IntersectionObserver((entries)=>{
    if(entries[0].isIntersecting){tick();obs.unobserve(chartEl);}
  },{threshold:0.3});
  obs.observe(chartEl);
})();

// === STAT COUNTER ANIMATION ===
(function(){
  const observer=new IntersectionObserver(entries=>{
    entries.forEach(entry=>{
      if(!entry.isIntersecting)return;
      const el=entry.target;
      const target=parseFloat(el.dataset.target);
      const isFloat=String(target).includes('.');
      const duration=1500;const start=performance.now();
      function tick(now){
        const t=Math.min((now-start)/duration,1);
        const ease=1-Math.pow(1-t,3);
        const val=ease*target;
        el.textContent=isFloat?val.toFixed(2):Math.round(val).toLocaleString();
        if(t<1)requestAnimationFrame(tick);
      }
      requestAnimationFrame(tick);
      observer.unobserve(el);
    });
  },{threshold:0.5});
  document.querySelectorAll('.big-num[data-target]').forEach(el=>observer.observe(el));
})();
</script>
<nav class="mobile-nav">
  <a href="#" class="mobile-prev">← prev</a>
  <span class="current-page"></span>
  <a href="#" class="mobile-next">next →</a>
</nav>
</body>
</html>