float/stream_server.py at main · zack314/float · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
"""
Simple FastAPI WebSocket app for FLOAT streaming:
- Client uploads one reference image and one audio file over a WebSocket.
- Server streams generated video frames (JPEG) back as binary WS messages.

Protocol (very simple):
1) Client connects to ws://host:port/ws/float
2) Client sends a JSON text message with optional params, e.g.:
   {"type":"start", "emo":"neutral", "a_cfg_scale":2.0, "r_cfg_scale":1.0, "e_cfg_scale":1.0, "nfe":10, "seed":25, "no_crop": false}
3) Client sends one binary message with the image bytes (PNG/JPEG/etc.)
4) Client sends one binary message with the audio bytes (WAV/MP3 compatible with librosa)
5) Server responds with:
   - a JSON text message {"type":"config", "fps": <int>} once processing starts
   - many binary messages each containing a single JPEG-encoded frame
   - a final JSON text message {"type":"done"}

There is also a GET / page with a minimal HTML client.
"""

import io
import json
import os
import sys
import tempfile
from pathlib import Path
from typing import Any, Dict, Optional

import cv2
import numpy as np
import torch
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.responses import HTMLResponse
import uvicorn

# Ensure 'float' directory is on sys.path so that generate.py's
# absolute imports like 'from models....' resolve correctly.
_THIS_DIR = Path(__file__).resolve().parent
if str(_THIS_DIR) not in sys.path:
    sys.path.insert(0, str(_THIS_DIR))

from generate import InferenceAgent, InferenceOptions


def build_agent() -> InferenceAgent:
    """Create and return a single global InferenceAgent instance."""
    opts = InferenceOptions().parse()
    # Device setup similar to generate.py
    opts.rank, opts.ngpus = 0, 1

    # Prefer env-provided checkpoint; fall back to option/default
    ckpt_env = os.environ.get("FLOAT_CKPT")
    if ckpt_env:
        opts.ckpt_path = ckpt_env
    # Resolve model asset paths relative to this package directory when given as relative paths
    base_dir = Path(__file__).resolve().parent
    def _resolve(p: Optional[str]) -> Optional[str]:
        if p is None:
            return None
        pp = Path(p)
        if not pp.is_absolute():
            pp = (base_dir / pp).resolve()
        return str(pp)

    opts.wav2vec_model_path = _resolve(opts.wav2vec_model_path)
    opts.audio2emotion_path = _resolve(opts.audio2emotion_path)
    # If ckpt not set or not found, try float/checkpoints/float.pth
    if not getattr(opts, "ckpt_path", None) or not Path(str(opts.ckpt_path)).exists():
        fallback = base_dir / "checkpoints/float.pth"
        if fallback.exists():
            opts.ckpt_path = str(fallback)

    agent = InferenceAgent(opts)
    return agent


AGENT = build_agent()
APP = FastAPI(title="FLOAT Streaming Server", version="0.1.0")


INDEX_HTML = """
<!doctype html>
<html>
  <head>
    <meta charset="utf-8" />
    <title>FLOAT Stream Demo</title>
    <style>
      body { font-family: sans-serif; margin: 24px; }
      #frame { max-width: 512px; border: 1px solid #ccc; }
      .row { margin-bottom: 12px; }
    </style>
  </head>
  <body>
    <h2>FLOAT Streaming Demo</h2>
    <div class="row">
      <label>Reference Image: <input type="file" id="img" accept="image/*" /></label>
    </div>
    <div class="row">
      <label>Audio: <input type="file" id="aud" accept="audio/*" /></label>
    </div>
    <div class="row">
      <label>Emotion:
        <select id="emo">
          <option value="neutral">neutral</option>
          <option value="angry">angry</option>
          <option value="disgust">disgust</option>
          <option value="fear">fear</option>
          <option value="happy">happy</option>
          <option value="sad">sad</option>
          <option value="surprise">surprise</option>
        </select>
      </label>
      <label>nfe: <input id="nfe" type="number" min="1" max="50" value="10" style="width:60px"/></label>
      <label>seed: <input id="seed" type="number" value="25" style="width:80px"/></label>
      <label><input id="nocrop" type="checkbox"/> no_crop</label>
    </div>
    <div class="row">
      <button id="btn">Start Streaming</button>
    </div>
    <div class="row">
      <img id="frame" />
      <div id="meta"></div>
    </div>
    <div class="row">
      <audio id="audio" controls style="width:512px"></audio>
    </div>

    <script>
      const btn = document.getElementById('btn');
      const imgEl = document.getElementById('img');
      const audEl = document.getElementById('aud');
      const emoEl = document.getElementById('emo');
      const nfeEl = document.getElementById('nfe');
      const seedEl = document.getElementById('seed');
      const nocropEl = document.getElementById('nocrop');
      const frameEl = document.getElementById('frame');
      const metaEl = document.getElementById('meta');

      // Frame scheduling at target FPS (default 25)
      let framesUrls = []; // ordered frames
      let displayTimer = null; // legacy interval (unused)
      let rafHandle = null; // requestAnimationFrame handle
      let targetFps = 25;
      let lastUrl = null;
      let shownFrames = 0;
      let startedAt = 0;
      let expectedFrames = null;
      let chunkSize = null;
      let firstBinaryAt = null;
      let sendCompletedAt = null;
      let startedPlayback = false;
      let minStartupBufferFrames = 25; // will be refined dynamically
      let nextFrameIdx = 0;
      let uploadedAudioUrl = null;
      const audioEl = document.getElementById('audio');
      let streamDone = false;
      let wsClosed = false;

      function startDisplayLoop() {
        if (rafHandle) return;
        startedAt = performance.now();
        shownFrames = 0;
        // Start audio in sync
        if (uploadedAudioUrl) {
          audioEl.src = uploadedAudioUrl;
          audioEl.currentTime = 0;
          audioEl.play().catch(() => {});
        }
        const step = () => {
          const ct = audioEl.src ? audioEl.currentTime : ((performance.now() - startedAt) / 1000.0);
          const targetIdx = Math.floor(ct * (targetFps || 25));
          // Display frames up to targetIdx if available
          while (nextFrameIdx <= targetIdx && nextFrameIdx < framesUrls.length) {
            const url = framesUrls[nextFrameIdx];
            if (lastUrl) try { URL.revokeObjectURL(lastUrl); } catch {}
            frameEl.src = url;
            lastUrl = url;
            nextFrameIdx++;
            shownFrames++;
          }
          // Stats: buffer available and approx fps
          const elapsed = (performance.now() - startedAt) / 1000.0;
          const actualFps = elapsed > 0 ? (shownFrames / elapsed).toFixed(1) : '0.0';
          const bufferAvailable = Math.max(0, framesUrls.length - nextFrameIdx);
          const doneFlag = streamDone ? ' | Stream: done' : '';
          metaEl.textContent = `Target FPS: ${targetFps} | Displayed: ${shownFrames}/${expectedFrames ?? '?'} | Buffer frames: ${bufferAvailable} | ~${actualFps} fps${doneFlag}`;
          // Stop when we've consumed all frames and stream is done
          const expected = (expectedFrames != null) ? expectedFrames : null;
          const allConsumed = (expected != null) ? (nextFrameIdx >= expected) : (streamDone && nextFrameIdx >= framesUrls.length);
          if (allConsumed) {
            stopDisplayLoop();
            return;
          }
          rafHandle = requestAnimationFrame(step);
        };
        rafHandle = requestAnimationFrame(step);
      }

      function stopDisplayLoop() {
        if (rafHandle) {
          cancelAnimationFrame(rafHandle);
          rafHandle = null;
        }
        if (displayTimer) {
          clearInterval(displayTimer);
          displayTimer = null;
        }
        // revoke any URLs we've already shown
        if (lastUrl) {
          try { URL.revokeObjectURL(lastUrl); } catch {}
          lastUrl = null;
        }
        // Free buffered frames
        for (let i = nextFrameIdx; i < framesUrls.length; i++) {
          try { URL.revokeObjectURL(framesUrls[i]); } catch {}
        }
        framesUrls = [];
        nextFrameIdx = 0;
      }

      btn.onclick = async () => {
        if (!imgEl.files[0] || !audEl.files[0]) { alert('Select image and audio'); return; }

        const ws = new WebSocket(`ws://${location.host}/ws/float`);
        ws.binaryType = 'arraybuffer';

        ws.onopen = async () => {
          const start = {
            type: 'start',
            emo: emoEl.value,
            nfe: parseInt(nfeEl.value || '10'),
            seed: parseInt(seedEl.value || '25'),
            no_crop: nocropEl.checked,
          };
          ws.send(JSON.stringify(start));

          // send image file
          const imgBuf = await imgEl.files[0].arrayBuffer();
          ws.send(imgBuf);
          // send audio file
          const audioFile = audEl.files[0];
          const audBuf = await audioFile.arrayBuffer();
          ws.send(audBuf);
          sendCompletedAt = performance.now();
          // Prepare local audio playback from uploaded file
          if (uploadedAudioUrl) { try { URL.revokeObjectURL(uploadedAudioUrl); } catch {} }
          uploadedAudioUrl = URL.createObjectURL(audioFile);
        };

        ws.onmessage = (evt) => {
          if (typeof evt.data === 'string') {
            const msg = JSON.parse(evt.data);
            if (msg.type === 'done') {
              console.log('Done');
              streamDone = true;
            }
            if (msg.type === 'config') {
              targetFps = msg.fps || 25;
              expectedFrames = msg.total_frames || null;
              chunkSize = msg.chunk_size || null;
              // delay starting playback until we have enough buffered frames
            }
            if (msg.type === 'chunk') {
              // Can be used to adjust buffer target
              if (!chunkSize) chunkSize = msg.len;
            }
            return;
          }
          // binary JPEG frame
          if (!firstBinaryAt) firstBinaryAt = performance.now();
          const blob = new Blob([evt.data], { type: 'image/jpeg' });
          const url = URL.createObjectURL(blob);
          framesUrls.push(url);
          // Decide when to start playback: ensure sufficient startup buffer
          if (!startedPlayback && targetFps) {
            // Estimate startup latency based buffer size
            const firstLatencyMs = (firstBinaryAt && sendCompletedAt) ? (firstBinaryAt - sendCompletedAt) : 0;
            // at least one full chunk or 1s worth of frames; add extra if latency is high
            const base = chunkSize ? chunkSize : targetFps;
            const dynamic = Math.max(targetFps, Math.min(2 * targetFps, Math.round((firstLatencyMs / 1000) * targetFps)));
            minStartupBufferFrames = Math.max(base, dynamic);
            if (framesUrls.length >= minStartupBufferFrames) {
              startDisplayLoop();
              startedPlayback = true;
            }
          }
        };

        ws.onclose = () => {
          wsClosed = true;
        };
        ws.onerror = () => {
          wsClosed = true;
        };
      };
    </script>
  </body>
 </html>
"""


@APP.get("/")
def index():
    return HTMLResponse(INDEX_HTML)


def tensor_frames_to_bgr_uint8(frames: torch.Tensor) -> np.ndarray:
    """Convert torch frames in [-1,1], [B, T, C, H, W] to np [T, H, W, C] BGR uint8.
    Assumes B == 1.
    """
    if frames.ndim == 5:
        # [B, T, C, H, W]
        frames = frames[0]
    elif frames.ndim == 4:
        # [T, C, H, W]
        pass
    else:
        raise ValueError(f"Unexpected tensor shape: {frames.shape}")

    frames = frames.detach().clamp(-1, 1)
    frames = (frames + 1.0) / 2.0  # [0,1]
    frames = (frames * 255.0).byte().cpu().numpy()  # [T, C, H, W]
    frames = np.transpose(frames, (0, 2, 3, 1))  # [T, H, W, C] RGB
    frames_bgr = frames[..., ::-1]  # to BGR for OpenCV
    return frames_bgr


async def stream_frames_over_ws(ws: WebSocket, image_path: Path, audio_path: Path, params: Dict[str, Any]):
    # Build input tensors using the same preprocessing as generate.py
    data = AGENT.data_processor.preprocess(str(image_path), str(audio_path), no_crop=bool(params.get("no_crop", False)))

    # Compute expected total frames from audio length
    a = data["a"]  # shape [1, samples]
    sampling_rate = AGENT.opt.sampling_rate
    fps = int(AGENT.opt.fps)
    total_frames = int(np.ceil(a.shape[-1] * fps / sampling_rate))
    duration_sec = float(a.shape[-1]) / float(sampling_rate)
    # Typical chunk size equals wav2vec_sec * fps
    chunk_size = int(AGENT.opt.wav2vec_sec * fps)

    # Inform client about stream config
    await ws.send_text(json.dumps({
        "type": "config",
        "fps": fps,
        "total_frames": total_frames,
        "chunk_size": chunk_size,
        "duration": duration_sec,
    }))

    # Iterate streaming chunks
    chunk_iter = AGENT.G.inference_stream(
        data,
        a_cfg_scale=float(params.get("a_cfg_scale", AGENT.opt.a_cfg_scale)),
        r_cfg_scale=float(params.get("r_cfg_scale", AGENT.opt.r_cfg_scale)),
        e_cfg_scale=float(params.get("e_cfg_scale", AGENT.opt.e_cfg_scale)),
        emo=params.get("emo"),
        nfe=int(params.get("nfe", AGENT.opt.nfe)),
        seed=int(params.get("seed", AGENT.opt.seed)),
    )

    for out in chunk_iter:
        frames = out["d_hat"]  # [B, chunk_len, C, H, W] or [chunk_len, C, H, W]
        chunk_len = frames.shape[0] if frames.ndim == 4 else frames.shape[1]
        # Notify client of chunk metadata before frames
        await ws.send_text(json.dumps({"type": "chunk", "len": int(chunk_len)}))
        frames_bgr = tensor_frames_to_bgr_uint8(frames)
        for t in range(frames_bgr.shape[0]):
            ok, jpg = cv2.imencode(".jpg", frames_bgr[t], [int(cv2.IMWRITE_JPEG_QUALITY), 85])
            if not ok:
                continue
            await ws.send_bytes(jpg.tobytes())

    await ws.send_text(json.dumps({"type": "done"}))


@APP.websocket("/ws/float")
async def ws_float(websocket: WebSocket):
    await websocket.accept()
    img_tmp: Optional[tempfile.NamedTemporaryFile] = None
    aud_tmp: Optional[tempfile.NamedTemporaryFile] = None
    try:
        # 1) Expect text JSON with params
        first = await websocket.receive()
        if "text" not in first:
            await websocket.close(code=1003)
            return
        params = json.loads(first["text"]) if first.get("text") else {}

        # 2) Expect binary image
        img_msg = await websocket.receive()
        if "bytes" not in img_msg:
            await websocket.close(code=1003)
            return
        img_tmp = tempfile.NamedTemporaryFile(suffix=".img", delete=False)
        img_tmp.write(img_msg["bytes"])
        img_tmp.flush()

        # 3) Expect binary audio
        aud_msg = await websocket.receive()
        if "bytes" not in aud_msg:
            await websocket.close(code=1003)
            return
        aud_tmp = tempfile.NamedTemporaryFile(suffix=".aud", delete=False)
        aud_tmp.write(aud_msg["bytes"])
        aud_tmp.flush()

        await stream_frames_over_ws(
            websocket, Path(img_tmp.name), Path(aud_tmp.name), params
        )
    except WebSocketDisconnect:
        pass
    except Exception as e:
        try:
            await websocket.send_text(json.dumps({"type": "error", "message": str(e)}))
        except Exception:
            pass
    finally:
        if img_tmp is not None:
            try:
                img_tmp.close()
                os.unlink(img_tmp.name)
            except Exception:
                pass
        if aud_tmp is not None:
            try:
                aud_tmp.close()
                os.unlink(aud_tmp.name)
            except Exception:
                pass


if __name__ == "__main__":
    # Run with: python -m float.stream_server
    uvicorn.run("float.stream_server:APP", host="0.0.0.0", port=8001, reload=False)