VibeVoice_server/Dockerfile at main · ValyrianTech/VibeVoice_server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# =============================================================================
# Stage 1: Builder - Install dependencies and build wheels
# =============================================================================
ARG DOCKER_FROM=nvidia/cuda:12.8.0-runtime-ubuntu22.04
FROM ${DOCKER_FROM} AS builder

ARG DEBIAN_FRONTEND=noninteractive

# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    python3-dev \
    python3-venv \
    git \
    build-essential \
    && rm -rf /var/lib/apt/lists/*

# Create virtual environment
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# Upgrade pip
RUN pip install --no-cache-dir --upgrade pip wheel setuptools

# Install PyTorch with CUDA 12.8 (required for RTX 5090/Blackwell sm_120)
RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cu128

# Clone and install VibeVoice
WORKDIR /build
RUN git clone https://github.com/vibevoice-community/VibeVoice.git \
    && cd VibeVoice \
    && pip install --no-cache-dir -e .

# Install all Python dependencies
RUN pip install --no-cache-dir \
    fastapi \
    uvicorn \
    python-multipart \
    python-magic \
    pydub \
    openai-whisper \
    soundfile \
    transformers \
    huggingface_hub

# Install flash-attention from pre-built wheel (PyTorch 2.10 + CUDA 12.8 + Python 3.10)
RUN pip install --no-cache-dir https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.7.12/flash_attn-2.6.3+cu128torch2.10-cp310-cp310-linux_x86_64.whl || true

# =============================================================================
# Stage 2: Runtime - Minimal image with only runtime dependencies
# =============================================================================
FROM ${DOCKER_FROM} AS runtime

ARG DEBIAN_FRONTEND=noninteractive

LABEL github_repo="https://github.com/vibevoice-community/VibeVoice"

# Install only runtime dependencies (no build tools)
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-pip \
    ffmpeg \
    sox \
    libsox-fmt-all \
    libsndfile1 \
    libmagic1 \
    git \
    git-lfs \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean \
    && git lfs install

# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# Copy VibeVoice installation to /app (not /workspace which RunPod overwrites)
COPY --from=builder /build/VibeVoice /app/VibeVoice

ENV SHELL=/bin/bash

# Create necessary directories
# Server files go in /app, models can be in /workspace for RunPod network volume
RUN mkdir -p /app/VibeVoice/server/outputs \
    /app/VibeVoice/server/resources \
    /workspace/models/vibevoice

# Remove any existing server files from the cloned repo and copy our own
RUN rm -f /app/VibeVoice/server/server.py /app/VibeVoice/server/start.sh 2>/dev/null || true
COPY server.py /app/VibeVoice/server/
COPY start.sh /app/VibeVoice/server/
COPY install_models.sh /app/VibeVoice/server/
COPY demo_speaker0.mp3 /app/VibeVoice/server/resources/

# Fix line endings (in case of Windows CRLF) and make executable
RUN sed -i 's/\r$//' /app/VibeVoice/server/start.sh \
    && sed -i 's/\r$//' /app/VibeVoice/server/install_models.sh \
    && chmod +x /app/VibeVoice/server/start.sh \
    && chmod +x /app/VibeVoice/server/install_models.sh \
    && cat /app/VibeVoice/server/start.sh

# Set environment variables for model paths (models can be on network volume)
ENV VIBEVOICE_MODEL_PATH=/workspace/models/vibevoice/VibeVoice-Large
ENV VIBEVOICE_TOKENIZER_PATH=/workspace/models/vibevoice/tokenizer

# Set the working directory to the server directory
WORKDIR /app/VibeVoice/server

# Expose port
EXPOSE 7860

# Set the entrypoint to our start script
ENTRYPOINT ["/bin/bash", "/app/VibeVoice/server/start.sh"]