uditmanav17 · uditmanav17 · Aug 4, 2024 · Aug 4, 2024 · Aug 4, 2024 · Aug 4, 2024
diff --git a/packages.txt b/packages.txt
@@ -0,0 +1 @@
+ffmpeg
diff --git a/speech-to-text/.gitignore b/speech-to-text/.gitignore
@@ -0,0 +1,3 @@
+audio.mp3
+video.MKV
+/src/nb.ipynb
diff --git a/speech-to-text/.streamlit/config.toml b/speech-to-text/.streamlit/config.toml
@@ -0,0 +1,33 @@
+# https://docs.streamlit.io/develop/api-reference/configuration/config.toml
+
+[server]
+# headless = true
+# runOnSave = true
+# allowRunOnSave = true
+# # fastReruns = true
+# # fileWatcherType = "auto"
+# # fileWatcherType = "watchdog"
+# fileWatcherType = "poll"
+
+
+[theme]
+base = "dark"
+
+# Used to style primary interface elements. It's the color displayed most frequently across your app's
+# screens and components. Examples of widgets using this color are st.slider and st.checkbox.
+# primaryColor = "#919e8b" # green
+# primaryColor = "purple"
+
+# Background color for the main container.
+# backgroundColor = "rgba(254,248,239,1)" # sepia yellow
+
+# Used as the background for most widgets. Examples of widgets with this background are st.sidebar,
+# st.text_input, st.date_input.
+# secondaryBackgroundColor =  "#ebd2b9" # wheat  "#d7ab82" # yellow <-- not enough contrast
+
+# Font color for the page.
+# textColor = "#6e7074" # grey
+
+# Font family (serif | sans serif | mono) for the page. Will not impact code areas.
+# Default: "sans serif"
+font = "sans serif"
diff --git a/speech-to-text/Dockerfile b/speech-to-text/Dockerfile
@@ -0,0 +1,21 @@
+ARG PYTHON_VERSION=3.11.4
+
+FROM python:${PYTHON_VERSION}-slim as base
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+COPY requirements.txt ./requirements.txt
+
+# Download dependencies using cache mount and bind mount
+RUN apt update && apt install ffmpeg -y && python -m pip install -r requirements.txt
+
+EXPOSE 8501
+
+# Copy the source code into the container.
+COPY . .
+
+# Run the application.
+ENTRYPOINT ["streamlit", "run", "app.py"]
diff --git a/speech-to-text/README.md b/speech-to-text/README.md
@@ -0,0 +1,47 @@
+# Speech to text transcription
+
+
+## Objective
+This project aims to transcribe audio file using [openai-whisper](https://github.com/openai/whisper) model. Users can also enter a valid YouTube URL for transcription.
+
+
+## Public Endpoints for Deployed App
+
+Application is deployed on streamlit cloud [here](https://transcribe-whisper.streamlit.app/).
+
+You can deploy it on your own easily and (possibly) free of charge on cloud. Scroll down to `Docker Playground Cloud Deployment` in `Deployment` section.
+
+
+## Code Structure / Services
+- `app` - Complete application code built in streamlit.
+- `packages` - List of linux dependencies required to deploy code on streamlit cloud.
+- `docker-compose` - Compose file which starts application.
+
+
+## Deployment
+- Local deployment
+    - Install Docker. Instructions available [here](https://docs.docker.com/engine/install/). Make sure docker is up and running before proceeding.
+    - Install Git. Instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
+    - Clone repo and run compose
+    ```
+    git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects
+    git switch whisper && cd ./speech-to-text
+    docker compose --profile app up
+    ```
+    - `--profile app` will start on `localhost:8501` port.
+
+- Docker Playground Cloud Deployment
+    - Navigate to [docker playground](https://labs.play-with-docker.com/).
+    - Login using your docker account. Click Start. This will direct you to a new page.
+    - Click `Add New Instance` on left pane. Then run following commands in terminal -
+    ```
+    git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects
+    git switch whisper && cd ./speech-to-text
+    docker compose --profile app up
+    ```
+    - To access application, click on port numbers next to `OPEN PORT` button to visit application.
+
+
+## Future work/ Improvements
+- Add download transcription as subtitles `.srt` file.
+- Implement streaming transcription.
diff --git a/speech-to-text/app.py b/speech-to-text/app.py
@@ -0,0 +1,167 @@
+# ruff: noqa: F401
+import contextlib
+from datetime import timedelta
+from pathlib import Path
+
+import pyperclip
+import streamlit as st
+import torch
+import whisper
+import yt_dlp
+
+st.set_page_config(
+    page_title="Speech to Text",
+    page_icon="🔉",
+    layout="centered",
+    initial_sidebar_state="auto",
+)
+
+
+@st.cache_resource
+def load_model():
+    # model sizes
+    # https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages
+    return whisper.load_model("base")
+
+
+def duration_check(info, *, incomplete):
+    """Download only videos less than 10 minute (or with unknown duration)"""
+    duration = info.get("duration")
+    if duration and duration > 601:  # 10 mins limit
+        return "The video is too long"
+
+
+def download_yt_audio(yt_url: str):
+    """Download audio from given youtube video URL"""
+    # convert cli to python args - https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py
+    ydl_opts = {
+        "match_filter": duration_check,
+        "format": "m4a/bestaudio/best",
+        "outtmpl": {"default": "audio.%(ext)s"},
+        # "cookiesfrombrowser": (None, None, None, None),
+        "cachedir": False,
+        "verbose": True,
+        "postprocessors": [
+            {  # Extract audio using ffmpeg
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "m4a",
+            }
+        ],
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        error_code = ydl.download(yt_url)
+    print(error_code)
+
+
+def postprocess_transcription(predictions: dict, include_timestamps: bool):
+    """
+    Postprocesses the transcription predictions to include timestamps if specified.
+
+    Args:
+        predictions (dict): The transcription predictions.
+        include_timestamps (bool): Flag to indicate whether to include timestamps.
+
+    Returns:
+        str: The postprocessed transcription with or without timestamps.
+    """
+
+    if not include_timestamps:
+        return predictions.get("text")
+    result = []
+    for segment in predictions.get("segments", {}):
+        startTime = str(0) + str(timedelta(seconds=int(segment["start"]))) + ",000"
+        endTime = str(0) + str(timedelta(seconds=int(segment["end"]))) + ",000"
+        text = segment["text"]
+        if not text.strip():
+            continue
+        segmentId = segment["id"] + 1
+        segment = f"{segmentId}\n{startTime} --> {endTime}\n{text.strip()}\n\n"
+
+        result.append(segment)
+    return "".join(result)
+
+
+def main():
+    st.title("Audio|YouTube video Transcription")
+
+    st.sidebar.title("Settings")
+    with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"], index=1)
+    language_options = {None: "Auto-detect", "en": "English"}
+    language = st.sidebar.selectbox(
+        "Select Language",
+        language_options.keys(),
+        format_func=lambda x: language_options.get(x),
+    )
+
+    # load model
+    model = load_model()
+    # YT video link input
+    text_input = st.text_input(label="Enter valid Youtube video URL")
+
+    # audio upload
+    audio = st.file_uploader(
+        "Upload an audio or short video file",
+        type=["mp3", "m4a", "mkv", "mp4"],
+    )
+    submit_button = st.button(label="Transcribe")
+
+    # submit with video link or uploaded audio
+    if submit_button and (text_input or audio):
+        # if previous audio exists, remove it
+        audio_path = Path("./audio.m4a")
+        audio_path.unlink(missing_ok=True)
+        toast_msg = st.toast("Model is running!", icon="🏃")
+        # download audio from YT video url
+        if text_input:
+            with contextlib.suppress(Exception):
+                download_yt_audio(text_input)
+        elif audio is not None:
+            # save uploaded audio
+            bytes_data = audio.getvalue()
+            with open("./audio.m4a", "wb") as f:
+                f.write(bytes_data)
+
+        if not audio_path.exists():
+            st.error(
+                """Audio file generation failed! Please recheck YouTube URL or uploaded file.
+                YT videos only upto 10 mins are supported.
+                YouTube may have rate limited due to large number of requests.""",
+                icon="🚨",
+            )
+        else:
+            # start transcription
+            st.audio("./audio.m4a", format="audio/mpeg")
+            try:
+                with st.spinner("Transcribing..."):
+                    result = model.transcribe(
+                        str(audio_path),
+                        verbose=True,
+                        word_timestamps=True,
+                        language=language,
+                    )
+
+                st.success("🎉 Transcription completed successfully! 🎉")
+                if transcription := postprocess_transcription(
+                    result,
+                    with_timestamps == "Yes",
+                ):
+                    st.session_state.text = transcription
+                    with st.expander("See Transcription"):
+                        st.write(st.session_state.text)
+                audio_path.unlink()
+            except Exception:
+                st.error("Please refresh App")
+    else:
+        st.info("Please add YouTube URL or upload audio for transcription", icon="ℹ️")
+
+    if st.button("Refresh App"):
+        audio_path = Path("./audio.m4a")
+        audio_path.unlink(missing_ok=True)
+        # re-load model
+        model = load_model()
+
+
+
+if __name__ == "__main__":
+    load_model()
+    main()
diff --git a/speech-to-text/docker-compose.yml b/speech-to-text/docker-compose.yml
@@ -0,0 +1,19 @@
+
+services:
+
+  streamlit:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    command: >
+      /app/app.py --server.port 8501
+    ports:
+      - "8501:8501"
+    volumes:
+      - ./:/app
+    profiles:
+      - app
+
+networks:
+  app:
+    driver: bridge
diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt
@@ -0,0 +1,6 @@
+pyperclip==1.9.0
+streamlit==1.26.0
+# torch==2.2.0
+yt-dlp
+openai-whisper
+# ffmpeg-python