diff --git a/packages.txt b/packages.txt new file mode 100644 index 0000000..20645e6 --- /dev/null +++ b/packages.txt @@ -0,0 +1 @@ +ffmpeg diff --git a/speech-to-text/.gitignore b/speech-to-text/.gitignore new file mode 100644 index 0000000..7813aeb --- /dev/null +++ b/speech-to-text/.gitignore @@ -0,0 +1,3 @@ +audio.mp3 +video.MKV +/src/nb.ipynb \ No newline at end of file diff --git a/speech-to-text/.streamlit/config.toml b/speech-to-text/.streamlit/config.toml new file mode 100644 index 0000000..489062c --- /dev/null +++ b/speech-to-text/.streamlit/config.toml @@ -0,0 +1,33 @@ +# https://docs.streamlit.io/develop/api-reference/configuration/config.toml + +[server] +# headless = true +# runOnSave = true +# allowRunOnSave = true +# # fastReruns = true +# # fileWatcherType = "auto" +# # fileWatcherType = "watchdog" +# fileWatcherType = "poll" + + +[theme] +base = "dark" + +# Used to style primary interface elements. It's the color displayed most frequently across your app's +# screens and components. Examples of widgets using this color are st.slider and st.checkbox. +# primaryColor = "#919e8b" # green +# primaryColor = "purple" + +# Background color for the main container. +# backgroundColor = "rgba(254,248,239,1)" # sepia yellow + +# Used as the background for most widgets. Examples of widgets with this background are st.sidebar, +# st.text_input, st.date_input. +# secondaryBackgroundColor = "#ebd2b9" # wheat "#d7ab82" # yellow <-- not enough contrast + +# Font color for the page. +# textColor = "#6e7074" # grey + +# Font family (serif | sans serif | mono) for the page. Will not impact code areas. +# Default: "sans serif" +font = "sans serif" \ No newline at end of file diff --git a/speech-to-text/Dockerfile b/speech-to-text/Dockerfile new file mode 100644 index 0000000..dbe2880 --- /dev/null +++ b/speech-to-text/Dockerfile @@ -0,0 +1,21 @@ +ARG PYTHON_VERSION=3.11.4 + +FROM python:${PYTHON_VERSION}-slim as base + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +COPY requirements.txt ./requirements.txt + +# Download dependencies using cache mount and bind mount +RUN apt update && apt install ffmpeg -y && python -m pip install -r requirements.txt + +EXPOSE 8501 + +# Copy the source code into the container. +COPY . . + +# Run the application. +ENTRYPOINT ["streamlit", "run", "app.py"] \ No newline at end of file diff --git a/speech-to-text/README.md b/speech-to-text/README.md new file mode 100644 index 0000000..13223e6 --- /dev/null +++ b/speech-to-text/README.md @@ -0,0 +1,47 @@ +# Speech to text transcription + + +## Objective +This project aims to transcribe audio file using [openai-whisper](https://github.com/openai/whisper) model. Users can also enter a valid YouTube URL for transcription. + + +## Public Endpoints for Deployed App + +Application is deployed on streamlit cloud [here](https://transcribe-whisper.streamlit.app/). + +You can deploy it on your own easily and (possibly) free of charge on cloud. Scroll down to `Docker Playground Cloud Deployment` in `Deployment` section. + + +## Code Structure / Services +- `app` - Complete application code built in streamlit. +- `packages` - List of linux dependencies required to deploy code on streamlit cloud. +- `docker-compose` - Compose file which starts application. + + +## Deployment +- Local deployment + - Install Docker. Instructions available [here](https://docs.docker.com/engine/install/). Make sure docker is up and running before proceeding. + - Install Git. Instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). + - Clone repo and run compose + ``` + git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects + git switch whisper && cd ./speech-to-text + docker compose --profile app up + ``` + - `--profile app` will start on `localhost:8501` port. + +- Docker Playground Cloud Deployment + - Navigate to [docker playground](https://labs.play-with-docker.com/). + - Login using your docker account. Click Start. This will direct you to a new page. + - Click `Add New Instance` on left pane. Then run following commands in terminal - + ``` + git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects + git switch whisper && cd ./speech-to-text + docker compose --profile app up + ``` + - To access application, click on port numbers next to `OPEN PORT` button to visit application. + + +## Future work/ Improvements +- Add download transcription as subtitles `.srt` file. +- Implement streaming transcription. \ No newline at end of file diff --git a/speech-to-text/app.py b/speech-to-text/app.py new file mode 100644 index 0000000..1a1a02f --- /dev/null +++ b/speech-to-text/app.py @@ -0,0 +1,167 @@ +# ruff: noqa: F401 +import contextlib +from datetime import timedelta +from pathlib import Path + +import pyperclip +import streamlit as st +import torch +import whisper +import yt_dlp + +st.set_page_config( + page_title="Speech to Text", + page_icon="🔉", + layout="centered", + initial_sidebar_state="auto", +) + + +@st.cache_resource +def load_model(): + # model sizes + # https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages + return whisper.load_model("base") + + +def duration_check(info, *, incomplete): + """Download only videos less than 10 minute (or with unknown duration)""" + duration = info.get("duration") + if duration and duration > 601: # 10 mins limit + return "The video is too long" + + +def download_yt_audio(yt_url: str): + """Download audio from given youtube video URL""" + # convert cli to python args - https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py + ydl_opts = { + "match_filter": duration_check, + "format": "m4a/bestaudio/best", + "outtmpl": {"default": "audio.%(ext)s"}, + # "cookiesfrombrowser": (None, None, None, None), + "cachedir": False, + "verbose": True, + "postprocessors": [ + { # Extract audio using ffmpeg + "key": "FFmpegExtractAudio", + "preferredcodec": "m4a", + } + ], + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + error_code = ydl.download(yt_url) + print(error_code) + + +def postprocess_transcription(predictions: dict, include_timestamps: bool): + """ + Postprocesses the transcription predictions to include timestamps if specified. + + Args: + predictions (dict): The transcription predictions. + include_timestamps (bool): Flag to indicate whether to include timestamps. + + Returns: + str: The postprocessed transcription with or without timestamps. + """ + + if not include_timestamps: + return predictions.get("text") + result = [] + for segment in predictions.get("segments", {}): + startTime = str(0) + str(timedelta(seconds=int(segment["start"]))) + ",000" + endTime = str(0) + str(timedelta(seconds=int(segment["end"]))) + ",000" + text = segment["text"] + if not text.strip(): + continue + segmentId = segment["id"] + 1 + segment = f"{segmentId}\n{startTime} --> {endTime}\n{text.strip()}\n\n" + + result.append(segment) + return "".join(result) + + +def main(): + st.title("Audio|YouTube video Transcription") + + st.sidebar.title("Settings") + with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"], index=1) + language_options = {None: "Auto-detect", "en": "English"} + language = st.sidebar.selectbox( + "Select Language", + language_options.keys(), + format_func=lambda x: language_options.get(x), + ) + + # load model + model = load_model() + # YT video link input + text_input = st.text_input(label="Enter valid Youtube video URL") + + # audio upload + audio = st.file_uploader( + "Upload an audio or short video file", + type=["mp3", "m4a", "mkv", "mp4"], + ) + submit_button = st.button(label="Transcribe") + + # submit with video link or uploaded audio + if submit_button and (text_input or audio): + # if previous audio exists, remove it + audio_path = Path("./audio.m4a") + audio_path.unlink(missing_ok=True) + toast_msg = st.toast("Model is running!", icon="🏃") + # download audio from YT video url + if text_input: + with contextlib.suppress(Exception): + download_yt_audio(text_input) + elif audio is not None: + # save uploaded audio + bytes_data = audio.getvalue() + with open("./audio.m4a", "wb") as f: + f.write(bytes_data) + + if not audio_path.exists(): + st.error( + """Audio file generation failed! Please recheck YouTube URL or uploaded file. + YT videos only upto 10 mins are supported. + YouTube may have rate limited due to large number of requests.""", + icon="🚨", + ) + else: + # start transcription + st.audio("./audio.m4a", format="audio/mpeg") + try: + with st.spinner("Transcribing..."): + result = model.transcribe( + str(audio_path), + verbose=True, + word_timestamps=True, + language=language, + ) + + st.success("🎉 Transcription completed successfully! 🎉") + if transcription := postprocess_transcription( + result, + with_timestamps == "Yes", + ): + st.session_state.text = transcription + with st.expander("See Transcription"): + st.write(st.session_state.text) + audio_path.unlink() + except Exception: + st.error("Please refresh App") + else: + st.info("Please add YouTube URL or upload audio for transcription", icon="â„šī¸") + + if st.button("Refresh App"): + audio_path = Path("./audio.m4a") + audio_path.unlink(missing_ok=True) + # re-load model + model = load_model() + + + +if __name__ == "__main__": + load_model() + main() diff --git a/speech-to-text/docker-compose.yml b/speech-to-text/docker-compose.yml new file mode 100644 index 0000000..507fadb --- /dev/null +++ b/speech-to-text/docker-compose.yml @@ -0,0 +1,19 @@ + +services: + + streamlit: + build: + context: . + dockerfile: Dockerfile + command: > + /app/app.py --server.port 8501 + ports: + - "8501:8501" + volumes: + - ./:/app + profiles: + - app + +networks: + app: + driver: bridge \ No newline at end of file diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt new file mode 100644 index 0000000..f8a1ea7 --- /dev/null +++ b/speech-to-text/requirements.txt @@ -0,0 +1,6 @@ +pyperclip==1.9.0 +streamlit==1.26.0 +# torch==2.2.0 +yt-dlp +openai-whisper +# ffmpeg-python \ No newline at end of file