From e178c6ba7f8b1fd8b0345ae849a1d13c4e67e0e9 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 16:26:03 +0530 Subject: [PATCH 01/23] whisper app v1 --- speech-to-text/.gitignore | 3 + speech-to-text/Dockerfile | 21 ++++ speech-to-text/docker-compose.yml | 19 ++++ speech-to-text/requirements.txt | 5 + speech-to-text/src/app.py | 157 ++++++++++++++++++++++++++++++ 5 files changed, 205 insertions(+) create mode 100644 speech-to-text/.gitignore create mode 100644 speech-to-text/Dockerfile create mode 100644 speech-to-text/docker-compose.yml create mode 100644 speech-to-text/requirements.txt create mode 100644 speech-to-text/src/app.py diff --git a/speech-to-text/.gitignore b/speech-to-text/.gitignore new file mode 100644 index 0000000..7813aeb --- /dev/null +++ b/speech-to-text/.gitignore @@ -0,0 +1,3 @@ +audio.mp3 +video.MKV +/src/nb.ipynb \ No newline at end of file diff --git a/speech-to-text/Dockerfile b/speech-to-text/Dockerfile new file mode 100644 index 0000000..5eebf52 --- /dev/null +++ b/speech-to-text/Dockerfile @@ -0,0 +1,21 @@ +ARG PYTHON_VERSION=3.11.4 + +FROM python:${PYTHON_VERSION}-slim as base + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +WORKDIR /app + +COPY requirements.txt ./requirements.txt + +# Download dependencies using cache mount and bind mount +RUN python -m pip install -r requirements.txt + +EXPOSE 8501 + +# Copy the source code into the container. +COPY . . + +# Run the application. +ENTRYPOINT ["streamlit", "run", "app.py"] \ No newline at end of file diff --git a/speech-to-text/docker-compose.yml b/speech-to-text/docker-compose.yml new file mode 100644 index 0000000..507fadb --- /dev/null +++ b/speech-to-text/docker-compose.yml @@ -0,0 +1,19 @@ + +services: + + streamlit: + build: + context: . + dockerfile: Dockerfile + command: > + /app/app.py --server.port 8501 + ports: + - "8501:8501" + volumes: + - ./:/app + profiles: + - app + +networks: + app: + driver: bridge \ No newline at end of file diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt new file mode 100644 index 0000000..edee4f9 --- /dev/null +++ b/speech-to-text/requirements.txt @@ -0,0 +1,5 @@ +pyperclip==1.9.0 +streamlit==1.26.0 +yt-dlp +openai-whisper +pytorch diff --git a/speech-to-text/src/app.py b/speech-to-text/src/app.py new file mode 100644 index 0000000..16b7d94 --- /dev/null +++ b/speech-to-text/src/app.py @@ -0,0 +1,157 @@ +# ruff: noqa: F401 +from datetime import timedelta +from pathlib import Path + +import pyperclip +import streamlit as st +import torch +import whisper +import yt_dlp + +st.set_page_config( + page_title="Speech to Text", + page_icon="🔉", + layout="centered", + initial_sidebar_state="auto", +) + + +@st.cache_resource +def load_model(): + return whisper.load_model("base") + + +def duration_check(info, *, incomplete): + """Download only videos less than 10 minute (or with unknown duration)""" + duration = info.get("duration") + if duration and duration > 601: # 10 mins limit + return "The video is too long" + + +def download_yt_audio(yt_url: str): + """Download audio from given youtube video URL""" + # convert cli to python args - https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py + ydl_opts = { + "match_filter": duration_check, + "format": "m4a/bestaudio/best", + "outtmpl": {"default": "audio.%(ext)s"}, + "postprocessors": [ + { # Extract audio using ffmpeg + "key": "FFmpegExtractAudio", + "preferredcodec": "m4a", + } + ], + } + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + error_code = ydl.download(yt_url) + print(error_code) + + +def postprocess_transcription(predictions: dict, include_timestamps: bool): + if not include_timestamps: + return predictions.get("text") + result = [] + for segment in predictions.get("segments", {}): + startTime = str(0) + str(timedelta(seconds=int(segment["start"]))) + ",000" + endTime = str(0) + str(timedelta(seconds=int(segment["end"]))) + ",000" + text = segment["text"] + segmentId = segment["id"] + 1 + segment = f"{segmentId}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n" + + result.append(segment) + return "".join(result) + + +def main(): + st.title("Audio|YouTube video Transcription") + + st.sidebar.title("Settings") + with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"]) + language = st.sidebar.selectbox( + "Select Language", {"Auto-detect": None, "English": "en"} + ) + + # check GPU + cuda_available = torch.cuda.is_available() + print(cuda_available) + if cuda_available: + st.info("GPU available đŸ”Ĩ - Transcriptions will be fast!") + else: + st.warning("GPU NOT available 🚨 - Transcriptions might take some time") + + # load model + model = load_model() + transcription = None + + # YT video link input + text_input = st.text_input(label="Enter valid Youtube video URL") + + # audio upload + audio = st.file_uploader( + "Upload an audio or short video file", + type=["mp3", "m4a", "mkv", "mp4"], + ) + submit_button = st.button(label="Transcribe") + + # submit with video link or uploaded audio + if submit_button and (text_input or audio): + st.session_state.text = None + toast_msg = st.toast("Model is running!", icon="🏃") + # download audio from YT video url + if text_input: + try: + download_yt_audio(text_input) + except Exception: + pass + elif audio is not None: + # save uploaded audio + bytes_data = audio.getvalue() + with open("./audio.m4a", "wb") as f: + f.write(bytes_data) + + audio_path = Path("./audio.m4a") + if not audio_path.exists(): + st.error( + """Audio file generation failed! Please recheck YouTube URL or uploaded file. + YT videos only upto 10 mins are supported""", + icon="🚨", + ) + else: + # start transcription + with st.spinner("Transcribing..."): + result = model.transcribe( + str(audio_path), + verbose=True, + word_timestamps=True, + language=language, + ) + transcription = postprocess_transcription(result, with_timestamps == "Yes") + if transcription or st.session_state.text: + st.session_state.text = transcription + with st.expander("See Transcription"): + st.write(st.session_state.text) + # st.code(transcription) + audio_path.unlink() + else: + st.info("Please add YouTube URL or upload audio for transcription", icon="â„šī¸") + + # download and copy transcription + # col1, col2 = st.columns([1, 1]) + # with col1: + # copy_btn = st.button("Copy", on_click=update_text, args=[st.session_state.text]) + # if copy_btn: + # pyperclip.copy(transcription) + # st.success("Text copied successfully!") + # with col2: + # if not transcription: + # transcription = "" + # dl_btn = st.download_button( + # "Download", + # transcription, + # "text/plain", + # ) + + +if __name__ == "__main__": + load_model() + main() From abf9b39a0facc0174b749306ba263024931077ae Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 16:49:47 +0530 Subject: [PATCH 02/23] path update --- speech-to-text/{src => }/app.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename speech-to-text/{src => }/app.py (100%) diff --git a/speech-to-text/src/app.py b/speech-to-text/app.py similarity index 100% rename from speech-to-text/src/app.py rename to speech-to-text/app.py From 9dc9ea1a1b622de55cdf5c86c053ebb99e07b8ff Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 17:27:36 +0530 Subject: [PATCH 03/23] update reqs --- speech-to-text/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt index edee4f9..d744f24 100644 --- a/speech-to-text/requirements.txt +++ b/speech-to-text/requirements.txt @@ -1,5 +1,5 @@ pyperclip==1.9.0 streamlit==1.26.0 +torch==2.2.0 yt-dlp openai-whisper -pytorch From e3fd25ac2d94df8f1a03ddcbe042710124561f36 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 18:16:15 +0530 Subject: [PATCH 04/23] minor fix + package for streamlit --- speech-to-text/Dockerfile | 2 +- speech-to-text/app.py | 5 ++++- speech-to-text/packages.txt | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 speech-to-text/packages.txt diff --git a/speech-to-text/Dockerfile b/speech-to-text/Dockerfile index 5eebf52..dbe2880 100644 --- a/speech-to-text/Dockerfile +++ b/speech-to-text/Dockerfile @@ -10,7 +10,7 @@ WORKDIR /app COPY requirements.txt ./requirements.txt # Download dependencies using cache mount and bind mount -RUN python -m pip install -r requirements.txt +RUN apt update && apt install ffmpeg -y && python -m pip install -r requirements.txt EXPOSE 8501 diff --git a/speech-to-text/app.py b/speech-to-text/app.py index 16b7d94..735b0a0 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -67,8 +67,11 @@ def main(): st.sidebar.title("Settings") with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"]) + language_options = {None: "Auto-detect", "en": "English"} language = st.sidebar.selectbox( - "Select Language", {"Auto-detect": None, "English": "en"} + "Select Language", + language_options.keys(), + format_func=lambda x: language_options.get(x), ) # check GPU diff --git a/speech-to-text/packages.txt b/speech-to-text/packages.txt new file mode 100644 index 0000000..a9f1eea --- /dev/null +++ b/speech-to-text/packages.txt @@ -0,0 +1 @@ +ffmpeg \ No newline at end of file From 0f033d61c0d911bbf86cbc7b29e422763b02375c Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 18:57:05 +0530 Subject: [PATCH 05/23] readme update + streamlit config --- speech-to-text/.streamlit/config.toml | 33 +++++++++++++++++++ speech-to-text/README.md | 46 +++++++++++++++++++++++++++ speech-to-text/app.py | 37 +++++++++++++-------- 3 files changed, 102 insertions(+), 14 deletions(-) create mode 100644 speech-to-text/.streamlit/config.toml create mode 100644 speech-to-text/README.md diff --git a/speech-to-text/.streamlit/config.toml b/speech-to-text/.streamlit/config.toml new file mode 100644 index 0000000..3161ea1 --- /dev/null +++ b/speech-to-text/.streamlit/config.toml @@ -0,0 +1,33 @@ +# https://docs.streamlit.io/develop/api-reference/configuration/config.toml + +[server] +headless = true +runOnSave = true +allowRunOnSave = true +# fastReruns = true +# fileWatcherType = "auto" +# fileWatcherType = "watchdog" +fileWatcherType = "poll" + + +[theme] +base = "dark" + +# Used to style primary interface elements. It's the color displayed most frequently across your app's +# screens and components. Examples of widgets using this color are st.slider and st.checkbox. +# primaryColor = "#919e8b" # green +# primaryColor = "purple" + +# Background color for the main container. +# backgroundColor = "rgba(254,248,239,1)" # sepia yellow + +# Used as the background for most widgets. Examples of widgets with this background are st.sidebar, +# st.text_input, st.date_input. +# secondaryBackgroundColor = "#ebd2b9" # wheat "#d7ab82" # yellow <-- not enough contrast + +# Font color for the page. +# textColor = "#6e7074" # grey + +# Font family (serif | sans serif | mono) for the page. Will not impact code areas. +# Default: "sans serif" +font = "sans serif" \ No newline at end of file diff --git a/speech-to-text/README.md b/speech-to-text/README.md new file mode 100644 index 0000000..d5e854f --- /dev/null +++ b/speech-to-text/README.md @@ -0,0 +1,46 @@ +# Speech to text transcription + + +## Objective +This project aims to transcribe audio file using [openai-whisper](https://github.com/openai/whisper) model. Users can also enter a valid YouTube URL for transcription. + + +## Public Endpoints for Deployed App + +Application is deployed on streamlit cloud [here](https://transcribe-whisper.streamlit.app/). + +You can deploy it on your own easily and (possibly) free of charge on cloud. Scroll down to `Docker Playground Cloud Deployment` in `Deployment` section. + + +## Code Structure / Services +- `app` - Complete application code built in streamlit. +- `packages` - List of linux dependencies required to deploy code on streamlit cloud. +- `docker-compose` - Compose file which starts application. + + +## Deployment +- Local deployment + - Install Docker. Instructions available [here](https://docs.docker.com/engine/install/). Make sure docker is up and running before proceeding. + - Install Git. Instruction [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). + - Clone repo and run compose + ``` + git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects + git switch whisper && cd ./speech-to-text + docker compose --profile app up + ``` + - `--profile app` will start on `localhost:8501` and `localhost:8501` ports. + +- Docker Playground Cloud Deployment + - Navigate to [docker playground](https://labs.play-with-docker.com/). + - Login using your docker account. Click Start. This will direct you to a new page. + - Click `Add New Instance` on left pane. Then run following commands in terminal - + ``` + git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects + git switch whisper && cd ./speech-to-text + docker compose --profile app up + ``` + - To access application, click on port numbers next to `OPEN PORT` button to visit application. + + +## Future work/ Improvements +- Add download transcription as subtitles `.srt` file. diff --git a/speech-to-text/app.py b/speech-to-text/app.py index 735b0a0..3141039 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -1,4 +1,5 @@ # ruff: noqa: F401 +import contextlib from datetime import timedelta from pathlib import Path @@ -18,7 +19,9 @@ @st.cache_resource def load_model(): - return whisper.load_model("base") + # model sizes + # https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages + return whisper.load_model("tiny") def duration_check(info, *, incomplete): @@ -48,6 +51,17 @@ def download_yt_audio(yt_url: str): def postprocess_transcription(predictions: dict, include_timestamps: bool): + """ + Postprocesses the transcription predictions to include timestamps if specified. + + Args: + predictions (dict): The transcription predictions. + include_timestamps (bool): Flag to indicate whether to include timestamps. + + Returns: + str: The postprocessed transcription with or without timestamps. + """ + if not include_timestamps: return predictions.get("text") result = [] @@ -74,18 +88,13 @@ def main(): format_func=lambda x: language_options.get(x), ) - # check GPU - cuda_available = torch.cuda.is_available() - print(cuda_available) - if cuda_available: + if cuda_available := torch.cuda.is_available(): st.info("GPU available đŸ”Ĩ - Transcriptions will be fast!") else: st.warning("GPU NOT available 🚨 - Transcriptions might take some time") # load model model = load_model() - transcription = None - # YT video link input text_input = st.text_input(label="Enter valid Youtube video URL") @@ -98,14 +107,11 @@ def main(): # submit with video link or uploaded audio if submit_button and (text_input or audio): - st.session_state.text = None toast_msg = st.toast("Model is running!", icon="🏃") # download audio from YT video url if text_input: - try: + with contextlib.suppress(Exception): download_yt_audio(text_input) - except Exception: - pass elif audio is not None: # save uploaded audio bytes_data = audio.getvalue() @@ -128,12 +134,15 @@ def main(): word_timestamps=True, language=language, ) - transcription = postprocess_transcription(result, with_timestamps == "Yes") - if transcription or st.session_state.text: + + if transcription := postprocess_transcription( + result, + with_timestamps == "Yes", + ): st.session_state.text = transcription with st.expander("See Transcription"): st.write(st.session_state.text) - # st.code(transcription) + audio_path.unlink() else: st.info("Please add YouTube URL or upload audio for transcription", icon="â„šī¸") From 322ee4c8f4bb75b738f4668f162f61bac473ff10 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 19:24:55 +0530 Subject: [PATCH 06/23] minor fix --- speech-to-text/README.md | 1 + speech-to-text/app.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/speech-to-text/README.md b/speech-to-text/README.md index d5e854f..8e6b76d 100644 --- a/speech-to-text/README.md +++ b/speech-to-text/README.md @@ -44,3 +44,4 @@ You can deploy it on your own easily and (possibly) free of charge on cloud. Scr ## Future work/ Improvements - Add download transcription as subtitles `.srt` file. +- Implement streaming transcription. \ No newline at end of file diff --git a/speech-to-text/app.py b/speech-to-text/app.py index 3141039..c533aac 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -69,8 +69,10 @@ def postprocess_transcription(predictions: dict, include_timestamps: bool): startTime = str(0) + str(timedelta(seconds=int(segment["start"]))) + ",000" endTime = str(0) + str(timedelta(seconds=int(segment["end"]))) + ",000" text = segment["text"] + if not text.strip(): + continue segmentId = segment["id"] + 1 - segment = f"{segmentId}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n" + segment = f"{segmentId}\n{startTime} --> {endTime}\n{text.strip()}\n\n" result.append(segment) return "".join(result) @@ -80,7 +82,7 @@ def main(): st.title("Audio|YouTube video Transcription") st.sidebar.title("Settings") - with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"]) + with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"], index=1) language_options = {None: "Auto-detect", "en": "English"} language = st.sidebar.selectbox( "Select Language", @@ -135,6 +137,7 @@ def main(): language=language, ) + st.success("🎉 Transcription completed successfully! 🎉") if transcription := postprocess_transcription( result, with_timestamps == "Yes", @@ -142,7 +145,6 @@ def main(): st.session_state.text = transcription with st.expander("See Transcription"): st.write(st.session_state.text) - audio_path.unlink() else: st.info("Please add YouTube URL or upload audio for transcription", icon="â„šī¸") From 8d0c9ca82fdcc418cce69ba20386f02784820ae6 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 22:10:58 +0530 Subject: [PATCH 07/23] play audio --- speech-to-text/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index c533aac..90d4c1d 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -129,6 +129,7 @@ def main(): ) else: # start transcription + st.audio("./audio.m4a", format="audio/mpeg") with st.spinner("Transcribing..."): result = model.transcribe( str(audio_path), From b7ee5409896ab765a301f265519eefdf3433d428 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 22:14:37 +0530 Subject: [PATCH 08/23] revert audio player --- speech-to-text/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index 90d4c1d..75c14de 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -129,7 +129,7 @@ def main(): ) else: # start transcription - st.audio("./audio.m4a", format="audio/mpeg") + # st.audio("./audio.m4a", format="audio/mpeg") with st.spinner("Transcribing..."): result = model.transcribe( str(audio_path), From ce6637480ffc197b322b87b2f506e52fc17c7a94 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 22:47:19 +0530 Subject: [PATCH 09/23] test --- speech-to-text/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index 75c14de..5eae243 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -21,7 +21,7 @@ def load_model(): # model sizes # https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages - return whisper.load_model("tiny") + return whisper.load_model("base") def duration_check(info, *, incomplete): @@ -129,7 +129,7 @@ def main(): ) else: # start transcription - # st.audio("./audio.m4a", format="audio/mpeg") + st.audio("./audio.m4a", format="audio/mpeg") with st.spinner("Transcribing..."): result = model.transcribe( str(audio_path), From 247ddc161fdf6b56cba7428d47519408cbeff985 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 22:53:29 +0530 Subject: [PATCH 10/23] update reqs --- speech-to-text/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt index d744f24..60bffe3 100644 --- a/speech-to-text/requirements.txt +++ b/speech-to-text/requirements.txt @@ -3,3 +3,4 @@ streamlit==1.26.0 torch==2.2.0 yt-dlp openai-whisper +ffmpeg-python \ No newline at end of file From ef453062f4c19add5c0975f05026cdfd2c4e4d67 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 23:11:07 +0530 Subject: [PATCH 11/23] pkg update --- speech-to-text/packages.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/speech-to-text/packages.txt b/speech-to-text/packages.txt index a9f1eea..dcbef9b 100644 --- a/speech-to-text/packages.txt +++ b/speech-to-text/packages.txt @@ -1 +1,2 @@ -ffmpeg \ No newline at end of file +ffmpeg +ffprobe \ No newline at end of file From 165145a52092fbbe941ddce72ce4a56a4e031f87 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 23:15:43 +0530 Subject: [PATCH 12/23] config update --- speech-to-text/.streamlit/config.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/speech-to-text/.streamlit/config.toml b/speech-to-text/.streamlit/config.toml index 3161ea1..489062c 100644 --- a/speech-to-text/.streamlit/config.toml +++ b/speech-to-text/.streamlit/config.toml @@ -1,13 +1,13 @@ # https://docs.streamlit.io/develop/api-reference/configuration/config.toml [server] -headless = true -runOnSave = true -allowRunOnSave = true -# fastReruns = true -# fileWatcherType = "auto" -# fileWatcherType = "watchdog" -fileWatcherType = "poll" +# headless = true +# runOnSave = true +# allowRunOnSave = true +# # fastReruns = true +# # fileWatcherType = "auto" +# # fileWatcherType = "watchdog" +# fileWatcherType = "poll" [theme] From 43697fc59d4cfeeb2fdd30323524fab23aa828fe Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 4 Aug 2024 23:25:38 +0530 Subject: [PATCH 13/23] test 2 --- speech-to-text/app.py | 8 ++++---- speech-to-text/requirements.txt | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index 5eae243..ea13135 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -90,10 +90,10 @@ def main(): format_func=lambda x: language_options.get(x), ) - if cuda_available := torch.cuda.is_available(): - st.info("GPU available đŸ”Ĩ - Transcriptions will be fast!") - else: - st.warning("GPU NOT available 🚨 - Transcriptions might take some time") + # if cuda_available := torch.cuda.is_available(): + # st.info("GPU available đŸ”Ĩ - Transcriptions will be fast!") + # else: + # st.warning("GPU NOT available 🚨 - Transcriptions might take some time") # load model model = load_model() diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt index 60bffe3..f8a1ea7 100644 --- a/speech-to-text/requirements.txt +++ b/speech-to-text/requirements.txt @@ -1,6 +1,6 @@ pyperclip==1.9.0 streamlit==1.26.0 -torch==2.2.0 +# torch==2.2.0 yt-dlp openai-whisper -ffmpeg-python \ No newline at end of file +# ffmpeg-python \ No newline at end of file From 21ed027c32962a8a322ca8ffaa10c4a9f7bc03b6 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Mon, 5 Aug 2024 11:10:52 +0530 Subject: [PATCH 14/23] pkgs update --- speech-to-text/packages.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/speech-to-text/packages.txt b/speech-to-text/packages.txt index dcbef9b..bedc20b 100644 --- a/speech-to-text/packages.txt +++ b/speech-to-text/packages.txt @@ -1,2 +1,3 @@ ffmpeg -ffprobe \ No newline at end of file +ffprobe +setuptools-rust \ No newline at end of file From cbf75cb518b8f333bb5286a3c4890101be3a0582 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Mon, 5 Aug 2024 11:24:31 +0530 Subject: [PATCH 15/23] update pkgs --- packages.txt | 1 + speech-to-text/packages.txt | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) create mode 100644 packages.txt delete mode 100644 speech-to-text/packages.txt diff --git a/packages.txt b/packages.txt new file mode 100644 index 0000000..20645e6 --- /dev/null +++ b/packages.txt @@ -0,0 +1 @@ +ffmpeg diff --git a/speech-to-text/packages.txt b/speech-to-text/packages.txt deleted file mode 100644 index bedc20b..0000000 --- a/speech-to-text/packages.txt +++ /dev/null @@ -1,3 +0,0 @@ -ffmpeg -ffprobe -setuptools-rust \ No newline at end of file From aadee4a4973d12078d554814034706f96545544e Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Mon, 5 Aug 2024 14:11:13 +0530 Subject: [PATCH 16/23] remove previous audio file --- speech-to-text/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index ea13135..6f029de 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -109,6 +109,9 @@ def main(): # submit with video link or uploaded audio if submit_button and (text_input or audio): + # if previous audio exists, remove it + audio_path = Path("./audio.m4a") + audio_path.unlink(missing_ok=True) toast_msg = st.toast("Model is running!", icon="🏃") # download audio from YT video url if text_input: @@ -120,7 +123,6 @@ def main(): with open("./audio.m4a", "wb") as f: f.write(bytes_data) - audio_path = Path("./audio.m4a") if not audio_path.exists(): st.error( """Audio file generation failed! Please recheck YouTube URL or uploaded file. From 1896743f5b4d324e98c64d35ac29269292b7ea9f Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Mon, 5 Aug 2024 14:51:22 +0530 Subject: [PATCH 17/23] streamlit issue --- speech-to-text/app.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index 6f029de..b2ca515 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -126,7 +126,8 @@ def main(): if not audio_path.exists(): st.error( """Audio file generation failed! Please recheck YouTube URL or uploaded file. - YT videos only upto 10 mins are supported""", + YT videos only upto 10 mins are supported. + YouTube may have rate limited due to large number of requests.""", icon="🚨", ) else: @@ -152,6 +153,10 @@ def main(): else: st.info("Please add YouTube URL or upload audio for transcription", icon="â„šī¸") + if st.button("Remove previous results."): + audio_path = Path("./audio.m4a") + audio_path.unlink(missing_ok=True) + # download and copy transcription # col1, col2 = st.columns([1, 1]) # with col1: From 706fd98ad1259fdd97ff05d1b94708c2d532fcea Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Mon, 5 Aug 2024 14:56:33 +0530 Subject: [PATCH 18/23] streamlit issue fix --- speech-to-text/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index b2ca515..a52459f 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -153,9 +153,11 @@ def main(): else: st.info("Please add YouTube URL or upload audio for transcription", icon="â„šī¸") - if st.button("Remove previous results."): + if st.button("Refresh App"): audio_path = Path("./audio.m4a") audio_path.unlink(missing_ok=True) + # re-load model + model = load_model() # download and copy transcription # col1, col2 = st.columns([1, 1]) From 262fc2074d013527fd59754fae05be7fb8d05947 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Mon, 5 Aug 2024 21:03:08 +0530 Subject: [PATCH 19/23] yt-dlp rete limit fix --- speech-to-text/app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index a52459f..eea1b04 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -38,6 +38,7 @@ def download_yt_audio(yt_url: str): "match_filter": duration_check, "format": "m4a/bestaudio/best", "outtmpl": {"default": "audio.%(ext)s"}, + "cookiesfrombrowser": ("edge", None, None, None), "postprocessors": [ { # Extract audio using ffmpeg "key": "FFmpegExtractAudio", From f28aa041937a33789290491c272c707c1a83c59b Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Mon, 5 Aug 2024 21:54:01 +0530 Subject: [PATCH 20/23] disable cookies --- speech-to-text/app.py | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index eea1b04..bfe6c93 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -38,7 +38,9 @@ def download_yt_audio(yt_url: str): "match_filter": duration_check, "format": "m4a/bestaudio/best", "outtmpl": {"default": "audio.%(ext)s"}, - "cookiesfrombrowser": ("edge", None, None, None), + # "cookiesfrombrowser": (None, None, None, None), + "cachedir": False, + "verbose": True, "postprocessors": [ { # Extract audio using ffmpeg "key": "FFmpegExtractAudio", @@ -134,23 +136,26 @@ def main(): else: # start transcription st.audio("./audio.m4a", format="audio/mpeg") - with st.spinner("Transcribing..."): - result = model.transcribe( - str(audio_path), - verbose=True, - word_timestamps=True, - language=language, - ) - - st.success("🎉 Transcription completed successfully! 🎉") - if transcription := postprocess_transcription( - result, - with_timestamps == "Yes", - ): - st.session_state.text = transcription - with st.expander("See Transcription"): - st.write(st.session_state.text) - audio_path.unlink() + try: + with st.spinner("Transcribing..."): + result = model.transcribe( + str(audio_path), + verbose=True, + word_timestamps=True, + language=language, + ) + + st.success("🎉 Transcription completed successfully! 🎉") + if transcription := postprocess_transcription( + result, + with_timestamps == "Yes", + ): + st.session_state.text = transcription + with st.expander("See Transcription"): + st.write(st.session_state.text) + audio_path.unlink() + except Exception: + st.error("Please refresh App") else: st.info("Please add YouTube URL or upload audio for transcription", icon="â„šī¸") From cd66ca42482198f5f2e57ece1d70a22baa47c487 Mon Sep 17 00:00:00 2001 From: Udit Manav Date: Sun, 18 Aug 2024 10:30:00 +0530 Subject: [PATCH 21/23] removed stale code --- speech-to-text/app.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/speech-to-text/app.py b/speech-to-text/app.py index bfe6c93..1a1a02f 100644 --- a/speech-to-text/app.py +++ b/speech-to-text/app.py @@ -93,11 +93,6 @@ def main(): format_func=lambda x: language_options.get(x), ) - # if cuda_available := torch.cuda.is_available(): - # st.info("GPU available đŸ”Ĩ - Transcriptions will be fast!") - # else: - # st.warning("GPU NOT available 🚨 - Transcriptions might take some time") - # load model model = load_model() # YT video link input @@ -165,21 +160,6 @@ def main(): # re-load model model = load_model() - # download and copy transcription - # col1, col2 = st.columns([1, 1]) - # with col1: - # copy_btn = st.button("Copy", on_click=update_text, args=[st.session_state.text]) - # if copy_btn: - # pyperclip.copy(transcription) - # st.success("Text copied successfully!") - # with col2: - # if not transcription: - # transcription = "" - # dl_btn = st.download_button( - # "Download", - # transcription, - # "text/plain", - # ) if __name__ == "__main__": From 3797b294e250a654b71598579e2cce23757b77fe Mon Sep 17 00:00:00 2001 From: Udit Manav <17214595+uditmanav17@users.noreply.github.com> Date: Thu, 23 Apr 2026 23:19:13 +0530 Subject: [PATCH 22/23] Update speech-to-text/README.md Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> --- speech-to-text/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech-to-text/README.md b/speech-to-text/README.md index 8e6b76d..4ff6bb3 100644 --- a/speech-to-text/README.md +++ b/speech-to-text/README.md @@ -21,7 +21,7 @@ You can deploy it on your own easily and (possibly) free of charge on cloud. Scr ## Deployment - Local deployment - Install Docker. Instructions available [here](https://docs.docker.com/engine/install/). Make sure docker is up and running before proceeding. - - Install Git. Instruction [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). + - Install Git. Instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). - Clone repo and run compose ``` git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects From 6aa46d306cb216bae5e7f069077cc638168f7841 Mon Sep 17 00:00:00 2001 From: Udit Manav <17214595+uditmanav17@users.noreply.github.com> Date: Thu, 23 Apr 2026 23:20:05 +0530 Subject: [PATCH 23/23] Update speech-to-text/README.md Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> --- speech-to-text/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech-to-text/README.md b/speech-to-text/README.md index 4ff6bb3..13223e6 100644 --- a/speech-to-text/README.md +++ b/speech-to-text/README.md @@ -28,7 +28,7 @@ You can deploy it on your own easily and (possibly) free of charge on cloud. Scr git switch whisper && cd ./speech-to-text docker compose --profile app up ``` - - `--profile app` will start on `localhost:8501` and `localhost:8501` ports. + - `--profile app` will start on `localhost:8501` port. - Docker Playground Cloud Deployment - Navigate to [docker playground](https://labs.play-with-docker.com/).