From e178c6ba7f8b1fd8b0345ae849a1d13c4e67e0e9 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 16:26:03 +0530
Subject: [PATCH 01/23] whisper app v1

---
 speech-to-text/.gitignore         |   3 +
 speech-to-text/Dockerfile         |  21 ++++
 speech-to-text/docker-compose.yml |  19 ++++
 speech-to-text/requirements.txt   |   5 +
 speech-to-text/src/app.py         | 157 ++++++++++++++++++++++++++++++
 5 files changed, 205 insertions(+)
 create mode 100644 speech-to-text/.gitignore
 create mode 100644 speech-to-text/Dockerfile
 create mode 100644 speech-to-text/docker-compose.yml
 create mode 100644 speech-to-text/requirements.txt
 create mode 100644 speech-to-text/src/app.py

diff --git a/speech-to-text/.gitignore b/speech-to-text/.gitignore
new file mode 100644
index 0000000..7813aeb
--- /dev/null
+++ b/speech-to-text/.gitignore
@@ -0,0 +1,3 @@
+audio.mp3
+video.MKV
+/src/nb.ipynb
\ No newline at end of file
diff --git a/speech-to-text/Dockerfile b/speech-to-text/Dockerfile
new file mode 100644
index 0000000..5eebf52
--- /dev/null
+++ b/speech-to-text/Dockerfile
@@ -0,0 +1,21 @@
+ARG PYTHON_VERSION=3.11.4
+
+FROM python:${PYTHON_VERSION}-slim as base
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+COPY requirements.txt ./requirements.txt
+
+# Download dependencies using cache mount and bind mount
+RUN python -m pip install -r requirements.txt
+
+EXPOSE 8501
+
+# Copy the source code into the container.
+COPY . .
+
+# Run the application.
+ENTRYPOINT ["streamlit", "run", "app.py"]
\ No newline at end of file
diff --git a/speech-to-text/docker-compose.yml b/speech-to-text/docker-compose.yml
new file mode 100644
index 0000000..507fadb
--- /dev/null
+++ b/speech-to-text/docker-compose.yml
@@ -0,0 +1,19 @@
+
+services:
+
+  streamlit:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    command: >
+      /app/app.py --server.port 8501
+    ports:
+      - "8501:8501"
+    volumes:
+      - ./:/app
+    profiles:
+      - app
+
+networks:
+  app:
+    driver: bridge
\ No newline at end of file
diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt
new file mode 100644
index 0000000..edee4f9
--- /dev/null
+++ b/speech-to-text/requirements.txt
@@ -0,0 +1,5 @@
+pyperclip==1.9.0
+streamlit==1.26.0
+yt-dlp
+openai-whisper
+pytorch
diff --git a/speech-to-text/src/app.py b/speech-to-text/src/app.py
new file mode 100644
index 0000000..16b7d94
--- /dev/null
+++ b/speech-to-text/src/app.py
@@ -0,0 +1,157 @@
+# ruff: noqa: F401
+from datetime import timedelta
+from pathlib import Path
+
+import pyperclip
+import streamlit as st
+import torch
+import whisper
+import yt_dlp
+
+st.set_page_config(
+    page_title="Speech to Text",
+    page_icon="🔉",
+    layout="centered",
+    initial_sidebar_state="auto",
+)
+
+
+@st.cache_resource
+def load_model():
+    return whisper.load_model("base")
+
+
+def duration_check(info, *, incomplete):
+    """Download only videos less than 10 minute (or with unknown duration)"""
+    duration = info.get("duration")
+    if duration and duration > 601:  # 10 mins limit
+        return "The video is too long"
+
+
+def download_yt_audio(yt_url: str):
+    """Download audio from given youtube video URL"""
+    # convert cli to python args - https://github.com/yt-dlp/yt-dlp/blob/master/devscripts/cli_to_api.py
+    ydl_opts = {
+        "match_filter": duration_check,
+        "format": "m4a/bestaudio/best",
+        "outtmpl": {"default": "audio.%(ext)s"},
+        "postprocessors": [
+            {  # Extract audio using ffmpeg
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "m4a",
+            }
+        ],
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        error_code = ydl.download(yt_url)
+    print(error_code)
+
+
+def postprocess_transcription(predictions: dict, include_timestamps: bool):
+    if not include_timestamps:
+        return predictions.get("text")
+    result = []
+    for segment in predictions.get("segments", {}):
+        startTime = str(0) + str(timedelta(seconds=int(segment["start"]))) + ",000"
+        endTime = str(0) + str(timedelta(seconds=int(segment["end"]))) + ",000"
+        text = segment["text"]
+        segmentId = segment["id"] + 1
+        segment = f"{segmentId}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n"
+
+        result.append(segment)
+    return "".join(result)
+
+
+def main():
+    st.title("Audio|YouTube video Transcription")
+
+    st.sidebar.title("Settings")
+    with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"])
+    language = st.sidebar.selectbox(
+        "Select Language", {"Auto-detect": None, "English": "en"}
+    )
+
+    # check GPU
+    cuda_available = torch.cuda.is_available()
+    print(cuda_available)
+    if cuda_available:
+        st.info("GPU available 🔥 - Transcriptions will be fast!")
+    else:
+        st.warning("GPU NOT available 🚨 - Transcriptions might take some time")
+
+    # load model
+    model = load_model()
+    transcription = None
+
+    # YT video link input
+    text_input = st.text_input(label="Enter valid Youtube video URL")
+
+    # audio upload
+    audio = st.file_uploader(
+        "Upload an audio or short video file",
+        type=["mp3", "m4a", "mkv", "mp4"],
+    )
+    submit_button = st.button(label="Transcribe")
+
+    # submit with video link or uploaded audio
+    if submit_button and (text_input or audio):
+        st.session_state.text = None
+        toast_msg = st.toast("Model is running!", icon="🏃")
+        # download audio from YT video url
+        if text_input:
+            try:
+                download_yt_audio(text_input)
+            except Exception:
+                pass
+        elif audio is not None:
+            # save uploaded audio
+            bytes_data = audio.getvalue()
+            with open("./audio.m4a", "wb") as f:
+                f.write(bytes_data)
+
+        audio_path = Path("./audio.m4a")
+        if not audio_path.exists():
+            st.error(
+                """Audio file generation failed! Please recheck YouTube URL or uploaded file.
+                YT videos only upto 10 mins are supported""",
+                icon="🚨",
+            )
+        else:
+            # start transcription
+            with st.spinner("Transcribing..."):
+                result = model.transcribe(
+                    str(audio_path),
+                    verbose=True,
+                    word_timestamps=True,
+                    language=language,
+                )
+            transcription = postprocess_transcription(result, with_timestamps == "Yes")
+            if transcription or st.session_state.text:
+                st.session_state.text = transcription
+                with st.expander("See Transcription"):
+                    st.write(st.session_state.text)
+                    # st.code(transcription)
+            audio_path.unlink()
+    else:
+        st.info("Please add YouTube URL or upload audio for transcription", icon="ℹ️")
+
+    # download and copy transcription
+    # col1, col2 = st.columns([1, 1])
+    # with col1:
+    #     copy_btn = st.button("Copy", on_click=update_text, args=[st.session_state.text])
+    #     if copy_btn:
+    #         pyperclip.copy(transcription)
+    #         st.success("Text copied successfully!")
+    # with col2:
+    #     if not transcription:
+    #         transcription = ""
+    #     dl_btn = st.download_button(
+    #         "Download",
+    #         transcription,
+    #         "text/plain",
+    #     )
+
+
+if __name__ == "__main__":
+    load_model()
+    main()

From abf9b39a0facc0174b749306ba263024931077ae Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 16:49:47 +0530
Subject: [PATCH 02/23] path update

---
 speech-to-text/{src => }/app.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename speech-to-text/{src => }/app.py (100%)

diff --git a/speech-to-text/src/app.py b/speech-to-text/app.py
similarity index 100%
rename from speech-to-text/src/app.py
rename to speech-to-text/app.py

From 9dc9ea1a1b622de55cdf5c86c053ebb99e07b8ff Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 17:27:36 +0530
Subject: [PATCH 03/23] update reqs

---
 speech-to-text/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt
index edee4f9..d744f24 100644
--- a/speech-to-text/requirements.txt
+++ b/speech-to-text/requirements.txt
@@ -1,5 +1,5 @@
 pyperclip==1.9.0
 streamlit==1.26.0
+torch==2.2.0
 yt-dlp
 openai-whisper
-pytorch

From e3fd25ac2d94df8f1a03ddcbe042710124561f36 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 18:16:15 +0530
Subject: [PATCH 04/23] minor fix + package for streamlit

---
 speech-to-text/Dockerfile   | 2 +-
 speech-to-text/app.py       | 5 ++++-
 speech-to-text/packages.txt | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)
 create mode 100644 speech-to-text/packages.txt

diff --git a/speech-to-text/Dockerfile b/speech-to-text/Dockerfile
index 5eebf52..dbe2880 100644
--- a/speech-to-text/Dockerfile
+++ b/speech-to-text/Dockerfile
@@ -10,7 +10,7 @@ WORKDIR /app
 COPY requirements.txt ./requirements.txt
 
 # Download dependencies using cache mount and bind mount
-RUN python -m pip install -r requirements.txt
+RUN apt update && apt install ffmpeg -y && python -m pip install -r requirements.txt
 
 EXPOSE 8501
 
diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index 16b7d94..735b0a0 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -67,8 +67,11 @@ def main():
 
     st.sidebar.title("Settings")
     with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"])
+    language_options = {None: "Auto-detect", "en": "English"}
     language = st.sidebar.selectbox(
-        "Select Language", {"Auto-detect": None, "English": "en"}
+        "Select Language",
+        language_options.keys(),
+        format_func=lambda x: language_options.get(x),
     )
 
     # check GPU
diff --git a/speech-to-text/packages.txt b/speech-to-text/packages.txt
new file mode 100644
index 0000000..a9f1eea
--- /dev/null
+++ b/speech-to-text/packages.txt
@@ -0,0 +1 @@
+ffmpeg
\ No newline at end of file

From 0f033d61c0d911bbf86cbc7b29e422763b02375c Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 18:57:05 +0530
Subject: [PATCH 05/23] readme update + streamlit config

---
 speech-to-text/.streamlit/config.toml | 33 +++++++++++++++++++
 speech-to-text/README.md              | 46 +++++++++++++++++++++++++++
 speech-to-text/app.py                 | 37 +++++++++++++--------
 3 files changed, 102 insertions(+), 14 deletions(-)
 create mode 100644 speech-to-text/.streamlit/config.toml
 create mode 100644 speech-to-text/README.md

diff --git a/speech-to-text/.streamlit/config.toml b/speech-to-text/.streamlit/config.toml
new file mode 100644
index 0000000..3161ea1
--- /dev/null
+++ b/speech-to-text/.streamlit/config.toml
@@ -0,0 +1,33 @@
+# https://docs.streamlit.io/develop/api-reference/configuration/config.toml
+
+[server]
+headless = true
+runOnSave = true
+allowRunOnSave = true
+# fastReruns = true
+# fileWatcherType = "auto"
+# fileWatcherType = "watchdog"
+fileWatcherType = "poll"
+
+
+[theme]
+base = "dark"
+
+# Used to style primary interface elements. It's the color displayed most frequently across your app's
+# screens and components. Examples of widgets using this color are st.slider and st.checkbox.
+# primaryColor = "#919e8b" # green
+# primaryColor = "purple"
+
+# Background color for the main container.
+# backgroundColor = "rgba(254,248,239,1)" # sepia yellow
+
+# Used as the background for most widgets. Examples of widgets with this background are st.sidebar,
+# st.text_input, st.date_input.
+# secondaryBackgroundColor =  "#ebd2b9" # wheat  "#d7ab82" # yellow <-- not enough contrast
+
+# Font color for the page.
+# textColor = "#6e7074" # grey
+
+# Font family (serif | sans serif | mono) for the page. Will not impact code areas.
+# Default: "sans serif"
+font = "sans serif"
\ No newline at end of file
diff --git a/speech-to-text/README.md b/speech-to-text/README.md
new file mode 100644
index 0000000..d5e854f
--- /dev/null
+++ b/speech-to-text/README.md
@@ -0,0 +1,46 @@
+# Speech to text transcription
+
+
+## Objective
+This project aims to transcribe audio file using [openai-whisper](https://github.com/openai/whisper) model. Users can also enter a valid YouTube URL for transcription.
+
+
+## Public Endpoints for Deployed App
+
+Application is deployed on streamlit cloud [here](https://transcribe-whisper.streamlit.app/).
+
+You can deploy it on your own easily and (possibly) free of charge on cloud. Scroll down to `Docker Playground Cloud Deployment` in `Deployment` section.
+
+
+## Code Structure / Services
+- `app` - Complete application code built in streamlit.
+- `packages` - List of linux dependencies required to deploy code on streamlit cloud.
+- `docker-compose` - Compose file which starts application.
+
+
+## Deployment
+- Local deployment
+    - Install Docker. Instructions available [here](https://docs.docker.com/engine/install/). Make sure docker is up and running before proceeding.
+    - Install Git. Instruction [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
+    - Clone repo and run compose
+    ```
+    git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects
+    git switch whisper && cd ./speech-to-text
+    docker compose --profile app up
+    ```
+    - `--profile app` will start on `localhost:8501` and `localhost:8501` ports.
+
+- Docker Playground Cloud Deployment
+    - Navigate to [docker playground](https://labs.play-with-docker.com/).
+    - Login using your docker account. Click Start. This will direct you to a new page.
+    - Click `Add New Instance` on left pane. Then run following commands in terminal -
+    ```
+    git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects
+    git switch whisper && cd ./speech-to-text
+    docker compose --profile app up
+    ```
+    - To access application, click on port numbers next to `OPEN PORT` button to visit application.
+
+
+## Future work/ Improvements
+- Add download transcription as subtitles `.srt` file.
diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index 735b0a0..3141039 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -1,4 +1,5 @@
 # ruff: noqa: F401
+import contextlib
 from datetime import timedelta
 from pathlib import Path
 
@@ -18,7 +19,9 @@
 
 @st.cache_resource
 def load_model():
-    return whisper.load_model("base")
+    # model sizes
+    # https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages
+    return whisper.load_model("tiny")
 
 
 def duration_check(info, *, incomplete):
@@ -48,6 +51,17 @@ def download_yt_audio(yt_url: str):
 
 
 def postprocess_transcription(predictions: dict, include_timestamps: bool):
+    """
+    Postprocesses the transcription predictions to include timestamps if specified.
+
+    Args:
+        predictions (dict): The transcription predictions.
+        include_timestamps (bool): Flag to indicate whether to include timestamps.
+
+    Returns:
+        str: The postprocessed transcription with or without timestamps.
+    """
+
     if not include_timestamps:
         return predictions.get("text")
     result = []
@@ -74,18 +88,13 @@ def main():
         format_func=lambda x: language_options.get(x),
     )
 
-    # check GPU
-    cuda_available = torch.cuda.is_available()
-    print(cuda_available)
-    if cuda_available:
+    if cuda_available := torch.cuda.is_available():
         st.info("GPU available 🔥 - Transcriptions will be fast!")
     else:
         st.warning("GPU NOT available 🚨 - Transcriptions might take some time")
 
     # load model
     model = load_model()
-    transcription = None
-
     # YT video link input
     text_input = st.text_input(label="Enter valid Youtube video URL")
 
@@ -98,14 +107,11 @@ def main():
 
     # submit with video link or uploaded audio
     if submit_button and (text_input or audio):
-        st.session_state.text = None
         toast_msg = st.toast("Model is running!", icon="🏃")
         # download audio from YT video url
         if text_input:
-            try:
+            with contextlib.suppress(Exception):
                 download_yt_audio(text_input)
-            except Exception:
-                pass
         elif audio is not None:
             # save uploaded audio
             bytes_data = audio.getvalue()
@@ -128,12 +134,15 @@ def main():
                     word_timestamps=True,
                     language=language,
                 )
-            transcription = postprocess_transcription(result, with_timestamps == "Yes")
-            if transcription or st.session_state.text:
+
+            if transcription := postprocess_transcription(
+                result,
+                with_timestamps == "Yes",
+            ):
                 st.session_state.text = transcription
                 with st.expander("See Transcription"):
                     st.write(st.session_state.text)
-                    # st.code(transcription)
+
             audio_path.unlink()
     else:
         st.info("Please add YouTube URL or upload audio for transcription", icon="ℹ️")

From 322ee4c8f4bb75b738f4668f162f61bac473ff10 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 19:24:55 +0530
Subject: [PATCH 06/23] minor fix

---
 speech-to-text/README.md | 1 +
 speech-to-text/app.py    | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/speech-to-text/README.md b/speech-to-text/README.md
index d5e854f..8e6b76d 100644
--- a/speech-to-text/README.md
+++ b/speech-to-text/README.md
@@ -44,3 +44,4 @@ You can deploy it on your own easily and (possibly) free of charge on cloud. Scr
 
 ## Future work/ Improvements
 - Add download transcription as subtitles `.srt` file.
+- Implement streaming transcription.
\ No newline at end of file
diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index 3141039..c533aac 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -69,8 +69,10 @@ def postprocess_transcription(predictions: dict, include_timestamps: bool):
         startTime = str(0) + str(timedelta(seconds=int(segment["start"]))) + ",000"
         endTime = str(0) + str(timedelta(seconds=int(segment["end"]))) + ",000"
         text = segment["text"]
+        if not text.strip():
+            continue
         segmentId = segment["id"] + 1
-        segment = f"{segmentId}\n{startTime} --> {endTime}\n{text[1:] if text[0] == ' ' else text}\n\n"
+        segment = f"{segmentId}\n{startTime} --> {endTime}\n{text.strip()}\n\n"
 
         result.append(segment)
     return "".join(result)
@@ -80,7 +82,7 @@ def main():
     st.title("Audio|YouTube video Transcription")
 
     st.sidebar.title("Settings")
-    with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"])
+    with_timestamps = st.sidebar.selectbox("Include Timestamps", ["Yes", "No"], index=1)
     language_options = {None: "Auto-detect", "en": "English"}
     language = st.sidebar.selectbox(
         "Select Language",
@@ -135,6 +137,7 @@ def main():
                     language=language,
                 )
 
+            st.success("🎉 Transcription completed successfully! 🎉")
             if transcription := postprocess_transcription(
                 result,
                 with_timestamps == "Yes",
@@ -142,7 +145,6 @@ def main():
                 st.session_state.text = transcription
                 with st.expander("See Transcription"):
                     st.write(st.session_state.text)
-
             audio_path.unlink()
     else:
         st.info("Please add YouTube URL or upload audio for transcription", icon="ℹ️")

From 8d0c9ca82fdcc418cce69ba20386f02784820ae6 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 22:10:58 +0530
Subject: [PATCH 07/23] play audio

---
 speech-to-text/app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index c533aac..90d4c1d 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -129,6 +129,7 @@ def main():
             )
         else:
             # start transcription
+            st.audio("./audio.m4a", format="audio/mpeg")
             with st.spinner("Transcribing..."):
                 result = model.transcribe(
                     str(audio_path),

From b7ee5409896ab765a301f265519eefdf3433d428 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 22:14:37 +0530
Subject: [PATCH 08/23] revert audio player

---
 speech-to-text/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index 90d4c1d..75c14de 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -129,7 +129,7 @@ def main():
             )
         else:
             # start transcription
-            st.audio("./audio.m4a", format="audio/mpeg")
+            # st.audio("./audio.m4a", format="audio/mpeg")
             with st.spinner("Transcribing..."):
                 result = model.transcribe(
                     str(audio_path),

From ce6637480ffc197b322b87b2f506e52fc17c7a94 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 22:47:19 +0530
Subject: [PATCH 09/23] test

---
 speech-to-text/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index 75c14de..5eae243 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -21,7 +21,7 @@
 def load_model():
     # model sizes
     # https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages
-    return whisper.load_model("tiny")
+    return whisper.load_model("base")
 
 
 def duration_check(info, *, incomplete):
@@ -129,7 +129,7 @@ def main():
             )
         else:
             # start transcription
-            # st.audio("./audio.m4a", format="audio/mpeg")
+            st.audio("./audio.m4a", format="audio/mpeg")
             with st.spinner("Transcribing..."):
                 result = model.transcribe(
                     str(audio_path),

From 247ddc161fdf6b56cba7428d47519408cbeff985 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 22:53:29 +0530
Subject: [PATCH 10/23] update reqs

---
 speech-to-text/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt
index d744f24..60bffe3 100644
--- a/speech-to-text/requirements.txt
+++ b/speech-to-text/requirements.txt
@@ -3,3 +3,4 @@ streamlit==1.26.0
 torch==2.2.0
 yt-dlp
 openai-whisper
+ffmpeg-python
\ No newline at end of file

From ef453062f4c19add5c0975f05026cdfd2c4e4d67 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 23:11:07 +0530
Subject: [PATCH 11/23] pkg update

---
 speech-to-text/packages.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/speech-to-text/packages.txt b/speech-to-text/packages.txt
index a9f1eea..dcbef9b 100644
--- a/speech-to-text/packages.txt
+++ b/speech-to-text/packages.txt
@@ -1 +1,2 @@
-ffmpeg
\ No newline at end of file
+ffmpeg
+ffprobe
\ No newline at end of file

From 165145a52092fbbe941ddce72ce4a56a4e031f87 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 23:15:43 +0530
Subject: [PATCH 12/23] config update

---
 speech-to-text/.streamlit/config.toml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/speech-to-text/.streamlit/config.toml b/speech-to-text/.streamlit/config.toml
index 3161ea1..489062c 100644
--- a/speech-to-text/.streamlit/config.toml
+++ b/speech-to-text/.streamlit/config.toml
@@ -1,13 +1,13 @@
 # https://docs.streamlit.io/develop/api-reference/configuration/config.toml
 
 [server]
-headless = true
-runOnSave = true
-allowRunOnSave = true
-# fastReruns = true
-# fileWatcherType = "auto"
-# fileWatcherType = "watchdog"
-fileWatcherType = "poll"
+# headless = true
+# runOnSave = true
+# allowRunOnSave = true
+# # fastReruns = true
+# # fileWatcherType = "auto"
+# # fileWatcherType = "watchdog"
+# fileWatcherType = "poll"
 
 
 [theme]

From 43697fc59d4cfeeb2fdd30323524fab23aa828fe Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 4 Aug 2024 23:25:38 +0530
Subject: [PATCH 13/23] test 2

---
 speech-to-text/app.py           | 8 ++++----
 speech-to-text/requirements.txt | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index 5eae243..ea13135 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -90,10 +90,10 @@ def main():
         format_func=lambda x: language_options.get(x),
     )
 
-    if cuda_available := torch.cuda.is_available():
-        st.info("GPU available 🔥 - Transcriptions will be fast!")
-    else:
-        st.warning("GPU NOT available 🚨 - Transcriptions might take some time")
+    # if cuda_available := torch.cuda.is_available():
+    #     st.info("GPU available 🔥 - Transcriptions will be fast!")
+    # else:
+    #     st.warning("GPU NOT available 🚨 - Transcriptions might take some time")
 
     # load model
     model = load_model()
diff --git a/speech-to-text/requirements.txt b/speech-to-text/requirements.txt
index 60bffe3..f8a1ea7 100644
--- a/speech-to-text/requirements.txt
+++ b/speech-to-text/requirements.txt
@@ -1,6 +1,6 @@
 pyperclip==1.9.0
 streamlit==1.26.0
-torch==2.2.0
+# torch==2.2.0
 yt-dlp
 openai-whisper
-ffmpeg-python
\ No newline at end of file
+# ffmpeg-python
\ No newline at end of file

From 21ed027c32962a8a322ca8ffaa10c4a9f7bc03b6 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Mon, 5 Aug 2024 11:10:52 +0530
Subject: [PATCH 14/23] pkgs update

---
 speech-to-text/packages.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/speech-to-text/packages.txt b/speech-to-text/packages.txt
index dcbef9b..bedc20b 100644
--- a/speech-to-text/packages.txt
+++ b/speech-to-text/packages.txt
@@ -1,2 +1,3 @@
 ffmpeg
-ffprobe
\ No newline at end of file
+ffprobe
+setuptools-rust
\ No newline at end of file

From cbf75cb518b8f333bb5286a3c4890101be3a0582 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Mon, 5 Aug 2024 11:24:31 +0530
Subject: [PATCH 15/23] update pkgs

---
 packages.txt                | 1 +
 speech-to-text/packages.txt | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)
 create mode 100644 packages.txt
 delete mode 100644 speech-to-text/packages.txt

diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000..20645e6
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1 @@
+ffmpeg
diff --git a/speech-to-text/packages.txt b/speech-to-text/packages.txt
deleted file mode 100644
index bedc20b..0000000
--- a/speech-to-text/packages.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-ffmpeg
-ffprobe
-setuptools-rust
\ No newline at end of file

From aadee4a4973d12078d554814034706f96545544e Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Mon, 5 Aug 2024 14:11:13 +0530
Subject: [PATCH 16/23] remove previous audio file

---
 speech-to-text/app.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index ea13135..6f029de 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -109,6 +109,9 @@ def main():
 
     # submit with video link or uploaded audio
     if submit_button and (text_input or audio):
+        # if previous audio exists, remove it
+        audio_path = Path("./audio.m4a")
+        audio_path.unlink(missing_ok=True)
         toast_msg = st.toast("Model is running!", icon="🏃")
         # download audio from YT video url
         if text_input:
@@ -120,7 +123,6 @@ def main():
             with open("./audio.m4a", "wb") as f:
                 f.write(bytes_data)
 
-        audio_path = Path("./audio.m4a")
         if not audio_path.exists():
             st.error(
                 """Audio file generation failed! Please recheck YouTube URL or uploaded file.

From 1896743f5b4d324e98c64d35ac29269292b7ea9f Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Mon, 5 Aug 2024 14:51:22 +0530
Subject: [PATCH 17/23] streamlit issue

---
 speech-to-text/app.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index 6f029de..b2ca515 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -126,7 +126,8 @@ def main():
         if not audio_path.exists():
             st.error(
                 """Audio file generation failed! Please recheck YouTube URL or uploaded file.
-                YT videos only upto 10 mins are supported""",
+                YT videos only upto 10 mins are supported.
+                YouTube may have rate limited due to large number of requests.""",
                 icon="🚨",
             )
         else:
@@ -152,6 +153,10 @@ def main():
     else:
         st.info("Please add YouTube URL or upload audio for transcription", icon="ℹ️")
 
+    if st.button("Remove previous results."):
+        audio_path = Path("./audio.m4a")
+        audio_path.unlink(missing_ok=True)
+
     # download and copy transcription
     # col1, col2 = st.columns([1, 1])
     # with col1:

From 706fd98ad1259fdd97ff05d1b94708c2d532fcea Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Mon, 5 Aug 2024 14:56:33 +0530
Subject: [PATCH 18/23] streamlit issue fix

---
 speech-to-text/app.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index b2ca515..a52459f 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -153,9 +153,11 @@ def main():
     else:
         st.info("Please add YouTube URL or upload audio for transcription", icon="ℹ️")
 
-    if st.button("Remove previous results."):
+    if st.button("Refresh App"):
         audio_path = Path("./audio.m4a")
         audio_path.unlink(missing_ok=True)
+        # re-load model
+        model = load_model()
 
     # download and copy transcription
     # col1, col2 = st.columns([1, 1])

From 262fc2074d013527fd59754fae05be7fb8d05947 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Mon, 5 Aug 2024 21:03:08 +0530
Subject: [PATCH 19/23] yt-dlp rete limit fix

---
 speech-to-text/app.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index a52459f..eea1b04 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -38,6 +38,7 @@ def download_yt_audio(yt_url: str):
         "match_filter": duration_check,
         "format": "m4a/bestaudio/best",
         "outtmpl": {"default": "audio.%(ext)s"},
+        "cookiesfrombrowser": ("edge", None, None, None),
         "postprocessors": [
             {  # Extract audio using ffmpeg
                 "key": "FFmpegExtractAudio",

From f28aa041937a33789290491c272c707c1a83c59b Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Mon, 5 Aug 2024 21:54:01 +0530
Subject: [PATCH 20/23] disable cookies

---
 speech-to-text/app.py | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index eea1b04..bfe6c93 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -38,7 +38,9 @@ def download_yt_audio(yt_url: str):
         "match_filter": duration_check,
         "format": "m4a/bestaudio/best",
         "outtmpl": {"default": "audio.%(ext)s"},
-        "cookiesfrombrowser": ("edge", None, None, None),
+        # "cookiesfrombrowser": (None, None, None, None),
+        "cachedir": False,
+        "verbose": True,
         "postprocessors": [
             {  # Extract audio using ffmpeg
                 "key": "FFmpegExtractAudio",
@@ -134,23 +136,26 @@ def main():
         else:
             # start transcription
             st.audio("./audio.m4a", format="audio/mpeg")
-            with st.spinner("Transcribing..."):
-                result = model.transcribe(
-                    str(audio_path),
-                    verbose=True,
-                    word_timestamps=True,
-                    language=language,
-                )
-
-            st.success("🎉 Transcription completed successfully! 🎉")
-            if transcription := postprocess_transcription(
-                result,
-                with_timestamps == "Yes",
-            ):
-                st.session_state.text = transcription
-                with st.expander("See Transcription"):
-                    st.write(st.session_state.text)
-            audio_path.unlink()
+            try:
+                with st.spinner("Transcribing..."):
+                    result = model.transcribe(
+                        str(audio_path),
+                        verbose=True,
+                        word_timestamps=True,
+                        language=language,
+                    )
+
+                st.success("🎉 Transcription completed successfully! 🎉")
+                if transcription := postprocess_transcription(
+                    result,
+                    with_timestamps == "Yes",
+                ):
+                    st.session_state.text = transcription
+                    with st.expander("See Transcription"):
+                        st.write(st.session_state.text)
+                audio_path.unlink()
+            except Exception:
+                st.error("Please refresh App")
     else:
         st.info("Please add YouTube URL or upload audio for transcription", icon="ℹ️")
 

From cd66ca42482198f5f2e57ece1d70a22baa47c487 Mon Sep 17 00:00:00 2001
From: Udit Manav <uditmanav17@gmail.com>
Date: Sun, 18 Aug 2024 10:30:00 +0530
Subject: [PATCH 21/23] removed stale code

---
 speech-to-text/app.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/speech-to-text/app.py b/speech-to-text/app.py
index bfe6c93..1a1a02f 100644
--- a/speech-to-text/app.py
+++ b/speech-to-text/app.py
@@ -93,11 +93,6 @@ def main():
         format_func=lambda x: language_options.get(x),
     )
 
-    # if cuda_available := torch.cuda.is_available():
-    #     st.info("GPU available 🔥 - Transcriptions will be fast!")
-    # else:
-    #     st.warning("GPU NOT available 🚨 - Transcriptions might take some time")
-
     # load model
     model = load_model()
     # YT video link input
@@ -165,21 +160,6 @@ def main():
         # re-load model
         model = load_model()
 
-    # download and copy transcription
-    # col1, col2 = st.columns([1, 1])
-    # with col1:
-    #     copy_btn = st.button("Copy", on_click=update_text, args=[st.session_state.text])
-    #     if copy_btn:
-    #         pyperclip.copy(transcription)
-    #         st.success("Text copied successfully!")
-    # with col2:
-    #     if not transcription:
-    #         transcription = ""
-    #     dl_btn = st.download_button(
-    #         "Download",
-    #         transcription,
-    #         "text/plain",
-    #     )
 
 
 if __name__ == "__main__":

From 3797b294e250a654b71598579e2cce23757b77fe Mon Sep 17 00:00:00 2001
From: Udit Manav <17214595+uditmanav17@users.noreply.github.com>
Date: Thu, 23 Apr 2026 23:19:13 +0530
Subject: [PATCH 22/23] Update speech-to-text/README.md

Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com>
---
 speech-to-text/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speech-to-text/README.md b/speech-to-text/README.md
index 8e6b76d..4ff6bb3 100644
--- a/speech-to-text/README.md
+++ b/speech-to-text/README.md
@@ -21,7 +21,7 @@ You can deploy it on your own easily and (possibly) free of charge on cloud. Scr
 ## Deployment
 - Local deployment
     - Install Docker. Instructions available [here](https://docs.docker.com/engine/install/). Make sure docker is up and running before proceeding.
-    - Install Git. Instruction [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
+    - Install Git. Instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
     - Clone repo and run compose
     ```
     git clone https://github.com/uditmanav17/ML-Projects.git && cd ./ML-Projects

From 6aa46d306cb216bae5e7f069077cc638168f7841 Mon Sep 17 00:00:00 2001
From: Udit Manav <17214595+uditmanav17@users.noreply.github.com>
Date: Thu, 23 Apr 2026 23:20:05 +0530
Subject: [PATCH 23/23] Update speech-to-text/README.md

Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com>
---
 speech-to-text/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speech-to-text/README.md b/speech-to-text/README.md
index 4ff6bb3..13223e6 100644
--- a/speech-to-text/README.md
+++ b/speech-to-text/README.md
@@ -28,7 +28,7 @@ You can deploy it on your own easily and (possibly) free of charge on cloud. Scr
     git switch whisper && cd ./speech-to-text
     docker compose --profile app up
     ```
-    - `--profile app` will start on `localhost:8501` and `localhost:8501` ports.
+    - `--profile app` will start on `localhost:8501` port.
 
 - Docker Playground Cloud Deployment
     - Navigate to [docker playground](https://labs.play-with-docker.com/).