-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
183 lines (143 loc) · 6.11 KB
/
app.py
File metadata and controls
183 lines (143 loc) · 6.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# import streamlit as st
# import os
# import sys
# from logger import logging
# from exception import CustomException
# from dotenv import load_dotenv
# import os
# from components.transcript_retriever import TranscriptRetriever
# from components.preprocessor import Preprocessor
# from components.qa_engine import QAEngine
# load_dotenv()
# def get_summary(video_url, question):
# api_key = os.getenv('GEMINI_API_KEY')
# if not api_key:
# raise RuntimeError("GEMINI_API_KEY not found in environment or .env file.")
# # Step 1: Retrieve transcript
# retriever = TranscriptRetriever(video_url)
# raw_transcript = retriever.fetch_transcript(video_url)
# preprocessor = Preprocessor()
# cleaned = preprocessor.clean_transcript(raw_transcript)
# def determine_chunk_size(transcript):
# length = len(transcript)
# if length < 2000:
# return length
# elif length < 10000:
# return 3000
# elif length < 40000:
# return 8000
# else:
# return 12000
# chunk_size = determine_chunk_size(cleaned)
# chunks = preprocessor.chunk_transcript(cleaned, chunk_size=chunk_size)
# engine = QAEngine(api_key=api_key)
# # Determine mode: summarize if question is blank or summarization requested
# if (not question) or "summary" in question.lower() or "summarize" in question.lower():
# # Map-reduce summarization
# summaries = [engine.answer_question(chunk, "Summarize this transcript section.") for chunk in chunks]
# combined = "\n\n".join(summaries)
# reduce_prompt = (
# f"Given these summaries/explanations from different parts of the video, "
# f"write a single, clear summary covering all key points:\n\n{combined}"
# )
# final_summary = engine.summarize_transcript(reduce_prompt)
# return final_summary
# else:
# # Per-chunk Q&A
# answers = [engine.answer_question(chunk, question) for chunk in chunks]
# return "\n\n".join([f"Chunk {i+1}:\n{ans}" for i, ans in enumerate(answers)])
# st.set_page_config(page_title="YouTube Transcript Q&A & Summarizer")
# ICON_PATH = "yt_logo.png"
# st.image(ICON_PATH, width=100)
# st.title(" YouTube Transcript Q&A & Summarizer")
# st.markdown("Enter YouTube video link and choose an action:")
# video_url = st.text_input("YouTube Video Link", placeholder="Paste the URL here...")
# option = st.radio("Select Action:", ("Summarize Video", "Ask a Question"))
# question = ""
# if option == "Ask a Question":
# question = st.text_input("Enter your question about this video:")
# if st.button("Go") and video_url:
# with st.spinner("Processing..."):
# try:
# output_text = get_summary(video_url, question)
# except Exception as e:
# output_text = f"Error: {str(e)}"
# st.subheader("Output:")
# st.write(output_text)
# st.download_button(
# label="Download Output",
# data=output_text,
# file_name="output.txt",
# mime="text/plain"
# )
# st.markdown("Created by **Siddham Jain** (Student of Shiv Nadar IOE).")
import streamlit as st
import os
import sys
from logger import logging
from exception import CustomException
from dotenv import load_dotenv
from components.transcript_retriever import TranscriptRetriever
from components.preprocessor import Preprocessor
from components.qa_engine import QAEngine
# Load environment variables
load_dotenv()
def get_summary(video_url, question):
api_key = os.getenv('GEMINI_API_KEY')
if not api_key:
raise RuntimeError("GEMINI_API_KEY not found in environment or .env file.")
# Step 1: Retrieve transcript
retriever = TranscriptRetriever(video_url)
raw_transcript = retriever.fetch_transcript(video_url)
# Step 2: Preprocess transcript
preprocessor = Preprocessor()
cleaned = preprocessor.clean_transcript(raw_transcript)
def determine_chunk_size(transcript):
length = len(transcript)
if length < 2000:
return length
elif length < 10000:
return 3000
elif length < 40000:
return 8000
else:
return 12000
chunk_size = determine_chunk_size(cleaned)
chunks = preprocessor.chunk_transcript(cleaned, chunk_size=chunk_size)
engine = QAEngine(api_key=api_key)
# --------- ALWAYS use map-reduce for any question (summary or specific) ---------
per_chunk_answers = [engine.answer_question(chunk, question if question else "Summarize this transcript section.") for chunk in chunks]
combined_text = "\n\n".join(per_chunk_answers)
reduce_prompt = (
f"Given these answers (from different parts of the video) to the question: '{question if question else 'Summarize the video'}', "
f"combine everything into ONE comprehensive, concise answer—removing repetition and contradictions, and giving the user the best possible response:\n\n"
f"{combined_text}"
)
final_answer = engine.summarize_transcript(reduce_prompt)
return final_answer
# --- Streamlit UI ---
st.set_page_config(page_title="YouTube Transcript Q&A & Summarizer")
ICON_PATH = "yt_logo.png"
st.image(ICON_PATH, width=100)
st.title("YouTube Transcript Q&A & Summarizer")
st.markdown("Enter YouTube video link and choose an action:")
video_url = st.text_input("YouTube Video Link", placeholder="Paste the URL here...")
option = st.radio("Select Action:", ("Summarize Video", "Ask a Question"))
question = ""
if option == "Ask a Question":
question = st.text_input("Enter your question about this video:")
if st.button("Go") and video_url:
with st.spinner("Processing..."):
try:
output_text = get_summary(video_url, question)
except Exception as e:
output_text = f"Error: {str(e)}"
st.subheader("Output:")
st.write(output_text)
st.download_button(
label="Download Output",
data=output_text,
file_name="output.txt",
mime="text/plain"
)
st.markdown("Created by **Siddham Jain** (Student of Shiv Nadar IOE).")