pdf_chat/chatbot_core.py at main · rishishanbhag/pdf_chat · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import os
import pickle
import logging
from typing import Optional, List
from dotenv import load_dotenv

from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

class PDFChatbot:
    def __init__(self):
        """Initialize the PDF chatbot with configuration"""
        self.google_api_key = os.getenv("GOOGLE_API_KEY")
        if not self.google_api_key:
            raise ValueError("GOOGLE_API_KEY not found in environment variables")

        genai.configure(api_key=self.google_api_key)
        self.conversation_chain = None
        self.vectorstore = None
        self.vectorstore_file = 'vectorstore.pkl'

    def get_pdf_text(self, pdf_docs) -> str:
        """Extract text from PDF documents"""
        text = ""
        for pdf in pdf_docs:
            try:
                pdf_reader = PdfReader(pdf)
                for page in pdf_reader.pages:
                    text += page.extract_text()
            except Exception as e:
                logger.error(f"Error reading PDF: {e}")
        return text

    def get_pdf_text_from_paths(self, pdf_paths: List[str]) -> str:
        """Extract text from PDF files given their paths"""
        text = ""
        for pdf_path in pdf_paths:
            try:
                with open(pdf_path, 'rb') as file:
                    pdf_reader = PdfReader(file)
                    for page in pdf_reader.pages:
                        text += page.extract_text()
            except Exception as e:
                logger.error(f"Error reading PDF {pdf_path}: {e}")
        return text

    def get_text_chunks(self, raw_text: str) -> List[str]:
        """Split text into chunks for processing"""
        text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )
        return text_splitter.split_text(raw_text)

    def create_vectorstore(self, text_chunks: List[str]) -> FAISS:
        """Create FAISS vectorstore from text chunks"""
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vectorstore = FAISS.from_texts(text_chunks, embedding=embeddings)
        self.vectorstore = vectorstore
        return vectorstore

    def save_vectorstore(self, vectorstore: Optional[FAISS] = None):
        """Save vectorstore to file"""
        try:
            store_to_save = vectorstore or self.vectorstore
            if store_to_save is None:
                raise ValueError("No vectorstore to save")

            with open(self.vectorstore_file, 'wb') as f:
                pickle.dump(store_to_save, f)
            logger.info("Vectorstore saved successfully")
        except Exception as e:
            logger.error(f"Failed to save vectorstore: {e}")
            raise

    def load_vectorstore(self) -> Optional[FAISS]:
        """Load vectorstore from file"""
        try:
            with open(self.vectorstore_file, 'rb') as f:
                self.vectorstore = pickle.load(f)
            logger.info("Vectorstore loaded successfully")
            return self.vectorstore
        except FileNotFoundError:
            logger.warning("No vectorstore found. Please process documents first.")
            return None
        except Exception as e:
            logger.error(f"Error loading vectorstore: {e}")
            return None

    def create_conversation_chain(self, vectorstore: Optional[FAISS] = None) -> ConversationalRetrievalChain:
        """Create conversation chain with Gemini"""
        store_to_use = vectorstore or self.vectorstore
        if store_to_use is None:
            raise ValueError("No vectorstore available for conversation chain")

        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0.2,
            google_api_key=self.google_api_key,
            convert_system_message_to_human=True
        )

        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

        conversation_chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=store_to_use.as_retriever(),
            memory=memory
        )

        self.conversation_chain = conversation_chain
        return conversation_chain

    def process_documents(self, pdf_docs) -> dict:
        """Process PDF documents and create vectorstore"""
        try:
            # Extract text
            raw_text = self.get_pdf_text(pdf_docs)
            if not raw_text.strip():
                raise ValueError("No text could be extracted from the PDFs")

            # Create chunks
            text_chunks = self.get_text_chunks(raw_text)

            # Create vectorstore
            vectorstore = self.create_vectorstore(text_chunks)

            # Save vectorstore
            self.save_vectorstore(vectorstore)

            # Create conversation chain
            self.create_conversation_chain(vectorstore)

            logger.info(f"Successfully processed documents: {len(text_chunks)} chunks created")

            return {
                "status": "success",
                "chunks_created": len(text_chunks),
                "text_length": len(raw_text)
            }

        except Exception as e:
            logger.error(f"Error processing documents: {e}")
            raise

    def process_documents_from_paths(self, pdf_paths: List[str]) -> dict:
        """Process PDF documents from file paths"""
        try:
            # Extract text
            raw_text = self.get_pdf_text_from_paths(pdf_paths)
            if not raw_text.strip():
                raise ValueError("No text could be extracted from the PDFs")

            # Create chunks
            text_chunks = self.get_text_chunks(raw_text)

            # Create vectorstore
            vectorstore = self.create_vectorstore(text_chunks)

            # Save vectorstore
            self.save_vectorstore(vectorstore)

            # Create conversation chain
            self.create_conversation_chain(vectorstore)

            logger.info(f"Successfully processed {len(pdf_paths)} documents: {len(text_chunks)} chunks created")

            return {
                "status": "success",
                "files_processed": len(pdf_paths),
                "chunks_created": len(text_chunks),
                "text_length": len(raw_text)
            }

        except Exception as e:
            logger.error(f"Error processing documents: {e}")
            raise

    def initialize_from_saved_vectorstore(self):
        """Initialize conversation chain from saved vectorstore"""
        vectorstore = self.load_vectorstore()
        if vectorstore:
            self.create_conversation_chain(vectorstore)
            return True
        return False

    def ask_question(self, question: str, conversation_id: Optional[str] = None) -> dict:
        """Ask a question and get response"""
        try:
            if not question.strip():
                raise ValueError("Question cannot be empty")

            if self.conversation_chain is None:
                # Try to initialize from saved vectorstore
                if not self.initialize_from_saved_vectorstore():
                    raise ValueError("No conversation chain available. Please process documents first.")

            logger.info(f"Processing question: '{question}' for conversation {conversation_id}")

            response = self.conversation_chain({"question": question})
            answer = response['answer']

            logger.info(f"Generated answer: {answer[:100]}...")

            return {
                "answer": answer,
                "status": "success",
                "conversation_id": conversation_id,
                "question": question
            }

        except Exception as e:
            logger.error(f"Error processing question: {e}")
            raise

    def get_status(self) -> dict:
        """Get current status of the chatbot"""
        return {
            "vectorstore_loaded": self.vectorstore is not None,
            "conversation_chain_initialized": self.conversation_chain is not None,
            "vectorstore_file_exists": os.path.exists(self.vectorstore_file),
            "google_api_configured": bool(self.google_api_key)
        }

# Global instance for easy access
chatbot = PDFChatbot()