-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllm_memory.py
More file actions
65 lines (47 loc) · 2.08 KB
/
llm_memory.py
File metadata and controls
65 lines (47 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import argparse
import sys
from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from memory.vector_store import VectorStore
import config
def load_pdf_files(data_path: str) -> list:
"""Load PDF files from directory"""
path = Path(data_path)
if not path.exists():
print(f"[ERROR] Data directory '{data_path}' does not exist.")
sys.exit(1)
loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
if not documents:
print(f"[ERROR] No PDF files found in '{data_path}'.")
sys.exit(1)
print(f"[INFO] Loaded {len(documents)} page(s) from PDFs in '{data_path}'.")
return documents
def create_chunks(documents: list) -> list:
"""Split documents into chunks"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=config.CHUNK_SIZE,
chunk_overlap=config.CHUNK_OVERLAP,
)
chunks = splitter.split_documents(documents)
print(f"[INFO] Created {len(chunks)} text chunk(s).")
return chunks
def build_vectorstore(chunks: list, output_path: str) -> None:
"""Build and save FAISS vector store"""
print(f"[INFO] Loading embedding model '{config.EMBEDDING_MODEL}'...")
vector_store = VectorStore(db_path=output_path)
print("[INFO] Building FAISS index (this may take a while)...")
vector_store.save(chunks, output_path)
print(f"[INFO] Vector store saved to '{output_path}'.")
def main() -> None:
parser = argparse.ArgumentParser(description="MedInsight ingestion pipeline")
parser.add_argument("--data", default=config.DATA_PATH, help="Directory containing PDF files")
parser.add_argument("--output", default=config.DB_FAISS_PATH, help="Output path for FAISS index")
args = parser.parse_args()
documents = load_pdf_files(args.data)
chunks = create_chunks(documents)
build_vectorstore(chunks, args.output)
print("[DONE] Ingestion complete. You can now run the chatbot.")
if __name__ == "__main__":
main()