ResumeQueryAgent/json_resume_query.py at main · deypadma2020/ResumeQueryAgent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# resume_query_dir/json_resume_query.py
import json
from pathlib import Path
from dotenv import load_dotenv
from langchain_anthropic import ChatAnthropic
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from collections import defaultdict

# Import centralized prompt
from PromptSchema.prompt_generator import prompt

# Import the function to auto-generate JSON from PDFs
from pdf_to_json import convert_pdfs_to_json

# Load environment variables
load_dotenv()

# === 1. Auto-generate resume JSON from PDFs if not present ===
resume_json_path = Path("resume_query_dir/document/resume.json")

if not resume_json_path.exists() or resume_json_path.stat().st_size == 0:
    print("Resume JSON not found or empty. Generating from PDFs...")
    convert_pdfs_to_json("resume_query_dir/raw_docs", str(resume_json_path))

# === 2. Load the combined JSON resumes ===
with open(resume_json_path, "r", encoding="utf-8") as f:
    resumes_list = json.load(f)  # list of resume dicts

# === 3. Convert resumes into Documents, leveraging the `keywords` field ===
documents = []
for resume in resumes_list:
    keywords = ", ".join(resume.get("keywords", [])) or "None"
    text = f"Keywords: {keywords}\n\n{json.dumps(resume, indent=2)}"
    metadata = {
        "name": resume.get("name", ""),
        "unique_id": resume.get("unique_id", ""),
        "designation": resume.get("designation", ""),
    }
    documents.append(Document(page_content=text, metadata=metadata))

# === 4. Split into chunks for embeddings ===
splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=50)
documents = splitter.split_documents(documents)

# === 5. Build embeddings & FAISS vectorstore ===
embedding_model = HuggingFaceEndpointEmbeddings(
    repo_id="sentence-transformers/all-MiniLM-L6-v2"
)
vectorstore = FAISS.from_documents(documents, embedding_model)
vectorstore.save_local("resume_query_dir/vectorstore")

# === 6. Setup retrievers ===
model = ChatAnthropic(model="claude-3-5-sonnet-20240620")

multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    llm=model
)

compressor = LLMChainExtractor.from_llm(model)
compression_retriever = ContextualCompressionRetriever(
    base_retriever=multiquery_retriever, base_compressor=compressor
)

# === 7. Parser & Chain ===
parser = StrOutputParser()
chain = prompt | model | parser

# === 8. Take user query ===
user_query = input("\nEnter your query about the candidates: ")

# === 9. Retrieve top matching chunks ===
query_docs = compression_retriever.invoke(user_query)

# Merge retrieved chunks by candidate
merged_context = defaultdict(list)
for doc in query_docs:
    uid = doc.metadata.get("unique_id", "unknown")
    merged_context[uid].append(
        f"Candidate Name: {doc.metadata.get('name', 'N/A')}\n"
        f"Designation: {doc.metadata.get('designation', 'N/A')}\n"
        f"Resume ID: {uid}\n"
        f"Resume Content:\n{doc.page_content}"
    )

context_text = "\n\n---\n\n".join(
    "\n\n".join(parts) for parts in merged_context.values()
)

# === 10. Generate final structured JSON output ===
response = chain.invoke({
    "query": user_query,
    "doc": context_text
})

# === 11. Display JSON result ===
try:
    parsed_json = json.loads(response)
    print(json.dumps(parsed_json, indent=2))
except json.JSONDecodeError:
    print("Model returned invalid JSON. Raw output:\n", response)


# python -m resume_query_dir.json_resume_query