Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions api/db/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from sqlmodel import SQLModel, Field
from sqlalchemy import Column, JSON
from datetime import datetime
from typing import Optional

class Template(SQLModel, table=True):
id: int | None = Field(default=None, primary_key=True)
Expand All @@ -15,4 +16,5 @@ class FormSubmission(SQLModel, table=True):
template_id: int
input_text: str
output_pdf_path: str
needs_review: Optional[dict] = Field(default=None, sa_column=Column(JSON))
created_at: datetime = Field(default_factory=datetime.utcnow)
19 changes: 14 additions & 5 deletions api/routes/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,24 @@

@router.post("/fill", response_model=FormFillResponse)
def fill_form(form: FormFill, db: Session = Depends(get_db)):
if not get_template(db, form.template_id):
raise AppError("Template not found", status_code=404)

# Fetch the template once — eliminates the previous double-query TOCTOU bug
fetched_template = get_template(db, form.template_id)
if not fetched_template:
raise AppError("Template not found", status_code=404)

controller = Controller()
path = controller.fill_form(user_input=form.input_text, fields=fetched_template.fields, pdf_form_path=fetched_template.pdf_path)
# fill_form now returns a tuple: (output_pdf_path, needs_review_dict)
path, needs_review = controller.fill_form(
user_input=form.input_text,
fields=fetched_template.fields,
pdf_form_path=fetched_template.pdf_path,
)

submission = FormSubmission(**form.model_dump(), output_pdf_path=path)
submission = FormSubmission(
**form.model_dump(),
output_pdf_path=path,
needs_review=needs_review or None,
)
return create_form(db, submission)


2 changes: 2 additions & 0 deletions api/schemas/forms.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pydantic import BaseModel
from typing import Optional

class FormFill(BaseModel):
template_id: int
Expand All @@ -10,6 +11,7 @@ class FormFillResponse(BaseModel):
template_id: int
input_text: str
output_pdf_path: str
needs_review: Optional[dict] = None # Fields the LLM was not confident about; must be verified by a human

class Config:
from_attributes = True
13 changes: 8 additions & 5 deletions src/file_manipulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,31 @@ def create_template(self, pdf_path: str):
def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
"""
It receives the raw data, runs the PDF filling logic,
and returns the path to the newly created file.
and returns (output_pdf_path, needs_review) where needs_review is a dict
of {field_name: {suggested_value, confidence}} for human verification.
"""
print("[1] Received request from frontend.")
print(f"[2] PDF template path: {pdf_form_path}")

if not os.path.exists(pdf_form_path):
print(f"Error: PDF template not found at {pdf_form_path}")
return None # Or raise an exception
return None, {} # Or raise an exception

print("[3] Starting extraction and PDF filling process...")
try:
self.llm._target_fields = fields
self.llm._transcript_text = user_input
output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)
# filler.fill_form now returns a tuple: (output_pdf_path, needs_review)
output_name, needs_review = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)

print("\n----------------------------------")
print("✅ Process Complete.")
print(f"Output saved to: {output_name}")
if needs_review:
print(f"⚠️ {len(needs_review)} field(s) flagged for human review: {list(needs_review.keys())}")

return output_name
return output_name, needs_review

except Exception as e:
print(f"An error occurred during PDF generation: {e}")
# Re-raise the exception so the frontend can handle it
raise e
44 changes: 24 additions & 20 deletions src/filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,14 @@ def __init__(self):

def fill_form(self, pdf_form: str, llm: LLM):
"""
Fill a PDF form with values from user_input using LLM.
Fields are filled in the visual order (top-to-bottom, left-to-right).
Fill a PDF form with confirmed high-confidence values from the LLM.
Fields flagged as low-confidence are written with a [REVIEW REQUIRED] placeholder
so reviewers can easily spot them in the document.

Returns:
tuple: (output_pdf_path: str, needs_review: dict)
needs_review is a dict of {field_name: {suggested_value, confidence}}
that must be presented to the user for manual verification.
"""
output_pdf = (
pdf_form[:-4]
Expand All @@ -19,34 +25,32 @@ def fill_form(self, pdf_form: str, llm: LLM):
+ "_filled.pdf"
)

# Generate dictionary of answers from your original function
# Run LLM extraction — populates both confirmed and needs_review buckets
t2j = llm.main_loop()
textbox_answers = t2j.get_data() # This is a dictionary
confirmed_answers = t2j.get_data() # high-confidence fields
needs_review = t2j.get_needs_review() # low-confidence fields

answers_list = list(textbox_answers.values())
# Merge all field names so we can look up values by name
all_answers = dict(confirmed_answers)
all_answers.update({
field: "[REVIEW REQUIRED]"
for field in needs_review
})

# Read PDF
pdf = PdfReader(pdf_form)

# Loop through pages
# Loop through pages and fill by field name (annot.T) not by position
for page in pdf.pages:
if page.Annots:
sorted_annots = sorted(
page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0]))
)

i = 0
for annot in sorted_annots:
for annot in page.Annots:
if annot.Subtype == "/Widget" and annot.T:
if i < len(answers_list):
annot.V = f"{answers_list[i]}"
# annot.T is a PDF string like "(FieldName)"; strip the parens
field_name = str(annot.T).strip("()")
if field_name in all_answers:
annot.V = str(all_answers[field_name])
annot.AP = None
i += 1
else:
# Stop if we run out of answers
break

PdfWriter().write(output_pdf, pdf)

# Your main.py expects this function to return the path
return output_pdf
return output_pdf, needs_review
92 changes: 65 additions & 27 deletions src/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@


class LLM:
CONFIDENCE_THRESHOLD = 0.85

def __init__(self, transcript_text=None, target_fields=None, json=None):
if json is None:
json = {}
self._transcript_text = transcript_text # str
self._target_fields = target_fields # List, contains the template field.
self._json = json # dictionary
self._json = json # dictionary: confirmed fields (confidence >= threshold)
self._needs_review = {} # fields with low confidence that a human must verify

def type_check_all(self):
if type(self._transcript_text) is not str:
Expand All @@ -26,19 +29,29 @@ def type_check_all(self):
def build_prompt(self, current_field):
"""
This method is in charge of the prompt engineering. It creates a specific prompt for each target field.
Returns a structured JSON object with value and confidence so that downstream logic can apply
a human-in-the-loop review for hallucinated or low-confidence fields.
@params: current_field -> represents the current element of the json that is being prompted.
"""
prompt = f"""
prompt = f"""
SYSTEM PROMPT:
You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings.
You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return
only a single string containing the identified value for the JSON field.
If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";".
If you don't identify the value in the provided text, return "-1".
You are an AI assistant designed to help fill out JSON files with information extracted from transcribed voice recordings.
You will receive the transcription and the name of the JSON field whose value you must identify.

You MUST respond with a single valid JSON object and nothing else. The JSON must have exactly two keys:
- "value": the identified string value for the field, or null if not found.
- "confidence": a float between 0.0 and 1.0 representing how certain you are.

Rules:
- If the field is plural and you find multiple values, separate them with ";" in the value string.
- If you cannot find the value, set "value" to null and "confidence" to 0.0.
- Do NOT add any explanation or text outside the JSON object.

Example output: {{"value": "John Doe", "confidence": 0.95}}
---
DATA:
Target JSON field to find in text: {current_field}

TEXT: {self._transcript_text}
"""

Expand Down Expand Up @@ -70,11 +83,10 @@ def main_loop(self):
except requests.exceptions.HTTPError as e:
raise RuntimeError(f"Ollama returned an error: {e}")

# parse response
# parse raw Ollama response
json_data = response.json()
parsed_response = json_data["response"]
# print(parsed_response)
self.add_response_to_json(field, parsed_response)
raw_text = json_data["response"].strip()
self.add_response_to_json(field, raw_text)

print("----------------------------------")
print("\t[LOG] Resulting JSON created from the input text:")
Expand All @@ -83,24 +95,45 @@ def main_loop(self):

return self

def add_response_to_json(self, field, value):
def add_response_to_json(self, field, raw_text):
"""
this method adds the following value under the specified field,
or under a new field if the field doesn't exist, to the json dict
Parses the structured JSON response from the LLM.
Confirmed fields (confidence >= CONFIDENCE_THRESHOLD) go into self._json.
Low-confidence fields go into self._needs_review for human verification.
"""
value = value.strip().replace('"', "")
parsed_value = None

if value != "-1":
parsed_value = value

if ";" in value:
parsed_value = self.handle_plural_values(value)

if field in self._json.keys():
self._json[field].append(parsed_value)
import json as json_lib
value = None
confidence = 0.0

try:
# The LLM is prompted to always return a JSON object
parsed = json_lib.loads(raw_text)
value = parsed.get("value")
confidence = float(parsed.get("confidence", 0.0))
except (json_lib.JSONDecodeError, ValueError, TypeError):
# If the LLM failed to return valid JSON, treat the whole text as a
# low-confidence raw string so it gets flagged for human review.
print(f"\t[WARN]: LLM returned non-JSON for field '{field}'. Flagging for review.")
value = raw_text if raw_text not in ("-1", "null", "") else None
confidence = 0.0

# Handle plural values (semicolon-separated)
if value and ";" in str(value):
value = self.handle_plural_values(value)

if confidence >= self.CONFIDENCE_THRESHOLD:
# High-confidence: write directly into the confirmed JSON
if field in self._json:
self._json[field].append(value)
else:
self._json[field] = value
else:
self._json[field] = parsed_value
# Low-confidence: flag for human-in-the-loop review
print(f"\t[REVIEW REQUIRED]: Field '{field}' has confidence {confidence:.2f} (threshold: {self.CONFIDENCE_THRESHOLD}). Value: '{value}'")
self._needs_review[field] = {
"suggested_value": value,
"confidence": confidence,
}

return

Expand Down Expand Up @@ -132,4 +165,9 @@ def handle_plural_values(self, plural_value):
return values

def get_data(self):
"""Returns confirmed high-confidence field values."""
return self._json

def get_needs_review(self):
"""Returns fields that could not be extracted with sufficient confidence and require human review."""
return self._needs_review
Loading