diff --git a/api/db/models.py b/api/db/models.py
index f76c93b..e0f7b62 100644
--- a/api/db/models.py
+++ b/api/db/models.py
@@ -1,6 +1,7 @@
 from sqlmodel import SQLModel, Field
 from sqlalchemy import Column, JSON
 from datetime import datetime
+from typing import Optional
 
 class Template(SQLModel, table=True):
     id: int | None = Field(default=None, primary_key=True)
@@ -15,4 +16,5 @@ class FormSubmission(SQLModel, table=True):
     template_id: int
     input_text: str
     output_pdf_path: str
+    needs_review: Optional[dict] = Field(default=None, sa_column=Column(JSON))
     created_at: datetime = Field(default_factory=datetime.utcnow)
\ No newline at end of file
diff --git a/api/routes/forms.py b/api/routes/forms.py
index f3430ed..4f4d3f8 100644
--- a/api/routes/forms.py
+++ b/api/routes/forms.py
@@ -11,15 +11,24 @@
 
 @router.post("/fill", response_model=FormFillResponse)
 def fill_form(form: FormFill, db: Session = Depends(get_db)):
-    if not get_template(db, form.template_id):
-        raise AppError("Template not found", status_code=404)
-
+    # Fetch the template once — eliminates the previous double-query TOCTOU bug
     fetched_template = get_template(db, form.template_id)
+    if not fetched_template:
+        raise AppError("Template not found", status_code=404)
 
     controller = Controller()
-    path = controller.fill_form(user_input=form.input_text, fields=fetched_template.fields, pdf_form_path=fetched_template.pdf_path)
+    # fill_form now returns a tuple: (output_pdf_path, needs_review_dict)
+    path, needs_review = controller.fill_form(
+        user_input=form.input_text,
+        fields=fetched_template.fields,
+        pdf_form_path=fetched_template.pdf_path,
+    )
 
-    submission = FormSubmission(**form.model_dump(), output_pdf_path=path)
+    submission = FormSubmission(
+        **form.model_dump(),
+        output_pdf_path=path,
+        needs_review=needs_review or None,
+    )
     return create_form(db, submission)
 
 
diff --git a/api/schemas/forms.py b/api/schemas/forms.py
index 3cce650..5448757 100644
--- a/api/schemas/forms.py
+++ b/api/schemas/forms.py
@@ -1,4 +1,5 @@
 from pydantic import BaseModel
+from typing import Optional
 
 class FormFill(BaseModel):
     template_id: int
@@ -10,6 +11,7 @@ class FormFillResponse(BaseModel):
     template_id: int
     input_text: str
     output_pdf_path: str
+    needs_review: Optional[dict] = None  # Fields the LLM was not confident about; must be verified by a human
 
     class Config:
         from_attributes = True
\ No newline at end of file
diff --git a/src/file_manipulator.py b/src/file_manipulator.py
index b7815cc..baf46c1 100644
--- a/src/file_manipulator.py
+++ b/src/file_manipulator.py
@@ -20,28 +20,31 @@ def create_template(self, pdf_path: str):
     def fill_form(self, user_input: str, fields: list, pdf_form_path: str):
         """
         It receives the raw data, runs the PDF filling logic,
-        and returns the path to the newly created file.
+        and returns (output_pdf_path, needs_review) where needs_review is a dict
+        of {field_name: {suggested_value, confidence}} for human verification.
         """
         print("[1] Received request from frontend.")
         print(f"[2] PDF template path: {pdf_form_path}")
 
         if not os.path.exists(pdf_form_path):
             print(f"Error: PDF template not found at {pdf_form_path}")
-            return None  # Or raise an exception
+            return None, {}  # Or raise an exception
 
         print("[3] Starting extraction and PDF filling process...")
         try:
             self.llm._target_fields = fields
             self.llm._transcript_text = user_input
-            output_name = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)
+            # filler.fill_form now returns a tuple: (output_pdf_path, needs_review)
+            output_name, needs_review = self.filler.fill_form(pdf_form=pdf_form_path, llm=self.llm)
 
             print("\n----------------------------------")
             print("✅ Process Complete.")
             print(f"Output saved to: {output_name}")
+            if needs_review:
+                print(f"⚠️  {len(needs_review)} field(s) flagged for human review: {list(needs_review.keys())}")
 
-            return output_name
+            return output_name, needs_review
 
         except Exception as e:
             print(f"An error occurred during PDF generation: {e}")
-            # Re-raise the exception so the frontend can handle it
             raise e
diff --git a/src/filler.py b/src/filler.py
index e31e535..c968b37 100644
--- a/src/filler.py
+++ b/src/filler.py
@@ -9,8 +9,14 @@ def __init__(self):
 
     def fill_form(self, pdf_form: str, llm: LLM):
         """
-        Fill a PDF form with values from user_input using LLM.
-        Fields are filled in the visual order (top-to-bottom, left-to-right).
+        Fill a PDF form with confirmed high-confidence values from the LLM.
+        Fields flagged as low-confidence are written with a [REVIEW REQUIRED] placeholder
+        so reviewers can easily spot them in the document.
+
+        Returns:
+            tuple: (output_pdf_path: str, needs_review: dict)
+                   needs_review is a dict of {field_name: {suggested_value, confidence}}
+                   that must be presented to the user for manual verification.
         """
         output_pdf = (
             pdf_form[:-4]
@@ -19,34 +25,32 @@ def fill_form(self, pdf_form: str, llm: LLM):
             + "_filled.pdf"
         )
 
-        # Generate dictionary of answers from your original function
+        # Run LLM extraction — populates both confirmed and needs_review buckets
         t2j = llm.main_loop()
-        textbox_answers = t2j.get_data()  # This is a dictionary
+        confirmed_answers = t2j.get_data()      # high-confidence fields
+        needs_review = t2j.get_needs_review()   # low-confidence fields
 
-        answers_list = list(textbox_answers.values())
+        # Merge all field names so we can look up values by name
+        all_answers = dict(confirmed_answers)
+        all_answers.update({
+            field: "[REVIEW REQUIRED]"
+            for field in needs_review
+        })
 
         # Read PDF
         pdf = PdfReader(pdf_form)
 
-        # Loop through pages
+        # Loop through pages and fill by field name (annot.T) not by position
         for page in pdf.pages:
             if page.Annots:
-                sorted_annots = sorted(
-                    page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0]))
-                )
-
-                i = 0
-                for annot in sorted_annots:
+                for annot in page.Annots:
                     if annot.Subtype == "/Widget" and annot.T:
-                        if i < len(answers_list):
-                            annot.V = f"{answers_list[i]}"
+                        # annot.T is a PDF string like "(FieldName)"; strip the parens
+                        field_name = str(annot.T).strip("()")
+                        if field_name in all_answers:
+                            annot.V = str(all_answers[field_name])
                             annot.AP = None
-                            i += 1
-                        else:
-                            # Stop if we run out of answers
-                            break
 
         PdfWriter().write(output_pdf, pdf)
 
-        # Your main.py expects this function to return the path
-        return output_pdf
+        return output_pdf, needs_review
diff --git a/src/llm.py b/src/llm.py
index 70937f9..011e1df 100644
--- a/src/llm.py
+++ b/src/llm.py
@@ -4,12 +4,15 @@
 
 
 class LLM:
+    CONFIDENCE_THRESHOLD = 0.85
+
     def __init__(self, transcript_text=None, target_fields=None, json=None):
         if json is None:
             json = {}
         self._transcript_text = transcript_text  # str
         self._target_fields = target_fields  # List, contains the template field.
-        self._json = json  # dictionary
+        self._json = json  # dictionary: confirmed fields (confidence >= threshold)
+        self._needs_review = {}  # fields with low confidence that a human must verify
 
     def type_check_all(self):
         if type(self._transcript_text) is not str:
@@ -26,19 +29,29 @@ def type_check_all(self):
     def build_prompt(self, current_field):
         """
         This method is in charge of the prompt engineering. It creates a specific prompt for each target field.
+        Returns a structured JSON object with value and confidence so that downstream logic can apply
+        a human-in-the-loop review for hallucinated or low-confidence fields.
         @params: current_field -> represents the current element of the json that is being prompted.
         """
-        prompt = f""" 
+        prompt = f"""
             SYSTEM PROMPT:
-            You are an AI assistant designed to help fillout json files with information extracted from transcribed voice recordings. 
-            You will receive the transcription, and the name of the JSON field whose value you have to identify in the context. Return 
-            only a single string containing the identified value for the JSON field. 
-            If the field name is plural, and you identify more than one possible value in the text, return both separated by a ";".
-            If you don't identify the value in the provided text, return "-1".
+            You are an AI assistant designed to help fill out JSON files with information extracted from transcribed voice recordings.
+            You will receive the transcription and the name of the JSON field whose value you must identify.
+
+            You MUST respond with a single valid JSON object and nothing else. The JSON must have exactly two keys:
+            - "value": the identified string value for the field, or null if not found.
+            - "confidence": a float between 0.0 and 1.0 representing how certain you are.
+
+            Rules:
+            - If the field is plural and you find multiple values, separate them with ";" in the value string.
+            - If you cannot find the value, set "value" to null and "confidence" to 0.0.
+            - Do NOT add any explanation or text outside the JSON object.
+
+            Example output: {{"value": "John Doe", "confidence": 0.95}}
             ---
             DATA:
             Target JSON field to find in text: {current_field}
-            
+
             TEXT: {self._transcript_text}
             """
 
@@ -70,11 +83,10 @@ def main_loop(self):
             except requests.exceptions.HTTPError as e:
                 raise RuntimeError(f"Ollama returned an error: {e}")
 
-            # parse response
+            # parse raw Ollama response
             json_data = response.json()
-            parsed_response = json_data["response"]
-            # print(parsed_response)
-            self.add_response_to_json(field, parsed_response)
+            raw_text = json_data["response"].strip()
+            self.add_response_to_json(field, raw_text)
 
         print("----------------------------------")
         print("\t[LOG] Resulting JSON created from the input text:")
@@ -83,24 +95,45 @@ def main_loop(self):
 
         return self
 
-    def add_response_to_json(self, field, value):
+    def add_response_to_json(self, field, raw_text):
         """
-        this method adds the following value under the specified field,
-        or under a new field if the field doesn't exist, to the json dict
+        Parses the structured JSON response from the LLM.
+        Confirmed fields (confidence >= CONFIDENCE_THRESHOLD) go into self._json.
+        Low-confidence fields go into self._needs_review for human verification.
         """
-        value = value.strip().replace('"', "")
-        parsed_value = None
-
-        if value != "-1":
-            parsed_value = value
-
-        if ";" in value:
-            parsed_value = self.handle_plural_values(value)
-
-        if field in self._json.keys():
-            self._json[field].append(parsed_value)
+        import json as json_lib
+        value = None
+        confidence = 0.0
+
+        try:
+            # The LLM is prompted to always return a JSON object
+            parsed = json_lib.loads(raw_text)
+            value = parsed.get("value")
+            confidence = float(parsed.get("confidence", 0.0))
+        except (json_lib.JSONDecodeError, ValueError, TypeError):
+            # If the LLM failed to return valid JSON, treat the whole text as a
+            # low-confidence raw string so it gets flagged for human review.
+            print(f"\t[WARN]: LLM returned non-JSON for field '{field}'. Flagging for review.")
+            value = raw_text if raw_text not in ("-1", "null", "") else None
+            confidence = 0.0
+
+        # Handle plural values (semicolon-separated)
+        if value and ";" in str(value):
+            value = self.handle_plural_values(value)
+
+        if confidence >= self.CONFIDENCE_THRESHOLD:
+            # High-confidence: write directly into the confirmed JSON
+            if field in self._json:
+                self._json[field].append(value)
+            else:
+                self._json[field] = value
         else:
-            self._json[field] = parsed_value
+            # Low-confidence: flag for human-in-the-loop review
+            print(f"\t[REVIEW REQUIRED]: Field '{field}' has confidence {confidence:.2f} (threshold: {self.CONFIDENCE_THRESHOLD}). Value: '{value}'")
+            self._needs_review[field] = {
+                "suggested_value": value,
+                "confidence": confidence,
+            }
 
         return
 
@@ -132,4 +165,9 @@ def handle_plural_values(self, plural_value):
         return values
 
     def get_data(self):
+        """Returns confirmed high-confidence field values."""
         return self._json
+
+    def get_needs_review(self):
+        """Returns fields that could not be extracted with sufficient confidence and require human review."""
+        return self._needs_review
diff --git a/tests/test_llm_confidence.py b/tests/test_llm_confidence.py
new file mode 100644
index 0000000..c66760e
--- /dev/null
+++ b/tests/test_llm_confidence.py
@@ -0,0 +1,112 @@
+"""
+Tests for LLM confidence validation and human-in-the-loop review logic.
+
+These tests verify that:
+1. High-confidence fields are written into the PDF normally.
+2. Low-confidence fields are separated into the needs_review bucket.
+3. LLM non-JSON responses are safely caught and flagged for review.
+4. The CONFIDENCE_THRESHOLD constant is applied correctly.
+"""
+
+import json
+import pytest
+from unittest.mock import patch, MagicMock
+from src.llm import LLM
+
+
+CONFIDENCE_THRESHOLD = LLM.CONFIDENCE_THRESHOLD  # 0.85
+
+
+def make_llm(fields: dict) -> LLM:
+    """Helper: Create an LLM instance with a dummy transcript and target fields."""
+    return LLM(
+        transcript_text="The employee is John Doe. His badge number is 12345.",
+        target_fields=fields,
+        json={},
+    )
+
+
+def mock_ollama_response(value, confidence):
+    """Helper: Build a mock requests.Response for the Ollama API."""
+    mock_resp = MagicMock()
+    mock_resp.raise_for_status = MagicMock()
+    mock_resp.json.return_value = {
+        "response": json.dumps({"value": value, "confidence": confidence})
+    }
+    return mock_resp
+
+
+class TestAddResponseToJson:
+    """Unit tests for LLM.add_response_to_json()"""
+
+    def test_high_confidence_field_goes_into_confirmed(self):
+        """A field with confidence >= threshold must end up in _json (confirmed)."""
+        llm = make_llm({"employee_name": None})
+        raw = json.dumps({"value": "John Doe", "confidence": 0.95})
+        llm.add_response_to_json("employee_name", raw)
+
+        assert llm.get_data()["employee_name"] == "John Doe"
+        assert "employee_name" not in llm.get_needs_review()
+
+    def test_low_confidence_field_goes_into_needs_review(self):
+        """A field with confidence < threshold must end up in needs_review, not _json."""
+        llm = make_llm({"badge_number": None})
+        raw = json.dumps({"value": "99999", "confidence": 0.50})
+        llm.add_response_to_json("badge_number", raw)
+
+        assert "badge_number" not in llm.get_data()
+        review = llm.get_needs_review()
+        assert "badge_number" in review
+        assert review["badge_number"]["suggested_value"] == "99999"
+        assert review["badge_number"]["confidence"] == pytest.approx(0.50)
+
+    def test_null_value_low_confidence_is_flagged(self):
+        """A field where LLM says it couldn't find the value should be flagged."""
+        llm = make_llm({"incident_code": None})
+        raw = json.dumps({"value": None, "confidence": 0.0})
+        llm.add_response_to_json("incident_code", raw)
+
+        assert "incident_code" not in llm.get_data()
+        assert "incident_code" in llm.get_needs_review()
+
+    def test_non_json_response_is_safely_caught_and_flagged(self):
+        """If the LLM returns garbage (not JSON), it must be caught and flagged — not crash."""
+        llm = make_llm({"address": None})
+        llm.add_response_to_json("address", "Sorry, I don't know the address.")
+
+        assert "address" not in llm.get_data()
+        assert "address" in llm.get_needs_review()
+
+    def test_exactly_at_threshold_is_confirmed(self):
+        """A field with confidence exactly equal to the threshold is confirmed (not flagged)."""
+        llm = make_llm({"date": None})
+        raw = json.dumps({"value": "01/02/2005", "confidence": CONFIDENCE_THRESHOLD})
+        llm.add_response_to_json("date", raw)
+
+        assert llm.get_data()["date"] == "01/02/2005"
+        assert "date" not in llm.get_needs_review()
+
+
+class TestMainLoop:
+    """Integration tests for LLM.main_loop() with a mocked Ollama API."""
+
+    @patch("src.llm.requests.post")
+    def test_main_loop_separates_confirmed_and_flagged(self, mock_post):
+        """main_loop must correctly separate high/low confidence fields from a real loop."""
+        fields = {"employee_name": None, "badge_number": None}
+        llm = make_llm(fields)
+
+        # employee_name returns high confidence; badge_number returns low
+        mock_post.side_effect = [
+            mock_ollama_response("John Doe", 0.97),
+            mock_ollama_response("???", 0.40),
+        ]
+
+        llm.main_loop()
+
+        assert llm.get_data().get("employee_name") == "John Doe"
+        assert "employee_name" not in llm.get_needs_review()
+
+        assert "badge_number" not in llm.get_data()
+        assert "badge_number" in llm.get_needs_review()
+        assert llm.get_needs_review()["badge_number"]["confidence"] == pytest.approx(0.40)