fix: include grounding metadata in rubric judge prompt

he-yufeng · he-yufeng · commit 954dc21ba884 · 2026-06-13T05:59:24.000+08:00
diff --git a/src/google/adk/evaluation/eval_case.py b/src/google/adk/evaluation/eval_case.py
@@ -66,6 +66,9 @@ class InvocationEvent(EvalBaseModel):
   content: Optional[genai_types.Content]
   """The content of the event."""
 
+  grounding_metadata: Optional[genai_types.GroundingMetadata] = None
+  """Grounding metadata emitted with the event."""
+
 
 class InvocationEvents(EvalBaseModel):
   """A container for events that occur during the course of an invocation."""
diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py
@@ -653,21 +653,32 @@ def convert_events_to_eval_invocations(
             final_response = event.content
             final_event = event
 
+          should_add_event = event.grounding_metadata is not None
           for p in event.content.parts:
             if (
                 p.function_call
                 or p.function_response
                 or p.text
                 or p.inline_data
             ):
-              events_to_add.append(event)
+              should_add_event = True
               break
-
-      invocation_events = [
-          InvocationEvent(author=e.author, content=e.content)
-          for e in events_to_add
-          if e is not final_event
-      ]
+          if should_add_event:
+            events_to_add.append(event)
+        elif event.grounding_metadata is not None:
+          events_to_add.append(event)
+
+      invocation_events = []
+      for e in events_to_add:
+        if e is final_event and not e.grounding_metadata:
+          continue
+        invocation_events.append(
+            InvocationEvent(
+                author=e.author,
+                content=None if e is final_event else e.content,
+                grounding_metadata=e.grounding_metadata,
+            )
+        )
       invocations.append(
           Invocation(
               invocation_id=invocation_id,
diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -153,6 +153,20 @@ class _ToolCallsAndResponses(EvalBaseModel):
   tool_calls_and_response: list[_ToolCallAndResponse]
 
 
+class _GroundingMetadataEntry(EvalBaseModel):
+  """Internal data model to capture grounding metadata from an invocation."""
+
+  step: int
+  author: str
+  grounding_metadata: genai_types.GroundingMetadata
+
+
+class _GroundingMetadataEntries(EvalBaseModel):
+  """Internal data model used for serializing grounding metadata."""
+
+  grounding_metadata: list[_GroundingMetadataEntry]
+
+
 def get_tool_calls_and_responses_as_json_str(
     intermediate_data: Optional[IntermediateDataType],
 ) -> str:
@@ -187,3 +201,34 @@ def get_tool_calls_and_responses_as_json_str(
       exclude_defaults=True,
       exclude_none=True,
   )
+
+
+def get_grounding_metadata_as_json_str(
+    intermediate_data: Optional[IntermediateDataType],
+) -> str:
+  """Returns a JSON string representation of grounding metadata."""
+  if not isinstance(intermediate_data, InvocationEvents):
+    return "No grounding metadata was provided."
+
+  grounding_metadata = []
+  for idx, invocation_event in enumerate(intermediate_data.invocation_events):
+    if invocation_event.grounding_metadata:
+      grounding_metadata.append(
+          _GroundingMetadataEntry(
+              step=idx,
+              author=invocation_event.author,
+              grounding_metadata=invocation_event.grounding_metadata,
+          )
+      )
+
+  if not grounding_metadata:
+    return "No grounding metadata was provided."
+
+  return _GroundingMetadataEntries(
+      grounding_metadata=grounding_metadata
+  ).model_dump_json(
+      indent=2,
+      exclude_unset=True,
+      exclude_defaults=True,
+      exclude_none=True,
+  )
diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -25,6 +25,7 @@
 from .eval_case import InvocationEvents
 from .eval_metrics import EvalMetric
 from .eval_metrics import RubricsBasedCriterion
+from .llm_as_judge_utils import get_grounding_metadata_as_json_str
 from .llm_as_judge_utils import get_text_from_content
 from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
 from .llm_as_judge_utils import get_tool_declarations_as_json_str
@@ -45,8 +46,9 @@
 
 # Key Evaluation Principles
 Your evaluation must follow a two-part process: first, collect trusted evidence from the agent's work, and second, judge the final answer against it.
-1. **Establish Trusted Evidence from Tool Calls**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt.
-  * Your ONLY sources of truth are the <user_prompt> and the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the <response_steps>. Examples of procedural flaws include:
+1. **Establish Trusted Evidence from Tool Calls and Grounding**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt.
+  * Your ONLY sources of truth are the <user_prompt>, the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the <response_steps>, and model-supplied grounding metadata found in <grounding_metadata>.
+  * Grounding metadata is trusted evidence for model-internal tools such as google_search whose raw search results may not appear as function tool responses. Examples of procedural flaws include:
     * The agent failed to call a tool that will enable it to answer the user's prompt despite having all the necessary parameters to do so.
     * The agent called the tool with incorrect or missing parameters.
     * The agent called a tool that does not exist, or called a tool with a parameter that does not exist.
@@ -214,6 +216,9 @@
   <response_steps>
   {response_steps}
   </response_steps>
+  <grounding_metadata>
+  {grounding_metadata}
+  </grounding_metadata>
   <final_answer>
   {final_response}
   </final_answer>
@@ -296,6 +301,9 @@ def format_auto_rater_prompt(
     response_steps = get_tool_calls_and_responses_as_json_str(
         actual_invocation.intermediate_data
     )
+    grounding_metadata = get_grounding_metadata_as_json_str(
+        actual_invocation.intermediate_data
+    )
 
     app_details = actual_invocation.app_details
     if app_details:
@@ -315,6 +323,7 @@ def format_auto_rater_prompt(
         tool_declarations=tool_declarations,
         user_input=user_input,
         response_steps=response_steps,
+        grounding_metadata=grounding_metadata,
         final_response=final_response,
         rubrics=rubrics_text,
     )
diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py
@@ -229,6 +229,31 @@ def test_convert_multi_agent_final_responses(
     assert intermediate_events[0].author == "agent1"
     assert intermediate_events[0].content.parts[0].text == "First response"
 
+  def test_convert_preserves_grounding_metadata_from_final_response(
+      self,
+  ):
+    """Tests final grounding metadata is available to evaluators."""
+    grounding_metadata = types.GroundingMetadata(
+        web_search_queries=["recent AI news"]
+    )
+    events = [
+        _build_event("user", [types.Part(text="What's new in AI?")], "inv1"),
+        Event(
+            author="agent",
+            content=types.Content(parts=[types.Part(text="Here are sources.")]),
+            invocation_id="inv1",
+            grounding_metadata=grounding_metadata,
+        ),
+    ]
+
+    invocations = EvaluationGenerator.convert_events_to_eval_invocations(events)
+
+    assert len(invocations) == 1
+    invocation_events = invocations[0].intermediate_data.invocation_events
+    assert len(invocation_events) == 1
+    assert invocation_events[0].content is None
+    assert invocation_events[0].grounding_metadata == grounding_metadata
+
 
 class TestGetAppDetailsByInvocationId:
   """Test cases for EvaluationGenerator._get_app_details_by_invocation_id method."""
diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py
@@ -26,6 +26,7 @@
 from google.adk.evaluation.evaluator import EvalStatus
 from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score
 from google.adk.evaluation.llm_as_judge_utils import get_eval_status
+from google.adk.evaluation.llm_as_judge_utils import get_grounding_metadata_as_json_str
 from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
 from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
 from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str
@@ -362,3 +363,36 @@ def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multipl
       ]
   }
   assert json.loads(json_str) == expected_json
+
+
+def test_get_grounding_metadata_as_json_str_with_invocation_events():
+  """Tests grounding metadata is serialized for LLM-as-judge prompts."""
+  grounding_metadata = genai_types.GroundingMetadata(
+      web_search_queries=["recent AI news"]
+  )
+  intermediate_data = InvocationEvents(
+      invocation_events=[
+          InvocationEvent(
+              author="agent",
+              content=None,
+              grounding_metadata=grounding_metadata,
+          )
+      ]
+  )
+
+  json_str = get_grounding_metadata_as_json_str(intermediate_data)
+  parsed = json.loads(json_str)
+
+  assert parsed["grounding_metadata"][0]["step"] == 0
+  assert parsed["grounding_metadata"][0]["author"] == "agent"
+  assert parsed["grounding_metadata"][0]["grounding_metadata"][
+      "web_search_queries"
+  ] == ["recent AI news"]
+
+
+def test_get_grounding_metadata_as_json_str_without_metadata():
+  """Tests empty grounding metadata serialization."""
+  assert (
+      get_grounding_metadata_as_json_str(InvocationEvents())
+      == "No grounding metadata was provided."
+  )
diff --git a/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py b/tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py
@@ -182,6 +182,37 @@ def test_format_auto_rater_prompt_with_intermediate_data(
   assert '"result": "ok"' in prompt
 
 
+def test_format_auto_rater_prompt_with_grounding_metadata(
+    evaluator: RubricBasedFinalResponseQualityV1Evaluator,
+):
+  """Tests grounding metadata is included as trusted evidence."""
+  grounding_metadata = genai_types.GroundingMetadata(
+      web_search_queries=["recent AI news"]
+  )
+  invocation = Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text="What's new in AI?")]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text="Here are sources.")]
+      ),
+      intermediate_data=InvocationEvents(
+          invocation_events=[
+              InvocationEvent(
+                  author="agent",
+                  content=None,
+                  grounding_metadata=grounding_metadata,
+              )
+          ]
+      ),
+  )
+  prompt = evaluator.format_auto_rater_prompt(invocation, None)
+
+  assert "<grounding_metadata>" in prompt
+  assert "recent AI news" in prompt
+  assert "model-supplied grounding metadata" in prompt
+
+
 def test_format_auto_rater_prompt_with_app_details_no_tools(
     evaluator: RubricBasedFinalResponseQualityV1Evaluator,
 ):