Skip to content

Commit 954dc21

Browse files
committed
fix: include grounding metadata in rubric judge prompt
1 parent 7d74a0a commit 954dc21

7 files changed

Lines changed: 167 additions & 9 deletions

src/google/adk/evaluation/eval_case.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ class InvocationEvent(EvalBaseModel):
6666
content: Optional[genai_types.Content]
6767
"""The content of the event."""
6868

69+
grounding_metadata: Optional[genai_types.GroundingMetadata] = None
70+
"""Grounding metadata emitted with the event."""
71+
6972

7073
class InvocationEvents(EvalBaseModel):
7174
"""A container for events that occur during the course of an invocation."""

src/google/adk/evaluation/evaluation_generator.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -653,21 +653,32 @@ def convert_events_to_eval_invocations(
653653
final_response = event.content
654654
final_event = event
655655

656+
should_add_event = event.grounding_metadata is not None
656657
for p in event.content.parts:
657658
if (
658659
p.function_call
659660
or p.function_response
660661
or p.text
661662
or p.inline_data
662663
):
663-
events_to_add.append(event)
664+
should_add_event = True
664665
break
665-
666-
invocation_events = [
667-
InvocationEvent(author=e.author, content=e.content)
668-
for e in events_to_add
669-
if e is not final_event
670-
]
666+
if should_add_event:
667+
events_to_add.append(event)
668+
elif event.grounding_metadata is not None:
669+
events_to_add.append(event)
670+
671+
invocation_events = []
672+
for e in events_to_add:
673+
if e is final_event and not e.grounding_metadata:
674+
continue
675+
invocation_events.append(
676+
InvocationEvent(
677+
author=e.author,
678+
content=None if e is final_event else e.content,
679+
grounding_metadata=e.grounding_metadata,
680+
)
681+
)
671682
invocations.append(
672683
Invocation(
673684
invocation_id=invocation_id,

src/google/adk/evaluation/llm_as_judge_utils.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,20 @@ class _ToolCallsAndResponses(EvalBaseModel):
153153
tool_calls_and_response: list[_ToolCallAndResponse]
154154

155155

156+
class _GroundingMetadataEntry(EvalBaseModel):
157+
"""Internal data model to capture grounding metadata from an invocation."""
158+
159+
step: int
160+
author: str
161+
grounding_metadata: genai_types.GroundingMetadata
162+
163+
164+
class _GroundingMetadataEntries(EvalBaseModel):
165+
"""Internal data model used for serializing grounding metadata."""
166+
167+
grounding_metadata: list[_GroundingMetadataEntry]
168+
169+
156170
def get_tool_calls_and_responses_as_json_str(
157171
intermediate_data: Optional[IntermediateDataType],
158172
) -> str:
@@ -187,3 +201,34 @@ def get_tool_calls_and_responses_as_json_str(
187201
exclude_defaults=True,
188202
exclude_none=True,
189203
)
204+
205+
206+
def get_grounding_metadata_as_json_str(
207+
intermediate_data: Optional[IntermediateDataType],
208+
) -> str:
209+
"""Returns a JSON string representation of grounding metadata."""
210+
if not isinstance(intermediate_data, InvocationEvents):
211+
return "No grounding metadata was provided."
212+
213+
grounding_metadata = []
214+
for idx, invocation_event in enumerate(intermediate_data.invocation_events):
215+
if invocation_event.grounding_metadata:
216+
grounding_metadata.append(
217+
_GroundingMetadataEntry(
218+
step=idx,
219+
author=invocation_event.author,
220+
grounding_metadata=invocation_event.grounding_metadata,
221+
)
222+
)
223+
224+
if not grounding_metadata:
225+
return "No grounding metadata was provided."
226+
227+
return _GroundingMetadataEntries(
228+
grounding_metadata=grounding_metadata
229+
).model_dump_json(
230+
indent=2,
231+
exclude_unset=True,
232+
exclude_defaults=True,
233+
exclude_none=True,
234+
)

src/google/adk/evaluation/rubric_based_final_response_quality_v1.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from .eval_case import InvocationEvents
2626
from .eval_metrics import EvalMetric
2727
from .eval_metrics import RubricsBasedCriterion
28+
from .llm_as_judge_utils import get_grounding_metadata_as_json_str
2829
from .llm_as_judge_utils import get_text_from_content
2930
from .llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
3031
from .llm_as_judge_utils import get_tool_declarations_as_json_str
@@ -45,8 +46,9 @@
4546
4647
# Key Evaluation Principles
4748
Your evaluation must follow a two-part process: first, collect trusted evidence from the agent's work, and second, judge the final answer against it.
48-
1. **Establish Trusted Evidence from Tool Calls**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt.
49-
* Your ONLY sources of truth are the <user_prompt> and the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the <response_steps>. Examples of procedural flaws include:
49+
1. **Establish Trusted Evidence from Tool Calls and Grounding**: You must first examine the agent's tool calls to determine if they are procedurally sound, meaning that the agent used the appropriate tools with logical parameters to address the user's prompt.
50+
* Your ONLY sources of truth are the <user_prompt>, the direct output ('tool_response') from PROCEDURALLY SOUND tool calls found in the <response_steps>, and model-supplied grounding metadata found in <grounding_metadata>.
51+
* Grounding metadata is trusted evidence for model-internal tools such as google_search whose raw search results may not appear as function tool responses. Examples of procedural flaws include:
5052
* The agent failed to call a tool that will enable it to answer the user's prompt despite having all the necessary parameters to do so.
5153
* The agent called the tool with incorrect or missing parameters.
5254
* The agent called a tool that does not exist, or called a tool with a parameter that does not exist.
@@ -214,6 +216,9 @@
214216
<response_steps>
215217
{response_steps}
216218
</response_steps>
219+
<grounding_metadata>
220+
{grounding_metadata}
221+
</grounding_metadata>
217222
<final_answer>
218223
{final_response}
219224
</final_answer>
@@ -296,6 +301,9 @@ def format_auto_rater_prompt(
296301
response_steps = get_tool_calls_and_responses_as_json_str(
297302
actual_invocation.intermediate_data
298303
)
304+
grounding_metadata = get_grounding_metadata_as_json_str(
305+
actual_invocation.intermediate_data
306+
)
299307

300308
app_details = actual_invocation.app_details
301309
if app_details:
@@ -315,6 +323,7 @@ def format_auto_rater_prompt(
315323
tool_declarations=tool_declarations,
316324
user_input=user_input,
317325
response_steps=response_steps,
326+
grounding_metadata=grounding_metadata,
318327
final_response=final_response,
319328
rubrics=rubrics_text,
320329
)

tests/unittests/evaluation/test_evaluation_generator.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,31 @@ def test_convert_multi_agent_final_responses(
229229
assert intermediate_events[0].author == "agent1"
230230
assert intermediate_events[0].content.parts[0].text == "First response"
231231

232+
def test_convert_preserves_grounding_metadata_from_final_response(
233+
self,
234+
):
235+
"""Tests final grounding metadata is available to evaluators."""
236+
grounding_metadata = types.GroundingMetadata(
237+
web_search_queries=["recent AI news"]
238+
)
239+
events = [
240+
_build_event("user", [types.Part(text="What's new in AI?")], "inv1"),
241+
Event(
242+
author="agent",
243+
content=types.Content(parts=[types.Part(text="Here are sources.")]),
244+
invocation_id="inv1",
245+
grounding_metadata=grounding_metadata,
246+
),
247+
]
248+
249+
invocations = EvaluationGenerator.convert_events_to_eval_invocations(events)
250+
251+
assert len(invocations) == 1
252+
invocation_events = invocations[0].intermediate_data.invocation_events
253+
assert len(invocation_events) == 1
254+
assert invocation_events[0].content is None
255+
assert invocation_events[0].grounding_metadata == grounding_metadata
256+
232257

233258
class TestGetAppDetailsByInvocationId:
234259
"""Test cases for EvaluationGenerator._get_app_details_by_invocation_id method."""

tests/unittests/evaluation/test_llm_as_judge_utils.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from google.adk.evaluation.evaluator import EvalStatus
2727
from google.adk.evaluation.llm_as_judge_utils import get_average_rubric_score
2828
from google.adk.evaluation.llm_as_judge_utils import get_eval_status
29+
from google.adk.evaluation.llm_as_judge_utils import get_grounding_metadata_as_json_str
2930
from google.adk.evaluation.llm_as_judge_utils import get_text_from_content
3031
from google.adk.evaluation.llm_as_judge_utils import get_tool_calls_and_responses_as_json_str
3132
from google.adk.evaluation.llm_as_judge_utils import get_tool_declarations_as_json_str
@@ -362,3 +363,36 @@ def test_get_tool_calls_and_responses_as_json_str_with_invocation_events_multipl
362363
]
363364
}
364365
assert json.loads(json_str) == expected_json
366+
367+
368+
def test_get_grounding_metadata_as_json_str_with_invocation_events():
369+
"""Tests grounding metadata is serialized for LLM-as-judge prompts."""
370+
grounding_metadata = genai_types.GroundingMetadata(
371+
web_search_queries=["recent AI news"]
372+
)
373+
intermediate_data = InvocationEvents(
374+
invocation_events=[
375+
InvocationEvent(
376+
author="agent",
377+
content=None,
378+
grounding_metadata=grounding_metadata,
379+
)
380+
]
381+
)
382+
383+
json_str = get_grounding_metadata_as_json_str(intermediate_data)
384+
parsed = json.loads(json_str)
385+
386+
assert parsed["grounding_metadata"][0]["step"] == 0
387+
assert parsed["grounding_metadata"][0]["author"] == "agent"
388+
assert parsed["grounding_metadata"][0]["grounding_metadata"][
389+
"web_search_queries"
390+
] == ["recent AI news"]
391+
392+
393+
def test_get_grounding_metadata_as_json_str_without_metadata():
394+
"""Tests empty grounding metadata serialization."""
395+
assert (
396+
get_grounding_metadata_as_json_str(InvocationEvents())
397+
== "No grounding metadata was provided."
398+
)

tests/unittests/evaluation/test_rubric_based_final_response_quality_v1.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,37 @@ def test_format_auto_rater_prompt_with_intermediate_data(
182182
assert '"result": "ok"' in prompt
183183

184184

185+
def test_format_auto_rater_prompt_with_grounding_metadata(
186+
evaluator: RubricBasedFinalResponseQualityV1Evaluator,
187+
):
188+
"""Tests grounding metadata is included as trusted evidence."""
189+
grounding_metadata = genai_types.GroundingMetadata(
190+
web_search_queries=["recent AI news"]
191+
)
192+
invocation = Invocation(
193+
user_content=genai_types.Content(
194+
parts=[genai_types.Part(text="What's new in AI?")]
195+
),
196+
final_response=genai_types.Content(
197+
parts=[genai_types.Part(text="Here are sources.")]
198+
),
199+
intermediate_data=InvocationEvents(
200+
invocation_events=[
201+
InvocationEvent(
202+
author="agent",
203+
content=None,
204+
grounding_metadata=grounding_metadata,
205+
)
206+
]
207+
),
208+
)
209+
prompt = evaluator.format_auto_rater_prompt(invocation, None)
210+
211+
assert "<grounding_metadata>" in prompt
212+
assert "recent AI news" in prompt
213+
assert "model-supplied grounding metadata" in prompt
214+
215+
185216
def test_format_auto_rater_prompt_with_app_details_no_tools(
186217
evaluator: RubricBasedFinalResponseQualityV1Evaluator,
187218
):

0 commit comments

Comments
 (0)