|
12 | 12 | from typing import Any, Dict, List, Optional, Protocol, TYPE_CHECKING, cast |
13 | 13 |
|
14 | 14 | from langfuse.api.resources.commons.types.observations_view import ObservationsView |
15 | | -from eval_protocol.models import EvaluationRow, InputMetadata, Message |
| 15 | +from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message |
16 | 16 | from .base import BaseAdapter |
17 | 17 | from .utils import extract_messages_from_data |
18 | 18 |
|
@@ -82,14 +82,41 @@ def convert_trace_to_evaluation_row( |
82 | 82 | if not messages: |
83 | 83 | return None |
84 | 84 |
|
| 85 | + execution_metadata = ExecutionMetadata() |
| 86 | + row_id = None |
| 87 | + |
| 88 | + if trace.tags: |
| 89 | + for tag in trace.tags: |
| 90 | + if tag.startswith("invocation_id:"): |
| 91 | + execution_metadata.invocation_id = tag.split(":", 1)[1] |
| 92 | + elif tag.startswith("experiment_id:"): |
| 93 | + execution_metadata.experiment_id = tag.split(":", 1)[1] |
| 94 | + elif tag.startswith("rollout_id:"): |
| 95 | + execution_metadata.rollout_id = tag.split(":", 1)[1] |
| 96 | + elif tag.startswith("run_id:"): |
| 97 | + execution_metadata.run_id = tag.split(":", 1)[1] |
| 98 | + elif tag.startswith("row_id:"): |
| 99 | + row_id = tag.split(":", 1)[1] |
| 100 | + |
| 101 | + if ( |
| 102 | + execution_metadata.invocation_id |
| 103 | + and execution_metadata.experiment_id |
| 104 | + and execution_metadata.rollout_id |
| 105 | + and execution_metadata.run_id |
| 106 | + and row_id |
| 107 | + ): |
| 108 | + break # Break early if we've found all the metadata we need |
| 109 | + |
85 | 110 | return EvaluationRow( |
86 | 111 | messages=messages, |
87 | 112 | tools=tools, |
88 | 113 | input_metadata=InputMetadata( |
| 114 | + row_id=row_id, |
89 | 115 | session_data={ |
90 | 116 | "langfuse_trace_id": trace.id, # Store the trace ID here |
91 | | - } |
| 117 | + }, |
92 | 118 | ), |
| 119 | + execution_metadata=execution_metadata, |
93 | 120 | ) |
94 | 121 |
|
95 | 122 | except (AttributeError, ValueError, KeyError) as e: |
@@ -259,9 +286,6 @@ def get_evaluation_rows( |
259 | 286 | max_retries: int = 3, |
260 | 287 | span_name: Optional[str] = None, |
261 | 288 | converter: Optional[TraceConverter] = None, |
262 | | - metadata: Optional[Dict[str, Any]] = None, |
263 | | - requester_metadata: Optional[Dict[str, Any]] = None, |
264 | | - requester_metadata_contains: Optional[str] = None, |
265 | 289 | ) -> List[EvaluationRow]: |
266 | 290 | """Pull traces from Langfuse and convert to EvaluationRow format. |
267 | 291 |
|
@@ -296,10 +320,6 @@ def get_evaluation_rows( |
296 | 320 | to_timestamp = datetime.now() |
297 | 321 | from_timestamp = to_timestamp - timedelta(hours=hours_back) |
298 | 322 |
|
299 | | - # If filtering by metadata/requester_metadata, prefer fetching metadata fields |
300 | | - if (metadata is not None or requester_metadata is not None or requester_metadata_contains) and not fields: |
301 | | - fields = "core,metadata,observations" |
302 | | - |
303 | 323 | # Collect trace summaries via pagination (up to limit) |
304 | 324 | all_traces = [] |
305 | 325 | page = 1 |
@@ -332,16 +352,18 @@ def get_evaluation_rows( |
332 | 352 | to_timestamp=to_timestamp, |
333 | 353 | order_by="timestamp.desc", |
334 | 354 | ) |
| 355 | + |
| 356 | + # If no results, possible due to indexing delay--remote rollout processor just finished pushing rows to Langfuse |
| 357 | + if traces and traces.meta and traces.meta.total_items == 0 and page == 1: |
| 358 | + raise Exception("Empty results - indexing delay") |
| 359 | + |
335 | 360 | break |
336 | 361 | except Exception as e: |
337 | 362 | list_retries += 1 |
338 | | - if "429" in str(e) and list_retries < max_retries: |
| 363 | + if list_retries < max_retries and ("429" in str(e) or "Empty results" in str(e)): |
339 | 364 | sleep_time = 2**list_retries # Exponential backoff |
340 | 365 | logger.warning( |
341 | | - "Rate limit hit on trace.list(), retrying in %ds (attempt %d/%d)", |
342 | | - sleep_time, |
343 | | - list_retries, |
344 | | - max_retries, |
| 366 | + "Retrying in %ds (attempt %d/%d): %s", sleep_time, list_retries, max_retries, str(e) |
345 | 367 | ) |
346 | 368 | time.sleep(sleep_time) |
347 | 369 | else: |
@@ -379,74 +401,6 @@ def get_evaluation_rows( |
379 | 401 | selected_traces = all_traces |
380 | 402 | logger.debug("Processing all %d collected traces (no sampling)", len(all_traces)) |
381 | 403 |
|
382 | | - # Helper to check if a trace matches provided metadata filters. We look in multiple places |
383 | | - # to account for Langfuse moving fields (e.g., metadata vs requester_metadata) and SDK shape. |
384 | | - def _trace_matches_metadata_filters(trace_obj: Any) -> bool: |
385 | | - if metadata is None and requester_metadata is None: |
386 | | - return True |
387 | | - |
388 | | - def _as_dict(val: Any) -> Dict[str, Any]: |
389 | | - if val is None: |
390 | | - return {} |
391 | | - if isinstance(val, dict): |
392 | | - return val |
393 | | - # Some SDK objects expose .model_dump() or behave like pydantic models |
394 | | - dump = getattr(val, "model_dump", None) |
395 | | - if callable(dump): |
396 | | - try: |
397 | | - return dump() # type: ignore[no-any-return] |
398 | | - except Exception: |
399 | | - return {} |
400 | | - return {} |
401 | | - |
402 | | - # Try common locations for metadata on full trace |
403 | | - trace_meta = _as_dict(getattr(trace_obj, "metadata", None)) |
404 | | - trace_req_meta = _as_dict(getattr(trace_obj, "requester_metadata", None)) |
405 | | - # Some Langfuse deployments nest requester_metadata inside metadata |
406 | | - nested_req_meta = {} |
407 | | - try: |
408 | | - if isinstance(trace_meta, dict) and isinstance(trace_meta.get("requester_metadata"), dict): |
409 | | - nested_req_meta = _as_dict(trace_meta.get("requester_metadata")) |
410 | | - except Exception: |
411 | | - nested_req_meta = {} |
412 | | - |
413 | | - # Fallbacks: sometimes metadata is embedded in input |
414 | | - input_meta = {} |
415 | | - try: |
416 | | - inp = getattr(trace_obj, "input", None) |
417 | | - if isinstance(inp, dict): |
418 | | - input_meta = _as_dict(inp.get("metadata")) |
419 | | - except Exception: |
420 | | - input_meta = {} |
421 | | - |
422 | | - # Combine for matching convenience (later keys override earlier for equality check only) |
423 | | - combined_meta = {**trace_meta, **input_meta} |
424 | | - combined_req_meta = {**trace_req_meta} |
425 | | - |
426 | | - # Also merge nested requester metadata when present |
427 | | - if nested_req_meta: |
428 | | - combined_req_meta = {**combined_req_meta, **nested_req_meta} |
429 | | - |
430 | | - def _is_subset(needle: Dict[str, Any], haystack: Dict[str, Any]) -> bool: |
431 | | - for k, v in needle.items(): |
432 | | - if haystack.get(k) != v: |
433 | | - return False |
434 | | - return True |
435 | | - |
436 | | - ok_meta = True |
437 | | - ok_req_meta = True |
438 | | - |
439 | | - if metadata is not None: |
440 | | - # Accept match if found either in metadata or requester_metadata buckets |
441 | | - ok_meta = _is_subset(metadata, combined_meta) or _is_subset(metadata, combined_req_meta) |
442 | | - |
443 | | - if requester_metadata is not None: |
444 | | - ok_req_meta = _is_subset(requester_metadata, combined_req_meta) or _is_subset( |
445 | | - requester_metadata, combined_meta |
446 | | - ) |
447 | | - |
448 | | - return ok_meta and ok_req_meta |
449 | | - |
450 | 404 | # Process each selected trace with sleep and retry logic |
451 | 405 | for trace_info in selected_traces: |
452 | 406 | # Sleep between gets to avoid rate limits |
@@ -483,39 +437,6 @@ def _is_subset(needle: Dict[str, Any], haystack: Dict[str, Any]) -> bool: |
483 | 437 | break # Skip this trace |
484 | 438 |
|
485 | 439 | if trace_full: |
486 | | - # If metadata filters are provided, skip non-matching traces early |
487 | | - try: |
488 | | - if not _trace_matches_metadata_filters(trace_full): |
489 | | - continue |
490 | | - except Exception: |
491 | | - # Be permissive on filter errors; treat as non-match |
492 | | - continue |
493 | | - |
494 | | - # If observations carry requester_metadata, allow substring filtering |
495 | | - if requester_metadata_contains: |
496 | | - contains_val = requester_metadata_contains |
497 | | - found_match = False |
498 | | - try: |
499 | | - for obs in getattr(trace_full, "observations", []) or []: |
500 | | - obs_rmd = getattr(obs, "requester_metadata", None) |
501 | | - if isinstance(obs_rmd, dict) and any( |
502 | | - (isinstance(v, str) and contains_val in v) for v in obs_rmd.values() |
503 | | - ): |
504 | | - found_match = True |
505 | | - break |
506 | | - obs_md = getattr(obs, "metadata", None) |
507 | | - if isinstance(obs_md, dict): |
508 | | - nested = obs_md.get("requester_metadata") |
509 | | - if isinstance(nested, dict) and any( |
510 | | - (isinstance(v, str) and contains_val in v) for v in nested.values() |
511 | | - ): |
512 | | - found_match = True |
513 | | - break |
514 | | - except Exception: |
515 | | - found_match = False |
516 | | - if not found_match: |
517 | | - continue |
518 | | - |
519 | 440 | try: |
520 | 441 | if converter: |
521 | 442 | eval_row = converter(trace_full, include_tool_calls, span_name) |
|
0 commit comments