datahub-project · shirshanka · May 31, 2026 · May 31, 2026 · May 31, 2026 · May 31, 2026
diff --git a/backend/src/analytics_agent/agent/analysis.py b/backend/src/analytics_agent/agent/analysis.py
@@ -34,11 +34,20 @@
 1 Very Poor — No useful context; agent expressed significant uncertainty, made \
 conflicting assumptions, or produced an answer that contradicts available definitions.
 
+**Important:** A `search_business_context` result that contains a `catalog_search` key \
+means ALL governance searches (documentation, glossary, domains, data products) returned \
+empty. No authoritative business definition exists. This caps the score at 3 (Fair) \
+regardless of what the catalog search found — scores of 4 or 5 require a governed \
+definition (glossary term, domain doc, or data-product entry). Within that 1–3 range, \
+use the dataset description from subsequent `get_entities` calls to judge how useful \
+the context actually was.
+
 Key signals that push the score DOWN:
 - Agent says "the definition doesn't cover this" or "I'll interpret this as…"
 - Agent switches columns, tables, or date anchors not mentioned in the definition
 - Agent produces a result that varies based on an undocumented assumption
 - Agent asks the user to clarify something the glossary/docs should have defined
+- `search_business_context` result contains `catalog_search` (no governed definition → max score 3)
 
 --- CONTEXT TOOL CALLS AND RESULTS ---
 {context_calls}

diff --git a/backend/src/analytics_agent/skills/datahub_skills.py b/backend/src/analytics_agent/skills/datahub_skills.py
@@ -356,6 +356,25 @@ def _save_correction_impl(
 # ---------------------------------------------------------------------------
 
 
+def _is_empty_search_result(result: object) -> bool:
+    """Return True when a search/search_documents result contains no hits."""
+    if result is None:
+        return True
+    if isinstance(result, dict):
+        if "error" in result:
+            return True
+        # datahub_agent_context search returns {"total": N, "entities": [...]}
+        # or {"results": [...]} depending on the tool
+        total = result.get("total", None)
+        if total is not None:
+            return int(total) == 0
+        entities = result.get("entities") or result.get("results") or []
+        return len(entities) == 0
+    if isinstance(result, list):
+        return len(result) == 0
+    return False
+
+
 def _search_business_context_impl(topic: str) -> dict:
     """Fan out to DataHub docs, glossary terms, domains, and data products for a topic."""
     from analytics_agent.context.datahub import get_datahub_client
@@ -394,6 +413,20 @@ def _search_business_context_impl(topic: str) -> dict:
             except Exception as e:
                 results[label] = {"error": str(e)}
 
+        # When no business documentation exists, fall back to a general catalog search so
+        # the agent can confirm the entity is present before telling the user it's missing.
+        if all(_is_empty_search_result(v) for v in results.values()):
+            try:
+                results["catalog_search"] = search(query=topic, num_results=10)
+                results["note"] = (
+                    "No governed documentation, glossary terms, domains, or data products "
+                    "were found for this topic. Catalog search results are included above — "
+                    "the entity may still exist in DataHub without governance metadata. "
+                    "Use get_entities on any matching URN to confirm existence and fetch schema."
+                )
+            except Exception as e:
+                results["catalog_search"] = {"error": str(e)}
+
     return results
 
 

diff --git a/tests/unit/test_search_business_context.py b/tests/unit/test_search_business_context.py
@@ -0,0 +1,138 @@
+"""
+Unit tests for _search_business_context_impl fallback behaviour (issue #61).
+
+When a dataset exists in DataHub but has no docs / glossary / domain / data-product
+entries, the four business-context sub-searches all return empty.  The impl must
+automatically fall back to a general catalog search and surface the result so the
+agent doesn't incorrectly tell the user the entity "doesn't exist".
+"""
+
+from __future__ import annotations
+
+from contextlib import ExitStack
+from unittest.mock import MagicMock, patch
+
+import pytest
+from analytics_agent.skills.datahub_skills import (
+    _is_empty_search_result,
+    _search_business_context_impl,
+)
+
+# ---------------------------------------------------------------------------
+# _is_empty_search_result
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "result, expected",
+    [
+        (None, True),
+        ([], True),
+        ({}, True),  # no recognised keys → no entities found
+        ({"total": 0, "entities": []}, True),
+        ({"total": 1, "entities": [{"urn": "urn:li:dataset:(x,y,PROD)"}]}, False),
+        ({"results": []}, True),
+        ({"results": [{"urn": "x"}]}, False),
+        ({"error": "something went wrong"}, True),
+    ],
+)
+def test_is_empty_search_result(result, expected):
+    assert _is_empty_search_result(result) == expected
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_EMPTY = {"total": 0, "entities": []}
+_HIT = {
+    "total": 1,
+    "entities": [{"urn": "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"}],
+}
+
+
+def _mock_datahub_context():
+    """Patch DataHubContext so it works as a no-op context manager."""
+    ctx_cls = MagicMock()
+    ctx_cls.return_value.__enter__ = MagicMock(return_value=None)
+    ctx_cls.return_value.__exit__ = MagicMock(return_value=False)
+    return patch("datahub_agent_context.context.DataHubContext", ctx_cls)
+
+
+# ---------------------------------------------------------------------------
+# _search_business_context_impl — fallback to catalog search
+# ---------------------------------------------------------------------------
+
+
+def test_fallback_triggered_when_all_empty():
+    """Catalog search is included when all business-context sub-searches are empty."""
+
+    def _search_side_effect(**kwargs):
+        # Filtered calls (glossaryTerm, domain, dataProduct) → empty;
+        # un-filtered fallback call → hit
+        if kwargs.get("filter"):
+            return _EMPTY
+        return _HIT
+
+    mock_client = MagicMock()
+    with ExitStack() as stack:
+        stack.enter_context(
+            patch("analytics_agent.context.datahub.get_datahub_client", return_value=mock_client)
+        )
+        stack.enter_context(_mock_datahub_context())
+        stack.enter_context(
+            patch(
+                "datahub_agent_context.mcp_tools.documents.search_documents",
+                return_value=_EMPTY,
+            )
+        )
+        stack.enter_context(
+            patch(
+                "datahub_agent_context.mcp_tools.search.search",
+                side_effect=_search_side_effect,
+            )
+        )
+        result = _search_business_context_impl("SampleHiveDataset")
+
+    assert "catalog_search" in result, "Fallback catalog_search key must be present"
+    assert result["catalog_search"] == _HIT
+    assert "note" in result, "A note explaining the fallback must be present"
+
+
+def test_no_fallback_when_business_context_found():
+    """Catalog fallback is NOT added when at least one business-context search has results."""
+
+    def _search_side_effect(**kwargs):
+        if "glossaryTerm" in kwargs.get("filter", ""):
+            return _HIT  # glossary found something
+        return _EMPTY
+
+    mock_client = MagicMock()
+    with ExitStack() as stack:
+        stack.enter_context(
+            patch("analytics_agent.context.datahub.get_datahub_client", return_value=mock_client)
+        )
+        stack.enter_context(_mock_datahub_context())
+        stack.enter_context(
+            patch(
+                "datahub_agent_context.mcp_tools.documents.search_documents",
+                return_value=_EMPTY,
+            )
+        )
+        stack.enter_context(
+            patch(
+                "datahub_agent_context.mcp_tools.search.search",
+                side_effect=_search_side_effect,
+            )
+        )
+        result = _search_business_context_impl("SomeMetric")
+
+    assert "catalog_search" not in result
+    assert "note" not in result
+
+
+def test_returns_error_when_no_client():
+    """Returns error dict immediately when DataHub is not configured."""
+    with patch("analytics_agent.context.datahub.get_datahub_client", return_value=None):
+        result = _search_business_context_impl("anything")
+    assert result == {"error": "DataHub is not configured."}