Dhee/simulate_usecases.py at main · Sankhya-AI/Dhee · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
"""
Dhee HyperAgent Use Case Simulation
Tests all claims from README as a real user would experience them.
No pytest — direct Python simulation with clear pass/fail reporting.

Simulates real user scenarios:
  UC1: Dev agent remembers preferences across sessions
  UC2: Performance regression detection
  UC3: Insight synthesis ("what worked") transfers to future sessions
  UC4: Prospective memory fires at the right moment
  UC5: Cross-agent handoff (Claude Code → Cursor)
"""

import sys
import os
import json
import tempfile

os.chdir("/Users/chitranjanmalviya/Desktop/Dhee")
sys.path.insert(0, ".")

PASS = "  [PASS]"
FAIL = "  [FAIL]"
WARN = "  [WARN]"
INFO = "  [INFO]"

results = []


def report(label, status, detail=""):
    symbol = {"PASS": PASS, "FAIL": FAIL, "WARN": WARN, "INFO": INFO}[status]
    line = f"{symbol} {label}"
    if detail:
        line += f"\n         → {detail}"
    print(line)
    results.append((label, status))


print("=" * 65)
print(" DHEE HyperAgent — Real Use Case Simulation")
print("=" * 65)

# =========================================================================
# [A] IMPORT / SDK SURFACE
# =========================================================================
print("\n[A] SDK Surface — what does 'from dhee import ...' actually give you?")

try:
    from dhee import Dhee  # noqa: F401
    report("from dhee import Dhee (4-tool API from README)", "PASS")
except ImportError as e:
    report("from dhee import Dhee (4-tool API from README)", "FAIL",
           "README shows `from dhee import Dhee` but no 'Dhee' class exists. "
           "README must be updated → use Engram/FullMemory instead.")

try:
    from dhee import Engram, CoreMemory, FullMemory
    report("from dhee import Engram, CoreMemory, FullMemory", "PASS")
except ImportError as e:
    report("from dhee import Engram, CoreMemory, FullMemory", "FAIL", str(e))

# =========================================================================
# [B] USE CASE 1 — "My agent remembers I like dark mode across sessions"
# (using mock/offline mode, no API key)
# =========================================================================
print("\n[B] USE CASE 1 — Persistent memory (offline, hash embeddings)")
print("    Scenario: dev agent stores 5 facts about the user, then retrieves them.")

from dhee import Engram
eng = Engram(in_memory=True)

# Store memories with infer=False (bypasses LLM extraction)
facts = [
    "User prefers dark mode for all IDEs",
    "Project uses PostgreSQL 15 with pgvector extension",
    "User prefers FastAPI over Flask for Python web APIs",
    "Auth system uses JWT tokens with 15-minute expiry",
    "User codes in Python primarily, TypeScript for frontend",
]
mem_ids = []
for fact in facts:
    r = eng.add(fact, user_id="dev_user", infer=False)
    mid = None
    if isinstance(r, dict):
        rs = r.get("results", [])
        if rs:
            mid = rs[0].get("id")
    mem_ids.append(mid)

stored_count = sum(1 for mid in mem_ids if mid)
report(
    f"Store 5 facts with infer=False: {stored_count}/5 stored",
    "PASS" if stored_count == 5 else "FAIL",
    f"IDs: {[m[:8] if m else 'None' for m in mem_ids]}",
)

# Retrieve
ga_raw = eng.get_all(user_id="dev_user")
# get_all() returns dict {'results': [...]} despite being typed as List
all_mems = ga_raw.get("results", []) if isinstance(ga_raw, dict) else ga_raw
count = len(all_mems)
report(
    f"get_all() returns {count} memories (note: returns dict, not list as typed)",
    "PASS" if count >= 5 else "FAIL",
    f"Sample: {all_mems[0].get('memory', all_mems[0].get('content', ''))[:60] if all_mems else 'none'}",
)

# Exact-ish phrasing search (hash embeddings — similar words needed)
sr = eng.search("dark mode", user_id="dev_user", limit=3)
found = any("dark" in (r.get("memory", r.get("content", ""))).lower() for r in sr)
report(
    "recall('dark mode') → finds 'dark mode' memory",
    "PASS" if found else "FAIL",
    f"Top result: {sr[0].get('memory', sr[0].get('content',''))[:60] if sr else 'empty'}",
)

sr2 = eng.search("PostgreSQL database", user_id="dev_user", limit=3)
found2 = any("postgres" in (r.get("memory", r.get("content", ""))).lower() for r in sr2)
report(
    "recall('PostgreSQL database') → finds 'PostgreSQL' memory",
    "PASS" if found2 else "FAIL",
    f"Top result: {sr2[0].get('memory', sr2[0].get('content',''))[:60] if sr2 else 'empty'}",
)

# Cross-phrasing test (hash embeddings — will NOT match without real LLM echo)
sr3 = eng.search("what theme does the user like?", user_id="dev_user", limit=3)
found3 = any("dark" in (r.get("memory", r.get("content", ""))).lower() for r in sr3)
report(
    "recall('what theme does the user like?') → finds 'dark mode' (cross-phrasing)",
    "PASS" if found3 else "WARN",
    (
        "EXPECTED with real LLM + echo enrichment at checkpoint. "
        "Hash embeddings can't cross-phrase without LLM-generated paraphrases. "
        "This claim is VALID but only after checkpoint() runs echo enrichment."
        if not found3 else f"Found: {sr3[0].get('memory', '')[:60]}"
    ),
)

# =========================================================================
# [C] USE CASE 2 — "Agent notices its code review quality is regressing"
# =========================================================================
print("\n[C] USE CASE 2 — Performance regression detection (Buddhi)")
print("    Scenario: 8 code reviews over time, quality drops → warning fires")

from dhee.core.buddhi import Buddhi

buddhi = Buddhi(data_dir=tempfile.mkdtemp(prefix="dhee_b_"))

# Session 1-3: good performance
for i, score in enumerate([0.92, 0.88, 0.90], 1):
    buddhi.record_outcome("dev_user", "code_review", score)

# Session 4-6: gradual decline
for i, score in enumerate([0.75, 0.65, 0.50], 4):
    insight = buddhi.record_outcome("dev_user", "code_review", score)

# Session 7-8: clear regression
insight = buddhi.record_outcome("dev_user", "code_review", 0.35)
report(
    "record_outcome() tracks 7 code review sessions",
    "PASS",
    f"buddhi stats: {buddhi.get_stats()}",
)

# Now get context at session 8
ctx = buddhi.get_hyper_context(user_id="dev_user", task_description="reviewing PR #42")
warnings = ctx.warnings
has_regression_warning = any("code_review" in w.lower() or "declining" in w.lower() for w in warnings)
report(
    "get_hyper_context() warns about code_review regression",
    "PASS" if has_regression_warning else "FAIL",
    f"Warnings: {warnings[:2]}",
)

# Performance snapshot available
has_perf = len(ctx.performance) > 0
snap = ctx.performance[0] if has_perf else None
report(
    "Performance snapshot for 'code_review' available",
    "PASS" if has_perf else "FAIL",
    f"trend={snap.trend:.3f}, avg={snap.avg_score:.2f}, attempts={snap.total_attempts}" if snap else "none",
)

# =========================================================================
# [D] USE CASE 3 — "What worked last time transfers to this session"
# =========================================================================
print("\n[D] USE CASE 3 — Insight synthesis (reflect)")
print("    Scenario: agent learns 'git blame first' worked, surfaces it next time")

reflections = buddhi.reflect(
    user_id="dev_user",
    task_type="bug_fix",
    what_worked="git blame showed the exact commit that broke auth — always check blame first",
    what_failed="grep was too slow on the 500k-line monorepo — use ast-grep instead",
    key_decision="Switched to JWT with 15-min TTL after reviewing OWASP session guidance",
)
report(
    "reflect() stores 3 insights (worked, failed, decision)",
    "PASS" if len(reflections) == 3 else "FAIL",
    f"Got {len(reflections)} insights",
)
for ins in reflections:
    print(f"         → [{ins.insight_type}] {ins.content[:90]}")

# Next session: start on similar task
ctx2 = buddhi.get_hyper_context(user_id="dev_user", task_description="fixing authentication bug")
relevant_insights = [i for i in ctx2.insights if "bug_fix" in i.source_task_types]
report(
    "Next session 'auth bug' surfaces bug_fix insights",
    "PASS" if len(relevant_insights) >= 2 else "FAIL",
    f"{len(relevant_insights)} relevant insights surfaced",
)
if relevant_insights:
    print(f"         → Top insight: {relevant_insights[0].content[:90]}")

# =========================================================================
# [E] USE CASE 4 — "Remember to run auth tests when touching login.py"
# =========================================================================
print("\n[E] USE CASE 4 — Prospective memory (intentions)")
print("    Scenario: dev says 'remember to run auth tests after login.py changes'")

# Store intention via checkpoint (as a real user would)
intent = buddhi.store_intention(
    user_id="dev_user",
    description="run auth tests after any login.py change",
    trigger_keywords=["login", "auth"],
    action_payload="Run: pytest tests/test_auth.py -v",
)
report("store_intention() stores prospective trigger", "PASS", f"ID: {intent.id[:8]}...")

# Auto-detect from natural language
detected = buddhi.detect_intention_in_text(
    "Remember to invalidate sessions when changing JWT secret", "dev_user"
)
report(
    "Auto-detect: 'Remember to X when Y' → stored as intention",
    "PASS" if detected else "FAIL",
    f"Detected: {detected.description[:70]}" if detected else "None",
)

# Context with auth keywords → intention fires
ctx3 = buddhi.get_hyper_context(
    user_id="dev_user",
    task_description="fixing auth bug in login.py — need to update token validation",
)
fired = [i for i in ctx3.intentions if "auth" in i.description.lower() or "login" in i.description.lower()]
report(
    "Intention fires when task mentions 'auth' + 'login'",
    "PASS" if fired else "FAIL",
    f"Fired: {fired[0].description[:70]}" if fired else "No intentions fired",
)

# Context WITHOUT trigger keywords → intention should NOT fire
ctx4 = buddhi.get_hyper_context(
    user_id="dev_user",
    task_description="updating CSS styles for dashboard page",
)
not_fired = all("auth" not in i.description.lower() for i in ctx4.intentions)
report(
    "Intention does NOT fire for unrelated task (CSS styling)",
    "PASS" if not_fired else "FAIL",
    f"Intentions triggered for unrelated task: {[i.description[:40] for i in ctx4.intentions]}",
)

# =========================================================================
# [F] USE CASE 5 — "Claude Code crashes → Cursor picks up instantly"
# =========================================================================
print("\n[F] USE CASE 5 — Cross-agent handoff (session digest)")
print("    Scenario: Claude Code saves state → Cursor reads and continues")

try:
    from dhee.core.kernel import save_session_digest, get_last_session

    # Claude Code saves its session digest
    result = save_session_digest(
        task_summary="Refactoring auth middleware: extracted JWT validation to separate module",
        agent_id="claude-code",
        repo="/projects/my-saas",
        status="paused",
        decisions_made=[
            "JWT validation moved to auth/jwt.py",
            "Middleware now delegates to jwt.validate()",
        ],
        files_touched=["src/middleware/auth.py", "src/auth/jwt.py"],
        todos_remaining=[
            "Add refresh token endpoint",
            "Update integration tests",
        ],
    )
    report(
        "save_session_digest() saves state",
        "PASS" if result.get("status") == "saved" else "FAIL",
        f"session_id: {result.get('session_id', 'N/A')[:12]}...",
    )

    # Cursor (different agent) reads it
    last = get_last_session(agent_id="claude-code", repo="/projects/my-saas")
    if last:
        report(
            "get_last_session() retrieves Claude Code's session",
            "PASS",
            f"Summary: {last.get('task_summary', '')[:70]}",
        )
        todos = last.get("todos", [])
        report(
            f"Handoff includes {len(todos)} TODOs for next agent",
            "PASS" if len(todos) >= 2 else "FAIL",
            f"TODOs: {todos[:2]}",
        )
    else:
        report("get_last_session() retrieves saved session", "FAIL", f"Got: {last}")

except Exception as e:
    report("Cross-agent handoff", "FAIL", str(e))
    import traceback
    traceback.print_exc()

# =========================================================================
# [G] MCP 4-TOOL API — end-to-end simulation via handlers
# =========================================================================
print("\n[G] MCP 4-Tool API — end-to-end simulation (as Claude/Cursor uses it)")
print("    Simulating: context → remember → remember → recall → checkpoint")

import dhee.mcp_slim as slim
# Use the same in-memory Engram instance so vector search works offline
mcp_eng = Engram(in_memory=True)
slim._memory = mcp_eng._memory
slim._buddhi = Buddhi(data_dir=tempfile.mkdtemp(prefix="dhee_mcp_"))

# Step 1: context (session bootstrap)
ctx_result = slim._handle_context({"task_description": "implementing feature flags", "user_id": "mcp_user"})
report(
    "context() returns structured HyperContext",
    "PASS" if "meta" in ctx_result else "FAIL",
    f"Keys: {list(ctx_result.keys())}",
)

# Step 2: remember (store facts)
r1 = slim._handle_remember({"content": "User wants per-environment feature flags", "user_id": "mcp_user"})
r2 = slim._handle_remember({"content": "Feature flags stored in Redis with 5-min TTL", "user_id": "mcp_user"})
r3 = slim._handle_remember({"content": "Remember to invalidate cache when flipping flags in production", "user_id": "mcp_user"})

report(
    "remember() x3 — all stored with IDs",
    "PASS" if all(r.get("stored") for r in [r1, r2, r3]) else "FAIL",
    f"IDs: {[r.get('id', 'N/A')[:8] for r in [r1, r2, r3]]}",
)

# Check if 'remember to' was auto-detected as intention
detected_intent = r3.get("detected_intention")
report(
    "remember() auto-detects intention in 'remember to X when Y'",
    "PASS" if detected_intent else "FAIL",
    f"Detected: {detected_intent.get('description', '')[:60] if detected_intent else 'None'}",
)

# Step 3: recall
recall_result = slim._handle_recall({"query": "feature flag storage", "user_id": "mcp_user", "limit": 5})
has_results = len(recall_result.get("memories", [])) > 0
report(
    "recall('feature flag storage') returns memories",
    "PASS" if has_results else "FAIL",
    f"count={recall_result.get('count', 0)}, top={recall_result['memories'][0]['memory'][:60] if has_results else 'none'}",
)

# Step 4: checkpoint (save + outcome + insights + intention)
cp = slim._handle_checkpoint({
    "summary": "Implemented per-environment feature flags with Redis backend",
    "status": "completed",
    "task_type": "feature",
    "outcome_score": 0.85,
    "what_worked": "Redis TTL approach eliminated stale flag issues in staging",
    "what_failed": "Local env needed mock Redis — added docker-compose override",
    "key_decision": "Per-environment flags > global flags for zero-downtime deploys",
    "remember_to": "run feature flag integration tests before deploying to prod",
    "trigger_keywords": ["deploy", "production", "prod"],
    "decisions": ["Redis TTL=300s", "Per-env config", "Fallback to defaults on timeout"],
    "files_touched": ["src/flags.py", "redis_config.py", "tests/test_flags.py"],
    "user_id": "mcp_user",
})
report(
    "checkpoint() — session + outcome + insights + intention",
    "PASS" if not cp.get("error") else "FAIL",
    f"Keys: {list(cp.keys())}",
)
report(
    "checkpoint.outcome_recorded = True",
    "PASS" if cp.get("outcome_recorded") else "WARN",
)
report(
    "checkpoint.insights_created > 0",
    "PASS" if cp.get("insights_created", 0) > 0 else "FAIL",
    f"{cp.get('insights_created', 0)} insights",
)
report(
    "checkpoint.intention_stored (remember to run tests)",
    "PASS" if cp.get("intention_stored") else "FAIL",
    str(cp.get("intention_stored", {}).get("description", ""))[:60],
)

# =========================================================================
# [H] MEMORY DECAY — FadeMem
# =========================================================================
print("\n[H] Memory Decay — FadeMem / Ebbinghaus")

decay = eng.forget(user_id="dev_user")
required = {"forgotten", "promoted", "decayed"}
has_required = required.issubset(set(decay.keys()))
report(
    "forget() returns decay metrics (forgotten, promoted, decayed)",
    "PASS" if has_required else "FAIL",
    str({k: decay.get(k) for k in required}),
)

# Verify stats reflect decay
stats = eng.stats(user_id="dev_user")
report(
    "stats() returns memory stats",
    "PASS" if isinstance(stats, dict) and stats.get("total", 0) > 0 else "FAIL",
    f"total={stats.get('total', 0)}, sml={stats.get('sml_count', '?')}, lml={stats.get('lml_count', '?')}",
)

# =========================================================================
# FINAL SUMMARY
# =========================================================================
print("\n" + "=" * 65)
print(" FINAL REPORT")
print("=" * 65)

pass_count = sum(1 for _, s in results if s == "PASS")
fail_count = sum(1 for _, s in results if s == "FAIL")
warn_count = sum(1 for _, s in results if s == "WARN")

print(f"\n  Total checks: {len(results)}")
print(f"  PASS:  {pass_count}  (works as claimed)")
print(f"  WARN:  {warn_count}  (works but claim needs nuance)")
print(f"  FAIL:  {fail_count}  (broken)")

if fail_count:
    print("\n  FAILURES:")
    for label, status in results:
        if status == "FAIL":
            print(f"    - {label}")

if warn_count:
    print("\n  WARNINGS / NUANCES:")
    for label, status in results:
        if status == "WARN":
            print(f"    - {label}")

print()