Skip to content

Commit 9244040

Browse files
authored
feat: add Agent Mesh module - heartbeat, health, kill switch (#47)
New axme_sdk/mesh.py with MeshClient class, accessible via client.mesh: - heartbeat() / start_heartbeat() - agent liveness reporting - report_metric() - buffer metrics, flushed with heartbeat - list_agents() / get_agent() - agents with health status - kill() / resume() - instant agent isolation - list_events() - audit log start_heartbeat() runs a daemon thread sending heartbeats every 30s. stop_heartbeat() called automatically on client.close().
1 parent 7e2a789 commit 9244040

3 files changed

Lines changed: 202 additions & 0 deletions

File tree

axme_sdk/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .client import AxmeClient, AxmeClientConfig
2+
from .mesh import MeshClient
23
from .exceptions import (
34
AxmeAuthError,
45
AxmeError,
@@ -11,6 +12,7 @@
1112
__all__ = [
1213
"AxmeClient",
1314
"AxmeClientConfig",
15+
"MeshClient",
1416
"AxmeAuthError",
1517
"AxmeError",
1618
"AxmeHttpError",

axme_sdk/client.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,19 @@ def __init__(self, config: AxmeClientConfig, *, http_client: httpx.Client | None
5656
headers=self._default_headers(),
5757
)
5858
self._mcp_tool_schemas: dict[str, dict[str, Any]] = {}
59+
self._mesh: "MeshClient | None" = None
60+
61+
@property
62+
def mesh(self) -> "MeshClient":
63+
"""Access Agent Mesh operations (heartbeat, health, kill switch)."""
64+
if self._mesh is None:
65+
from .mesh import MeshClient
66+
self._mesh = MeshClient(self)
67+
return self._mesh
5968

6069
def close(self) -> None:
70+
if self._mesh is not None:
71+
self._mesh.stop_heartbeat()
6172
if self._owns_http_client:
6273
self._http.close()
6374

axme_sdk/mesh.py

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
"""Agent Mesh module - heartbeat, health monitoring, metrics reporting."""
2+
from __future__ import annotations
3+
4+
import threading
5+
import time
6+
from typing import Any, TYPE_CHECKING
7+
8+
if TYPE_CHECKING:
9+
from .client import AxmeClient
10+
11+
12+
class MeshClient:
13+
"""Mesh operations for an AxmeClient instance."""
14+
15+
def __init__(self, client: AxmeClient) -> None:
16+
self._client = client
17+
self._heartbeat_thread: threading.Thread | None = None
18+
self._heartbeat_stop = threading.Event()
19+
self._metrics_buffer: dict[str, Any] = {}
20+
21+
# ── Heartbeat ────────────────────────────────────────────────────
22+
23+
def heartbeat(
24+
self,
25+
*,
26+
metrics: dict[str, Any] | None = None,
27+
trace_id: str | None = None,
28+
) -> dict[str, Any]:
29+
"""Send a single heartbeat to the mesh. Optionally include metrics."""
30+
body: dict[str, Any] = {}
31+
if metrics:
32+
body["metrics"] = metrics
33+
return self._client._request_json(
34+
"POST",
35+
"/v1/mesh/heartbeat",
36+
json_body=body if body else None,
37+
retryable=True,
38+
trace_id=trace_id,
39+
)
40+
41+
def start_heartbeat(
42+
self,
43+
*,
44+
interval_seconds: float = 30.0,
45+
include_metrics: bool = True,
46+
) -> None:
47+
"""Start a background thread that sends heartbeats at regular intervals.
48+
49+
Args:
50+
interval_seconds: Seconds between heartbeats (default 30).
51+
include_metrics: Whether to include buffered metrics with each heartbeat.
52+
"""
53+
if self._heartbeat_thread is not None and self._heartbeat_thread.is_alive():
54+
return # Already running
55+
56+
self._heartbeat_stop.clear()
57+
58+
def _loop() -> None:
59+
while not self._heartbeat_stop.wait(timeout=interval_seconds):
60+
try:
61+
metrics = self._flush_metrics() if include_metrics else None
62+
self.heartbeat(metrics=metrics)
63+
except Exception:
64+
pass # Heartbeat failures are non-fatal
65+
66+
self._heartbeat_thread = threading.Thread(
67+
target=_loop, daemon=True, name="axme-mesh-heartbeat",
68+
)
69+
self._heartbeat_thread.start()
70+
71+
def stop_heartbeat(self) -> None:
72+
"""Stop the background heartbeat thread."""
73+
self._heartbeat_stop.set()
74+
if self._heartbeat_thread is not None:
75+
self._heartbeat_thread.join(timeout=5.0)
76+
self._heartbeat_thread = None
77+
78+
# ── Metrics ──────────────────────────────────────────────────────
79+
80+
def report_metric(
81+
self,
82+
*,
83+
success: bool = True,
84+
latency_ms: float | None = None,
85+
cost_usd: float | None = None,
86+
) -> None:
87+
"""Buffer a metric observation. Flushed with next heartbeat."""
88+
buf = self._metrics_buffer
89+
buf["intents_total"] = buf.get("intents_total", 0) + 1
90+
if success:
91+
buf["intents_succeeded"] = buf.get("intents_succeeded", 0) + 1
92+
else:
93+
buf["intents_failed"] = buf.get("intents_failed", 0) + 1
94+
if latency_ms is not None:
95+
# Running average
96+
count = buf["intents_total"]
97+
prev_avg = buf.get("avg_latency_ms", 0.0)
98+
buf["avg_latency_ms"] = prev_avg + (latency_ms - prev_avg) / count
99+
if cost_usd is not None:
100+
buf["cost_usd"] = buf.get("cost_usd", 0.0) + cost_usd
101+
102+
def _flush_metrics(self) -> dict[str, Any] | None:
103+
if not self._metrics_buffer:
104+
return None
105+
metrics = self._metrics_buffer.copy()
106+
self._metrics_buffer.clear()
107+
return metrics
108+
109+
# ── Agent management ─────────────────────────────────────────────
110+
111+
def list_agents(
112+
self,
113+
*,
114+
limit: int = 100,
115+
health: str | None = None,
116+
trace_id: str | None = None,
117+
) -> dict[str, Any]:
118+
"""List all agents in workspace with health status."""
119+
params: dict[str, str] = {"limit": str(limit)}
120+
if health:
121+
params["health"] = health
122+
return self._client._request_json(
123+
"GET",
124+
"/v1/mesh/agents",
125+
params=params,
126+
retryable=True,
127+
trace_id=trace_id,
128+
)
129+
130+
def get_agent(
131+
self,
132+
address_id: str,
133+
*,
134+
trace_id: str | None = None,
135+
) -> dict[str, Any]:
136+
"""Get single agent detail with metrics and events."""
137+
return self._client._request_json(
138+
"GET",
139+
f"/v1/mesh/agents/{address_id}",
140+
retryable=True,
141+
trace_id=trace_id,
142+
)
143+
144+
def kill(
145+
self,
146+
address_id: str,
147+
*,
148+
trace_id: str | None = None,
149+
) -> dict[str, Any]:
150+
"""Kill an agent - block all intents to and from it."""
151+
return self._client._request_json(
152+
"POST",
153+
f"/v1/mesh/agents/{address_id}/kill",
154+
retryable=False,
155+
trace_id=trace_id,
156+
)
157+
158+
def resume(
159+
self,
160+
address_id: str,
161+
*,
162+
trace_id: str | None = None,
163+
) -> dict[str, Any]:
164+
"""Resume a killed agent."""
165+
return self._client._request_json(
166+
"POST",
167+
f"/v1/mesh/agents/{address_id}/resume",
168+
retryable=False,
169+
trace_id=trace_id,
170+
)
171+
172+
def list_events(
173+
self,
174+
*,
175+
limit: int = 50,
176+
event_type: str | None = None,
177+
trace_id: str | None = None,
178+
) -> dict[str, Any]:
179+
"""List recent mesh events (kills, resumes, health changes)."""
180+
params: dict[str, str] = {"limit": str(limit)}
181+
if event_type:
182+
params["event_type"] = event_type
183+
return self._client._request_json(
184+
"GET",
185+
"/v1/mesh/events",
186+
params=params,
187+
retryable=True,
188+
trace_id=trace_id,
189+
)

0 commit comments

Comments
 (0)