Skip to content

Commit 6817830

Browse files
authored
Merge pull request #129 from agent-diff-bench/fixes-kdd
Annotated Test Suites [KDD 2026]
2 parents 446cd92 + e63faa0 commit 6817830

13 files changed

Lines changed: 1939 additions & 296 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,3 +245,6 @@ local_experiments/
245245
.claude/**
246246
!.claude/CLAUDE.md
247247
!.claude/settings.json
248+
249+
experiments/kdd 2026
250+

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,12 @@ client.delete_env(envId=env.environmentId)
149149
SDK provides **code execution proxies** - tools for AI agents. You add it to your toolbox in Vercel AI SDK, Langchain or OpenAI Agents, making LLM write Python or Bash code to talk with Slack or Linear API. Requests will automatically be intercepted and routed to isolated test environments. This enables agents to interact with service replicas without any code changes. See more in: **[Python SDK](sdk/agent-diff-python/README.md)**
150150

151151

152+
## Benchmark & Training
153+
154+
- **HuggingFace Dataset**: [hubertmarek/agent-diff-bench](https://huggingface.co/datasets/hubertmarek/agent-diff-bench) — 224 tasks across all 4 services (80/20 train/test split, stratified by service)
155+
- **Prime Intellect Environment**: [agent-diff-bench on Prime Lab](https://app.primeintellect.ai/dashboard/environments/hubert-marek/agent-diff-bench) — run evaluations or RL training via Hosted Training
156+
- **Paper**: [AgentDiff: Agentic API Evaluation via State Differencing (KDD 2026 pre-print)](https://drive.google.com/file/d/1BlmJTSMX7ohwvD1aYBByg7_Y815fgsxp/view?usp=sharing)
157+
152158
## Evaluations & Test Suites
153159

154160
Collections of test cases with assertions that you can run against agent runs using evaluations.

backend/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ RUN echo '#!/bin/sh\n\
2929
python utils/seed_slack_template.py\n\
3030
python utils/seed_linear_template.py\n\
3131
python utils/seed_box_template.py\n\
32+
python utils/seed_calendar_template.py\n\
3233
python utils/seed_tests.py\n\
3334
else\n\
3435
echo "=== Skipping seed (set SEED=true to enable) ==="\n\

backend/src/services/slack/api/methods.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,23 @@ def _slack_error(
153153
raise SlackAPIError(code, status_code, extra)
154154

155155

156+
def _parse_bool_param(value: Any, default: bool = False) -> bool:
157+
"""Safely parse a boolean parameter from JSON (bool) or form data (string).
158+
159+
Handles:
160+
- Boolean values: True/False
161+
- String values: "true"/"false" (case-insensitive)
162+
- None/missing: returns default
163+
"""
164+
if value is None:
165+
return default
166+
if isinstance(value, bool):
167+
return value
168+
if isinstance(value, str):
169+
return value.lower() == "true"
170+
return default
171+
172+
156173
def _resolve_channel_id(channel: str, session=None) -> str:
157174
"""Resolve channel name or ID to channel ID.
158175
@@ -1033,7 +1050,7 @@ async def conversations_list(request: Request) -> JSONResponse:
10331050
except ValueError:
10341051
_slack_error("invalid_arguments")
10351052

1036-
exclude_archived = params.get("exclude_archived", "false").lower() == "true"
1053+
exclude_archived = _parse_bool_param(params.get("exclude_archived"), default=False)
10371054
types_param = params.get("types", "public_channel") # Default: public_channel
10381055

10391056
session = _session(request)
@@ -1146,7 +1163,7 @@ async def conversations_history(request: Request) -> JSONResponse:
11461163
_slack_error("invalid_cursor")
11471164
oldest_param = params.get("oldest")
11481165
latest_param = params.get("latest")
1149-
inclusive = params.get("inclusive", "false").lower() == "true"
1166+
inclusive = _parse_bool_param(params.get("inclusive"), default=False)
11501167

11511168
# Validate channel (required)
11521169
if not channel:
@@ -1270,7 +1287,7 @@ async def conversations_replies(request: Request) -> JSONResponse:
12701287

12711288
oldest_param = params.get("oldest")
12721289
latest_param = params.get("latest")
1273-
inclusive = params.get("inclusive", "false").lower() == "true"
1290+
inclusive = _parse_bool_param(params.get("inclusive"), default=False)
12741291

12751292
oldest_dt = None
12761293
latest_dt = None
@@ -1710,8 +1727,8 @@ async def conversations_open(request: Request) -> JSONResponse:
17101727
async def conversations_info(request: Request) -> JSONResponse:
17111728
params = await _get_params_async(request)
17121729
channel = params.get("channel")
1713-
include_locale = params.get("include_locale", "false").lower() == "true"
1714-
include_num_members = params.get("include_num_members", "false").lower() == "true"
1730+
include_locale = _parse_bool_param(params.get("include_locale"), default=False)
1731+
include_num_members = _parse_bool_param(params.get("include_num_members"), default=False)
17151732

17161733
# Validate channel (required)
17171734
if not channel:
@@ -2283,7 +2300,7 @@ async def users_info(request: Request) -> JSONResponse:
22832300
if user is None:
22842301
_slack_error("user_not_found")
22852302

2286-
include_locale = params.get("include_locale", "false").lower() == "true"
2303+
include_locale = _parse_bool_param(params.get("include_locale"), default=False)
22872304

22882305
session = _session(request)
22892306

@@ -2317,7 +2334,7 @@ async def users_list(request: Request) -> JSONResponse:
23172334
except ValueError:
23182335
_slack_error("invalid_cursor")
23192336

2320-
include_locale = params.get("include_locale", "false").lower() == "true"
2337+
include_locale = _parse_bool_param(params.get("include_locale"), default=False)
23212338
session = _session(request)
23222339
actor = _principal_user_id(request)
23232340
team_id = _get_env_team_id(request, channel_id=None, actor_user_id=actor)
@@ -2857,7 +2874,7 @@ async def search_messages(request: Request) -> JSONResponse:
28572874
if not query_str:
28582875
_slack_error("No query passed")
28592876

2860-
highlight = str(params.get("highlight", "false")).lower() == "true"
2877+
highlight = _parse_bool_param(params.get("highlight"), default=False)
28612878
sort = (params.get("sort") or "score").lower()
28622879
sort_dir = (params.get("sort_dir") or "desc").lower()
28632880
count_param = params.get("count")

0 commit comments

Comments
 (0)