Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e32939d
test(eval): GT-completeness pass (G3-1) — add 2 verified real bugs mi…
grauwolf32 Jun 7, 2026
6092730
fix(eval): correct vaultpay source_root (vault-pay/vault-pay -> vault…
grauwolf32 Jun 7, 2026
3d80ae0
feat(eval): dated per-run archive so eval results are never overwritten
grauwolf32 Jun 7, 2026
dbf7c8b
feat(agents): FS_HEAVY_KEEP_LAST_N env override for the elision count…
grauwolf32 Jun 7, 2026
3ede881
feat(eval): pass@N loop for the trace eval (CONTRACTOR_EVAL_TRACE_PAS…
grauwolf32 Jun 7, 2026
95f6611
feat(trace): converge prompt + task-prompt fix to stop annotate->rest…
grauwolf32 Jun 7, 2026
51f40bb
feat(trace): promote converge prompt to active (v7 -> converge)
grauwolf32 Jun 7, 2026
1f4bfd9
feat(trace): leaned planner-scoped task v3 + task-version env override
grauwolf32 Jun 7, 2026
e54d328
feat(trace): promote task v3 to active (v1 -> v3)
grauwolf32 Jun 7, 2026
63281d8
fix(eval): F4 — rebuild truncated xbow_exploit envelope from per-benc…
grauwolf32 Jun 7, 2026
7af0236
test(eval): F3 — wire held-out realvuln fixture (extremely-vulnerable…
grauwolf32 Jun 7, 2026
104b11d
test(eval): X1 — env-gated OAS-in-prompt knob (CONTRACTOR_EVAL_WITH_OAS)
grauwolf32 Jun 7, 2026
4618c6f
fix(likec4): bare {id} in builder prompt v3 crashed the workflow (ADK…
grauwolf32 Jun 7, 2026
e64acaf
test(eval): X2 — add generated LikeC4 model for vulnyapi (enables thr…
grauwolf32 Jun 7, 2026
777c636
fix(callbacks): guardrail no longer short-circuits the after-model chain
grauwolf32 Jun 10, 2026
e1841b2
fix(fs): correct ** glob semantics in MemoryOverlayFileSystem
grauwolf32 Jun 10, 2026
3e4f0c9
fix(runners): retry on iteration exceptions, surface artifact issues,…
grauwolf32 Jun 10, 2026
fcf7e96
fix(workflows): trace-verify probes all trace namespace prefixes; uni…
grauwolf32 Jun 10, 2026
de95fce
fix(runners): best-effort event delivery, metrics retry pairing, chec…
grauwolf32 Jun 10, 2026
287afa3
fix(workflows): dedup crash paths, stable refs, app_name, pathpar for…
grauwolf32 Jun 10, 2026
3b259fb
fix(tasks,callbacks): finish summarizer hygiene, planner message cont…
grauwolf32 Jun 10, 2026
2a95169
fix(eval,scripts): eval gate, envelope rebuild layouts, sink collisio…
grauwolf32 Jun 10, 2026
44c731c
fix(cli,http,oas_analyzer): keep UI alive past non-terminal failures,…
grauwolf32 Jun 10, 2026
08c30f9
fix(runners): fail-fast skills, once-per-task injection, state hygien…
grauwolf32 Jun 10, 2026
aa22fc7
fix(tools): honest skip errors, early input_schema check, lazy likec4…
grauwolf32 Jun 10, 2026
38390f9
fix(fs): walk symlink filtering, TOCTOU-safe paths, fork delete basel…
grauwolf32 Jun 10, 2026
d104874
fix(callbacks): RPM window roll, token history flush, real validate()…
grauwolf32 Jun 10, 2026
55c56d4
fix(agents,workflows): distinct tool descriptions, prompt corrections…
grauwolf32 Jun 10, 2026
d971e28
fix(eval,scripts): partial results on timeout, honest cost reporting
grauwolf32 Jun 10, 2026
1c3f9d4
docs: sync all docs/ with current code
grauwolf32 Jun 10, 2026
437d500
refactor(workflows): restore trace-direct baseline, drop dead knobs, …
grauwolf32 Jun 10, 2026
048e6c2
refactor(callbacks,agents): extract chain_after_model_callback + shar…
grauwolf32 Jun 10, 2026
eb071d4
refactor(workflows): extract shared YAML findings-artifact loaders
grauwolf32 Jun 10, 2026
43620ee
feat(trace): post-diff split — annotate-only trace stage + vuln_analy…
grauwolf32 Jun 10, 2026
c93170b
feat(trace): router-prefix path grouping for coverage budgeting
grauwolf32 Jun 10, 2026
720a99d
feat(vuln): vuln-sweep — recall-oriented two-pass BFS sweep + DFS trace
grauwolf32 Jun 10, 2026
047a489
docs: add planner.md — streamline planner & task runner deep dive
grauwolf32 Jun 10, 2026
4d96c13
docs(planner): add 'Variations worth testing' + 'Decisions after curr…
grauwolf32 Jun 10, 2026
fa279a9
docs(planner): fold in review — trust-mechanism axis, V8/V9/V10, meth…
grauwolf32 Jun 10, 2026
862f5e0
docs(planner): add §9.4 — execution / records / scheduler seams
grauwolf32 Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 0 additions & 97 deletions RESUME.md

This file was deleted.

105 changes: 42 additions & 63 deletions cli/fs.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,12 @@
import os
import re
from collections.abc import Iterator
from typing import Any

from fsspec.implementations.local import LocalFileSystem, stringify_path

from contractor.tools.fs.globmatch import glob_to_regex
from contractor.utils.formatting import norm_unicode


def _translate_glob_segment(seg: str) -> str:
"""Translate one glob path segment to regex, never crossing ``/``."""
out: list[str] = []
i, n = 0, len(seg)
while i < n:
c = seg[i]
if c == "*":
out.append("[^/]*")
elif c == "?":
out.append("[^/]")
elif c == "[":
j = i + 1
if j < n and seg[j] == "!":
j += 1
if j < n and seg[j] == "]":
j += 1
while j < n and seg[j] != "]":
j += 1
if j >= n: # no closing bracket: treat '[' literally
out.append(re.escape(c))
else:
inner = seg[i + 1 : j]
if inner.startswith("!"):
inner = "^" + inner[1:]
out.append("[" + inner + "]")
i = j + 1
continue
else:
out.append(re.escape(c))
i += 1
return "".join(out)


def _glob_to_regex(pattern: str) -> "re.Pattern[str]":
"""
Compile a glob pattern into a path-aware regex with Python-like semantics:
``*``/``?``/``[...]`` match within a single path segment, while ``**``
matches any number of segments (including zero). Matches relative paths
without a leading ``/``.
"""
segments = pattern.split("/")
parts: list[str] = []
last = len(segments) - 1
for idx, seg in enumerate(segments):
if seg == "**":
if idx == last:
parts.append(".*") # trailing ** matches anything, any depth
else:
parts.append("(?:[^/]*/)*") # **/ matches zero or more segments
continue # the separator is baked into the group above
else:
parts.append(_translate_glob_segment(seg))
if idx != last:
parts.append("/")
return re.compile("(?s:" + "".join(parts) + r")\Z")
from contractor.utils.settings import get_settings


class RootedLocalFileSystem(LocalFileSystem):
Expand Down Expand Up @@ -130,7 +74,11 @@ def _strip_protocol(self, path: str) -> str:
resolved = os.path.realpath(candidate)

if self._is_within_sandbox(resolved):
return candidate
# Return the *resolved* path — the exact path that was validated —
# so the later open()/stat() cannot re-resolve a symlink component
# swapped in after this check (check-then-use TOCTOU). In-sandbox
# symlinks still work: they resolve to their (validated) target.
return resolved

return self._blocked_path

Expand Down Expand Up @@ -164,6 +112,11 @@ def walk(
# Prune symlinked directories so os.walk never descends into them.
dirs[:] = [d for d in dirs if self._is_safe_entry(current_root, d)]

# Hide symlinked files too (same policy as ls/glob): their content
# is already unreadable through the sandbox, so leaking the names
# would only disclose the existence of out-of-sandbox targets.
files = [f for f in files if self._is_safe_entry(current_root, f)]

yield self._to_virtual(real_root), dirs, files

def ls(
Expand Down Expand Up @@ -201,17 +154,35 @@ def glob(self, pattern: str, **kwargs: Any) -> list[str]:

Returns virtual paths such as ``/file.txt`` or ``/dir/inner.txt``.
"""
matches, _truncated = self.glob_scanned(pattern)
return matches

def glob_scanned(
self, pattern: str, max_files: int | None = None
) -> tuple[list[str], bool]:
"""``glob`` plus a truncation flag.

The tree walk is hard-bounded at *max_files* scanned files (default:
``Settings.fs_max_files_per_walk``) so a glob over a huge repo cannot
run away. The flag is ``True`` when the ceiling was hit, i.e. the
match list may be incomplete.
"""
if not pattern:
return []
return [], False

pattern = norm_unicode(pattern.lstrip("/")) or ""

# Reject obvious traversal attempts.
if ".." in pattern.split("/"):
return []
return [], False

if max_files is None:
max_files = get_settings().fs_max_files_per_walk

regex = _glob_to_regex(pattern)
regex = glob_to_regex(pattern)
matches: set[str] = set()
scanned = 0
truncated = False

# Always walk the full tree: a non-recursive pattern like ``sub/*.py``
# still needs to descend into ``sub``. The regex is path-aware, so a
Expand All @@ -225,6 +196,11 @@ def glob(self, pattern: str, **kwargs: Any) -> list[str]:
rel_root = ""

for name in files:
if scanned >= max_files:
truncated = True
break
scanned += 1

normalized_name = norm_unicode(name) or name
host_path = os.path.join(host_root, normalized_name)

Expand All @@ -239,4 +215,7 @@ def glob(self, pattern: str, **kwargs: Any) -> list[str]:
if regex.match(rel_path):
matches.add("/" + rel_path)

return sorted(matches)
if truncated:
break

return sorted(matches), truncated
18 changes: 16 additions & 2 deletions cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,15 @@ def _project_artifacts_dir(base: Path, project_path: Path) -> Path:
"opentelemetry",
)

_UI_STOP_EVENTS = frozenset({"run_finished", "task_failed", "workflow_finished"})
# Only the single, truly-terminal workflow event stops the live UI. Both
# ``run_finished`` (per TaskRunner.run(), fired once per finding in multi-run
# workflows) and ``task_failed`` (per-finding failure that the workflow catches
# and continues past) happen mid-workflow — stopping on them froze the UI and,
# because the handler returned early, suppressed every later event from both the
# live render and the print fallback. ``workflow_finished`` is emitted exactly
# once in ``Workflow.run()``'s finally block (even on abort), so it is the only
# safe place to tear the renderer down.
_UI_STOP_EVENTS = frozenset({"workflow_finished"})

# High-volume / non-user-facing events. Persisted to metrics.jsonl when they
# match, but never forwarded to the live UI (they would just flood it).
Expand Down Expand Up @@ -201,7 +209,13 @@ async def async_main(
checkpoint_path=checkpoint_path,
)

runner = workflow_cls(ctx)
try:
runner = workflow_cls(ctx)
except ValueError as exc:
# Some workflows (e.g. ExploitabilityWorkflow without a target URL)
# validate their context in __init__. Surface that as a clean CLI
# error instead of an uncaught traceback.
raise click.UsageError(str(exc)) from exc
handler = _build_event_handler(output_dir, workflow, enable_ui=enable_ui)

with observability.run_context(
Expand Down
2 changes: 2 additions & 0 deletions cli/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def _event_to_record(event: TaskRunnerEvent) -> dict[str, Any]:
"task_name": getattr(event, "task_name", None),
"task_id": getattr(event, "task_id", None),
}
# Intentional: setdefault means payload keys that shadow envelope keys
# ("type", "task_name", ...) are dropped — the envelope always wins.
for key, value in payload_dict.items():
record.setdefault(key, value)
return record
Expand Down
39 changes: 8 additions & 31 deletions contractor/agents/exploitability_agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from contractor.agents.worker_factory import build_worker
from contractor.callbacks import default_tool
from contractor.callbacks.adapter import CallbackAdapter
from contractor.callbacks.adapter import chain_after_model_callback
from contractor.callbacks.guardrails import MandatoryToolCallback
from contractor.tools.caido import caido_tools
from contractor.tools.code import attach_graph_tools_if_local, code_tools
Expand All @@ -18,6 +18,8 @@
from contractor.tools.memory import MemoryFormat, memory_tools
from contractor.tools.podman import code_exec_tools
from contractor.tools.vuln import (
READ_ONLY_VULN_TOOL_NAMES,
VERDICT_TOOL_NAMES,
VerifiedFindingFormat,
VulnerabilityReportFormat,
verification_tools,
Expand All @@ -27,12 +29,6 @@

EXPLOIT_PROMPT: Final[str] = load_prompt("exploitability_agent")

_READ_ONLY_VULN_TOOL_NAMES: frozenset[str] = frozenset(
{"get_vulnerability", "list_vulnerabilities"}
)

_VERDICT_TOOL_NAMES: list[str] = ["submit_verdict", "report_verification"]

_ELIDE_TOOLS: list[str] = [
"read_file", "grep", "glob", "list_symbols",
"http_request", "http_read_body",
Expand Down Expand Up @@ -112,7 +108,7 @@ def build_exploitability_agent(
name=src_ns,
fmt=VulnerabilityReportFormat(_format=_format),
)
if t.__name__ in _READ_ONLY_VULN_TOOL_NAMES
if t.__name__ in READ_ONLY_VULN_TOOL_NAMES
]

verif_tools = verification_tools(
Expand Down Expand Up @@ -150,28 +146,9 @@ def build_exploitability_agent(
elide_keep_last_n=elide_keep_last_n,
)

mandatory_cb = MandatoryToolCallback(tool_names=_VERDICT_TOOL_NAMES, max_nudges=3)
adapter = CallbackAdapter(agent_name=name)
adapter.register(mandatory_cb)
extra_callbacks = adapter()
if "after_model_callback" in extra_callbacks:
existing = agent.after_model_callback
new_cb = extra_callbacks["after_model_callback"]
if existing is not None:
original = existing
def _chain(callback_context, llm_response, _orig=original, _new=new_cb):
result = _orig(
callback_context=callback_context,
llm_response=llm_response,
)
if result is not None:
return result
return _new(
callback_context=callback_context,
llm_response=llm_response,
)
agent.after_model_callback = _chain
else:
agent.after_model_callback = new_cb
chain_after_model_callback(
agent,
MandatoryToolCallback(tool_names=list(VERDICT_TOOL_NAMES), max_nudges=3),
)

return agent
9 changes: 8 additions & 1 deletion contractor/agents/http_agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@
HTTP_PROMPT: Final[str] = load_prompt("http_agent")

_SUMMARIZATION_BULLETS: Final[str] = (
"You have reached context limit. Summarize your progress and call report tool."
"You have reached the context limit. Summarize your progress:\n"
"1. Subtask objective as you understand it\n"
"2. Requests issued so far (method + URL) and the key responses observed\n"
"3. Findings worth keeping — persist them to memory before stopping\n"
"4. Open questions or blockers\n"
"5. Smallest concrete next step to resume the flow\n"
"Then return the structured result. Include only claims supported by "
"tool output; mark anything inferred as such.\n"
)

def build_http_agent(
Expand Down
2 changes: 1 addition & 1 deletion contractor/agents/likec4_builder_agent/prompts/v3.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ Inbound entry points:
- In the relationship title, name the specific vulnerability if present:
"POST /notes/search (unauthenticated, SQL injection via q param)"
"GET /admin/users (no RBAC — any auth user can access)"
"DELETE /notes/{id} (no ownership check)"
"DELETE /notes/{note-id} (no ownership check)"

Outbound calls:
- Protocol, transport (TLS?), credential type in the title:
Expand Down
Loading
Loading