Skip to content

Commit a217c3e

Browse files
hiskudinclaude
andcommitted
fix(defender): sync hasThreats blocking logic and tool rules precedence from JS package
- Add has_threats guard so base risk from tool rules alone does not block safe content when block_high_risk is enabled - Custom config tool_rules now take precedence over use_default_tool_rules flag - Add TestUseDefaultToolRules integration tests to cover both behaviours Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent aec0c5b commit a217c3e

2 files changed

Lines changed: 53 additions & 2 deletions

File tree

src/stackone_defender/core/prompt_defense.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def __init__(
5555
if block_high_risk:
5656
self._config.block_high_risk = True
5757

58-
tool_rules = self._config.tool_rules if use_default_tool_rules else []
58+
tool_rules = (config or {}).get("tool_rules") or (self._config.tool_rules if use_default_tool_rules else [])
5959

6060
self._tool_sanitizer: ToolResultSanitizer = create_tool_result_sanitizer(
6161
risky_fields=self._config.risky_fields,
@@ -120,7 +120,20 @@ def defend_tool_result(self, value: Any, tool_name: str) -> DefenseResult:
120120
tier2_idx = _RISK_LEVELS.index(tier2_risk)
121121
risk_level = _RISK_LEVELS[max(tier1_idx, tier2_idx)]
122122

123-
allowed = not self._config.block_high_risk or risk_level not in ("high", "critical")
123+
# Determine whether any threat signals were found (Tier 1 or Tier 2).
124+
# fields_sanitized captures sanitization methods (role stripping, encoding detection, etc.)
125+
# that may fire without adding named pattern detections, so we include it here.
126+
has_threats = (
127+
len(detections) > 0
128+
or len(fields_sanitized) > 0
129+
or (tier2_score is not None and tier2_score >= self._config.tier2.high_risk_threshold)
130+
)
131+
132+
# Three cases for allowed:
133+
# 1. block_high_risk is off -> always allow
134+
# 2. No threat signals found -> allow (base risk from tool rules alone does not block)
135+
# 3. Risk did not reach high/critical -> allow
136+
allowed = not self._config.block_high_risk or not has_threats or risk_level not in ("high", "critical")
124137

125138
return DefenseResult(
126139
allowed=allowed,

tests/test_integration.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,44 @@ def test_returns_latency(self):
147147
assert result.latency_ms > 0
148148

149149

150+
class TestUseDefaultToolRules:
151+
def test_does_not_apply_tool_rules_by_default(self):
152+
defense = create_prompt_defense()
153+
data = {"subject": "Weekly team update", "body": "Reminder about the meeting tomorrow at 10am.", "thread_id": "thread123"}
154+
result = defense.defend_tool_result(data, "gmail_get_message")
155+
# Without use_default_tool_rules, gmail tool rule should NOT seed risk_level to 'high'
156+
assert result.risk_level not in ("high", "critical")
157+
158+
def test_does_not_apply_tool_rules_when_explicitly_false(self):
159+
defense = create_prompt_defense(use_default_tool_rules=False)
160+
data = {"subject": "Weekly team update", "body": "Reminder about the meeting tomorrow at 10am.", "thread_id": "thread123"}
161+
result = defense.defend_tool_result(data, "gmail_get_message")
162+
assert result.risk_level not in ("high", "critical")
163+
164+
def test_applies_tool_rules_when_true(self):
165+
defense = create_prompt_defense(use_default_tool_rules=True, block_high_risk=True)
166+
data = {"subject": "Weekly team update", "body": "Reminder about the meeting tomorrow at 10am.", "thread_id": "thread123"}
167+
result = defense.defend_tool_result(data, "gmail_get_message")
168+
# With use_default_tool_rules, gmail tool rule seeds risk_level: 'high' as base risk,
169+
# but safe content with no detections should still be allowed through.
170+
assert result.risk_level == "high"
171+
assert result.allowed is True
172+
173+
def test_always_applies_custom_tool_rules_from_config(self):
174+
from stackone_defender.types import ToolSanitizationRule
175+
defense = create_prompt_defense(
176+
use_default_tool_rules=False,
177+
config={"tool_rules": [ToolSanitizationRule(tool_pattern="custom_*", sanitization_level="high")]},
178+
block_high_risk=True,
179+
)
180+
data = {"name": "Safe content"}
181+
result = defense.defend_tool_result(data, "custom_tool")
182+
# Custom rules set base risk_level: 'high', but safe content with no detections
183+
# should still be allowed through — base risk alone does not block.
184+
assert result.risk_level == "high"
185+
assert result.allowed is True
186+
187+
150188
class TestRealWorldScenarios:
151189
def setup_method(self):
152190
self.defense = create_prompt_defense()

0 commit comments

Comments
 (0)