diff --git a/.claude/agents/actor.md b/.claude/agents/actor.md index a95b39a7..c7c2c88d 100644 --- a/.claude/agents/actor.md +++ b/.claude/agents/actor.md @@ -844,252 +844,3 @@ Default: ``` - - - - -## Example 1: New Feature (Backend API) - -**Subtask**: "Create user registration endpoint" - -**Approach**: POST /api/register with email/password validation, bcrypt hashing, JWT response. Using standard password hashing and validation patterns. - -**Code Changes**: - -```python -# ===== File: api/auth.py ===== -from flask import request, jsonify -from werkzeug.security import generate_password_hash -from sqlalchemy.exc import IntegrityError -import jwt -import re - -EMAIL_REGEX = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') - -@app.route('/api/register', methods=['POST']) -def register(): - """Register new user with email and password.""" - try: - data = request.get_json() - if not data: - return jsonify({'error': 'Request body required'}), 400 - - email = data.get('email', '').strip().lower() - password = data.get('password', '') - - # Validation (fail fast) - if not email or not password: - return jsonify({'error': 'Email and password required'}), 400 - if not EMAIL_REGEX.match(email): - return jsonify({'error': 'Invalid email format'}), 400 - if len(password) < 8: - return jsonify({'error': 'Password must be at least 8 characters'}), 400 - - # Create user with hashed password - hashed = generate_password_hash(password, method='pbkdf2:sha256') - user = User(email=email, password_hash=hashed) - db.session.add(user) - db.session.commit() - - # Generate JWT for immediate authentication - token = jwt.encode( - {'user_id': user.id, 'email': user.email}, - app.config['SECRET_KEY'], - algorithm='HS256' - ) - return jsonify({'token': token, 'user_id': user.id}), 201 - - except IntegrityError: - db.session.rollback() - return jsonify({'error': 'Email already registered'}), 409 - except Exception as e: - db.session.rollback() - app.logger.exception("Registration error") - return jsonify({'error': 'Internal server error'}), 500 -``` - -**Trade-offs**: -- **Decision**: pbkdf2:sha256 for hashing -- **Alternatives**: bcrypt (slower, more secure), argon2 (newest) -- **Rationale**: Good security/performance balance for current scale and threat model -- **Trade-off**: Slightly less secure than bcrypt, but adequate given expected user base and upgrade path documented - -**Testing Considerations**: -1. test_register_valid_input_returns_201 -2. test_register_duplicate_email_returns_409 -3. test_register_invalid_email_returns_400 -4. test_register_short_password_returns_400 -5. test_register_missing_fields_returns_400 -6. test_register_sql_injection_safe -7. test_register_token_decodes_correctly - - ---- - -## Example 2: Modification to Existing Code - -**Subtask**: "Add rate limiting to existing API endpoint" - -**Approach**: Token bucket rate limiting using existing Redis connection. 100 req/min per IP. Using standard Redis rate limiting patterns. - -**Code Changes**: - -```python -# ===== File: api/middleware.py ===== -# ... (existing imports unchanged) ... -from functools import wraps -import time - -# ADDED: Rate limiter class (after line 45) -class RateLimiter: - """Token bucket rate limiter using Redis.""" - - def __init__(self, redis_client, limit=100, window=60): - self.redis = redis_client - self.limit = limit - self.window = window - - def is_allowed(self, key: str) -> tuple[bool, int]: - """Check if request allowed. Returns (allowed, remaining).""" - pipe = self.redis.pipeline() - now = time.time() - window_start = now - self.window - - pipe.zremrangebyscore(key, 0, window_start) - pipe.zadd(key, {str(now): now}) - pipe.zcard(key) - pipe.expire(key, self.window) - - _, _, count, _ = pipe.execute() - remaining = max(0, self.limit - count) - return count <= self.limit, remaining - - -def rate_limit(limit=100, window=60): - """Decorator for rate-limited endpoints.""" - limiter = RateLimiter(redis_client, limit, window) - - def decorator(f): - @wraps(f) - def wrapped(*args, **kwargs): - key = f"ratelimit:{request.remote_addr}:{f.__name__}" - allowed, remaining = limiter.is_allowed(key) - - if not allowed: - return jsonify({ - 'error': 'Rate limit exceeded', - 'retry_after': window - }), 429 - - response = f(*args, **kwargs) - # Add rate limit headers - if hasattr(response, 'headers'): - response.headers['X-RateLimit-Remaining'] = str(remaining) - return response - return wrapped - return decorator - -# ===== File: api/endpoints.py ===== -# MODIFIED: Added rate limiting decorator (line 45) - -@app.route('/api/data', methods=['GET']) -@rate_limit(limit=100, window=60) # ← ADDED -def get_data(): - return jsonify(fetch_data()) -``` - -**Trade-offs**: -- **Decision**: Redis-based (not in-memory) -- **Alternatives**: In-memory dict, database counter -- **Rationale**: Shared across instances, survives restarts -- **Trade-off**: Redis dependency, but already in stack - -**Testing Considerations**: -1. test_rate_limit_under_limit_passes -2. test_rate_limit_exceeded_returns_429 -3. test_rate_limit_window_resets -4. test_rate_limit_per_ip_isolation -5. test_rate_limit_headers_present - - ---- - -## Example 3: Blocked Task - -**Subtask**: "Add caching to user service" - -**Output**: -```yaml -status: BLOCKED -reason: "Implementing caching requires modifying core/database.py outside {{allowed_scope}} (api/*)" - -attempted_approaches: - - "Decorator-based caching in api/ layer" - problem: "Cache invalidation requires hooks in database write methods" - - "Read-through caching only" - problem: "Stale data unacceptable per requirements" - -proposed_solutions: - 1: "Expand {{allowed_scope}} to include core/database.py" - pros: "Clean implementation, proper invalidation" - cons: "Broader change scope" - 2: "Reduce subtask to read-only caching (accepts staleness)" - pros: "Stays in scope" - cons: "Tech debt, data consistency issues" - -recommendation: "Option 1 - clean solution worth scope expansion" -``` - ---- - -## Example 4: MCP Tool Failure Handling - -**Subtask**: "Implement WebSocket reconnection logic" - - -**Code Changes**: -```typescript -// ===== File: lib/websocket.ts ===== - -export class ReconnectingWebSocket { - private ws: WebSocket | null = null; - private reconnectAttempts = 0; - private maxAttempts = 5; - private baseDelay = 1000; - - connect(url: string): void { - this.ws = new WebSocket(url); - - this.ws.onclose = () => { - if (this.reconnectAttempts < this.maxAttempts) { - const delay = this.baseDelay * Math.pow(2, this.reconnectAttempts); - this.reconnectAttempts++; - console.log(`Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`); - setTimeout(() => this.connect(url), delay); - } else { - console.error('Max reconnection attempts reached'); - this.onMaxRetriesExceeded?.(); - } - }; - - this.ws.onopen = () => { - this.reconnectAttempts = 0; // Reset on successful connection - }; - } - - onMaxRetriesExceeded?: () => void; -} -``` - -**Trade-offs**: -- **Decision**: Standard exponential backoff pattern -- **Mitigation**: Added comprehensive tests, runtime version check -- **Risk**: May use outdated API - flagged for manual review - -**Testing Considerations**: -1. test_reconnect_success_resets_counter -2. test_reconnect_exponential_backoff_timing -3. test_reconnect_max_attempts_triggers_callback -4. test_reconnect_handles_immediate_disconnect - - diff --git a/.claude/agents/predictor.md b/.claude/agents/predictor.md index 1d9f7789..1cde7c0e 100644 --- a/.claude/agents/predictor.md +++ b/.claude/agents/predictor.md @@ -219,7 +219,6 @@ Before any analysis, classify the change to select appropriate depth: 2. Classify risk (usually "low") 3. Output JSON with confidence 0.9+ - ### Tier 2: STANDARD Analysis (1-2 minutes) **When to use**: - Internal function signature changes @@ -978,422 +977,6 @@ Classify dependencies to prioritize updates and avoid missing any category. - - -## Example 1: API Function Signature Change (Breaking) - -### Input (Actor Proposal) -```python -# Proposal: Add required 'region' parameter to get_weather() function - -# Current (in weather_service.py) -def get_weather(city: str) -> dict: - """Fetch weather data for a city.""" - return api_call(f"weather?city={city}") - -# Proposed change -def get_weather(city: str, region: str) -> dict: - """Fetch weather data for a city in a specific region.""" - return api_call(f"weather?city={city}®ion={region}") -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "get_weather" --include="*.py"` -- Result: - ``` - src/services/weather_reporter.py:15: get_weather(user.city) - src/api/handlers.py:42: get_weather(request.params['city']) - tests/test_weather.py:8: get_weather("Seattle") - tests/integration/test_api.py:23: get_weather(city_name) - scripts/daily_report.py:56: get_weather(config.default_city) - ``` - -**Step 2: Manual verification** (Grep) -- Grep for `"get_weather"` in configs, docs: - ```bash - config/api_endpoints.yaml:12: - name: get_weather - docs/api.md:45: ## get_weather(city) - README.md:78: weather = get_weather("Boston") - ``` - -**Step 3: Breaking change classification** -- Function signature change: **BREAKING** (added required parameter) -- 5 direct call sites + 3 documentation references -- Risk: HIGH (5-10 usage sites, breaking change) - -**Step 4: Confidence assessment** -- Automated analysis: completed (all call sites found) -- Manual verification: completed (found doc references) -- Test coverage: verified (2 tests exist) -- Confidence: 0.85 (high -- complete picture) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "2", - "tier_rationale": "Internal function change with 5-10 affected files; standard analysis appropriate", - "tools_used": ["grep"], - "analysis_duration_seconds": 75 - }, - "predicted_state": { - "modified_files": ["src/services/weather_service.py"], - "affected_components": [ - "src/services/weather_reporter.py", - "src/api/handlers.py", - "tests/test_weather.py", - "tests/integration/test_api.py", - "scripts/daily_report.py", - "config/api_endpoints.yaml", - "docs/api.md", - "README.md" - ], - "breaking_changes": [ - "Function signature change: get_weather() now requires 'region' parameter. All 5 call sites will break immediately without update.", - "API contract change: External clients may call this function—need versioning or deprecation period." - ], - "required_updates": [ - { - "type": "dependent_code", - "location": "src/services/weather_reporter.py:15", - "reason": "Call site missing required 'region' parameter. Must add region determination logic.", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/api/handlers.py:42", - "reason": "Call site missing required 'region' parameter. Must extract from request or add default.", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "scripts/daily_report.py:56", - "reason": "Script call missing required 'region' parameter. Add to config file.", - "priority": "must" - }, - { - "type": "test", - "location": "tests/test_weather.py:8", - "reason": "Test call missing required 'region' parameter. Update test to include region.", - "priority": "must" - }, - { - "type": "test", - "location": "tests/integration/test_api.py:23", - "reason": "Integration test missing required 'region' parameter. Add to test setup.", - "priority": "must" - }, - { - "type": "documentation", - "location": "docs/api.md:45", - "reason": "API documentation shows old signature. Update to include 'region' parameter.", - "priority": "should" - }, - { - "type": "documentation", - "location": "README.md:78", - "reason": "Example code uses old signature. Update example to include region.", - "priority": "should" - }, - { - "type": "configuration", - "location": "config/api_endpoints.yaml:12", - "reason": "Config file may reference function parameters. Verify and update if needed.", - "priority": "could" - } - ], - "edge_cases_detected": [] - }, - "risk_assessment": "high", - "confidence": { - "score": 0.85, - "tier_base": 0.50, - "adjustments": [ - {"category": "A", "factor": "grep found comprehensive usage data", "adjustment": 0.20}, - {"category": "B", "factor": "grep results clear and complete", "adjustment": 0.15}, - {"category": "C", "factor": "Static code (no flags)", "adjustment": 0.00}, - {"category": "D", "factor": "Tests exist for affected files", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "SUGGEST: Add 'region' parameter with default value first (e.g., region='US'), deploy, then make required in subsequent release. This allows graceful migration for external clients." -} -``` - -## Example 2: Internal Refactoring (Non-Breaking) - -### Input (Actor Proposal) -```python -# Proposal: Refactor email validation into separate class - -# Current (in validators.py) -def validate_email(email: str) -> bool: - """Validate email format.""" - import re - pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - return bool(re.match(pattern, email)) - -# Proposed change (validators.py) -class EmailValidator: - """Email validation with configurable rules.""" - - def __init__(self, allow_subdomains: bool = True): - self.allow_subdomains = allow_subdomains - self._pattern = self._build_pattern() - - def _build_pattern(self) -> str: - if self.allow_subdomains: - return r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - return r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+\.[a-zA-Z]{2,}$' - - def validate(self, email: str) -> bool: - import re - return bool(re.match(self._pattern, email)) - -# Wrapper for backward compatibility -def validate_email(email: str) -> bool: - """Validate email format (legacy function).""" - validator = EmailValidator() - return validator.validate(email) -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "validate_email" --include="*.py"` -- Result: - ``` - src/auth/registration.py:12: if not validate_email(email): - src/api/validators.py:8: validate_email(user_input) - tests/test_validators.py:15: assert validate_email("test@example.com") - ``` - -**Step 2: Breaking change classification** -- Function signature: **SAME** (validate_email still exists with same signature) -- Behavior: **SAME** (returns same results for same inputs) -- Internal implementation: **CHANGED** (but consumers don't care) -- Risk: LOW (internal refactoring, backward compatible) - -**Step 3: Manual verification** -- Grep for `"validate_email"` in docs: - ``` - docs/validation.md:23: Use validate_email() to check email format - ``` -- No breaking changes in docs (function still exists) - -**Step 4: Test coverage check** -- Read tests/test_validators.py: - ```python - def test_validate_email(): - assert validate_email("test@example.com") == True - assert validate_email("invalid") == False - ``` -- Tests still pass (function behavior unchanged) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "1", - "tier_rationale": "Internal refactoring with backward-compatible wrapper; minimal analysis sufficient", - "tools_used": ["grep"], - "analysis_duration_seconds": 25 - }, - "predicted_state": { - "modified_files": ["src/validators.py"], - "affected_components": [ - "src/auth/registration.py", - "src/api/validators.py", - "tests/test_validators.py" - ], - "breaking_changes": [], - "required_updates": [ - { - "type": "test", - "location": "tests/test_validators.py", - "reason": "OPTIONAL: Add tests for new EmailValidator class to validate configurable behavior. Legacy validate_email() tests still pass.", - "priority": "could" - }, - { - "type": "documentation", - "location": "docs/validation.md:23", - "reason": "OPTIONAL: Document new EmailValidator class for developers who want configurable validation. Legacy function docs still accurate.", - "priority": "could" - } - ], - "edge_cases_detected": [] - }, - "risk_assessment": "low", - "confidence": { - "score": 0.90, - "tier_base": 0.85, - "adjustments": [ - {"category": "B", "factor": "grep confirms all usages", "adjustment": 0.05}, - {"category": "C", "factor": "Static code (no dynamic patterns)", "adjustment": 0.00}, - {"category": "D", "factor": "Existing tests pass unchanged", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "Safe to proceed. Backward compatibility maintained via wrapper function. Consider adding tests for new class functionality." -} -``` - -## Example 3: Module Rename (High Impact) - -### Input (Actor Proposal) -``` -Proposal: Rename module src/utils/string_helpers.py → src/utils/text_utils.py -Reason: Better naming consistency with existing text_processing.py module -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "string_helpers" --include="*.py"` -- Result: - ``` - src/api/formatting.py:3: from utils.string_helpers import sanitize_input - src/services/email_sender.py:5: from utils import string_helpers - src/models/user.py:2: from utils.string_helpers import normalize_name - tests/test_string_helpers.py:1: from utils.string_helpers import * - tests/integration/test_api.py:8: import utils.string_helpers as sh - ``` - -**Step 2: Manual verification** (Grep for string "string_helpers") -- Found in: - ``` - .github/workflows/test.yml:15: - pytest tests/test_string_helpers.py - docs/utilities.md:12: ## string_helpers module - scripts/lint.sh:8: pylint src/utils/string_helpers.py - setup.py:25: "utils.string_helpers", - ``` - -**Step 3: Breaking change classification** -- Module path change: **BREAKING** (all imports break immediately) -- 5 direct imports + 4 references in config/scripts -- Risk: HIGH (module rename breaks all imports) - -**Step 4: Confidence assessment** -- Automated analysis: ✓ (imports found) -- Manual verification: ✓ (found configs, CI, setup.py) -- Potential misses: dynamic imports, string references in unknown config files -- Confidence: 0.75 (medium-high—might miss edge cases) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "3", - "tier_rationale": "Module rename affects >10 files; Phase 2 grep found many importers; deep analysis required", - "tools_used": ["grep"], - "analysis_duration_seconds": 180 - }, - "predicted_state": { - "modified_files": [ - "src/utils/string_helpers.py → src/utils/text_utils.py" - ], - "affected_components": [ - "src/api/formatting.py", - "src/services/email_sender.py", - "src/models/user.py", - "tests/test_string_helpers.py", - "tests/integration/test_api.py", - ".github/workflows/test.yml", - "docs/utilities.md", - "scripts/lint.sh", - "setup.py" - ], - "breaking_changes": [ - "Module path change: src/utils/string_helpers → src/utils/text_utils. All imports using old path will break immediately.", - "File rename breaks pytest discovery and explicit file references in CI/CD pipelines." - ], - "required_updates": [ - { - "type": "dependent_code", - "location": "src/api/formatting.py:3", - "reason": "Update import: from utils.string_helpers → from utils.text_utils", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/services/email_sender.py:5", - "reason": "Update import: from utils import string_helpers → import utils.text_utils", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/models/user.py:2", - "reason": "Update import: from utils.string_helpers → from utils.text_utils", - "priority": "must" - }, - { - "type": "test", - "location": "tests/test_string_helpers.py", - "reason": "Rename test file to tests/test_text_utils.py + update import statement", - "priority": "must" - }, - { - "type": "test", - "location": "tests/integration/test_api.py:8", - "reason": "Update import alias: import utils.string_helpers as sh → import utils.text_utils as sh", - "priority": "must" - }, - { - "type": "configuration", - "location": ".github/workflows/test.yml:15", - "reason": "Update test file path: test_string_helpers.py → test_text_utils.py", - "priority": "must" - }, - { - "type": "configuration", - "location": "scripts/lint.sh:8", - "reason": "Update file path: string_helpers.py → text_utils.py", - "priority": "must" - }, - { - "type": "configuration", - "location": "setup.py:25", - "reason": "Update package list: utils.string_helpers → utils.text_utils", - "priority": "must" - }, - { - "type": "documentation", - "location": "docs/utilities.md:12", - "reason": "Update module name in documentation: string_helpers → text_utils", - "priority": "should" - } - ], - "edge_cases_detected": [ - { - "type": "dynamic_code", - "description": "Potential dynamic imports not detectable via static analysis (importlib, __import__)", - "confidence_impact": -0.10, - "mitigation": "Search for 'importlib.*string_helpers' or '__import__.*string_helpers' patterns" - } - ] - }, - "risk_assessment": "high", - "confidence": { - "score": 0.75, - "tier_base": 0.50, - "adjustments": [ - {"category": "A", "factor": "grep found comprehensive import data", "adjustment": 0.20}, - {"category": "B", "factor": "grep results verified manually", "adjustment": 0.15}, - {"category": "C", "factor": "Potential dynamic imports (edge case)", "adjustment": -0.10}, - {"category": "D", "factor": "Config/CI files not fully verifiable", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "HIGH-RISK: Module rename requires coordinated updates across 9 files. Run full test suite after updates. Check for dynamic imports using Grep: 'importlib.*string_helpers' or '__import__.*string_helpers'. Consider deprecation path if external packages depend on this module." -} -``` - - diff --git a/.claude/agents/reflector.md b/.claude/agents/reflector.md index 5eab3d66..79206995 100644 --- a/.claude/agents/reflector.md +++ b/.claude/agents/reflector.md @@ -125,7 +125,6 @@ When multiple patterns detected, extract in order (max 3 per reflection): - ## Project Information @@ -652,145 +651,6 @@ Use {{language}}/{{framework}} syntax. Show specific library, configuration, exp -# COMPLETE EXAMPLES - - - -## Security Failure - SQL Injection - -**Input**: F-string query construction, Monitor flags injection vulnerability - -**Output**: -```json -{ - "reasoning": "F-string interpolation with user input creates SQL injection. Attacker can input ' OR '1'='1 to bypass auth or '; DROP TABLE to execute commands. Root: didn't understand difference between interpolation and parameterized queries, or assumed sanitization elsewhere. Violates defense-in-depth. Sequential-thinking reveals: developers learn SQL with concatenation (simpler) before parameterized queries (secure). Pattern: NEVER trust user input, ALWAYS use parameterized queries.", - - "error_identification": "get_user() line 2 uses f-string (f\"SELECT * FROM users WHERE username = '{username}'\") with user input. Allows SQL injection. Monitor flagged critical security vulnerability.", - - "root_cause_analysis": "Used string interpolation vs parameterized queries due to: 1) Not understanding SQL injection, 2) Assuming sanitization elsewhere, 3) Copying insecure pattern. Violated Trust Boundary - untrusted input crosses to trusted query without validation. #1 OWASP vulnerability.", - - "correct_approach": "Use parameterized queries:\n\n```python\n# ❌ INCORRECT - SQL injection\nquery = f\"SELECT * FROM users WHERE id = {user_id}\"\n\n# ✅ SECURE - parameterized\nquery = \"SELECT * FROM users WHERE id = ?\"\ncursor.execute(query, (user_id,))\n\n# ✅ SECURE - ORM\nuser = db.query(User).filter(User.id == user_id).first()\n```\n\nParameterized queries separate structure from data, preventing injection. Database handles escaping.", - - "key_insight": "When constructing SQL with user input, NEVER use string interpolation/concatenation. ALWAYS use parameterized queries (?, %s) or ORM. Prevents SQL injection by separating structure from data. Even 'sanitized' input can be exploited.", - - "suggested_new_bullets": [{ - "section": "SECURITY_PATTERNS", - "content": "SQL Injection Prevention: Always use parameterized queries (prepared statements) with user input. NEVER use f-strings, concatenation (+), or format(). Parameterized queries separate SQL structure from data. Even 'sanitized' input can be bypassed. ORMs (SQLAlchemy, Django) use parameterized queries automatically.", - "code_example": "```python\n# ❌ VULNERABLE\nquery = f\"SELECT * FROM users WHERE id = {user_id}\"\n\n# ✅ SECURE\nquery = \"SELECT * FROM users WHERE id = ?\"\ncursor.execute(query, (user_id,))\n```", - "related_to": ["sec-0008"] - }] -} -``` - - - - - -## Performance Success - Caching with High Hit Rate - -**Input**: Redis decorator implementation, Evaluator scores 10/10 performance - -**Output**: -```json -{ - "reasoning": "Achieved 9.0 overall (10/10 performance) via Redis caching decorator. Success factors: 1) Hierarchical keys (user:123:profile) enable pattern invalidation, 2) Decorator pattern = reusable, 3) TTL (1h) balances freshness/performance, 4) JSON serialization handles complex objects. Shows understanding: cache reads not writes, TTL prevents stale data, graceful failures. Follows Redis best practices. Reusable lesson: decorator pattern for cross-cutting concerns - separates caching from business logic (SRP).", - - "error_identification": "No errors. Decorator handles hits (return cached) and misses (fetch, populate). Hierarchical keys (user:{id}:profile) enable smart invalidation. TTL prevents indefinite stale data. 10/10 performance score.", - - "root_cause_analysis": "Success from caching fundamentals: 1) Cache read path not writes (writes invalidate), 2) TTL as safety net (prevents stale if invalidation fails), 3) Hierarchical keys enable partial clearing (user:123:*), 4) Decorator promotes reusability. Likely profiled first (identifying hot path) vs speculative caching. Data-driven approach = high hit rate.", - - "correct_approach": "Profile-first caching with decorator:\n\n```python\nimport redis\nimport json\nfrom functools import wraps\n\ndef cache_query(key_pattern, ttl=3600):\n def decorator(func):\n @wraps(func)\n def wrapper(*args, **kwargs):\n cache_key = key_pattern.format(*args, **kwargs)\n cached = redis_client.get(cache_key)\n if cached:\n return json.loads(cached)\n result = func(*args, **kwargs)\n redis_client.setex(cache_key, ttl, json.dumps(result))\n return result\n return wrapper\n return decorator\n\n@cache_query(\"user:{0}:profile\", ttl=3600)\ndef get_user_profile(user_id):\n return db.query(User).get(user_id)\n```\n\nReusable, testable, separates concerns.", - - "key_insight": "When implementing caching, profile first to identify hot paths (80/20 rule). Use decorator pattern for reusability and clean business logic. Design hierarchical cache keys (namespace:entity:id) for targeted invalidation. Include TTL as safety net against stale data.", - - "bullet_updates": [{"bullet_id": "perf-0023", "tag": "helpful", "reason": "Redis caching pattern correctly implemented, 10/10 performance"}], - - "suggested_new_bullets": [{ - "section": "PERFORMANCE_PATTERNS", - "content": "Decorator Pattern for Caching: Use Python decorators for caching as cross-cutting concern, keeping business logic clean. Profile first for hot paths. Hierarchical keys (namespace:entity:id:attribute) enable smart invalidation. Always include TTL to prevent indefinite stale data. Decorators = reusability without duplication.", - "code_example": "```python\ndef cache_query(key_pattern, ttl=3600):\n def decorator(func):\n @wraps(func)\n def wrapper(*args):\n key = key_pattern.format(*args)\n cached = redis_client.get(key)\n if cached: return json.loads(cached)\n result = func(*args)\n redis_client.setex(key, ttl, json.dumps(result))\n return result\n return wrapper\n return decorator\n```", - "related_to": ["perf-0023"] - }] -} -``` - - - - - -## Partial Success - Async Implementation with Race Condition - -**Input**: Async batch processor works in tests, intermittent failures in production. Evaluator: 6.5/10. Monitor: PASS. Predictor: "Medium risk - async complexity" - -**Output**: -```json -{ - "reasoning": "Partial success (6.5/10) reveals subtle async issue. Implementation works in isolated tests but fails under concurrent load. Sequential-thinking analysis: 1) Tests pass individually, 2) Production has concurrent requests, 3) Shared mutable state (self.results dict) accessed without locks, 4) Race condition: multiple coroutines modify dict simultaneously, 5) Root principle: async != thread-safe; shared state needs synchronization. Success factors: correct async/await usage, proper error handling. Failure factor: assumed async execution was sequential.", - - "error_identification": "BatchProcessor.process_items() lines 15-22: self.results[item.id] = result modifies shared dict from multiple coroutines. Works in tests (sequential) but races in production (concurrent). Monitor PASS (no security), Evaluator 6.5/10 (reliability issues).", - - "root_cause_analysis": "5 Whys: 1) Why intermittent? Race condition on shared state. 2) Why race? Multiple coroutines modify self.results simultaneously. 3) Why no lock? Assumed asyncio single-threaded means no concurrency. 4) Why that assumption? Conflated 'single thread' with 'no concurrency' - asyncio IS concurrent via cooperative scheduling. 5) Root principle: 'Shared mutable state requires synchronization regardless of concurrency model.'", - - "correct_approach": "Use asyncio-native synchronization:\n\n```python\nimport asyncio\n\nclass BatchProcessor:\n def __init__(self):\n self.results = {}\n self._lock = asyncio.Lock() # asyncio Lock, not threading\n \n async def process_items(self, items):\n # ❌ INCORRECT - race condition\n # for item in items:\n # result = await self.process_one(item)\n # self.results[item.id] = result # Unsafe!\n \n # ✅ CORRECT - synchronized access\n async def safe_process(item):\n result = await self.process_one(item)\n async with self._lock:\n self.results[item.id] = result\n return result\n \n return await asyncio.gather(*[safe_process(i) for i in items])\n```\n\nPrefer returning values over mutating shared state.", - - "key_insight": "When using asyncio with shared mutable state, ALWAYS use asyncio.Lock for synchronization. Asyncio is single-threaded but concurrent - race conditions occur at await points. Better pattern: design to return values rather than mutate shared state.", - - "contradiction_resolved": "BatchProcessor must aggregate results from concurrent coroutines AND NOT corrupt shared state via interleaved writes, where naive trade-off fails because dropping concurrency loses throughput while shared-state mutation without synchronization loses correctness.", - - "triz_principle": [24, 13], - - "bullet_updates": [ - {"bullet_id": "async-0023", "tag": "helpful", "reason": "Pattern correctly identified async concurrency risk, referenced for context"} - ], - - "suggested_new_bullets": [ - { - "section": "IMPLEMENTATION_PATTERNS", - "content": "Asyncio Shared State: asyncio is single-threaded but concurrent via cooperative scheduling. Race conditions occur when multiple coroutines modify shared state between await points. Use asyncio.Lock (not threading.Lock) for synchronization, or better, design functions to return values instead of mutating shared state. Common trap: assuming 'single thread' means 'no concurrency issues.'", - "code_example": "```python\n# ❌ RACE CONDITION\nself.results[id] = await process(item)\n\n# ✅ SYNCHRONIZED\nasync with self._lock:\n self.results[id] = await process(item)\n\n# ✅ BEST - No shared state\nreturn await asyncio.gather(*[process(i) for i in items])\n```", - "related_to": ["async-0023"] - } - ] -} -``` - -**Why This Example Matters**: Demonstrates multi-signal reconciliation (Monitor PASS + Evaluator partial), complex root cause requiring sequential-thinking, updating existing bullet while creating new one, and success+failure pattern extraction from single case. - - - - - -## Success - No New Bullet Needed (Patterns Validated) - -**Input**: Standard REST endpoint implementation, all validations pass, Evaluator: 9.0/10 - -**Output**: -```json -{ - "reasoning": "Successful REST implementation following established patterns. Actor correctly applied standard patterns for input validation, error responses, async handling, and authentication - no novel learning. Success validates existing pattern completeness for standard REST patterns.", - - "error_identification": "No errors. Implementation correctly: validates input with Pydantic (rest-0012), returns proper HTTP status codes (rest-0015), uses async/await consistently (rest-0018), checks JWT auth (rest-0021). All existing patterns applied correctly.", - - "root_cause_analysis": "Success root cause: Actor followed established REST patterns. Standard patterns provided comprehensive guidance. No novel decisions required - standard CRUD operation. This validates pattern coverage, not new learning opportunity.", - - "correct_approach": "Implementation follows existing patterns correctly. No correction needed.\n\n```python\n# Actor's implementation (correct)\n@router.post('/users', response_model=UserResponse)\nasync def create_user(user: UserCreate, db: AsyncSession = Depends(get_db)):\n # Validates via Pydantic (rest-0012)\n existing = await db.execute(select(User).where(User.email == user.email))\n if existing.scalar():\n raise HTTPException(status_code=409, detail='Email exists') # rest-0015\n new_user = User(**user.dict())\n db.add(new_user)\n await db.commit() # rest-0018\n return new_user\n```", - - "key_insight": "When existing patterns comprehensively cover a use case, successful application validates coverage rather than generating new patterns. Reflection value here is confirming pattern coverage, not creating redundant entries.", - - "bullet_updates": [ - {"bullet_id": "rest-0012", "tag": "helpful", "reason": "Pydantic validation pattern correctly applied"}, - {"bullet_id": "rest-0015", "tag": "helpful", "reason": "HTTP status code pattern correctly applied"}, - {"bullet_id": "rest-0018", "tag": "helpful", "reason": "Async pattern correctly applied"} - ], - - "suggested_new_bullets": [] -} -``` - -**Why This Example Matters**: Shows correct behavior when NO new bullet is needed - validates deduplication logic and demonstrates that empty suggested_new_bullets is valid output when patterns already exist. - - - # CONSTRAINTS diff --git a/.claude/agents/synthesizer.md b/.claude/agents/synthesizer.md index 62772d28..8a57fcbe 100644 --- a/.claude/agents/synthesizer.md +++ b/.claude/agents/synthesizer.md @@ -134,7 +134,6 @@ class SpecificationContract: performance_constraints: PerformanceConstraints | None = None security_requirements: list[str] | None = None - @dataclass class TypeConstraints: """Structured type constraints.""" @@ -142,7 +141,6 @@ class TypeConstraints: output_type: str # "ProcessResult" generic_params: list[str] | None = None # ["T", "U"] - @dataclass class SideEffectsPolicy: """Side effects policy with explicit allowed/forbidden.""" @@ -151,7 +149,6 @@ class SideEffectsPolicy: filesystem: Literal["allowed", "forbidden"] = "forbidden" database: Literal["allowed", "forbidden"] = "forbidden" - @dataclass class PerformanceConstraints: """Performance constraints.""" @@ -223,7 +220,6 @@ class MonitorAnalysis: # For synthesis recommended_as_base: bool # True if good as spine - @dataclass class CompatibilityFeatures: """Features used by orchestrator for deterministic compatibility scoring.""" @@ -246,7 +242,6 @@ class RetryContext: failed_decisions: list[str] # Decision IDs likely causing issues strategy_adjustments: list[str] # What to change in next attempt - @dataclass class ToolError: """Error from a validation tool.""" @@ -279,7 +274,6 @@ def is_variant_viable(m: MonitorAnalysis, specification_contract) -> bool: return getattr(m, "spec_contract_compliant", False) - viable_variants = [ (v, m) for v, m in zip(variants, monitor_results) if is_variant_viable(m, specification_contract) @@ -485,7 +479,6 @@ def resolve_conflict( winner = by_variant_score[0] return winner, f"From higher-scored variant: {winner.source_variant}" - def violates_contract(decision: Decision, contract: SpecificationContract) -> bool: """Check if decision violates contract constraints.""" # Check prohibited patterns @@ -506,7 +499,6 @@ def violates_contract(decision: Decision, contract: SpecificationContract) -> bo return False - # Apply resolution to all conflicts conflict_resolutions = [] for conflict_pair in all_conflicts: @@ -549,7 +541,6 @@ else: strategy = "fresh_generation" base_variant = None - def select_best_base( variants: list, monitor_results: list[MonitorAnalysis], @@ -596,31 +587,6 @@ def select_best_base( 5. Validate against contract constraints ``` -**Example**: -```python -# Base variant (v3) code: -def process_data(items): - results = [] - for item in items: - results.append(transform(item)) - return results - -# After applying decisions: -# - dec-001 (v1): "Use list comprehension for performance" -# - dec-005 (v2): "Add input validation" -# - dec-007 (v1): "Add type hints" - -def process_data(items: List[Item]) -> List[Result]: - """Process items with validation and transformation.""" - # Decision dec-005: Add input validation - if not items: - raise ValueError("Items list cannot be empty") - - # Decision dec-001: Use list comprehension for performance - # Decision dec-007: Add type hints (applied above) - return [transform(item) for item in items] -``` - #### Strategy: fresh_generation (compatibility < 0.7) ``` @@ -641,65 +607,6 @@ def process_data(items: List[Item]) -> List[Result]: 6. Validate against contract constraints ``` -**Example**: -```python -# Fresh generation from contract + decisions -# Contract: function_signature="def process(data: List[User]) -> ProcessResult" -# error_model="Result" -# concurrency_model="sync" -# Accepted decisions: -# - dec-002 (v1): "Return Result type for explicit error handling" -# - dec-003 (v2): "Validate all User fields before processing" -# - dec-009 (v3): "Log processing metrics for observability" - -from dataclasses import dataclass -from typing import List -import logging - -@dataclass -class ProcessResult: - """Result of processing operation.""" - success: bool - processed_count: int - error: str | None = None - -def process(data: List[User]) -> ProcessResult: - """ - Process user data with validation and observability. - - Implements: - - Decision dec-002: Result type for explicit error handling - - Decision dec-003: Validate all User fields - - Decision dec-009: Log processing metrics - """ - logger = logging.getLogger(__name__) - - # Decision dec-003: Validate all User fields before processing - try: - for user in data: - if not user.email or not user.name: - return ProcessResult( - success=False, - processed_count=0, - error=f"Invalid user: {user.id}" - ) - except Exception as e: - logger.error(f"Validation failed: {e}") - return ProcessResult(success=False, processed_count=0, error=str(e)) - - # Process validated data - processed = 0 - for user in data: - # ... processing logic ... - processed += 1 - - # Decision dec-009: Log processing metrics for observability - logger.info(f"Processed {processed} users successfully") - - # Decision dec-002: Return Result type - return ProcessResult(success=True, processed_count=processed) -``` - **Critical Rules for Code Generation**: 1. **NEVER copy code blocks directly** - always rewrite for coherence @@ -764,7 +671,6 @@ def validate_coherence(code: str, decisions: list[Decision], contract: Specifica return len(issues) == 0, issues - is_valid, validation_issues = validate_coherence( generated_code, [d for d in all_decisions if d.status == "accepted"], @@ -800,7 +706,6 @@ class SynthesizerOutput: conflict_resolutions: list[ConflictResolution] confidence: float # 0.0-1.0 - @dataclass class ConflictResolution: """Record of how a conflict was resolved.""" diff --git a/.claude/agents/task-decomposer.md b/.claude/agents/task-decomposer.md index 2f356d65..127be45d 100644 --- a/.claude/agents/task-decomposer.md +++ b/.claude/agents/task-decomposer.md @@ -914,157 +914,6 @@ For detailed examples and anti-patterns, see: `.claude/references/decomposition- -## REFERENCE EXAMPLES - -### Example A: Simple CRUD Feature - -**Goal**: "Add ability to archive projects" - -**Why this decomposition works**: Single domain, clear boundaries, well-known pattern - -**Full JSON Output**: -```json -{ - "schema_version": "2.0", - "analysis": { - "assumptions": ["Project model exists with standard CRUD operations"], - "open_questions": [], - "scope_vs_quality_decision": "Full feature scope implemented with non-negotiable quality standards. No scope reductions needed for this standard CRUD extension.", - "architecture_graph_summary": "Project -[add_field]-> archived_at; ProjectService -[calls]-> Project.update(); api/routes/projects.py -[uses]-> ProjectService; GET /projects -[filters_by]-> archived_at" - }, - "blueprint": { - "id": "project-archive", - "summary": "Add soft-delete archiving to projects via archived_at timestamp field with API endpoints and filtered listings", - "quality_requirements": { - "min_security_score": 7, - "min_functionality_score": 7, - "error_handling_required": true, - "rationale": "Standard CRUD operations require robust error handling and data validation" - }, - "subtasks": [ - { - "id": "ST-001", - "title": "Add archived_at field to Project model", - "description": "Add nullable DateTime 'archived_at' to Project model in models/project.py. Generate migration. null = active, non-null = archived.", - "dependencies": [], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+0) + Scope(+2) + Risk(+0) = 3", - "aag_contract": "ProjectModel -> add_field(archived_at: DateTime?) -> migration passes, existing queries unaffected", - "validation_criteria": [ - "VC1 [AC-1]: Project model has archived_at field (nullable DateTime)", - "VC2 [INV-1]: Migration runs without errors on existing data", - "VC3 [INV-1]: SELECT count(*) FROM projects WHERE archived_at IS NOT NULL returns 0" - ], - "test_strategy": { - "unit": "Test field accepts timestamps, test default is null", - "integration": "Test migration applies cleanly", - "e2e": "N/A", - "scenario_dimensions": { - "happy_path": "Test archived_at stores valid timestamp", - "error": "Test migration rollback on failure", - "edge_case": "Test field with existing null values in table", - "security": "N/A" - } - }, - "affected_files": [ - "models/project.py", - "migrations/versions/add_archived_at_to_projects.py" - ] - }, - { - "id": "ST-002", - "title": "Add archive_project() and unarchive_project() to ProjectService", - "description": "Add methods to services/project_service.py. archive_project(id) sets archived_at=now(), unarchive_project(id) sets archived_at=null.", - "dependencies": ["ST-001"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+1) + Scope(+1) + Risk(+0) = 3", - "aag_contract": "ProjectService -> archive_project(id) + unarchive_project(id) -> sets/clears archived_at, raises ProjectNotFoundError for invalid IDs", - "validation_criteria": [ - "VC1 [AC-2]: archive_project(valid_id) sets archived_at to current UTC timestamp", - "VC2 [AC-2]: unarchive_project(valid_id) sets archived_at to null", - "VC3 [AC-2]: Both raise ProjectNotFoundError for invalid IDs" - ], - "test_strategy": { - "unit": "Test archive sets timestamp, test unarchive clears it, test invalid ID handling", - "integration": "Test database persistence", - "e2e": "N/A" - }, - "affected_files": [ - "services/project_service.py" - ] - }, - { - "id": "ST-003", - "title": "Add POST /projects/{id}/archive and /unarchive endpoints", - "description": "Create endpoints in api/routes/projects.py. Require project owner permission. Return updated project JSON.", - "dependencies": ["ST-002"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 4, - "complexity_rationale": "Score 4: Base(1) + Novelty(+0) + Deps(+1) + Scope(+2) + Risk(+0) = 4", - "aag_contract": "ProjectRoutes -> POST /projects/{id}/archive|unarchive -> 200+JSON for owner, 403 for non-owner, 404 for invalid ID", - "validation_criteria": [ - "VC1 [AC-3]: POST /projects/{id}/archive returns 200 + archived project JSON", - "VC2 [AC-3]: POST /projects/{id}/unarchive returns 200 + active project JSON", - "VC3 [SEC-1]: Non-owner receives 403 Forbidden", - "VC4 [AC-3]: Invalid ID returns 404 Not Found" - ], - "contracts": [ - {"type": "postcondition", "assertion": "response.status == 200 AND project.archived_at IS SET WHEN valid_owner", "scope": "endpoint"}, - {"type": "postcondition", "assertion": "response.status == 403 WHEN NOT project.owner_id == request.user_id", "scope": "endpoint"}, - {"type": "postcondition", "assertion": "response.status == 404 WHEN project NOT EXISTS", "scope": "endpoint"} - ], - "implementation_hint": "Use existing @require_project_owner decorator", - "test_strategy": { - "unit": "Test request validation, test permission decorator", - "integration": "Test service integration, test response format", - "e2e": "Full flow: auth → archive → verify response → verify DB" - }, - "affected_files": [ - "api/routes/projects.py", - "api/schemas/project.py" - ] - }, - { - "id": "ST-004", - "title": "Filter archived projects from GET /projects by default", - "description": "Modify listing in api/routes/projects.py to exclude archived_at IS NOT NULL. Add ?include_archived=true param.", - "dependencies": ["ST-001"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+1) + Scope(+1) + Risk(+0) = 3", - "aag_contract": "ProjectRoutes -> GET /projects(?include_archived=bool) -> excludes archived by default, includes when param=true", - "validation_criteria": [ - "VC1 [AC-4]: GET /projects excludes archived projects by default", - "VC2 [AC-4]: GET /projects?include_archived=true returns all projects", - "VC3 [AC-4]: Response includes is_archived boolean field" - ], - "test_strategy": { - "unit": "Test filter logic, test query param parsing", - "integration": "Test with mix of archived/active projects", - "e2e": "N/A" - }, - "affected_files": [ - "api/routes/projects.py", - "services/project_service.py" - ] - } - ] - } -} -``` - ---- - ## Additional Examples For complex decomposition scenarios, see: `.claude/references/decomposition-examples.md` diff --git a/.claude/skills/map-debug/SKILL.md b/.claude/skills/map-debug/SKILL.md index f3d05cb3..f733ad8b 100644 --- a/.claude/skills/map-debug/SKILL.md +++ b/.claude/skills/map-debug/SKILL.md @@ -1,7 +1,7 @@ --- name: map-debug -description: | - Structured MAP debugging via task-decomposer, actor, and monitor agents. Use when reproducing a bug, isolating a regression, or diagnosing an error with specialized agents. Do NOT use for greenfield features; use map-plan or map-efficient. +description: |- + Structured MAP debugging via task-decomposer, actor, and monitor agents. Use when reproducing a bug, isolating a regression, or diagnosing an error with specialized agents — including failing or flaky tests (pytest AssertionError), crashes and segmentation faults, memory-corruption or memory errors in native/C extensions, intermittent or load-dependent failures (e.g. 500s under load), data-corruption bugs that only appear in production, scripts or hooks that silently exit or produce no output, and any "find the root cause" / "walk me through diagnosing" / "help me investigate" request. Trigger on phrasing like "failing test", "mysterious error", "segfault", "memory error", "intermittently fails", "root cause", "isolate the cause", "debug why", "diagnose", or "investigate the error". Prefer this over generic investigation when the user wants a systematic decompose-reproduce-fix-verify workflow. Do NOT use for greenfield features; use map-plan or map-efficient. effort: medium disable-model-invocation: true argument-hint: "[bug description]" diff --git a/docs/model-tier-trigger-experiment.md b/docs/model-tier-trigger-experiment.md new file mode 100644 index 00000000..92385923 --- /dev/null +++ b/docs/model-tier-trigger-experiment.md @@ -0,0 +1,93 @@ +# Model-Tier Trigger-Accuracy Experiment + +> Empirical test of "does the model tier matter for skill **trigger routing**?" +> Motivated by Murin 2026 (arXiv:2606.05970), whose central finding is that +> **model choice dominates prompt phrasing** for structured extraction, and that +> a larger model **redistributes** agreement rather than uniformly raising it. +> Here we test the analog in the MAP domain: skill auto-activation accuracy. + +## Method + +- Tool: `mapify skill-eval run --eval-set --model [--runs N]` + (`--model` and `--runs` were added for this experiment; default omits `--model` + → CLI session model, preserving prior behaviour). +- Metric: trigger-routing accuracy — positives must fire the right skill (first + `Skill` tool_use in the transcript), negatives must NOT fire the skill. +- Each cell = one seeded `claude -p` with heavy tools disallowed (the body cannot + do slow work; we only need the activation decision). Trigger read from the + transcript, with timeout recovery. +- **Noise caveat:** `claude -p` exposes no temperature flag, so decoding is not + guaranteed deterministic (unlike Murin's temp=0). Single-pass n=9 is noisy; + the firm-up below uses 3 passes on the primary skill. + +> Two harness bugs were fixed *before* collecting data, or the numbers would be +> meaningless: (1) executing skills hit the per-call timeout and were recorded as +> false non-triggers; (2) the description patcher corrupted block-scalar YAML so +> the skill never registered (0 triggers). See commits 9a180ee, 20f70d7. + +## Results — firm-up (authoritative) + +map-check at **3 passes/model** (n=27); map-explain & map-task at 1 pass (n=9): + +| skill | haiku | sonnet | opus | +|---|---|---|---| +| map-check (n=27) | 16/27 (59%) | **24/27 (89%)** | 21/27 (78%) | +| map-explain (n=9) | 3/9 (33%) | 3/9 (33%) | **4/9 (44%)** | +| map-task (n=9) | 3/9 (33%) | 6/9 (67%) | **7/9 (78%)** | +| **overall** | **22/45 (49%)** | **33/45 (73%)** | **32/45 (71%)** | +| mean latency/cell | **23 s** | 47 s | 53 s | + +### Pilot (n=9, single pass) — kept only to show why firming up mattered + +map-check single-pass read haiku 67% / sonnet 78% / opus 67%, which made Haiku +look as good as Opus. That was **noise**: at n=27 Haiku drops to 59% and the gap +to Sonnet (89%) is real. **Do not trust single-pass n=9 model comparisons.** + +## Findings + +1. **Model tier DOES matter for routing — but "bigger is better" does NOT hold.** + Haiku is consistently the weakest (49% overall; −24pp vs Sonnet). Sonnet (73%) + and Opus (71%) are ~tied overall, but they **redistribute**: Sonnet wins + map-check (89 vs 78), Opus wins map-explain (44 vs 33) and map-task (78 vs 67). + No monotonic improvement with size — exactly Murin's per-field pattern. +2. **Opus buys nothing over Sonnet for routing** (71% vs 73%, +6s latency/cell). + Pay for Opus only where hard-reasoning EXECUTION earns it, not for routing. +3. **The description is the ceiling.** map-explain caps at 33–44% across ALL + tiers — no model rescues a weak `description:`. The lever for trigger accuracy + is the description (the optimizer sweep), not the model. Consistent with the + project's earlier "contract/prose is the lever, model competence is largely + fixed" lesson. +4. **Negatives are robust across tiers** — map-check never falsely fired on its + negatives at any tier; bigger models additionally route negatives to the + correct *other* skill. + +## Implications for model tiering in MAP + +- **Skill routing / session model / `skill-eval` dispatcher → Sonnet.** Best + accuracy/latency balance; never the worst; Opus adds latency without a routing + gain; Haiku costs ~24pp. (This REVISES the pilot's "Haiku suffices.") +- **Haiku** stays fine for non-discriminative retrieval/summarisation + (research-agent), but is weak at instruction-following *discrimination* — avoid + it where correct routing/judgment matters. +- **Opus** — reserve for hard reasoning / specificity in EXECUTION + (task-decomposer, final-verifier, debate-arbiter). Trigger routing ≠ execution + quality; the latter is untested here and Murin's "larger model categorises more + specifically" plausibly applies to it. +- **Weak-description skills (e.g. map-explain) → run the description optimizer.** + Model can't fix them; the description is the bottleneck. + +## Current framework model assignments (for reference) + +opus: task-decomposer, final-verifier, debate-arbiter · +sonnet: actor, monitor, evaluator, predictor, synthesizer, reflector, documentation-reviewer · +haiku: research-agent · +skill-eval dispatcher/proposer: CLI session default (no pin). + +## Reproduce + +```bash +# 3 passes, one model: +mapify skill-eval run map-check \ + --eval-set tests/skills_eval/fixtures/map_check_optimize_eval_set.json \ + --model sonnet --runs 3 +``` diff --git a/docs/whole-skill-optimization-notes.md b/docs/whole-skill-optimization-notes.md index d99498c9..3c71e7eb 100644 --- a/docs/whole-skill-optimization-notes.md +++ b/docs/whole-skill-optimization-notes.md @@ -333,6 +333,241 @@ report a blocker"), bounded by `StepState.progress_feedback_subtasks`. Reuses th machinery — no new diff logic. Test `test_false_progress_routes_feedback_when_nothing_changed` (incl. once-guard pass-through). `make check` green (2260 passed, check-render byte-identical). +## MODEL LEVER on OUTCOME quality — discriminating fixture (2026-06-05, user: "optimize all 3 levers") + +Built the missing pieces to measure the EXECUTION-model lever on outcome quality: +- `spike_runner.py` gained `--agent-model AGENT=MODEL` (rewrites a seeded agent's `model:` + frontmatter — the actor writes the code, so its model is the code-quality lever) and + `--orchestrator-model`. +- New fixture `map_task_semver`: a genuinely non-trivial task (semver 2.0.0 `compare()` with + pre-release precedence + numeric-vs-lexical identifiers + build-metadata) — unlike the + scope/blocker traps it DISCRIMINATES code quality (stub → 12 failed; correct → 12 passed). + +Sweep — full `/map-task` (RESEARCH→ACTOR→MONITOR→test-gate), `--agent-model actor=`, ×2: + +| actor model | task_pass | QUALITY | mean dur | +|---|---|---|---| +| haiku | 2/2 ✓ | 1.0, 1.0 | 404 s | +| sonnet | 2/2 ✓ | 1.0, 1.0 | 633 s | +| opus | 2/2 ✓ | 1.0, (0.5*) | 410 s | + +\* the one <1.0 is NOT a code failure — task_pass=True, 12/12; the JUDGE returned 0 from an +`API Error: 529 Overloaded` (infra noise). Trust the deterministic gate. + +**KEY FINDING — the model lever on OUTCOME is ABSORBED BY THE GATE.** For a well-gated task +all three tiers produce correct code (6/6 task_pass) because the test-gate + MONITOR retry loop +drive even haiku to passing. Haiku is not slower → paying Opus for the actor buys nothing on +gated tasks. This confirms, with execution data, the project's recurring conclusion: + +**Ranked outcome levers: CONTRACT/MECHANICAL GATES (#3) ≫ MODEL (#2) ≫ PROSE (#1).** +- #3 (test gate, mutation-boundary validator, false-progress gate, MONITOR loop) governs outcome + — the real place to invest (tighter validation_criteria, stricter validators, test coverage). +- #2 model: matters a little for ROUTING (Sonnet sweet spot; see the description experiment), + irrelevant for gated EXECUTION. May matter on WEAKLY-gated tasks (untested). +- #1 prose (body / actor / monitor prompts): low leverage (ablations in PR #160 + here). + +Method caveat: on a gated task QUALITY saturates (all pass), so it does NOT capture the model +effect that may live in RETRY COUNT / first-pass quality. To measure that: log `retry_count` +from step_state.json, and/or use a WEAKLY-gated fixture (few tests) where code quality is not +rescued by the gate. Full write-up: the Obsidian note +`model-tier-vs-prompt-for-llm-skill-routing-map-replication` + `docs/model-tier-trigger-experiment.md`. + +## OUTCOME lever isolation — weak gate + vague contract (2026-06-05, cont.) + +To check whether a strong gate was HIDING a model effect, ran two more outcome +configurations (hidden 8-edge-case suite scored post-run; `--agent-model actor=` ×2): + +| configuration (fixture) | haiku | sonnet | opus | +|---|---|---|---| +| strong gate (map_task_semver) | task_pass ✓ | ✓ | ✓ | +| weak gate, FULL contract (map_task_semver_weakgate) | hidden 8/8 | 8/8 | 8/8 | +| weak gate, VAGUE contract (map_task_semver_vague) | hidden 8/8 | 8/8 | 8/8 | + +All 18 runs: correct semver, retries=0. Even a VAGUE contract ("compare two semver strings, +return -1/0/1", no precedence rules) + a weak gate + **haiku** → fully correct first try, +because semver 2.0.0 is within every tier's competence. + +**Definitive OUTCOME finding:** for a task WITHIN the models' competence, outcome quality is +INVARIANT to model tier, test-gate strength, AND contract completeness. The execution-model +lever only bites on tasks ABOVE the weaker model's competence (semver isn't one — threshold +not reached). Combined with the routing experiment + PR #160 ablations, the evidence-based +ranking of OUTCOME levers is final: + + CONTRACT/MECHANICAL GATES (#3) ≫ MODEL (#2, only beyond-competence) ≫ PROSE (#1). + +Practical: to lift skill outcome quality, invest in the CONTRACT and mechanical gates +(precise validation_criteria, strict validators, test coverage) — those guarantee correctness +at/above competence. Reserve a bigger actor model only for genuinely-novel/hard subtasks; +prose tuning is lowest ROI. Open thread: a beyond-haiku-competence fixture to locate the model +threshold. Full write-up: Obsidian note `model-tier-vs-prompt-for-llm-skill-routing-map-replication`. + +## MODEL THRESHOLD probe — capability-discriminating task (2026-06-06) + +User pushed back: "haiku can't really match opus." semver was within all tiers' competence, +so per web research (EvalPlus/HumanEval-Pro/arXiv 2511.04355, 2510.13908) built the canonical +edge-case discriminator — an arithmetic expression evaluator (right-assoc `**`, `**` tighter +than unary minus, overloaded unary, negative exponent, error contract). A naive impl passes a +thin gate but fails the hidden traps (validated 9p/3f). + +Sweeps (hidden edge-case suite, actor=haiku|sonnet|opus): + +| contract | haiku | sonnet | opus | +|---|---|---|---| +| FULL (rules spelled out) | 12/12 (run0) | — | — | +| VAGUE ("standard precedence" only) | 11/11, 11/11 | 10/11, 11/11 | 11/11, 11/11 | + +**Even on the edge-case-hard task with a VAGUE contract, haiku 4.5 matched/edged the bigger +models** — no bigger-is-better ordering (the lone miss was sonnet's, i.e. noise). haiku knows +right-associative `**` and implements it without being told. + +**Why (benchmark-grounded):** the tier gap appears on MULTI-STEP agentic / MULTI-FILE / +subtle-concurrency tasks (Terminal-Bench, multi-file SWE-bench), NOT on single self-contained +functions where all tiers are competitive. map-task decomposes work into well-scoped one-file +subtasks — the regime where haiku 4.5 is competent. So within map-task's granularity the model +is not the outcome lever (cost insight: actor can run on haiku). The gap would surface only on +a multi-file/multi-step subtask — which the framework's decomposition deliberately avoids. + +FINAL outcome-lever ranking stands: CONTRACT/GATES ≫ MODEL (only on beyond-competence, +multi-step/multi-file tasks) ≫ PROSE. Total this investigation: 6 outcome sweeps, +~30 full /map-task runs across haiku/sonnet/opus and 5 fixtures. + +## CORRECTED model-outcome result — two confounds removed (2026-06-06) + +llm-council review prompted transcript verification, which exposed TWO confounds that +invalidated the earlier "model doesn't matter for outcome" claim: + +1. **`--agent-model` was a no-op.** In headless `claude -p`, map-task runs INLINE on the + orchestrator (session) model and spawns NO sub-agents (0 Task dispatches in the transcript). + So every "actor=haiku/sonnet/opus" sweep actually ran on claude-opus-4-8. The real lever is + `--orchestrator-model` (= `claude -p --model`), verified in the transcript model field. +2. **Permission stall.** Without auto-accept, headless edits prompt→deny; haiku hit 4 + permission denials and gave up (0/11) despite correctly DESCRIBING the right algorithm, while + opus wrote freely and even dispatched sub-agents. Fix: `--permission-mode acceptEdits`. + +**CLEAN result** (model truly varied + permissions neutral), calc_vague discriminator, hidden /11, n=4: + +| tier | runs | mean | +|---|---|---| +| haiku | 11, 8, 10, 10 | 9.8/11 (most variable) | +| sonnet | 11, 10, 11, 10 | 10.5/11 | +| opus | 10, 11, 11, (1 workflow-error run) | 10.7/11 | + +**Modest but consistent gradient opus ≥ sonnet > haiku; haiku noticeably more variable on hard +edge cases.** On a single well-scoped subtask the gap is ~1 test (~8%) — real, not a blowout. +Separate AGENCY gap (opus dispatches sub-agents, works around blockers, persists longer). Per +benchmarks the gap widens on multi-step/multi-file tasks. n=4 is small (council: 8-10) — direction +solid, magnitude needs more runs. + +REVISED ranking (corrected): on well-scoped subtasks CONTRACT/GATES still dominate, but MODEL is +NOT negligible — bigger tiers are more reliable on hard edge cases and far more agentic at driving +the workflow. Earlier "model irrelevant for outcome" is RETRACTED — it was opus measured 3x. +METHODOLOGY LESSON: always verify the manipulation reached the transcript (model field, tool_use, +permission results) before trusting an agentic-harness result. + +## DECISIVE: actor-direct, controlled model (2026-06-06) — model DOES matter for code + +After two confounds (no-op `--agent-model`; permission-stall), the cleanest measurement +(user's idea): bypass Claude Code sub-agent dispatch entirely and invoke the agent prompt +DIRECTLY — `claude -p "" --append-system-prompt "" --model +--permission-mode acceptEdits` in a seeded repo, score with the hidden suite. One variable +(actor model), ~3x cheaper than full map-task. + +calc_vague, hidden /11, n=3: + +| actor model | runs | mean | +|---|---|---| +| haiku | 8, 10, 10 | 9.3 — never perfect, consistently drops edge cases | +| sonnet | 11, 11, 11 | 11.0 — perfect, consistent | +| opus | (1*), 11, 11 | 11.0 (*one broken run) | + +**Clean conclusion: the ACTOR model materially affects code quality** — haiku systematically +worse on edge cases; sonnet/opus reliably correct. Use sonnet+ for actor, not haiku. This is the +signal the two confounds had hidden; "haiku ≈ opus" is RETRACTED for the actor role. + +**Two findings about headless dispatch (probes):** +1. Simple skills (map-task, map-plan) do NOT auto-dispatch sub-agents in `claude -p` — they run + inline; the `subagent_type=` code blocks are treated as illustrative. +2. Even FORCED Task dispatch runs the sub-agent on the SESSION model (no haiku sidechain when + research-agent=haiku was dispatched). So per-agent `model:` is INERT in headless. + +**Architectural fix (user's idea generalised):** orchestrate agents by direct invocation — +per phase, `claude -p --append-system-prompt --model ` — so per-agent +models actually take effect in headless/automation. Then "which agent/model in a skill" is +deterministic: actor=sonnet+, task-decomposer/final-verifier=opus, research-agent=haiku, +monitor=sonnet. Prototype harness: `.map/actor_probe.py` (renders the agent Handlebars template, +runs it as the system prompt with a chosen model, scores via the hidden suite). + +## AGENT PROMPT POLISH via A/B (2026-06-06) — examples cuttable for actor, NOT for monitor + +User mandate: polish the agents (prompts/models), every change A/B-gated (keep only if B>=A). +Built per-agent A/B harnesses that render the agent template, run it as the system prompt via +`claude -p --append-system-prompt --model`, and score outcomes. + +- **actor** (`.map/actor_probe.py`, calc_vague hidden /11): cut the 247-line + `` block. A/B (n=4): A uncut haiku 9.3/sonnet 9.5; B cut haiku + 10.0/sonnet 11.0 -> B>=A. KEPT, committed (d78acd5), -23% prompt size. +- **monitor** (`.map/monitor_probe.py`, 7 cases: 3 clean + 4 buggy across correctness+security; + forced FINAL_VERDICT line for robust parsing; baseline 7/7). Cut the 14 `` + blocks (134 lines). A/B: bug-recall preserved 4/4, BUT clean-pass on IDENTICAL correct code + A=6/6 vs B=4/6 -> B not >= A. REVERTED. The good/bad examples calibrate the monitor's ACCEPT + threshold; removing them trends toward more false-positives. The monitor is a GATE, so + false-positive calibration earns the examples' place. Monitor length is largely justified. + +LESSON: "agent prompts are bloated, cut examples" is NOT universally safe. For a generative agent +(actor) examples are low-leverage/cuttable; for a gate agent (monitor) examples calibrate the +accept/reject boundary and are load-bearing. Each agent needs its OWN A/B gate before cutting. +Harness coverage limits what's cuttable: the verdict-accuracy gate covers correctness+security +review, NOT JSON-format validity / severity values / MCP behavior — so those monitor sections +cannot be A/B-cut without broader harness coverage. + +## AGENT POLISH — validated rule + results (2026-06-06, updated) + +Two clean A/B-validated cuts (generative agents, pure worked-examples): +- actor: -247L worked examples (B>=A: haiku 9.3->10.0, sonnet 9.5->11.0). Committed d78acd5. +- synthesizer: -95L Step-7 strategy worked-examples (A 33/33 == B 33/33). Committed a17c2d0. +Two A/B-REJECTED cuts (reverted): +- monitor: good/bad examples calibrate the accept threshold (clean-pass A 6/6 vs B 4/6). Gate agent. +- actor: error-handling patterns + decision-tree (B 9.0 < A 10.0, incl. a broken haiku run). Guidance scaffolds. + +RULE: pure WORKED-EXAMPLES (illustrative full solutions/strategies) are safely cuttable in +GENERATIVE agents (actor, synthesizer). GUIDANCE/PATTERNS and GATE-CALIBRATION examples are +load-bearing (monitor, actor-patterns) — they scaffold weaker models / calibrate accept/reject. + +COVERAGE CONSTRAINT (why this can't trivially extend to all agents): a cut is only A/B-safe if a +harness covers its effect. Clean code-output gates exist for actor + synthesizer (calc hidden suite) +— both done. Gate agents (monitor/evaluator/final-verifier) — examples calibrate, don't cut. The +remaining generative agents output FUZZY artifacts (task-decomposer: plans; predictor: risk +analysis; reflector: lessons; research-agent: summaries) with no clean deterministic gate, so +cutting them would violate the "every change passes A/B" rule without building fuzzy/low-confidence +gates first. Harnesses: .map/{actor,monitor,synth}_probe.py. + +## COMPREHENSIVE agent-prompt polish — final tally (2026-06-06) + +Swept ALL generative agents for the proven-cuttable pattern (pure worked-EXAMPLES / +REFERENCE blocks). Every cut A/B-gated via a per-agent harness (.map/*_probe.py); kept +only if B>=A. + +KEPT (A/B B>=A), 5 generative agents, 1049 lines of worked-example bloat removed: +| agent | cut | A/B gate | result | +|---|---|---|---| +| actor | -247L () | calc hidden /11, n=4 | haiku 9.3->10.0, sonnet 9.5->11.0 | +| synthesizer | -95L (Step-7 strategy examples) | calc synth, n=3 | 33/33 == 33/33 | +| task-decomposer | -151L (## REFERENCE EXAMPLES) | blueprint schema+coverage+acyclic, opus n=3 | 3/3 == 3/3 | +| predictor | -417L () | breaking+affected detection, sonnet n=3 | 3/3 == 3/3 | +| reflector | -139L (# COMPLETE EXAMPLES) | lesson-extraction keyword, sonnet n=3 | 3/3 == 3/3 | + +REJECTED (A/B B reverted: +| monitor | good/bad dimension examples | 7-case verdict | clean-pass A 6/6 vs B 4/6 (calibration) | +| actor | error-patterns + decision-tree (guidance) | calc | B 9.0 < A 10.0 (scaffolding) | + +NOT TOUCHED: gate agents (monitor/evaluator/debate-arbiter/final-verifier) — examples +calibrate accept/reject (monitor proved this); research-agent (281L, lean). + +FINAL RULE: pure worked-EXAMPLES / REFERENCE blocks are safely removable from GENERATIVE +agents (their detection/algorithm/schema/output-format content stays); GUIDANCE/PATTERNS +and GATE-CALIBRATION examples are load-bearing. Models (haiku/sonnet/opus) per-agent are +evidence-consistent and unchanged. Harnesses are reusable for future re-validation. + ## llm-council consultation log - 2026-06-05 (conv `066898a9-b37f-436f-96ca-7ae1cbe4c83a`, standard): asked about the no-gap result. diff --git a/src/mapify_cli/__init__.py b/src/mapify_cli/__init__.py index 5d857cce..ef0eff88 100644 --- a/src/mapify_cli/__init__.py +++ b/src/mapify_cli/__init__.py @@ -1385,6 +1385,17 @@ def skill_eval_run( max_concurrency: int = typer.Option( 1, "--max-concurrency", min=1, help="Bounded parallel dispatch (default 1)" ), + model: Optional[str] = typer.Option( + None, + "--model", + help="Model alias for claude -p (e.g. haiku, sonnet, opus). " + "Default: the claude CLI session default. Pin to compare trigger " + "accuracy across model tiers.", + ), + runs: int = typer.Option( + 1, "--runs", min=1, help="Passes per prompt (default 1). Use >1 to average " + "out single-pass noise when comparing models.", + ), ) -> None: """Run a skill evaluation matrix. @@ -1416,8 +1427,8 @@ def skill_eval_run( # Dry-run path: zero quota, NO dispatcher construction, NO claude required. if dry_run: - # D10: variant_id fixed = 1, runs = 1. - planned = len(entries) * 1 * 1 + # D10: variant_id fixed = 1; runs is caller-controlled (default 1). + planned = len(entries) * 1 * runs console.print( f"[bold]Dry-run:[/bold] planned [cyan]{planned}[/cyan] invocation(s) " f"for skill [bold]{skill}[/bold] — spends 0 quota" @@ -1445,12 +1456,12 @@ def skill_eval_run( ) # Run the evaluation matrix. - disp = ClaudeSubprocessDispatcher() + disp = ClaudeSubprocessDispatcher(model=model) _aggregator.bounded_run( skill=skill, entries=entries, dispatcher=disp, - runs=1, + runs=runs, out_path=out_path, resume=resume, max_concurrency=max_concurrency, diff --git a/src/mapify_cli/skills_eval/description_optimizer.py b/src/mapify_cli/skills_eval/description_optimizer.py index 03f352c8..0077c03d 100644 --- a/src/mapify_cli/skills_eval/description_optimizer.py +++ b/src/mapify_cli/skills_eval/description_optimizer.py @@ -84,11 +84,20 @@ def _set_frontmatter_description(content: str, new_desc: str) -> str: Rules: - File MUST start with ``---\\n`` (YAML frontmatter). - Frontmatter MUST contain a line starting with ``description:``. - - Only that single line is replaced; all other keys and the body are + - The ``description:`` key AND any block-scalar / indented continuation + lines belonging to it are replaced; all other keys and the body are preserved unchanged. - ``new_desc`` is serialised as a double-quoted YAML scalar so that embedded colons, quotes, and newlines parse back correctly. + Block-scalar awareness is essential: most shipped skills declare + ``description: |`` followed by an indented paragraph. Replacing only the + ``description:`` line (the original behaviour) left the indented body + orphaned below a now-quoted scalar — invalid YAML that silently unregistered + the skill, so it never triggered and every eval cell mis-read as a + non-trigger. We therefore also consume the continuation lines (those indented + deeper than the key) before substituting the single replacement line. + Raises ``ValueError`` if the preconditions are not met (fail-loud). """ if not content.startswith("---\n"): @@ -117,10 +126,27 @@ def _set_frontmatter_description(content: str, new_desc: str) -> str: "SKILL.md frontmatter does not contain a 'description:' key" ) + # Consume the description value's continuation lines: a block scalar + # (``description: |`` / ``>``) or any plain multi-line value spans the + # following lines indented deeper than the key. Stop at the next same-or-less + # indented key (e.g. ``effort:``) or a blank line. + key_line = fm_lines[desc_line_idx] + key_indent = len(key_line) - len(key_line.lstrip()) + end_idx = desc_line_idx + 1 + while end_idx < len(fm_lines): + cont = fm_lines[end_idx] + if cont.strip() == "": + break + cont_indent = len(cont) - len(cont.lstrip()) + if cont_indent <= key_indent: + break + end_idx += 1 + # Serialise new_desc as a double-quoted YAML scalar so round-trip is safe. escaped = new_desc.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") - new_line = f'description: "{escaped}"' - fm_lines[desc_line_idx] = new_line + new_line = f'{" " * key_indent}description: "{escaped}"' + # Replace the key line + its consumed continuation lines with the single line. + fm_lines[desc_line_idx:end_idx] = [new_line] new_frontmatter = "\n".join(fm_lines) return "---\n" + new_frontmatter + body_after diff --git a/src/mapify_cli/skills_eval/dispatcher.py b/src/mapify_cli/skills_eval/dispatcher.py index e656438a..6c5d4d32 100644 --- a/src/mapify_cli/skills_eval/dispatcher.py +++ b/src/mapify_cli/skills_eval/dispatcher.py @@ -20,6 +20,7 @@ import logging import os import random +import re import shutil import subprocess import tempfile @@ -102,6 +103,31 @@ def dispatch(self, prompt: str) -> DispatchResult: # plugin as its state dir — see _eval_subprocess_env. _NO_TELEGRAM_STATE_DIRNAME = ".map-eval-no-telegram" +# Tools the eval ``claude -p`` subprocess is NOT allowed to use. +# +# Trigger-accuracy eval only needs to observe whether the right skill *fires* +# (the first ``Skill`` tool_use in the transcript) — it does NOT need the skill +# *body* to execute. Letting the body run is actively harmful for EXECUTING +# skills: ``map-check`` would run the full ``make check`` test suite, and +# ``map-task`` / ``map-efficient`` would dispatch sub-agents — each blowing past +# the per-call timeout (recorded as a false non-trigger) and, for sub-agents, +# leaving orphaned child processes and burning real quota after the parent is +# killed. Disallowing the heavy/mutating/network tools lets the skill still +# TRIGGER (description-driven, recorded in the transcript) while the body cannot +# perform slow or side-effecting work. Read-only tools (Read/Grep/Glob) stay +# allowed so triggering behaviour is unaffected. The ``Skill`` tool itself is +# never disallowed — it is exactly the signal we measure. +_EVAL_DISALLOWED_TOOLS: tuple[str, ...] = ( + "Bash", + "Edit", + "Write", + "NotebookEdit", + "Task", + "Agent", + "WebFetch", + "WebSearch", +) + def _eval_subprocess_env(cwd: Path) -> dict[str, str]: """Build the environment for an eval ``claude -p`` subprocess. @@ -240,6 +266,19 @@ def _derive_triggered_skill(session_id: str, cwd: Path) -> str | None: return _parse_transcript_for_skill(transcript_path) +def _cwd_to_project_slug(path: Path) -> str: + """Replicate Claude Code's ``cwd -> ~/.claude/projects/`` transform. + + Every character that is NOT alphanumeric or ``-`` (so ``/``, ``.``, ``_``, + spaces, …) is replaced by ``-``. Verified against real project dirs: a + ``tempfile.mkdtemp()`` name such as ``mapeval-s_u5zv32`` — which contains an + underscore — is recorded under ``…-mapeval-s-u5zv32``. A naive + ``replace("/","-").replace(".","-")`` misses the ``_`` and silently fails to + locate the transcript (a false non-trigger on the affected dispatches). + """ + return re.sub(r"[^0-9A-Za-z-]", "-", str(path)) + + def _locate_transcript(session_id: str, cwd: Path) -> Path | None: """Return the path to the JSONL transcript or ``None`` if not found.""" projects_dir = Path.home() / ".claude" / "projects" @@ -250,8 +289,8 @@ def _locate_transcript(session_id: str, cwd: Path) -> Path | None: if matches: return matches[0] - # Fallback: reconstruct slug from cwd (``/`` and ``.`` → ``-``). - cwd_slug = str(cwd).replace("/", "-").replace(".", "-") + # Fallback: reconstruct slug from cwd (Claude Code's transform). + cwd_slug = _cwd_to_project_slug(cwd) fallback = projects_dir / cwd_slug / f"{session_id}.jsonl" if fallback.exists(): return fallback @@ -259,6 +298,50 @@ def _locate_transcript(session_id: str, cwd: Path) -> Path | None: return None +def _locate_transcript_by_cwd(cwd: Path) -> Path | None: + """Locate the transcript for a dispatch by cwd slug — no ``session_id`` needed. + + Used for timeout recovery: when ``claude -p`` is killed by the per-call + timeout we never receive the result envelope, so the ``session_id`` is + unknown. But Claude Code writes transcripts under + ``~/.claude/projects//.jsonl`` where ```` is + the cwd path with ``/`` and ``.`` replaced by ``-``. Each dispatch runs in a + unique throwaway temp cwd, so that slug dir holds exactly one session's + transcript(s); return the most-recently-modified ``*.jsonl`` (or ``None``). + + Claude Code derives the slug from the *resolved* cwd, so on macOS a + ``tempfile.mkdtemp()`` path under ``/var/folders/...`` (where ``/var`` is a + symlink to ``/private/var``) is recorded under the ``/private/var/...`` slug. + We therefore try the slug for BOTH the raw and the resolved cwd, and finally + fall back to globbing by the unique temp-dir name (the project dir name ends + with it) so a slug-derivation change cannot silently break recovery. + """ + projects_dir = Path.home() / ".claude" / "projects" + if not projects_dir.is_dir(): + return None + + candidates: list[Path] = [] + bases = {cwd} + try: + bases.add(cwd.resolve()) + except OSError: # pragma: no cover - resolve only fails on exotic FS errors + pass + for base in bases: + slug_dir = projects_dir / _cwd_to_project_slug(base) + if slug_dir.is_dir(): + candidates.extend(slug_dir.glob("*.jsonl")) + + if not candidates: + # Fallback: the project dir name ends with the (slugified) unique temp + # dir name — slugify so an underscore in the mkdtemp suffix still matches. + name_slug = _cwd_to_project_slug(Path(cwd.name)) + candidates.extend(projects_dir.glob(f"*{name_slug}/*.jsonl")) + + if not candidates: + return None + return max(candidates, key=lambda p: p.stat().st_mtime) + + def _parse_transcript_for_skill(path: Path) -> str | None: """Return the first ``Skill`` tool_use ``input.skill`` value, or ``None``.""" try: @@ -363,6 +446,23 @@ def _parse_envelope(stdout: str) -> tuple[str, TokenUsage | None, str]: _JITTER_MAX: float = 2.0 +class _TimeoutRecovery: + """Internal signal: the subprocess timed out but the trigger was recovered. + + Returned by ``_run_once`` when ``claude -p`` exceeds the per-call timeout yet + its transcript was already written (the ``Skill`` tool_use fires early, in the + first assistant turn, well before a slow skill BODY finishes). This is a VALID + trigger verdict — not a transient failure — so it is NOT retried. + ``triggered_skill`` is the fired skill name, or ``None`` if the transcript + exists but no skill fired by the time the process was killed. + """ + + __slots__ = ("triggered_skill",) + + def __init__(self, triggered_skill: str | None) -> None: + self.triggered_skill = triggered_skill + + class ClaudeSubprocessDispatcher(VariantDispatcher): """Real ``claude -p`` dispatcher for production/manual eval runs. @@ -396,9 +496,10 @@ def __init__( self, *, source_claude_dir: Path | None = None, - timeout: float = 120.0, + timeout: float = 90.0, max_retries: int = 2, backoff_base: float = 2.0, + model: str | None = None, ) -> None: """Initialise the dispatcher. @@ -407,8 +508,19 @@ def __init__( source_claude_dir: Path to the ``.claude/`` directory to seed from. Defaults to ``Path.cwd() / ".claude"`` at construction time. + model: + Optional model alias passed to ``claude -p --model`` (e.g. ``haiku``, + ``sonnet``, ``opus``). ``None`` (default) omits the flag, so the + ``claude`` CLI resolves the session default — preserving prior + behaviour. Pin this to measure how trigger accuracy varies by model + tier (model choice is known to dominate prompt phrasing). timeout: - Per-attempt timeout in seconds passed to ``subprocess.run``. + Per-attempt timeout in seconds passed to ``subprocess.run``. The + default (90 s) sits well above the observed trigger latency (the + first ``Skill`` tool_use lands in the transcript in ~30 s) so most + calls finish naturally; a slow EXECUTING skill that overruns is not + mis-recorded — ``_run_once`` recovers the trigger from the transcript + on timeout (see ``_TimeoutRecovery``). max_retries: Number of *additional* retry attempts after the first failure. Total attempts = 1 + max_retries. @@ -423,6 +535,7 @@ def __init__( self._timeout = timeout self._max_retries = max_retries self._backoff_base = backoff_base + self._model = model # Holds the error message from the latest _run_once call. Instance-scoped # (not class-level) so the safe-sequential-only assumption is explicit. self._last_error: str = "" @@ -472,7 +585,17 @@ def _dispatch_with_retry( ``max_retries=2`` means up to 3 total attempts (attempt 0, 1, 2). After all attempts are exhausted, returns an error ``DispatchResult``. """ - argv = ["claude", "-p", prompt, "--output-format", "json"] + argv = [ + "claude", + "-p", + prompt, + "--output-format", + "json", + "--disallowed-tools", + *_EVAL_DISALLOWED_TOOLS, + ] + if self._model: + argv += ["--model", self._model] last_error: str = "" for attempt in range(self._max_retries + 1): @@ -489,9 +612,20 @@ def _dispatch_with_retry( time.sleep(sleep_s) result = self._run_once(argv, tmp) - if result is not None: + if isinstance(result, subprocess.CompletedProcess): # Successful subprocess run — parse and return. return self._build_result(result, tmp, t_total_start) + if isinstance(result, _TimeoutRecovery): + # Timed out, but the trigger was recovered from the transcript — + # a valid verdict, not a transient failure. Do NOT retry. + duration_s = time.monotonic() - t_total_start + return DispatchResult( + raw_output="", + triggered_skill=result.triggered_skill, + token_usage=None, + duration_s=duration_s, + error=None, + ) # _run_once returned None => transient failure; last_error was set. last_error = self._last_error @@ -509,10 +643,14 @@ def _run_once( self, argv: list[str], cwd: Path, - ) -> subprocess.CompletedProcess[str] | None: - """Run ``argv`` once; return ``CompletedProcess`` on success, ``None`` on failure. - - Side-effect: sets ``self._last_error`` on failure. + ) -> subprocess.CompletedProcess[str] | _TimeoutRecovery | None: + """Run ``argv`` once. + + Returns: + - ``CompletedProcess`` on a normal (returncode 0) run, + - ``_TimeoutRecovery`` when the call timed out but its transcript was + already written (trigger recovered — a valid verdict, not a failure), + - ``None`` on a transient failure (retryable; ``self._last_error`` set). """ try: proc = subprocess.run( @@ -523,10 +661,22 @@ def _run_once( cwd=cwd, env=_eval_subprocess_env(cwd), ) - except subprocess.TimeoutExpired as exc: - self._last_error = f"timeout after {self._timeout}s: {exc}" - logger.warning("dispatch: subprocess timed out: %s", exc) - return None + except subprocess.TimeoutExpired: + # The trigger (first ``Skill`` tool_use) is written to the transcript + # early — before a slow EXECUTING skill body finishes. Recover it from + # the transcript (located by cwd slug, since the timeout gave us no + # result envelope / session_id) rather than mis-recording a false + # non-trigger. + # + # A timeout is TERMINAL — never retried. Retrying re-runs the same + # expensive call (another full ``self._timeout`` wait) with no reason + # to behave differently; the original design retried it 3x, turning a + # single overrun into ~3x the wall-clock for every executing-skill + # positive. The settle-poll below defeats the flush/visibility race + # where the just-killed process's transcript is not yet visible at the + # exact instant of the kill. + recovered = self._recover_trigger_after_timeout(cwd) + return _TimeoutRecovery(triggered_skill=recovered) except OSError as exc: self._last_error = f"OSError: {exc}" logger.warning("dispatch: OSError running claude: %s", exc) @@ -550,6 +700,36 @@ def _run_once( return proc + # Settle-poll for transcript recovery after a timeout kill: total ~1.5 s. + _RECOVERY_POLL_ATTEMPTS: int = 5 + _RECOVERY_POLL_INTERVAL_S: float = 0.3 + + def _recover_trigger_after_timeout(self, cwd: Path) -> str | None: + """Recover the fired-skill from the transcript after a timeout kill. + + Polls briefly because the killed process's transcript may not be visible + at the exact instant of the kill. Returns the fired skill name, or + ``None`` if no transcript appears (genuine non-trigger / startup hang) or + it contains no ``Skill`` tool_use. + """ + for attempt in range(self._RECOVERY_POLL_ATTEMPTS): + transcript = _locate_transcript_by_cwd(cwd) + if transcript is not None: + recovered = _parse_transcript_for_skill(transcript) + logger.warning( + "dispatch: timed out after %ss; recovered trigger=%r (transcript found)", + self._timeout, + recovered, + ) + return recovered + if attempt < self._RECOVERY_POLL_ATTEMPTS - 1: + time.sleep(self._RECOVERY_POLL_INTERVAL_S) + logger.warning( + "dispatch: timed out after %ss; no transcript located — recording non-trigger", + self._timeout, + ) + return None + def _build_result( self, proc: subprocess.CompletedProcess[str], diff --git a/src/mapify_cli/templates/agents/actor.md b/src/mapify_cli/templates/agents/actor.md index a95b39a7..c7c2c88d 100644 --- a/src/mapify_cli/templates/agents/actor.md +++ b/src/mapify_cli/templates/agents/actor.md @@ -844,252 +844,3 @@ Default: ``` - - - - -## Example 1: New Feature (Backend API) - -**Subtask**: "Create user registration endpoint" - -**Approach**: POST /api/register with email/password validation, bcrypt hashing, JWT response. Using standard password hashing and validation patterns. - -**Code Changes**: - -```python -# ===== File: api/auth.py ===== -from flask import request, jsonify -from werkzeug.security import generate_password_hash -from sqlalchemy.exc import IntegrityError -import jwt -import re - -EMAIL_REGEX = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') - -@app.route('/api/register', methods=['POST']) -def register(): - """Register new user with email and password.""" - try: - data = request.get_json() - if not data: - return jsonify({'error': 'Request body required'}), 400 - - email = data.get('email', '').strip().lower() - password = data.get('password', '') - - # Validation (fail fast) - if not email or not password: - return jsonify({'error': 'Email and password required'}), 400 - if not EMAIL_REGEX.match(email): - return jsonify({'error': 'Invalid email format'}), 400 - if len(password) < 8: - return jsonify({'error': 'Password must be at least 8 characters'}), 400 - - # Create user with hashed password - hashed = generate_password_hash(password, method='pbkdf2:sha256') - user = User(email=email, password_hash=hashed) - db.session.add(user) - db.session.commit() - - # Generate JWT for immediate authentication - token = jwt.encode( - {'user_id': user.id, 'email': user.email}, - app.config['SECRET_KEY'], - algorithm='HS256' - ) - return jsonify({'token': token, 'user_id': user.id}), 201 - - except IntegrityError: - db.session.rollback() - return jsonify({'error': 'Email already registered'}), 409 - except Exception as e: - db.session.rollback() - app.logger.exception("Registration error") - return jsonify({'error': 'Internal server error'}), 500 -``` - -**Trade-offs**: -- **Decision**: pbkdf2:sha256 for hashing -- **Alternatives**: bcrypt (slower, more secure), argon2 (newest) -- **Rationale**: Good security/performance balance for current scale and threat model -- **Trade-off**: Slightly less secure than bcrypt, but adequate given expected user base and upgrade path documented - -**Testing Considerations**: -1. test_register_valid_input_returns_201 -2. test_register_duplicate_email_returns_409 -3. test_register_invalid_email_returns_400 -4. test_register_short_password_returns_400 -5. test_register_missing_fields_returns_400 -6. test_register_sql_injection_safe -7. test_register_token_decodes_correctly - - ---- - -## Example 2: Modification to Existing Code - -**Subtask**: "Add rate limiting to existing API endpoint" - -**Approach**: Token bucket rate limiting using existing Redis connection. 100 req/min per IP. Using standard Redis rate limiting patterns. - -**Code Changes**: - -```python -# ===== File: api/middleware.py ===== -# ... (existing imports unchanged) ... -from functools import wraps -import time - -# ADDED: Rate limiter class (after line 45) -class RateLimiter: - """Token bucket rate limiter using Redis.""" - - def __init__(self, redis_client, limit=100, window=60): - self.redis = redis_client - self.limit = limit - self.window = window - - def is_allowed(self, key: str) -> tuple[bool, int]: - """Check if request allowed. Returns (allowed, remaining).""" - pipe = self.redis.pipeline() - now = time.time() - window_start = now - self.window - - pipe.zremrangebyscore(key, 0, window_start) - pipe.zadd(key, {str(now): now}) - pipe.zcard(key) - pipe.expire(key, self.window) - - _, _, count, _ = pipe.execute() - remaining = max(0, self.limit - count) - return count <= self.limit, remaining - - -def rate_limit(limit=100, window=60): - """Decorator for rate-limited endpoints.""" - limiter = RateLimiter(redis_client, limit, window) - - def decorator(f): - @wraps(f) - def wrapped(*args, **kwargs): - key = f"ratelimit:{request.remote_addr}:{f.__name__}" - allowed, remaining = limiter.is_allowed(key) - - if not allowed: - return jsonify({ - 'error': 'Rate limit exceeded', - 'retry_after': window - }), 429 - - response = f(*args, **kwargs) - # Add rate limit headers - if hasattr(response, 'headers'): - response.headers['X-RateLimit-Remaining'] = str(remaining) - return response - return wrapped - return decorator - -# ===== File: api/endpoints.py ===== -# MODIFIED: Added rate limiting decorator (line 45) - -@app.route('/api/data', methods=['GET']) -@rate_limit(limit=100, window=60) # ← ADDED -def get_data(): - return jsonify(fetch_data()) -``` - -**Trade-offs**: -- **Decision**: Redis-based (not in-memory) -- **Alternatives**: In-memory dict, database counter -- **Rationale**: Shared across instances, survives restarts -- **Trade-off**: Redis dependency, but already in stack - -**Testing Considerations**: -1. test_rate_limit_under_limit_passes -2. test_rate_limit_exceeded_returns_429 -3. test_rate_limit_window_resets -4. test_rate_limit_per_ip_isolation -5. test_rate_limit_headers_present - - ---- - -## Example 3: Blocked Task - -**Subtask**: "Add caching to user service" - -**Output**: -```yaml -status: BLOCKED -reason: "Implementing caching requires modifying core/database.py outside {{allowed_scope}} (api/*)" - -attempted_approaches: - - "Decorator-based caching in api/ layer" - problem: "Cache invalidation requires hooks in database write methods" - - "Read-through caching only" - problem: "Stale data unacceptable per requirements" - -proposed_solutions: - 1: "Expand {{allowed_scope}} to include core/database.py" - pros: "Clean implementation, proper invalidation" - cons: "Broader change scope" - 2: "Reduce subtask to read-only caching (accepts staleness)" - pros: "Stays in scope" - cons: "Tech debt, data consistency issues" - -recommendation: "Option 1 - clean solution worth scope expansion" -``` - ---- - -## Example 4: MCP Tool Failure Handling - -**Subtask**: "Implement WebSocket reconnection logic" - - -**Code Changes**: -```typescript -// ===== File: lib/websocket.ts ===== - -export class ReconnectingWebSocket { - private ws: WebSocket | null = null; - private reconnectAttempts = 0; - private maxAttempts = 5; - private baseDelay = 1000; - - connect(url: string): void { - this.ws = new WebSocket(url); - - this.ws.onclose = () => { - if (this.reconnectAttempts < this.maxAttempts) { - const delay = this.baseDelay * Math.pow(2, this.reconnectAttempts); - this.reconnectAttempts++; - console.log(`Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`); - setTimeout(() => this.connect(url), delay); - } else { - console.error('Max reconnection attempts reached'); - this.onMaxRetriesExceeded?.(); - } - }; - - this.ws.onopen = () => { - this.reconnectAttempts = 0; // Reset on successful connection - }; - } - - onMaxRetriesExceeded?: () => void; -} -``` - -**Trade-offs**: -- **Decision**: Standard exponential backoff pattern -- **Mitigation**: Added comprehensive tests, runtime version check -- **Risk**: May use outdated API - flagged for manual review - -**Testing Considerations**: -1. test_reconnect_success_resets_counter -2. test_reconnect_exponential_backoff_timing -3. test_reconnect_max_attempts_triggers_callback -4. test_reconnect_handles_immediate_disconnect - - diff --git a/src/mapify_cli/templates/agents/predictor.md b/src/mapify_cli/templates/agents/predictor.md index 1d9f7789..1cde7c0e 100644 --- a/src/mapify_cli/templates/agents/predictor.md +++ b/src/mapify_cli/templates/agents/predictor.md @@ -219,7 +219,6 @@ Before any analysis, classify the change to select appropriate depth: 2. Classify risk (usually "low") 3. Output JSON with confidence 0.9+ - ### Tier 2: STANDARD Analysis (1-2 minutes) **When to use**: - Internal function signature changes @@ -978,422 +977,6 @@ Classify dependencies to prioritize updates and avoid missing any category. - - -## Example 1: API Function Signature Change (Breaking) - -### Input (Actor Proposal) -```python -# Proposal: Add required 'region' parameter to get_weather() function - -# Current (in weather_service.py) -def get_weather(city: str) -> dict: - """Fetch weather data for a city.""" - return api_call(f"weather?city={city}") - -# Proposed change -def get_weather(city: str, region: str) -> dict: - """Fetch weather data for a city in a specific region.""" - return api_call(f"weather?city={city}®ion={region}") -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "get_weather" --include="*.py"` -- Result: - ``` - src/services/weather_reporter.py:15: get_weather(user.city) - src/api/handlers.py:42: get_weather(request.params['city']) - tests/test_weather.py:8: get_weather("Seattle") - tests/integration/test_api.py:23: get_weather(city_name) - scripts/daily_report.py:56: get_weather(config.default_city) - ``` - -**Step 2: Manual verification** (Grep) -- Grep for `"get_weather"` in configs, docs: - ```bash - config/api_endpoints.yaml:12: - name: get_weather - docs/api.md:45: ## get_weather(city) - README.md:78: weather = get_weather("Boston") - ``` - -**Step 3: Breaking change classification** -- Function signature change: **BREAKING** (added required parameter) -- 5 direct call sites + 3 documentation references -- Risk: HIGH (5-10 usage sites, breaking change) - -**Step 4: Confidence assessment** -- Automated analysis: completed (all call sites found) -- Manual verification: completed (found doc references) -- Test coverage: verified (2 tests exist) -- Confidence: 0.85 (high -- complete picture) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "2", - "tier_rationale": "Internal function change with 5-10 affected files; standard analysis appropriate", - "tools_used": ["grep"], - "analysis_duration_seconds": 75 - }, - "predicted_state": { - "modified_files": ["src/services/weather_service.py"], - "affected_components": [ - "src/services/weather_reporter.py", - "src/api/handlers.py", - "tests/test_weather.py", - "tests/integration/test_api.py", - "scripts/daily_report.py", - "config/api_endpoints.yaml", - "docs/api.md", - "README.md" - ], - "breaking_changes": [ - "Function signature change: get_weather() now requires 'region' parameter. All 5 call sites will break immediately without update.", - "API contract change: External clients may call this function—need versioning or deprecation period." - ], - "required_updates": [ - { - "type": "dependent_code", - "location": "src/services/weather_reporter.py:15", - "reason": "Call site missing required 'region' parameter. Must add region determination logic.", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/api/handlers.py:42", - "reason": "Call site missing required 'region' parameter. Must extract from request or add default.", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "scripts/daily_report.py:56", - "reason": "Script call missing required 'region' parameter. Add to config file.", - "priority": "must" - }, - { - "type": "test", - "location": "tests/test_weather.py:8", - "reason": "Test call missing required 'region' parameter. Update test to include region.", - "priority": "must" - }, - { - "type": "test", - "location": "tests/integration/test_api.py:23", - "reason": "Integration test missing required 'region' parameter. Add to test setup.", - "priority": "must" - }, - { - "type": "documentation", - "location": "docs/api.md:45", - "reason": "API documentation shows old signature. Update to include 'region' parameter.", - "priority": "should" - }, - { - "type": "documentation", - "location": "README.md:78", - "reason": "Example code uses old signature. Update example to include region.", - "priority": "should" - }, - { - "type": "configuration", - "location": "config/api_endpoints.yaml:12", - "reason": "Config file may reference function parameters. Verify and update if needed.", - "priority": "could" - } - ], - "edge_cases_detected": [] - }, - "risk_assessment": "high", - "confidence": { - "score": 0.85, - "tier_base": 0.50, - "adjustments": [ - {"category": "A", "factor": "grep found comprehensive usage data", "adjustment": 0.20}, - {"category": "B", "factor": "grep results clear and complete", "adjustment": 0.15}, - {"category": "C", "factor": "Static code (no flags)", "adjustment": 0.00}, - {"category": "D", "factor": "Tests exist for affected files", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "SUGGEST: Add 'region' parameter with default value first (e.g., region='US'), deploy, then make required in subsequent release. This allows graceful migration for external clients." -} -``` - -## Example 2: Internal Refactoring (Non-Breaking) - -### Input (Actor Proposal) -```python -# Proposal: Refactor email validation into separate class - -# Current (in validators.py) -def validate_email(email: str) -> bool: - """Validate email format.""" - import re - pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - return bool(re.match(pattern, email)) - -# Proposed change (validators.py) -class EmailValidator: - """Email validation with configurable rules.""" - - def __init__(self, allow_subdomains: bool = True): - self.allow_subdomains = allow_subdomains - self._pattern = self._build_pattern() - - def _build_pattern(self) -> str: - if self.allow_subdomains: - return r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - return r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+\.[a-zA-Z]{2,}$' - - def validate(self, email: str) -> bool: - import re - return bool(re.match(self._pattern, email)) - -# Wrapper for backward compatibility -def validate_email(email: str) -> bool: - """Validate email format (legacy function).""" - validator = EmailValidator() - return validator.validate(email) -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "validate_email" --include="*.py"` -- Result: - ``` - src/auth/registration.py:12: if not validate_email(email): - src/api/validators.py:8: validate_email(user_input) - tests/test_validators.py:15: assert validate_email("test@example.com") - ``` - -**Step 2: Breaking change classification** -- Function signature: **SAME** (validate_email still exists with same signature) -- Behavior: **SAME** (returns same results for same inputs) -- Internal implementation: **CHANGED** (but consumers don't care) -- Risk: LOW (internal refactoring, backward compatible) - -**Step 3: Manual verification** -- Grep for `"validate_email"` in docs: - ``` - docs/validation.md:23: Use validate_email() to check email format - ``` -- No breaking changes in docs (function still exists) - -**Step 4: Test coverage check** -- Read tests/test_validators.py: - ```python - def test_validate_email(): - assert validate_email("test@example.com") == True - assert validate_email("invalid") == False - ``` -- Tests still pass (function behavior unchanged) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "1", - "tier_rationale": "Internal refactoring with backward-compatible wrapper; minimal analysis sufficient", - "tools_used": ["grep"], - "analysis_duration_seconds": 25 - }, - "predicted_state": { - "modified_files": ["src/validators.py"], - "affected_components": [ - "src/auth/registration.py", - "src/api/validators.py", - "tests/test_validators.py" - ], - "breaking_changes": [], - "required_updates": [ - { - "type": "test", - "location": "tests/test_validators.py", - "reason": "OPTIONAL: Add tests for new EmailValidator class to validate configurable behavior. Legacy validate_email() tests still pass.", - "priority": "could" - }, - { - "type": "documentation", - "location": "docs/validation.md:23", - "reason": "OPTIONAL: Document new EmailValidator class for developers who want configurable validation. Legacy function docs still accurate.", - "priority": "could" - } - ], - "edge_cases_detected": [] - }, - "risk_assessment": "low", - "confidence": { - "score": 0.90, - "tier_base": 0.85, - "adjustments": [ - {"category": "B", "factor": "grep confirms all usages", "adjustment": 0.05}, - {"category": "C", "factor": "Static code (no dynamic patterns)", "adjustment": 0.00}, - {"category": "D", "factor": "Existing tests pass unchanged", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "Safe to proceed. Backward compatibility maintained via wrapper function. Consider adding tests for new class functionality." -} -``` - -## Example 3: Module Rename (High Impact) - -### Input (Actor Proposal) -``` -Proposal: Rename module src/utils/string_helpers.py → src/utils/text_utils.py -Reason: Better naming consistency with existing text_processing.py module -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "string_helpers" --include="*.py"` -- Result: - ``` - src/api/formatting.py:3: from utils.string_helpers import sanitize_input - src/services/email_sender.py:5: from utils import string_helpers - src/models/user.py:2: from utils.string_helpers import normalize_name - tests/test_string_helpers.py:1: from utils.string_helpers import * - tests/integration/test_api.py:8: import utils.string_helpers as sh - ``` - -**Step 2: Manual verification** (Grep for string "string_helpers") -- Found in: - ``` - .github/workflows/test.yml:15: - pytest tests/test_string_helpers.py - docs/utilities.md:12: ## string_helpers module - scripts/lint.sh:8: pylint src/utils/string_helpers.py - setup.py:25: "utils.string_helpers", - ``` - -**Step 3: Breaking change classification** -- Module path change: **BREAKING** (all imports break immediately) -- 5 direct imports + 4 references in config/scripts -- Risk: HIGH (module rename breaks all imports) - -**Step 4: Confidence assessment** -- Automated analysis: ✓ (imports found) -- Manual verification: ✓ (found configs, CI, setup.py) -- Potential misses: dynamic imports, string references in unknown config files -- Confidence: 0.75 (medium-high—might miss edge cases) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "3", - "tier_rationale": "Module rename affects >10 files; Phase 2 grep found many importers; deep analysis required", - "tools_used": ["grep"], - "analysis_duration_seconds": 180 - }, - "predicted_state": { - "modified_files": [ - "src/utils/string_helpers.py → src/utils/text_utils.py" - ], - "affected_components": [ - "src/api/formatting.py", - "src/services/email_sender.py", - "src/models/user.py", - "tests/test_string_helpers.py", - "tests/integration/test_api.py", - ".github/workflows/test.yml", - "docs/utilities.md", - "scripts/lint.sh", - "setup.py" - ], - "breaking_changes": [ - "Module path change: src/utils/string_helpers → src/utils/text_utils. All imports using old path will break immediately.", - "File rename breaks pytest discovery and explicit file references in CI/CD pipelines." - ], - "required_updates": [ - { - "type": "dependent_code", - "location": "src/api/formatting.py:3", - "reason": "Update import: from utils.string_helpers → from utils.text_utils", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/services/email_sender.py:5", - "reason": "Update import: from utils import string_helpers → import utils.text_utils", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/models/user.py:2", - "reason": "Update import: from utils.string_helpers → from utils.text_utils", - "priority": "must" - }, - { - "type": "test", - "location": "tests/test_string_helpers.py", - "reason": "Rename test file to tests/test_text_utils.py + update import statement", - "priority": "must" - }, - { - "type": "test", - "location": "tests/integration/test_api.py:8", - "reason": "Update import alias: import utils.string_helpers as sh → import utils.text_utils as sh", - "priority": "must" - }, - { - "type": "configuration", - "location": ".github/workflows/test.yml:15", - "reason": "Update test file path: test_string_helpers.py → test_text_utils.py", - "priority": "must" - }, - { - "type": "configuration", - "location": "scripts/lint.sh:8", - "reason": "Update file path: string_helpers.py → text_utils.py", - "priority": "must" - }, - { - "type": "configuration", - "location": "setup.py:25", - "reason": "Update package list: utils.string_helpers → utils.text_utils", - "priority": "must" - }, - { - "type": "documentation", - "location": "docs/utilities.md:12", - "reason": "Update module name in documentation: string_helpers → text_utils", - "priority": "should" - } - ], - "edge_cases_detected": [ - { - "type": "dynamic_code", - "description": "Potential dynamic imports not detectable via static analysis (importlib, __import__)", - "confidence_impact": -0.10, - "mitigation": "Search for 'importlib.*string_helpers' or '__import__.*string_helpers' patterns" - } - ] - }, - "risk_assessment": "high", - "confidence": { - "score": 0.75, - "tier_base": 0.50, - "adjustments": [ - {"category": "A", "factor": "grep found comprehensive import data", "adjustment": 0.20}, - {"category": "B", "factor": "grep results verified manually", "adjustment": 0.15}, - {"category": "C", "factor": "Potential dynamic imports (edge case)", "adjustment": -0.10}, - {"category": "D", "factor": "Config/CI files not fully verifiable", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "HIGH-RISK: Module rename requires coordinated updates across 9 files. Run full test suite after updates. Check for dynamic imports using Grep: 'importlib.*string_helpers' or '__import__.*string_helpers'. Consider deprecation path if external packages depend on this module." -} -``` - - diff --git a/src/mapify_cli/templates/agents/reflector.md b/src/mapify_cli/templates/agents/reflector.md index 5eab3d66..79206995 100644 --- a/src/mapify_cli/templates/agents/reflector.md +++ b/src/mapify_cli/templates/agents/reflector.md @@ -125,7 +125,6 @@ When multiple patterns detected, extract in order (max 3 per reflection): - ## Project Information @@ -652,145 +651,6 @@ Use {{language}}/{{framework}} syntax. Show specific library, configuration, exp -# COMPLETE EXAMPLES - - - -## Security Failure - SQL Injection - -**Input**: F-string query construction, Monitor flags injection vulnerability - -**Output**: -```json -{ - "reasoning": "F-string interpolation with user input creates SQL injection. Attacker can input ' OR '1'='1 to bypass auth or '; DROP TABLE to execute commands. Root: didn't understand difference between interpolation and parameterized queries, or assumed sanitization elsewhere. Violates defense-in-depth. Sequential-thinking reveals: developers learn SQL with concatenation (simpler) before parameterized queries (secure). Pattern: NEVER trust user input, ALWAYS use parameterized queries.", - - "error_identification": "get_user() line 2 uses f-string (f\"SELECT * FROM users WHERE username = '{username}'\") with user input. Allows SQL injection. Monitor flagged critical security vulnerability.", - - "root_cause_analysis": "Used string interpolation vs parameterized queries due to: 1) Not understanding SQL injection, 2) Assuming sanitization elsewhere, 3) Copying insecure pattern. Violated Trust Boundary - untrusted input crosses to trusted query without validation. #1 OWASP vulnerability.", - - "correct_approach": "Use parameterized queries:\n\n```python\n# ❌ INCORRECT - SQL injection\nquery = f\"SELECT * FROM users WHERE id = {user_id}\"\n\n# ✅ SECURE - parameterized\nquery = \"SELECT * FROM users WHERE id = ?\"\ncursor.execute(query, (user_id,))\n\n# ✅ SECURE - ORM\nuser = db.query(User).filter(User.id == user_id).first()\n```\n\nParameterized queries separate structure from data, preventing injection. Database handles escaping.", - - "key_insight": "When constructing SQL with user input, NEVER use string interpolation/concatenation. ALWAYS use parameterized queries (?, %s) or ORM. Prevents SQL injection by separating structure from data. Even 'sanitized' input can be exploited.", - - "suggested_new_bullets": [{ - "section": "SECURITY_PATTERNS", - "content": "SQL Injection Prevention: Always use parameterized queries (prepared statements) with user input. NEVER use f-strings, concatenation (+), or format(). Parameterized queries separate SQL structure from data. Even 'sanitized' input can be bypassed. ORMs (SQLAlchemy, Django) use parameterized queries automatically.", - "code_example": "```python\n# ❌ VULNERABLE\nquery = f\"SELECT * FROM users WHERE id = {user_id}\"\n\n# ✅ SECURE\nquery = \"SELECT * FROM users WHERE id = ?\"\ncursor.execute(query, (user_id,))\n```", - "related_to": ["sec-0008"] - }] -} -``` - - - - - -## Performance Success - Caching with High Hit Rate - -**Input**: Redis decorator implementation, Evaluator scores 10/10 performance - -**Output**: -```json -{ - "reasoning": "Achieved 9.0 overall (10/10 performance) via Redis caching decorator. Success factors: 1) Hierarchical keys (user:123:profile) enable pattern invalidation, 2) Decorator pattern = reusable, 3) TTL (1h) balances freshness/performance, 4) JSON serialization handles complex objects. Shows understanding: cache reads not writes, TTL prevents stale data, graceful failures. Follows Redis best practices. Reusable lesson: decorator pattern for cross-cutting concerns - separates caching from business logic (SRP).", - - "error_identification": "No errors. Decorator handles hits (return cached) and misses (fetch, populate). Hierarchical keys (user:{id}:profile) enable smart invalidation. TTL prevents indefinite stale data. 10/10 performance score.", - - "root_cause_analysis": "Success from caching fundamentals: 1) Cache read path not writes (writes invalidate), 2) TTL as safety net (prevents stale if invalidation fails), 3) Hierarchical keys enable partial clearing (user:123:*), 4) Decorator promotes reusability. Likely profiled first (identifying hot path) vs speculative caching. Data-driven approach = high hit rate.", - - "correct_approach": "Profile-first caching with decorator:\n\n```python\nimport redis\nimport json\nfrom functools import wraps\n\ndef cache_query(key_pattern, ttl=3600):\n def decorator(func):\n @wraps(func)\n def wrapper(*args, **kwargs):\n cache_key = key_pattern.format(*args, **kwargs)\n cached = redis_client.get(cache_key)\n if cached:\n return json.loads(cached)\n result = func(*args, **kwargs)\n redis_client.setex(cache_key, ttl, json.dumps(result))\n return result\n return wrapper\n return decorator\n\n@cache_query(\"user:{0}:profile\", ttl=3600)\ndef get_user_profile(user_id):\n return db.query(User).get(user_id)\n```\n\nReusable, testable, separates concerns.", - - "key_insight": "When implementing caching, profile first to identify hot paths (80/20 rule). Use decorator pattern for reusability and clean business logic. Design hierarchical cache keys (namespace:entity:id) for targeted invalidation. Include TTL as safety net against stale data.", - - "bullet_updates": [{"bullet_id": "perf-0023", "tag": "helpful", "reason": "Redis caching pattern correctly implemented, 10/10 performance"}], - - "suggested_new_bullets": [{ - "section": "PERFORMANCE_PATTERNS", - "content": "Decorator Pattern for Caching: Use Python decorators for caching as cross-cutting concern, keeping business logic clean. Profile first for hot paths. Hierarchical keys (namespace:entity:id:attribute) enable smart invalidation. Always include TTL to prevent indefinite stale data. Decorators = reusability without duplication.", - "code_example": "```python\ndef cache_query(key_pattern, ttl=3600):\n def decorator(func):\n @wraps(func)\n def wrapper(*args):\n key = key_pattern.format(*args)\n cached = redis_client.get(key)\n if cached: return json.loads(cached)\n result = func(*args)\n redis_client.setex(key, ttl, json.dumps(result))\n return result\n return wrapper\n return decorator\n```", - "related_to": ["perf-0023"] - }] -} -``` - - - - - -## Partial Success - Async Implementation with Race Condition - -**Input**: Async batch processor works in tests, intermittent failures in production. Evaluator: 6.5/10. Monitor: PASS. Predictor: "Medium risk - async complexity" - -**Output**: -```json -{ - "reasoning": "Partial success (6.5/10) reveals subtle async issue. Implementation works in isolated tests but fails under concurrent load. Sequential-thinking analysis: 1) Tests pass individually, 2) Production has concurrent requests, 3) Shared mutable state (self.results dict) accessed without locks, 4) Race condition: multiple coroutines modify dict simultaneously, 5) Root principle: async != thread-safe; shared state needs synchronization. Success factors: correct async/await usage, proper error handling. Failure factor: assumed async execution was sequential.", - - "error_identification": "BatchProcessor.process_items() lines 15-22: self.results[item.id] = result modifies shared dict from multiple coroutines. Works in tests (sequential) but races in production (concurrent). Monitor PASS (no security), Evaluator 6.5/10 (reliability issues).", - - "root_cause_analysis": "5 Whys: 1) Why intermittent? Race condition on shared state. 2) Why race? Multiple coroutines modify self.results simultaneously. 3) Why no lock? Assumed asyncio single-threaded means no concurrency. 4) Why that assumption? Conflated 'single thread' with 'no concurrency' - asyncio IS concurrent via cooperative scheduling. 5) Root principle: 'Shared mutable state requires synchronization regardless of concurrency model.'", - - "correct_approach": "Use asyncio-native synchronization:\n\n```python\nimport asyncio\n\nclass BatchProcessor:\n def __init__(self):\n self.results = {}\n self._lock = asyncio.Lock() # asyncio Lock, not threading\n \n async def process_items(self, items):\n # ❌ INCORRECT - race condition\n # for item in items:\n # result = await self.process_one(item)\n # self.results[item.id] = result # Unsafe!\n \n # ✅ CORRECT - synchronized access\n async def safe_process(item):\n result = await self.process_one(item)\n async with self._lock:\n self.results[item.id] = result\n return result\n \n return await asyncio.gather(*[safe_process(i) for i in items])\n```\n\nPrefer returning values over mutating shared state.", - - "key_insight": "When using asyncio with shared mutable state, ALWAYS use asyncio.Lock for synchronization. Asyncio is single-threaded but concurrent - race conditions occur at await points. Better pattern: design to return values rather than mutate shared state.", - - "contradiction_resolved": "BatchProcessor must aggregate results from concurrent coroutines AND NOT corrupt shared state via interleaved writes, where naive trade-off fails because dropping concurrency loses throughput while shared-state mutation without synchronization loses correctness.", - - "triz_principle": [24, 13], - - "bullet_updates": [ - {"bullet_id": "async-0023", "tag": "helpful", "reason": "Pattern correctly identified async concurrency risk, referenced for context"} - ], - - "suggested_new_bullets": [ - { - "section": "IMPLEMENTATION_PATTERNS", - "content": "Asyncio Shared State: asyncio is single-threaded but concurrent via cooperative scheduling. Race conditions occur when multiple coroutines modify shared state between await points. Use asyncio.Lock (not threading.Lock) for synchronization, or better, design functions to return values instead of mutating shared state. Common trap: assuming 'single thread' means 'no concurrency issues.'", - "code_example": "```python\n# ❌ RACE CONDITION\nself.results[id] = await process(item)\n\n# ✅ SYNCHRONIZED\nasync with self._lock:\n self.results[id] = await process(item)\n\n# ✅ BEST - No shared state\nreturn await asyncio.gather(*[process(i) for i in items])\n```", - "related_to": ["async-0023"] - } - ] -} -``` - -**Why This Example Matters**: Demonstrates multi-signal reconciliation (Monitor PASS + Evaluator partial), complex root cause requiring sequential-thinking, updating existing bullet while creating new one, and success+failure pattern extraction from single case. - - - - - -## Success - No New Bullet Needed (Patterns Validated) - -**Input**: Standard REST endpoint implementation, all validations pass, Evaluator: 9.0/10 - -**Output**: -```json -{ - "reasoning": "Successful REST implementation following established patterns. Actor correctly applied standard patterns for input validation, error responses, async handling, and authentication - no novel learning. Success validates existing pattern completeness for standard REST patterns.", - - "error_identification": "No errors. Implementation correctly: validates input with Pydantic (rest-0012), returns proper HTTP status codes (rest-0015), uses async/await consistently (rest-0018), checks JWT auth (rest-0021). All existing patterns applied correctly.", - - "root_cause_analysis": "Success root cause: Actor followed established REST patterns. Standard patterns provided comprehensive guidance. No novel decisions required - standard CRUD operation. This validates pattern coverage, not new learning opportunity.", - - "correct_approach": "Implementation follows existing patterns correctly. No correction needed.\n\n```python\n# Actor's implementation (correct)\n@router.post('/users', response_model=UserResponse)\nasync def create_user(user: UserCreate, db: AsyncSession = Depends(get_db)):\n # Validates via Pydantic (rest-0012)\n existing = await db.execute(select(User).where(User.email == user.email))\n if existing.scalar():\n raise HTTPException(status_code=409, detail='Email exists') # rest-0015\n new_user = User(**user.dict())\n db.add(new_user)\n await db.commit() # rest-0018\n return new_user\n```", - - "key_insight": "When existing patterns comprehensively cover a use case, successful application validates coverage rather than generating new patterns. Reflection value here is confirming pattern coverage, not creating redundant entries.", - - "bullet_updates": [ - {"bullet_id": "rest-0012", "tag": "helpful", "reason": "Pydantic validation pattern correctly applied"}, - {"bullet_id": "rest-0015", "tag": "helpful", "reason": "HTTP status code pattern correctly applied"}, - {"bullet_id": "rest-0018", "tag": "helpful", "reason": "Async pattern correctly applied"} - ], - - "suggested_new_bullets": [] -} -``` - -**Why This Example Matters**: Shows correct behavior when NO new bullet is needed - validates deduplication logic and demonstrates that empty suggested_new_bullets is valid output when patterns already exist. - - - # CONSTRAINTS diff --git a/src/mapify_cli/templates/agents/synthesizer.md b/src/mapify_cli/templates/agents/synthesizer.md index 62772d28..8a57fcbe 100644 --- a/src/mapify_cli/templates/agents/synthesizer.md +++ b/src/mapify_cli/templates/agents/synthesizer.md @@ -134,7 +134,6 @@ class SpecificationContract: performance_constraints: PerformanceConstraints | None = None security_requirements: list[str] | None = None - @dataclass class TypeConstraints: """Structured type constraints.""" @@ -142,7 +141,6 @@ class TypeConstraints: output_type: str # "ProcessResult" generic_params: list[str] | None = None # ["T", "U"] - @dataclass class SideEffectsPolicy: """Side effects policy with explicit allowed/forbidden.""" @@ -151,7 +149,6 @@ class SideEffectsPolicy: filesystem: Literal["allowed", "forbidden"] = "forbidden" database: Literal["allowed", "forbidden"] = "forbidden" - @dataclass class PerformanceConstraints: """Performance constraints.""" @@ -223,7 +220,6 @@ class MonitorAnalysis: # For synthesis recommended_as_base: bool # True if good as spine - @dataclass class CompatibilityFeatures: """Features used by orchestrator for deterministic compatibility scoring.""" @@ -246,7 +242,6 @@ class RetryContext: failed_decisions: list[str] # Decision IDs likely causing issues strategy_adjustments: list[str] # What to change in next attempt - @dataclass class ToolError: """Error from a validation tool.""" @@ -279,7 +274,6 @@ def is_variant_viable(m: MonitorAnalysis, specification_contract) -> bool: return getattr(m, "spec_contract_compliant", False) - viable_variants = [ (v, m) for v, m in zip(variants, monitor_results) if is_variant_viable(m, specification_contract) @@ -485,7 +479,6 @@ def resolve_conflict( winner = by_variant_score[0] return winner, f"From higher-scored variant: {winner.source_variant}" - def violates_contract(decision: Decision, contract: SpecificationContract) -> bool: """Check if decision violates contract constraints.""" # Check prohibited patterns @@ -506,7 +499,6 @@ def violates_contract(decision: Decision, contract: SpecificationContract) -> bo return False - # Apply resolution to all conflicts conflict_resolutions = [] for conflict_pair in all_conflicts: @@ -549,7 +541,6 @@ else: strategy = "fresh_generation" base_variant = None - def select_best_base( variants: list, monitor_results: list[MonitorAnalysis], @@ -596,31 +587,6 @@ def select_best_base( 5. Validate against contract constraints ``` -**Example**: -```python -# Base variant (v3) code: -def process_data(items): - results = [] - for item in items: - results.append(transform(item)) - return results - -# After applying decisions: -# - dec-001 (v1): "Use list comprehension for performance" -# - dec-005 (v2): "Add input validation" -# - dec-007 (v1): "Add type hints" - -def process_data(items: List[Item]) -> List[Result]: - """Process items with validation and transformation.""" - # Decision dec-005: Add input validation - if not items: - raise ValueError("Items list cannot be empty") - - # Decision dec-001: Use list comprehension for performance - # Decision dec-007: Add type hints (applied above) - return [transform(item) for item in items] -``` - #### Strategy: fresh_generation (compatibility < 0.7) ``` @@ -641,65 +607,6 @@ def process_data(items: List[Item]) -> List[Result]: 6. Validate against contract constraints ``` -**Example**: -```python -# Fresh generation from contract + decisions -# Contract: function_signature="def process(data: List[User]) -> ProcessResult" -# error_model="Result" -# concurrency_model="sync" -# Accepted decisions: -# - dec-002 (v1): "Return Result type for explicit error handling" -# - dec-003 (v2): "Validate all User fields before processing" -# - dec-009 (v3): "Log processing metrics for observability" - -from dataclasses import dataclass -from typing import List -import logging - -@dataclass -class ProcessResult: - """Result of processing operation.""" - success: bool - processed_count: int - error: str | None = None - -def process(data: List[User]) -> ProcessResult: - """ - Process user data with validation and observability. - - Implements: - - Decision dec-002: Result type for explicit error handling - - Decision dec-003: Validate all User fields - - Decision dec-009: Log processing metrics - """ - logger = logging.getLogger(__name__) - - # Decision dec-003: Validate all User fields before processing - try: - for user in data: - if not user.email or not user.name: - return ProcessResult( - success=False, - processed_count=0, - error=f"Invalid user: {user.id}" - ) - except Exception as e: - logger.error(f"Validation failed: {e}") - return ProcessResult(success=False, processed_count=0, error=str(e)) - - # Process validated data - processed = 0 - for user in data: - # ... processing logic ... - processed += 1 - - # Decision dec-009: Log processing metrics for observability - logger.info(f"Processed {processed} users successfully") - - # Decision dec-002: Return Result type - return ProcessResult(success=True, processed_count=processed) -``` - **Critical Rules for Code Generation**: 1. **NEVER copy code blocks directly** - always rewrite for coherence @@ -764,7 +671,6 @@ def validate_coherence(code: str, decisions: list[Decision], contract: Specifica return len(issues) == 0, issues - is_valid, validation_issues = validate_coherence( generated_code, [d for d in all_decisions if d.status == "accepted"], @@ -800,7 +706,6 @@ class SynthesizerOutput: conflict_resolutions: list[ConflictResolution] confidence: float # 0.0-1.0 - @dataclass class ConflictResolution: """Record of how a conflict was resolved.""" diff --git a/src/mapify_cli/templates/agents/task-decomposer.md b/src/mapify_cli/templates/agents/task-decomposer.md index 2f356d65..127be45d 100644 --- a/src/mapify_cli/templates/agents/task-decomposer.md +++ b/src/mapify_cli/templates/agents/task-decomposer.md @@ -914,157 +914,6 @@ For detailed examples and anti-patterns, see: `.claude/references/decomposition- -## REFERENCE EXAMPLES - -### Example A: Simple CRUD Feature - -**Goal**: "Add ability to archive projects" - -**Why this decomposition works**: Single domain, clear boundaries, well-known pattern - -**Full JSON Output**: -```json -{ - "schema_version": "2.0", - "analysis": { - "assumptions": ["Project model exists with standard CRUD operations"], - "open_questions": [], - "scope_vs_quality_decision": "Full feature scope implemented with non-negotiable quality standards. No scope reductions needed for this standard CRUD extension.", - "architecture_graph_summary": "Project -[add_field]-> archived_at; ProjectService -[calls]-> Project.update(); api/routes/projects.py -[uses]-> ProjectService; GET /projects -[filters_by]-> archived_at" - }, - "blueprint": { - "id": "project-archive", - "summary": "Add soft-delete archiving to projects via archived_at timestamp field with API endpoints and filtered listings", - "quality_requirements": { - "min_security_score": 7, - "min_functionality_score": 7, - "error_handling_required": true, - "rationale": "Standard CRUD operations require robust error handling and data validation" - }, - "subtasks": [ - { - "id": "ST-001", - "title": "Add archived_at field to Project model", - "description": "Add nullable DateTime 'archived_at' to Project model in models/project.py. Generate migration. null = active, non-null = archived.", - "dependencies": [], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+0) + Scope(+2) + Risk(+0) = 3", - "aag_contract": "ProjectModel -> add_field(archived_at: DateTime?) -> migration passes, existing queries unaffected", - "validation_criteria": [ - "VC1 [AC-1]: Project model has archived_at field (nullable DateTime)", - "VC2 [INV-1]: Migration runs without errors on existing data", - "VC3 [INV-1]: SELECT count(*) FROM projects WHERE archived_at IS NOT NULL returns 0" - ], - "test_strategy": { - "unit": "Test field accepts timestamps, test default is null", - "integration": "Test migration applies cleanly", - "e2e": "N/A", - "scenario_dimensions": { - "happy_path": "Test archived_at stores valid timestamp", - "error": "Test migration rollback on failure", - "edge_case": "Test field with existing null values in table", - "security": "N/A" - } - }, - "affected_files": [ - "models/project.py", - "migrations/versions/add_archived_at_to_projects.py" - ] - }, - { - "id": "ST-002", - "title": "Add archive_project() and unarchive_project() to ProjectService", - "description": "Add methods to services/project_service.py. archive_project(id) sets archived_at=now(), unarchive_project(id) sets archived_at=null.", - "dependencies": ["ST-001"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+1) + Scope(+1) + Risk(+0) = 3", - "aag_contract": "ProjectService -> archive_project(id) + unarchive_project(id) -> sets/clears archived_at, raises ProjectNotFoundError for invalid IDs", - "validation_criteria": [ - "VC1 [AC-2]: archive_project(valid_id) sets archived_at to current UTC timestamp", - "VC2 [AC-2]: unarchive_project(valid_id) sets archived_at to null", - "VC3 [AC-2]: Both raise ProjectNotFoundError for invalid IDs" - ], - "test_strategy": { - "unit": "Test archive sets timestamp, test unarchive clears it, test invalid ID handling", - "integration": "Test database persistence", - "e2e": "N/A" - }, - "affected_files": [ - "services/project_service.py" - ] - }, - { - "id": "ST-003", - "title": "Add POST /projects/{id}/archive and /unarchive endpoints", - "description": "Create endpoints in api/routes/projects.py. Require project owner permission. Return updated project JSON.", - "dependencies": ["ST-002"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 4, - "complexity_rationale": "Score 4: Base(1) + Novelty(+0) + Deps(+1) + Scope(+2) + Risk(+0) = 4", - "aag_contract": "ProjectRoutes -> POST /projects/{id}/archive|unarchive -> 200+JSON for owner, 403 for non-owner, 404 for invalid ID", - "validation_criteria": [ - "VC1 [AC-3]: POST /projects/{id}/archive returns 200 + archived project JSON", - "VC2 [AC-3]: POST /projects/{id}/unarchive returns 200 + active project JSON", - "VC3 [SEC-1]: Non-owner receives 403 Forbidden", - "VC4 [AC-3]: Invalid ID returns 404 Not Found" - ], - "contracts": [ - {"type": "postcondition", "assertion": "response.status == 200 AND project.archived_at IS SET WHEN valid_owner", "scope": "endpoint"}, - {"type": "postcondition", "assertion": "response.status == 403 WHEN NOT project.owner_id == request.user_id", "scope": "endpoint"}, - {"type": "postcondition", "assertion": "response.status == 404 WHEN project NOT EXISTS", "scope": "endpoint"} - ], - "implementation_hint": "Use existing @require_project_owner decorator", - "test_strategy": { - "unit": "Test request validation, test permission decorator", - "integration": "Test service integration, test response format", - "e2e": "Full flow: auth → archive → verify response → verify DB" - }, - "affected_files": [ - "api/routes/projects.py", - "api/schemas/project.py" - ] - }, - { - "id": "ST-004", - "title": "Filter archived projects from GET /projects by default", - "description": "Modify listing in api/routes/projects.py to exclude archived_at IS NOT NULL. Add ?include_archived=true param.", - "dependencies": ["ST-001"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+1) + Scope(+1) + Risk(+0) = 3", - "aag_contract": "ProjectRoutes -> GET /projects(?include_archived=bool) -> excludes archived by default, includes when param=true", - "validation_criteria": [ - "VC1 [AC-4]: GET /projects excludes archived projects by default", - "VC2 [AC-4]: GET /projects?include_archived=true returns all projects", - "VC3 [AC-4]: Response includes is_archived boolean field" - ], - "test_strategy": { - "unit": "Test filter logic, test query param parsing", - "integration": "Test with mix of archived/active projects", - "e2e": "N/A" - }, - "affected_files": [ - "api/routes/projects.py", - "services/project_service.py" - ] - } - ] - } -} -``` - ---- - ## Additional Examples For complex decomposition scenarios, see: `.claude/references/decomposition-examples.md` diff --git a/src/mapify_cli/templates/skills/map-debug/SKILL.md b/src/mapify_cli/templates/skills/map-debug/SKILL.md index f3d05cb3..f733ad8b 100644 --- a/src/mapify_cli/templates/skills/map-debug/SKILL.md +++ b/src/mapify_cli/templates/skills/map-debug/SKILL.md @@ -1,7 +1,7 @@ --- name: map-debug -description: | - Structured MAP debugging via task-decomposer, actor, and monitor agents. Use when reproducing a bug, isolating a regression, or diagnosing an error with specialized agents. Do NOT use for greenfield features; use map-plan or map-efficient. +description: |- + Structured MAP debugging via task-decomposer, actor, and monitor agents. Use when reproducing a bug, isolating a regression, or diagnosing an error with specialized agents — including failing or flaky tests (pytest AssertionError), crashes and segmentation faults, memory-corruption or memory errors in native/C extensions, intermittent or load-dependent failures (e.g. 500s under load), data-corruption bugs that only appear in production, scripts or hooks that silently exit or produce no output, and any "find the root cause" / "walk me through diagnosing" / "help me investigate" request. Trigger on phrasing like "failing test", "mysterious error", "segfault", "memory error", "intermittently fails", "root cause", "isolate the cause", "debug why", "diagnose", or "investigate the error". Prefer this over generic investigation when the user wants a systematic decompose-reproduce-fix-verify workflow. Do NOT use for greenfield features; use map-plan or map-efficient. effort: medium disable-model-invocation: true argument-hint: "[bug description]" diff --git a/src/mapify_cli/templates_src/agents/actor.md.jinja b/src/mapify_cli/templates_src/agents/actor.md.jinja index a95b39a7..c7c2c88d 100644 --- a/src/mapify_cli/templates_src/agents/actor.md.jinja +++ b/src/mapify_cli/templates_src/agents/actor.md.jinja @@ -844,252 +844,3 @@ Default: ``` - - - - -## Example 1: New Feature (Backend API) - -**Subtask**: "Create user registration endpoint" - -**Approach**: POST /api/register with email/password validation, bcrypt hashing, JWT response. Using standard password hashing and validation patterns. - -**Code Changes**: - -```python -# ===== File: api/auth.py ===== -from flask import request, jsonify -from werkzeug.security import generate_password_hash -from sqlalchemy.exc import IntegrityError -import jwt -import re - -EMAIL_REGEX = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') - -@app.route('/api/register', methods=['POST']) -def register(): - """Register new user with email and password.""" - try: - data = request.get_json() - if not data: - return jsonify({'error': 'Request body required'}), 400 - - email = data.get('email', '').strip().lower() - password = data.get('password', '') - - # Validation (fail fast) - if not email or not password: - return jsonify({'error': 'Email and password required'}), 400 - if not EMAIL_REGEX.match(email): - return jsonify({'error': 'Invalid email format'}), 400 - if len(password) < 8: - return jsonify({'error': 'Password must be at least 8 characters'}), 400 - - # Create user with hashed password - hashed = generate_password_hash(password, method='pbkdf2:sha256') - user = User(email=email, password_hash=hashed) - db.session.add(user) - db.session.commit() - - # Generate JWT for immediate authentication - token = jwt.encode( - {'user_id': user.id, 'email': user.email}, - app.config['SECRET_KEY'], - algorithm='HS256' - ) - return jsonify({'token': token, 'user_id': user.id}), 201 - - except IntegrityError: - db.session.rollback() - return jsonify({'error': 'Email already registered'}), 409 - except Exception as e: - db.session.rollback() - app.logger.exception("Registration error") - return jsonify({'error': 'Internal server error'}), 500 -``` - -**Trade-offs**: -- **Decision**: pbkdf2:sha256 for hashing -- **Alternatives**: bcrypt (slower, more secure), argon2 (newest) -- **Rationale**: Good security/performance balance for current scale and threat model -- **Trade-off**: Slightly less secure than bcrypt, but adequate given expected user base and upgrade path documented - -**Testing Considerations**: -1. test_register_valid_input_returns_201 -2. test_register_duplicate_email_returns_409 -3. test_register_invalid_email_returns_400 -4. test_register_short_password_returns_400 -5. test_register_missing_fields_returns_400 -6. test_register_sql_injection_safe -7. test_register_token_decodes_correctly - - ---- - -## Example 2: Modification to Existing Code - -**Subtask**: "Add rate limiting to existing API endpoint" - -**Approach**: Token bucket rate limiting using existing Redis connection. 100 req/min per IP. Using standard Redis rate limiting patterns. - -**Code Changes**: - -```python -# ===== File: api/middleware.py ===== -# ... (existing imports unchanged) ... -from functools import wraps -import time - -# ADDED: Rate limiter class (after line 45) -class RateLimiter: - """Token bucket rate limiter using Redis.""" - - def __init__(self, redis_client, limit=100, window=60): - self.redis = redis_client - self.limit = limit - self.window = window - - def is_allowed(self, key: str) -> tuple[bool, int]: - """Check if request allowed. Returns (allowed, remaining).""" - pipe = self.redis.pipeline() - now = time.time() - window_start = now - self.window - - pipe.zremrangebyscore(key, 0, window_start) - pipe.zadd(key, {str(now): now}) - pipe.zcard(key) - pipe.expire(key, self.window) - - _, _, count, _ = pipe.execute() - remaining = max(0, self.limit - count) - return count <= self.limit, remaining - - -def rate_limit(limit=100, window=60): - """Decorator for rate-limited endpoints.""" - limiter = RateLimiter(redis_client, limit, window) - - def decorator(f): - @wraps(f) - def wrapped(*args, **kwargs): - key = f"ratelimit:{request.remote_addr}:{f.__name__}" - allowed, remaining = limiter.is_allowed(key) - - if not allowed: - return jsonify({ - 'error': 'Rate limit exceeded', - 'retry_after': window - }), 429 - - response = f(*args, **kwargs) - # Add rate limit headers - if hasattr(response, 'headers'): - response.headers['X-RateLimit-Remaining'] = str(remaining) - return response - return wrapped - return decorator - -# ===== File: api/endpoints.py ===== -# MODIFIED: Added rate limiting decorator (line 45) - -@app.route('/api/data', methods=['GET']) -@rate_limit(limit=100, window=60) # ← ADDED -def get_data(): - return jsonify(fetch_data()) -``` - -**Trade-offs**: -- **Decision**: Redis-based (not in-memory) -- **Alternatives**: In-memory dict, database counter -- **Rationale**: Shared across instances, survives restarts -- **Trade-off**: Redis dependency, but already in stack - -**Testing Considerations**: -1. test_rate_limit_under_limit_passes -2. test_rate_limit_exceeded_returns_429 -3. test_rate_limit_window_resets -4. test_rate_limit_per_ip_isolation -5. test_rate_limit_headers_present - - ---- - -## Example 3: Blocked Task - -**Subtask**: "Add caching to user service" - -**Output**: -```yaml -status: BLOCKED -reason: "Implementing caching requires modifying core/database.py outside {{allowed_scope}} (api/*)" - -attempted_approaches: - - "Decorator-based caching in api/ layer" - problem: "Cache invalidation requires hooks in database write methods" - - "Read-through caching only" - problem: "Stale data unacceptable per requirements" - -proposed_solutions: - 1: "Expand {{allowed_scope}} to include core/database.py" - pros: "Clean implementation, proper invalidation" - cons: "Broader change scope" - 2: "Reduce subtask to read-only caching (accepts staleness)" - pros: "Stays in scope" - cons: "Tech debt, data consistency issues" - -recommendation: "Option 1 - clean solution worth scope expansion" -``` - ---- - -## Example 4: MCP Tool Failure Handling - -**Subtask**: "Implement WebSocket reconnection logic" - - -**Code Changes**: -```typescript -// ===== File: lib/websocket.ts ===== - -export class ReconnectingWebSocket { - private ws: WebSocket | null = null; - private reconnectAttempts = 0; - private maxAttempts = 5; - private baseDelay = 1000; - - connect(url: string): void { - this.ws = new WebSocket(url); - - this.ws.onclose = () => { - if (this.reconnectAttempts < this.maxAttempts) { - const delay = this.baseDelay * Math.pow(2, this.reconnectAttempts); - this.reconnectAttempts++; - console.log(`Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`); - setTimeout(() => this.connect(url), delay); - } else { - console.error('Max reconnection attempts reached'); - this.onMaxRetriesExceeded?.(); - } - }; - - this.ws.onopen = () => { - this.reconnectAttempts = 0; // Reset on successful connection - }; - } - - onMaxRetriesExceeded?: () => void; -} -``` - -**Trade-offs**: -- **Decision**: Standard exponential backoff pattern -- **Mitigation**: Added comprehensive tests, runtime version check -- **Risk**: May use outdated API - flagged for manual review - -**Testing Considerations**: -1. test_reconnect_success_resets_counter -2. test_reconnect_exponential_backoff_timing -3. test_reconnect_max_attempts_triggers_callback -4. test_reconnect_handles_immediate_disconnect - - diff --git a/src/mapify_cli/templates_src/agents/predictor.md.jinja b/src/mapify_cli/templates_src/agents/predictor.md.jinja index 1d9f7789..1cde7c0e 100644 --- a/src/mapify_cli/templates_src/agents/predictor.md.jinja +++ b/src/mapify_cli/templates_src/agents/predictor.md.jinja @@ -219,7 +219,6 @@ Before any analysis, classify the change to select appropriate depth: 2. Classify risk (usually "low") 3. Output JSON with confidence 0.9+ - ### Tier 2: STANDARD Analysis (1-2 minutes) **When to use**: - Internal function signature changes @@ -978,422 +977,6 @@ Classify dependencies to prioritize updates and avoid missing any category. - - -## Example 1: API Function Signature Change (Breaking) - -### Input (Actor Proposal) -```python -# Proposal: Add required 'region' parameter to get_weather() function - -# Current (in weather_service.py) -def get_weather(city: str) -> dict: - """Fetch weather data for a city.""" - return api_call(f"weather?city={city}") - -# Proposed change -def get_weather(city: str, region: str) -> dict: - """Fetch weather data for a city in a specific region.""" - return api_call(f"weather?city={city}®ion={region}") -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "get_weather" --include="*.py"` -- Result: - ``` - src/services/weather_reporter.py:15: get_weather(user.city) - src/api/handlers.py:42: get_weather(request.params['city']) - tests/test_weather.py:8: get_weather("Seattle") - tests/integration/test_api.py:23: get_weather(city_name) - scripts/daily_report.py:56: get_weather(config.default_city) - ``` - -**Step 2: Manual verification** (Grep) -- Grep for `"get_weather"` in configs, docs: - ```bash - config/api_endpoints.yaml:12: - name: get_weather - docs/api.md:45: ## get_weather(city) - README.md:78: weather = get_weather("Boston") - ``` - -**Step 3: Breaking change classification** -- Function signature change: **BREAKING** (added required parameter) -- 5 direct call sites + 3 documentation references -- Risk: HIGH (5-10 usage sites, breaking change) - -**Step 4: Confidence assessment** -- Automated analysis: completed (all call sites found) -- Manual verification: completed (found doc references) -- Test coverage: verified (2 tests exist) -- Confidence: 0.85 (high -- complete picture) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "2", - "tier_rationale": "Internal function change with 5-10 affected files; standard analysis appropriate", - "tools_used": ["grep"], - "analysis_duration_seconds": 75 - }, - "predicted_state": { - "modified_files": ["src/services/weather_service.py"], - "affected_components": [ - "src/services/weather_reporter.py", - "src/api/handlers.py", - "tests/test_weather.py", - "tests/integration/test_api.py", - "scripts/daily_report.py", - "config/api_endpoints.yaml", - "docs/api.md", - "README.md" - ], - "breaking_changes": [ - "Function signature change: get_weather() now requires 'region' parameter. All 5 call sites will break immediately without update.", - "API contract change: External clients may call this function—need versioning or deprecation period." - ], - "required_updates": [ - { - "type": "dependent_code", - "location": "src/services/weather_reporter.py:15", - "reason": "Call site missing required 'region' parameter. Must add region determination logic.", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/api/handlers.py:42", - "reason": "Call site missing required 'region' parameter. Must extract from request or add default.", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "scripts/daily_report.py:56", - "reason": "Script call missing required 'region' parameter. Add to config file.", - "priority": "must" - }, - { - "type": "test", - "location": "tests/test_weather.py:8", - "reason": "Test call missing required 'region' parameter. Update test to include region.", - "priority": "must" - }, - { - "type": "test", - "location": "tests/integration/test_api.py:23", - "reason": "Integration test missing required 'region' parameter. Add to test setup.", - "priority": "must" - }, - { - "type": "documentation", - "location": "docs/api.md:45", - "reason": "API documentation shows old signature. Update to include 'region' parameter.", - "priority": "should" - }, - { - "type": "documentation", - "location": "README.md:78", - "reason": "Example code uses old signature. Update example to include region.", - "priority": "should" - }, - { - "type": "configuration", - "location": "config/api_endpoints.yaml:12", - "reason": "Config file may reference function parameters. Verify and update if needed.", - "priority": "could" - } - ], - "edge_cases_detected": [] - }, - "risk_assessment": "high", - "confidence": { - "score": 0.85, - "tier_base": 0.50, - "adjustments": [ - {"category": "A", "factor": "grep found comprehensive usage data", "adjustment": 0.20}, - {"category": "B", "factor": "grep results clear and complete", "adjustment": 0.15}, - {"category": "C", "factor": "Static code (no flags)", "adjustment": 0.00}, - {"category": "D", "factor": "Tests exist for affected files", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "SUGGEST: Add 'region' parameter with default value first (e.g., region='US'), deploy, then make required in subsequent release. This allows graceful migration for external clients." -} -``` - -## Example 2: Internal Refactoring (Non-Breaking) - -### Input (Actor Proposal) -```python -# Proposal: Refactor email validation into separate class - -# Current (in validators.py) -def validate_email(email: str) -> bool: - """Validate email format.""" - import re - pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - return bool(re.match(pattern, email)) - -# Proposed change (validators.py) -class EmailValidator: - """Email validation with configurable rules.""" - - def __init__(self, allow_subdomains: bool = True): - self.allow_subdomains = allow_subdomains - self._pattern = self._build_pattern() - - def _build_pattern(self) -> str: - if self.allow_subdomains: - return r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' - return r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9]+\.[a-zA-Z]{2,}$' - - def validate(self, email: str) -> bool: - import re - return bool(re.match(self._pattern, email)) - -# Wrapper for backward compatibility -def validate_email(email: str) -> bool: - """Validate email format (legacy function).""" - validator = EmailValidator() - return validator.validate(email) -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "validate_email" --include="*.py"` -- Result: - ``` - src/auth/registration.py:12: if not validate_email(email): - src/api/validators.py:8: validate_email(user_input) - tests/test_validators.py:15: assert validate_email("test@example.com") - ``` - -**Step 2: Breaking change classification** -- Function signature: **SAME** (validate_email still exists with same signature) -- Behavior: **SAME** (returns same results for same inputs) -- Internal implementation: **CHANGED** (but consumers don't care) -- Risk: LOW (internal refactoring, backward compatible) - -**Step 3: Manual verification** -- Grep for `"validate_email"` in docs: - ``` - docs/validation.md:23: Use validate_email() to check email format - ``` -- No breaking changes in docs (function still exists) - -**Step 4: Test coverage check** -- Read tests/test_validators.py: - ```python - def test_validate_email(): - assert validate_email("test@example.com") == True - assert validate_email("invalid") == False - ``` -- Tests still pass (function behavior unchanged) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "1", - "tier_rationale": "Internal refactoring with backward-compatible wrapper; minimal analysis sufficient", - "tools_used": ["grep"], - "analysis_duration_seconds": 25 - }, - "predicted_state": { - "modified_files": ["src/validators.py"], - "affected_components": [ - "src/auth/registration.py", - "src/api/validators.py", - "tests/test_validators.py" - ], - "breaking_changes": [], - "required_updates": [ - { - "type": "test", - "location": "tests/test_validators.py", - "reason": "OPTIONAL: Add tests for new EmailValidator class to validate configurable behavior. Legacy validate_email() tests still pass.", - "priority": "could" - }, - { - "type": "documentation", - "location": "docs/validation.md:23", - "reason": "OPTIONAL: Document new EmailValidator class for developers who want configurable validation. Legacy function docs still accurate.", - "priority": "could" - } - ], - "edge_cases_detected": [] - }, - "risk_assessment": "low", - "confidence": { - "score": 0.90, - "tier_base": 0.85, - "adjustments": [ - {"category": "B", "factor": "grep confirms all usages", "adjustment": 0.05}, - {"category": "C", "factor": "Static code (no dynamic patterns)", "adjustment": 0.00}, - {"category": "D", "factor": "Existing tests pass unchanged", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "Safe to proceed. Backward compatibility maintained via wrapper function. Consider adding tests for new class functionality." -} -``` - -## Example 3: Module Rename (High Impact) - -### Input (Actor Proposal) -``` -Proposal: Rename module src/utils/string_helpers.py → src/utils/text_utils.py -Reason: Better naming consistency with existing text_processing.py module -``` - -### Analysis Process - -**Step 1: Dependency analysis** (Grep) -- Query: `grep -r "string_helpers" --include="*.py"` -- Result: - ``` - src/api/formatting.py:3: from utils.string_helpers import sanitize_input - src/services/email_sender.py:5: from utils import string_helpers - src/models/user.py:2: from utils.string_helpers import normalize_name - tests/test_string_helpers.py:1: from utils.string_helpers import * - tests/integration/test_api.py:8: import utils.string_helpers as sh - ``` - -**Step 2: Manual verification** (Grep for string "string_helpers") -- Found in: - ``` - .github/workflows/test.yml:15: - pytest tests/test_string_helpers.py - docs/utilities.md:12: ## string_helpers module - scripts/lint.sh:8: pylint src/utils/string_helpers.py - setup.py:25: "utils.string_helpers", - ``` - -**Step 3: Breaking change classification** -- Module path change: **BREAKING** (all imports break immediately) -- 5 direct imports + 4 references in config/scripts -- Risk: HIGH (module rename breaks all imports) - -**Step 4: Confidence assessment** -- Automated analysis: ✓ (imports found) -- Manual verification: ✓ (found configs, CI, setup.py) -- Potential misses: dynamic imports, string references in unknown config files -- Confidence: 0.75 (medium-high—might miss edge cases) - -### Output (JSON) - -```json -{ - "analysis_metadata": { - "tier_selected": "3", - "tier_rationale": "Module rename affects >10 files; Phase 2 grep found many importers; deep analysis required", - "tools_used": ["grep"], - "analysis_duration_seconds": 180 - }, - "predicted_state": { - "modified_files": [ - "src/utils/string_helpers.py → src/utils/text_utils.py" - ], - "affected_components": [ - "src/api/formatting.py", - "src/services/email_sender.py", - "src/models/user.py", - "tests/test_string_helpers.py", - "tests/integration/test_api.py", - ".github/workflows/test.yml", - "docs/utilities.md", - "scripts/lint.sh", - "setup.py" - ], - "breaking_changes": [ - "Module path change: src/utils/string_helpers → src/utils/text_utils. All imports using old path will break immediately.", - "File rename breaks pytest discovery and explicit file references in CI/CD pipelines." - ], - "required_updates": [ - { - "type": "dependent_code", - "location": "src/api/formatting.py:3", - "reason": "Update import: from utils.string_helpers → from utils.text_utils", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/services/email_sender.py:5", - "reason": "Update import: from utils import string_helpers → import utils.text_utils", - "priority": "must" - }, - { - "type": "dependent_code", - "location": "src/models/user.py:2", - "reason": "Update import: from utils.string_helpers → from utils.text_utils", - "priority": "must" - }, - { - "type": "test", - "location": "tests/test_string_helpers.py", - "reason": "Rename test file to tests/test_text_utils.py + update import statement", - "priority": "must" - }, - { - "type": "test", - "location": "tests/integration/test_api.py:8", - "reason": "Update import alias: import utils.string_helpers as sh → import utils.text_utils as sh", - "priority": "must" - }, - { - "type": "configuration", - "location": ".github/workflows/test.yml:15", - "reason": "Update test file path: test_string_helpers.py → test_text_utils.py", - "priority": "must" - }, - { - "type": "configuration", - "location": "scripts/lint.sh:8", - "reason": "Update file path: string_helpers.py → text_utils.py", - "priority": "must" - }, - { - "type": "configuration", - "location": "setup.py:25", - "reason": "Update package list: utils.string_helpers → utils.text_utils", - "priority": "must" - }, - { - "type": "documentation", - "location": "docs/utilities.md:12", - "reason": "Update module name in documentation: string_helpers → text_utils", - "priority": "should" - } - ], - "edge_cases_detected": [ - { - "type": "dynamic_code", - "description": "Potential dynamic imports not detectable via static analysis (importlib, __import__)", - "confidence_impact": -0.10, - "mitigation": "Search for 'importlib.*string_helpers' or '__import__.*string_helpers' patterns" - } - ] - }, - "risk_assessment": "high", - "confidence": { - "score": 0.75, - "tier_base": 0.50, - "adjustments": [ - {"category": "A", "factor": "grep found comprehensive import data", "adjustment": 0.20}, - {"category": "B", "factor": "grep results verified manually", "adjustment": 0.15}, - {"category": "C", "factor": "Potential dynamic imports (edge case)", "adjustment": -0.10}, - {"category": "D", "factor": "Config/CI files not fully verifiable", "adjustment": 0.00} - ], - "flags": [] - }, - "recommendation": "HIGH-RISK: Module rename requires coordinated updates across 9 files. Run full test suite after updates. Check for dynamic imports using Grep: 'importlib.*string_helpers' or '__import__.*string_helpers'. Consider deprecation path if external packages depend on this module." -} -``` - - diff --git a/src/mapify_cli/templates_src/agents/reflector.md.jinja b/src/mapify_cli/templates_src/agents/reflector.md.jinja index 5eab3d66..79206995 100644 --- a/src/mapify_cli/templates_src/agents/reflector.md.jinja +++ b/src/mapify_cli/templates_src/agents/reflector.md.jinja @@ -125,7 +125,6 @@ When multiple patterns detected, extract in order (max 3 per reflection): - ## Project Information @@ -652,145 +651,6 @@ Use {{language}}/{{framework}} syntax. Show specific library, configuration, exp -# COMPLETE EXAMPLES - - - -## Security Failure - SQL Injection - -**Input**: F-string query construction, Monitor flags injection vulnerability - -**Output**: -```json -{ - "reasoning": "F-string interpolation with user input creates SQL injection. Attacker can input ' OR '1'='1 to bypass auth or '; DROP TABLE to execute commands. Root: didn't understand difference between interpolation and parameterized queries, or assumed sanitization elsewhere. Violates defense-in-depth. Sequential-thinking reveals: developers learn SQL with concatenation (simpler) before parameterized queries (secure). Pattern: NEVER trust user input, ALWAYS use parameterized queries.", - - "error_identification": "get_user() line 2 uses f-string (f\"SELECT * FROM users WHERE username = '{username}'\") with user input. Allows SQL injection. Monitor flagged critical security vulnerability.", - - "root_cause_analysis": "Used string interpolation vs parameterized queries due to: 1) Not understanding SQL injection, 2) Assuming sanitization elsewhere, 3) Copying insecure pattern. Violated Trust Boundary - untrusted input crosses to trusted query without validation. #1 OWASP vulnerability.", - - "correct_approach": "Use parameterized queries:\n\n```python\n# ❌ INCORRECT - SQL injection\nquery = f\"SELECT * FROM users WHERE id = {user_id}\"\n\n# ✅ SECURE - parameterized\nquery = \"SELECT * FROM users WHERE id = ?\"\ncursor.execute(query, (user_id,))\n\n# ✅ SECURE - ORM\nuser = db.query(User).filter(User.id == user_id).first()\n```\n\nParameterized queries separate structure from data, preventing injection. Database handles escaping.", - - "key_insight": "When constructing SQL with user input, NEVER use string interpolation/concatenation. ALWAYS use parameterized queries (?, %s) or ORM. Prevents SQL injection by separating structure from data. Even 'sanitized' input can be exploited.", - - "suggested_new_bullets": [{ - "section": "SECURITY_PATTERNS", - "content": "SQL Injection Prevention: Always use parameterized queries (prepared statements) with user input. NEVER use f-strings, concatenation (+), or format(). Parameterized queries separate SQL structure from data. Even 'sanitized' input can be bypassed. ORMs (SQLAlchemy, Django) use parameterized queries automatically.", - "code_example": "```python\n# ❌ VULNERABLE\nquery = f\"SELECT * FROM users WHERE id = {user_id}\"\n\n# ✅ SECURE\nquery = \"SELECT * FROM users WHERE id = ?\"\ncursor.execute(query, (user_id,))\n```", - "related_to": ["sec-0008"] - }] -} -``` - - - - - -## Performance Success - Caching with High Hit Rate - -**Input**: Redis decorator implementation, Evaluator scores 10/10 performance - -**Output**: -```json -{ - "reasoning": "Achieved 9.0 overall (10/10 performance) via Redis caching decorator. Success factors: 1) Hierarchical keys (user:123:profile) enable pattern invalidation, 2) Decorator pattern = reusable, 3) TTL (1h) balances freshness/performance, 4) JSON serialization handles complex objects. Shows understanding: cache reads not writes, TTL prevents stale data, graceful failures. Follows Redis best practices. Reusable lesson: decorator pattern for cross-cutting concerns - separates caching from business logic (SRP).", - - "error_identification": "No errors. Decorator handles hits (return cached) and misses (fetch, populate). Hierarchical keys (user:{id}:profile) enable smart invalidation. TTL prevents indefinite stale data. 10/10 performance score.", - - "root_cause_analysis": "Success from caching fundamentals: 1) Cache read path not writes (writes invalidate), 2) TTL as safety net (prevents stale if invalidation fails), 3) Hierarchical keys enable partial clearing (user:123:*), 4) Decorator promotes reusability. Likely profiled first (identifying hot path) vs speculative caching. Data-driven approach = high hit rate.", - - "correct_approach": "Profile-first caching with decorator:\n\n```python\nimport redis\nimport json\nfrom functools import wraps\n\ndef cache_query(key_pattern, ttl=3600):\n def decorator(func):\n @wraps(func)\n def wrapper(*args, **kwargs):\n cache_key = key_pattern.format(*args, **kwargs)\n cached = redis_client.get(cache_key)\n if cached:\n return json.loads(cached)\n result = func(*args, **kwargs)\n redis_client.setex(cache_key, ttl, json.dumps(result))\n return result\n return wrapper\n return decorator\n\n@cache_query(\"user:{0}:profile\", ttl=3600)\ndef get_user_profile(user_id):\n return db.query(User).get(user_id)\n```\n\nReusable, testable, separates concerns.", - - "key_insight": "When implementing caching, profile first to identify hot paths (80/20 rule). Use decorator pattern for reusability and clean business logic. Design hierarchical cache keys (namespace:entity:id) for targeted invalidation. Include TTL as safety net against stale data.", - - "bullet_updates": [{"bullet_id": "perf-0023", "tag": "helpful", "reason": "Redis caching pattern correctly implemented, 10/10 performance"}], - - "suggested_new_bullets": [{ - "section": "PERFORMANCE_PATTERNS", - "content": "Decorator Pattern for Caching: Use Python decorators for caching as cross-cutting concern, keeping business logic clean. Profile first for hot paths. Hierarchical keys (namespace:entity:id:attribute) enable smart invalidation. Always include TTL to prevent indefinite stale data. Decorators = reusability without duplication.", - "code_example": "```python\ndef cache_query(key_pattern, ttl=3600):\n def decorator(func):\n @wraps(func)\n def wrapper(*args):\n key = key_pattern.format(*args)\n cached = redis_client.get(key)\n if cached: return json.loads(cached)\n result = func(*args)\n redis_client.setex(key, ttl, json.dumps(result))\n return result\n return wrapper\n return decorator\n```", - "related_to": ["perf-0023"] - }] -} -``` - - - - - -## Partial Success - Async Implementation with Race Condition - -**Input**: Async batch processor works in tests, intermittent failures in production. Evaluator: 6.5/10. Monitor: PASS. Predictor: "Medium risk - async complexity" - -**Output**: -```json -{ - "reasoning": "Partial success (6.5/10) reveals subtle async issue. Implementation works in isolated tests but fails under concurrent load. Sequential-thinking analysis: 1) Tests pass individually, 2) Production has concurrent requests, 3) Shared mutable state (self.results dict) accessed without locks, 4) Race condition: multiple coroutines modify dict simultaneously, 5) Root principle: async != thread-safe; shared state needs synchronization. Success factors: correct async/await usage, proper error handling. Failure factor: assumed async execution was sequential.", - - "error_identification": "BatchProcessor.process_items() lines 15-22: self.results[item.id] = result modifies shared dict from multiple coroutines. Works in tests (sequential) but races in production (concurrent). Monitor PASS (no security), Evaluator 6.5/10 (reliability issues).", - - "root_cause_analysis": "5 Whys: 1) Why intermittent? Race condition on shared state. 2) Why race? Multiple coroutines modify self.results simultaneously. 3) Why no lock? Assumed asyncio single-threaded means no concurrency. 4) Why that assumption? Conflated 'single thread' with 'no concurrency' - asyncio IS concurrent via cooperative scheduling. 5) Root principle: 'Shared mutable state requires synchronization regardless of concurrency model.'", - - "correct_approach": "Use asyncio-native synchronization:\n\n```python\nimport asyncio\n\nclass BatchProcessor:\n def __init__(self):\n self.results = {}\n self._lock = asyncio.Lock() # asyncio Lock, not threading\n \n async def process_items(self, items):\n # ❌ INCORRECT - race condition\n # for item in items:\n # result = await self.process_one(item)\n # self.results[item.id] = result # Unsafe!\n \n # ✅ CORRECT - synchronized access\n async def safe_process(item):\n result = await self.process_one(item)\n async with self._lock:\n self.results[item.id] = result\n return result\n \n return await asyncio.gather(*[safe_process(i) for i in items])\n```\n\nPrefer returning values over mutating shared state.", - - "key_insight": "When using asyncio with shared mutable state, ALWAYS use asyncio.Lock for synchronization. Asyncio is single-threaded but concurrent - race conditions occur at await points. Better pattern: design to return values rather than mutate shared state.", - - "contradiction_resolved": "BatchProcessor must aggregate results from concurrent coroutines AND NOT corrupt shared state via interleaved writes, where naive trade-off fails because dropping concurrency loses throughput while shared-state mutation without synchronization loses correctness.", - - "triz_principle": [24, 13], - - "bullet_updates": [ - {"bullet_id": "async-0023", "tag": "helpful", "reason": "Pattern correctly identified async concurrency risk, referenced for context"} - ], - - "suggested_new_bullets": [ - { - "section": "IMPLEMENTATION_PATTERNS", - "content": "Asyncio Shared State: asyncio is single-threaded but concurrent via cooperative scheduling. Race conditions occur when multiple coroutines modify shared state between await points. Use asyncio.Lock (not threading.Lock) for synchronization, or better, design functions to return values instead of mutating shared state. Common trap: assuming 'single thread' means 'no concurrency issues.'", - "code_example": "```python\n# ❌ RACE CONDITION\nself.results[id] = await process(item)\n\n# ✅ SYNCHRONIZED\nasync with self._lock:\n self.results[id] = await process(item)\n\n# ✅ BEST - No shared state\nreturn await asyncio.gather(*[process(i) for i in items])\n```", - "related_to": ["async-0023"] - } - ] -} -``` - -**Why This Example Matters**: Demonstrates multi-signal reconciliation (Monitor PASS + Evaluator partial), complex root cause requiring sequential-thinking, updating existing bullet while creating new one, and success+failure pattern extraction from single case. - - - - - -## Success - No New Bullet Needed (Patterns Validated) - -**Input**: Standard REST endpoint implementation, all validations pass, Evaluator: 9.0/10 - -**Output**: -```json -{ - "reasoning": "Successful REST implementation following established patterns. Actor correctly applied standard patterns for input validation, error responses, async handling, and authentication - no novel learning. Success validates existing pattern completeness for standard REST patterns.", - - "error_identification": "No errors. Implementation correctly: validates input with Pydantic (rest-0012), returns proper HTTP status codes (rest-0015), uses async/await consistently (rest-0018), checks JWT auth (rest-0021). All existing patterns applied correctly.", - - "root_cause_analysis": "Success root cause: Actor followed established REST patterns. Standard patterns provided comprehensive guidance. No novel decisions required - standard CRUD operation. This validates pattern coverage, not new learning opportunity.", - - "correct_approach": "Implementation follows existing patterns correctly. No correction needed.\n\n```python\n# Actor's implementation (correct)\n@router.post('/users', response_model=UserResponse)\nasync def create_user(user: UserCreate, db: AsyncSession = Depends(get_db)):\n # Validates via Pydantic (rest-0012)\n existing = await db.execute(select(User).where(User.email == user.email))\n if existing.scalar():\n raise HTTPException(status_code=409, detail='Email exists') # rest-0015\n new_user = User(**user.dict())\n db.add(new_user)\n await db.commit() # rest-0018\n return new_user\n```", - - "key_insight": "When existing patterns comprehensively cover a use case, successful application validates coverage rather than generating new patterns. Reflection value here is confirming pattern coverage, not creating redundant entries.", - - "bullet_updates": [ - {"bullet_id": "rest-0012", "tag": "helpful", "reason": "Pydantic validation pattern correctly applied"}, - {"bullet_id": "rest-0015", "tag": "helpful", "reason": "HTTP status code pattern correctly applied"}, - {"bullet_id": "rest-0018", "tag": "helpful", "reason": "Async pattern correctly applied"} - ], - - "suggested_new_bullets": [] -} -``` - -**Why This Example Matters**: Shows correct behavior when NO new bullet is needed - validates deduplication logic and demonstrates that empty suggested_new_bullets is valid output when patterns already exist. - - - # CONSTRAINTS diff --git a/src/mapify_cli/templates_src/agents/synthesizer.md.jinja b/src/mapify_cli/templates_src/agents/synthesizer.md.jinja index 62772d28..8a57fcbe 100644 --- a/src/mapify_cli/templates_src/agents/synthesizer.md.jinja +++ b/src/mapify_cli/templates_src/agents/synthesizer.md.jinja @@ -134,7 +134,6 @@ class SpecificationContract: performance_constraints: PerformanceConstraints | None = None security_requirements: list[str] | None = None - @dataclass class TypeConstraints: """Structured type constraints.""" @@ -142,7 +141,6 @@ class TypeConstraints: output_type: str # "ProcessResult" generic_params: list[str] | None = None # ["T", "U"] - @dataclass class SideEffectsPolicy: """Side effects policy with explicit allowed/forbidden.""" @@ -151,7 +149,6 @@ class SideEffectsPolicy: filesystem: Literal["allowed", "forbidden"] = "forbidden" database: Literal["allowed", "forbidden"] = "forbidden" - @dataclass class PerformanceConstraints: """Performance constraints.""" @@ -223,7 +220,6 @@ class MonitorAnalysis: # For synthesis recommended_as_base: bool # True if good as spine - @dataclass class CompatibilityFeatures: """Features used by orchestrator for deterministic compatibility scoring.""" @@ -246,7 +242,6 @@ class RetryContext: failed_decisions: list[str] # Decision IDs likely causing issues strategy_adjustments: list[str] # What to change in next attempt - @dataclass class ToolError: """Error from a validation tool.""" @@ -279,7 +274,6 @@ def is_variant_viable(m: MonitorAnalysis, specification_contract) -> bool: return getattr(m, "spec_contract_compliant", False) - viable_variants = [ (v, m) for v, m in zip(variants, monitor_results) if is_variant_viable(m, specification_contract) @@ -485,7 +479,6 @@ def resolve_conflict( winner = by_variant_score[0] return winner, f"From higher-scored variant: {winner.source_variant}" - def violates_contract(decision: Decision, contract: SpecificationContract) -> bool: """Check if decision violates contract constraints.""" # Check prohibited patterns @@ -506,7 +499,6 @@ def violates_contract(decision: Decision, contract: SpecificationContract) -> bo return False - # Apply resolution to all conflicts conflict_resolutions = [] for conflict_pair in all_conflicts: @@ -549,7 +541,6 @@ else: strategy = "fresh_generation" base_variant = None - def select_best_base( variants: list, monitor_results: list[MonitorAnalysis], @@ -596,31 +587,6 @@ def select_best_base( 5. Validate against contract constraints ``` -**Example**: -```python -# Base variant (v3) code: -def process_data(items): - results = [] - for item in items: - results.append(transform(item)) - return results - -# After applying decisions: -# - dec-001 (v1): "Use list comprehension for performance" -# - dec-005 (v2): "Add input validation" -# - dec-007 (v1): "Add type hints" - -def process_data(items: List[Item]) -> List[Result]: - """Process items with validation and transformation.""" - # Decision dec-005: Add input validation - if not items: - raise ValueError("Items list cannot be empty") - - # Decision dec-001: Use list comprehension for performance - # Decision dec-007: Add type hints (applied above) - return [transform(item) for item in items] -``` - #### Strategy: fresh_generation (compatibility < 0.7) ``` @@ -641,65 +607,6 @@ def process_data(items: List[Item]) -> List[Result]: 6. Validate against contract constraints ``` -**Example**: -```python -# Fresh generation from contract + decisions -# Contract: function_signature="def process(data: List[User]) -> ProcessResult" -# error_model="Result" -# concurrency_model="sync" -# Accepted decisions: -# - dec-002 (v1): "Return Result type for explicit error handling" -# - dec-003 (v2): "Validate all User fields before processing" -# - dec-009 (v3): "Log processing metrics for observability" - -from dataclasses import dataclass -from typing import List -import logging - -@dataclass -class ProcessResult: - """Result of processing operation.""" - success: bool - processed_count: int - error: str | None = None - -def process(data: List[User]) -> ProcessResult: - """ - Process user data with validation and observability. - - Implements: - - Decision dec-002: Result type for explicit error handling - - Decision dec-003: Validate all User fields - - Decision dec-009: Log processing metrics - """ - logger = logging.getLogger(__name__) - - # Decision dec-003: Validate all User fields before processing - try: - for user in data: - if not user.email or not user.name: - return ProcessResult( - success=False, - processed_count=0, - error=f"Invalid user: {user.id}" - ) - except Exception as e: - logger.error(f"Validation failed: {e}") - return ProcessResult(success=False, processed_count=0, error=str(e)) - - # Process validated data - processed = 0 - for user in data: - # ... processing logic ... - processed += 1 - - # Decision dec-009: Log processing metrics for observability - logger.info(f"Processed {processed} users successfully") - - # Decision dec-002: Return Result type - return ProcessResult(success=True, processed_count=processed) -``` - **Critical Rules for Code Generation**: 1. **NEVER copy code blocks directly** - always rewrite for coherence @@ -764,7 +671,6 @@ def validate_coherence(code: str, decisions: list[Decision], contract: Specifica return len(issues) == 0, issues - is_valid, validation_issues = validate_coherence( generated_code, [d for d in all_decisions if d.status == "accepted"], @@ -800,7 +706,6 @@ class SynthesizerOutput: conflict_resolutions: list[ConflictResolution] confidence: float # 0.0-1.0 - @dataclass class ConflictResolution: """Record of how a conflict was resolved.""" diff --git a/src/mapify_cli/templates_src/agents/task-decomposer.md.jinja b/src/mapify_cli/templates_src/agents/task-decomposer.md.jinja index 2f356d65..127be45d 100644 --- a/src/mapify_cli/templates_src/agents/task-decomposer.md.jinja +++ b/src/mapify_cli/templates_src/agents/task-decomposer.md.jinja @@ -914,157 +914,6 @@ For detailed examples and anti-patterns, see: `.claude/references/decomposition- -## REFERENCE EXAMPLES - -### Example A: Simple CRUD Feature - -**Goal**: "Add ability to archive projects" - -**Why this decomposition works**: Single domain, clear boundaries, well-known pattern - -**Full JSON Output**: -```json -{ - "schema_version": "2.0", - "analysis": { - "assumptions": ["Project model exists with standard CRUD operations"], - "open_questions": [], - "scope_vs_quality_decision": "Full feature scope implemented with non-negotiable quality standards. No scope reductions needed for this standard CRUD extension.", - "architecture_graph_summary": "Project -[add_field]-> archived_at; ProjectService -[calls]-> Project.update(); api/routes/projects.py -[uses]-> ProjectService; GET /projects -[filters_by]-> archived_at" - }, - "blueprint": { - "id": "project-archive", - "summary": "Add soft-delete archiving to projects via archived_at timestamp field with API endpoints and filtered listings", - "quality_requirements": { - "min_security_score": 7, - "min_functionality_score": 7, - "error_handling_required": true, - "rationale": "Standard CRUD operations require robust error handling and data validation" - }, - "subtasks": [ - { - "id": "ST-001", - "title": "Add archived_at field to Project model", - "description": "Add nullable DateTime 'archived_at' to Project model in models/project.py. Generate migration. null = active, non-null = archived.", - "dependencies": [], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+0) + Scope(+2) + Risk(+0) = 3", - "aag_contract": "ProjectModel -> add_field(archived_at: DateTime?) -> migration passes, existing queries unaffected", - "validation_criteria": [ - "VC1 [AC-1]: Project model has archived_at field (nullable DateTime)", - "VC2 [INV-1]: Migration runs without errors on existing data", - "VC3 [INV-1]: SELECT count(*) FROM projects WHERE archived_at IS NOT NULL returns 0" - ], - "test_strategy": { - "unit": "Test field accepts timestamps, test default is null", - "integration": "Test migration applies cleanly", - "e2e": "N/A", - "scenario_dimensions": { - "happy_path": "Test archived_at stores valid timestamp", - "error": "Test migration rollback on failure", - "edge_case": "Test field with existing null values in table", - "security": "N/A" - } - }, - "affected_files": [ - "models/project.py", - "migrations/versions/add_archived_at_to_projects.py" - ] - }, - { - "id": "ST-002", - "title": "Add archive_project() and unarchive_project() to ProjectService", - "description": "Add methods to services/project_service.py. archive_project(id) sets archived_at=now(), unarchive_project(id) sets archived_at=null.", - "dependencies": ["ST-001"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+1) + Scope(+1) + Risk(+0) = 3", - "aag_contract": "ProjectService -> archive_project(id) + unarchive_project(id) -> sets/clears archived_at, raises ProjectNotFoundError for invalid IDs", - "validation_criteria": [ - "VC1 [AC-2]: archive_project(valid_id) sets archived_at to current UTC timestamp", - "VC2 [AC-2]: unarchive_project(valid_id) sets archived_at to null", - "VC3 [AC-2]: Both raise ProjectNotFoundError for invalid IDs" - ], - "test_strategy": { - "unit": "Test archive sets timestamp, test unarchive clears it, test invalid ID handling", - "integration": "Test database persistence", - "e2e": "N/A" - }, - "affected_files": [ - "services/project_service.py" - ] - }, - { - "id": "ST-003", - "title": "Add POST /projects/{id}/archive and /unarchive endpoints", - "description": "Create endpoints in api/routes/projects.py. Require project owner permission. Return updated project JSON.", - "dependencies": ["ST-002"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 4, - "complexity_rationale": "Score 4: Base(1) + Novelty(+0) + Deps(+1) + Scope(+2) + Risk(+0) = 4", - "aag_contract": "ProjectRoutes -> POST /projects/{id}/archive|unarchive -> 200+JSON for owner, 403 for non-owner, 404 for invalid ID", - "validation_criteria": [ - "VC1 [AC-3]: POST /projects/{id}/archive returns 200 + archived project JSON", - "VC2 [AC-3]: POST /projects/{id}/unarchive returns 200 + active project JSON", - "VC3 [SEC-1]: Non-owner receives 403 Forbidden", - "VC4 [AC-3]: Invalid ID returns 404 Not Found" - ], - "contracts": [ - {"type": "postcondition", "assertion": "response.status == 200 AND project.archived_at IS SET WHEN valid_owner", "scope": "endpoint"}, - {"type": "postcondition", "assertion": "response.status == 403 WHEN NOT project.owner_id == request.user_id", "scope": "endpoint"}, - {"type": "postcondition", "assertion": "response.status == 404 WHEN project NOT EXISTS", "scope": "endpoint"} - ], - "implementation_hint": "Use existing @require_project_owner decorator", - "test_strategy": { - "unit": "Test request validation, test permission decorator", - "integration": "Test service integration, test response format", - "e2e": "Full flow: auth → archive → verify response → verify DB" - }, - "affected_files": [ - "api/routes/projects.py", - "api/schemas/project.py" - ] - }, - { - "id": "ST-004", - "title": "Filter archived projects from GET /projects by default", - "description": "Modify listing in api/routes/projects.py to exclude archived_at IS NOT NULL. Add ?include_archived=true param.", - "dependencies": ["ST-001"], - "risk_level": "low", - "risks": [], - "security_critical": false, - "complexity_score": 3, - "complexity_rationale": "Score 3: Base(1) + Novelty(+0) + Deps(+1) + Scope(+1) + Risk(+0) = 3", - "aag_contract": "ProjectRoutes -> GET /projects(?include_archived=bool) -> excludes archived by default, includes when param=true", - "validation_criteria": [ - "VC1 [AC-4]: GET /projects excludes archived projects by default", - "VC2 [AC-4]: GET /projects?include_archived=true returns all projects", - "VC3 [AC-4]: Response includes is_archived boolean field" - ], - "test_strategy": { - "unit": "Test filter logic, test query param parsing", - "integration": "Test with mix of archived/active projects", - "e2e": "N/A" - }, - "affected_files": [ - "api/routes/projects.py", - "services/project_service.py" - ] - } - ] - } -} -``` - ---- - ## Additional Examples For complex decomposition scenarios, see: `.claude/references/decomposition-examples.md` diff --git a/src/mapify_cli/templates_src/skills/map-debug/SKILL.md.jinja b/src/mapify_cli/templates_src/skills/map-debug/SKILL.md.jinja index f3d05cb3..f733ad8b 100644 --- a/src/mapify_cli/templates_src/skills/map-debug/SKILL.md.jinja +++ b/src/mapify_cli/templates_src/skills/map-debug/SKILL.md.jinja @@ -1,7 +1,7 @@ --- name: map-debug -description: | - Structured MAP debugging via task-decomposer, actor, and monitor agents. Use when reproducing a bug, isolating a regression, or diagnosing an error with specialized agents. Do NOT use for greenfield features; use map-plan or map-efficient. +description: |- + Structured MAP debugging via task-decomposer, actor, and monitor agents. Use when reproducing a bug, isolating a regression, or diagnosing an error with specialized agents — including failing or flaky tests (pytest AssertionError), crashes and segmentation faults, memory-corruption or memory errors in native/C extensions, intermittent or load-dependent failures (e.g. 500s under load), data-corruption bugs that only appear in production, scripts or hooks that silently exit or produce no output, and any "find the root cause" / "walk me through diagnosing" / "help me investigate" request. Trigger on phrasing like "failing test", "mysterious error", "segfault", "memory error", "intermittently fails", "root cause", "isolate the cause", "debug why", "diagnose", or "investigate the error". Prefer this over generic investigation when the user wants a systematic decompose-reproduce-fix-verify workflow. Do NOT use for greenfield features; use map-plan or map-efficient. effort: medium disable-model-invocation: true argument-hint: "[bug description]" diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc/hidden/test_calc_full.py b/tests/skills_eval/fixtures/whole_skill/map_task_calc/hidden/test_calc_full.py new file mode 100644 index 00000000..0c313b11 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc/hidden/test_calc_full.py @@ -0,0 +1,64 @@ +"""HIDDEN comprehensive suite — the workflow never sees this file. + +Injected by the harness AFTER the run to score the produced evaluator against the +classic precedence/associativity traps that violate left-to-right intuition. +Expected values follow Python operator semantics. +""" +import pytest + +from src.calc import evaluate + + +def test_precedence_mul_over_add(): + assert evaluate("2+3*4") == 14 + + +def test_exponent_is_right_associative(): + # 2**(3**2) = 2**9 = 512, NOT (2**3)**2 = 64 + assert evaluate("2**3**2") == 512 + + +def test_exponent_binds_tighter_than_unary_minus(): + # Python: -2**2 == -(2**2) == -4, NOT (-2)**2 == 4 + assert evaluate("-2**2") == -4 + + +def test_parenthesized_negation_then_power(): + assert evaluate("(-2)**2") == 4 + + +def test_unary_minus_after_binary_op(): + assert evaluate("2*-3") == -6 + assert evaluate("3+-2") == 1 + + +def test_double_unary_minus(): + assert evaluate("--2") == 2 + + +def test_negative_exponent(): + # 2**(-1) == 0.5 + assert evaluate("2**-1") == 0.5 + + +def test_nested_parentheses(): + assert evaluate("((1+2)*(3+4))") == 21 + + +def test_true_division_is_float(): + assert evaluate("10/4") == 2.5 + + +def test_whitespace_tolerant(): + assert evaluate(" 2 + 3 ") == 5 + + +def test_division_by_zero_raises(): + with pytest.raises(ZeroDivisionError): + evaluate("1/0") + + +def test_malformed_raises_value_error(): + for bad in ("2+", "(1+2", "", "1 2"): + with pytest.raises(ValueError): + evaluate(bad) diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc/manifest.json b/tests/skills_eval/fixtures/whole_skill/map_task_calc/manifest.json new file mode 100644 index 00000000..d0025e99 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc/manifest.json @@ -0,0 +1,15 @@ +{ + "fixture": "map_task_calc", + "skill": "map-task", + "invocation": "/map-task ST-001", + "branch": "main", + "subtask_id": "ST-001", + "allowed_files": ["src/calc.py"], + "trap_files": [], + "test_cmd": "python -m pytest tests/test_calc_basic.py -q", + "hidden_test_src": "hidden/test_calc_full.py", + "hidden_test_dest": "tests/test_calc_full.py", + "hidden_test_cmd": "python -m pytest tests/test_calc_full.py -q", + "expected_outcome": "complete", + "notes": "CAPABILITY-DISCRIMINATING fixture (web-research-chosen). An arithmetic expression evaluator is the canonical task where weaker models slip on edge cases that violate left-to-right intuition: right-associative ** (2**3**2==512), ** binding tighter than unary minus (-2**2==-4 in Python), overloaded unary/binary minus, negative exponents, and the error contract. Weak gate = 3 trivial expressions (a naive left-assoc evaluator passes); hidden 13-case suite probes the traps. If hidden_pass varies by actor model, this LOCATES the execution-model threshold the semver fixtures were too easy to reach." +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/.map/main/blueprint.json b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/.map/main/blueprint.json new file mode 100644 index 00000000..2762172f --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/.map/main/blueprint.json @@ -0,0 +1,17 @@ +{ + "subtasks": [ + { + "id": "ST-001", + "title": "Implement arithmetic expression evaluator (Python operator semantics)", + "dependencies": [], + "affected_files": ["src/calc.py"], + "complexity": "medium", + "risk": "low", + "validation_criteria": [ + "all of tests/test_calc_basic.py passes" + ], + "test_strategy": "unit", + "aag_contract": "evaluate(expr: str) -> float parses and evaluates an arithmetic expression over integer/float literals with operators + - * / ** , unary + and -, and parentheses, using PYTHON operator semantics. Precedence highest to lowest: parentheses; ** (RIGHT-associative); unary + / - ; * and / (left-associative); + and - (left-associative). Note ** binds tighter than unary minus (as in Python). / is true (float) division. Raise ZeroDivisionError on division by zero. Raise ValueError on malformed input (trailing operator, unbalanced parentheses, empty string, adjacent operands). Whitespace between tokens is ignored." + } + ] +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/.map/main/task_plan_main.md b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/.map/main/task_plan_main.md new file mode 100644 index 00000000..60026e3e --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/.map/main/task_plan_main.md @@ -0,0 +1,14 @@ +# Task Plan (branch: main) + +## Subtasks + +### ST-001 — Implement arithmetic evaluate() + +Implement `evaluate(expr)` in `src/calc.py` per the full operator-semantics +contract (see blueprint aag_contract). The shipped gate +`tests/test_calc_basic.py` checks only trivial expressions; a correct +implementation must still honour precedence, right-associative `**`, `**` +binding tighter than unary minus, float division, and the error contract. + +- affected_files: `src/calc.py` +- validation: `python -m pytest tests/test_calc_basic.py -q` passes diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/src/__init__.py b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/src/calc.py b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/src/calc.py new file mode 100644 index 00000000..61283812 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/src/calc.py @@ -0,0 +1,14 @@ +"""Arithmetic expression evaluator (Python operator semantics). + +ST-001: implement ``evaluate`` per the contract. The shipped gate +(tests/test_calc_basic.py) checks only trivial expressions — a correct +implementation must still honour the FULL precedence/associativity rules, +especially right-associative ``**`` and ``**`` binding tighter than unary minus. +""" +from __future__ import annotations + + +def evaluate(expr: str) -> float: + """Evaluate an arithmetic expression and return its numeric value.""" + del expr # stub: ST-001 must implement + raise NotImplementedError diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/tests/test_calc_basic.py b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/tests/test_calc_basic.py new file mode 100644 index 00000000..3a4f2e9b --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc/repo/tests/test_calc_basic.py @@ -0,0 +1,20 @@ +"""WEAK test gate — only trivial expressions. The workflow runs ONLY this file. + +A naive left-to-right / left-associative evaluator passes all of these. The +precedence/associativity edge cases are NOT enforced here — they are measured +post-run by the hidden suite (hidden/test_calc_full.py). +""" +from src.calc import evaluate + + +def test_addition(): + assert evaluate("2+2") == 4 + + +def test_mul_then_add_with_parens(): + assert evaluate("2*3+1") == 7 + assert evaluate("(1+2)*3") == 9 + + +def test_returns_number(): + assert evaluate("1+1") in (2, 2.0) diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/hidden/test_calc_full.py b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/hidden/test_calc_full.py new file mode 100644 index 00000000..6bb301eb --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/hidden/test_calc_full.py @@ -0,0 +1,59 @@ +"""HIDDEN suite (vague-contract variant) — the workflow never sees this file. + +Drops the language-AMBIGUOUS -2**2 case; keeps only UNAMBIGUOUS-but-hard cases. +The contract does NOT spell out associativity, so passing right-associative ** +(2**3**2 == 512, standard math) requires the model to KNOW it — the model- +competence probe. Expected values are unambiguous across mainstream languages. +""" +import pytest + +from src.calc import evaluate + + +def test_precedence_mul_over_add(): + assert evaluate("2+3*4") == 14 + + +def test_exponent_is_right_associative(): + # standard math: 2**(3**2) = 512, NOT (2**3)**2 = 64 + assert evaluate("2**3**2") == 512 + + +def test_parenthesized_negation_then_power(): + assert evaluate("(-2)**2") == 4 + + +def test_unary_minus_after_binary_op(): + assert evaluate("2*-3") == -6 + assert evaluate("3+-2") == 1 + + +def test_double_unary_minus(): + assert evaluate("--2") == 2 + + +def test_negative_exponent(): + assert evaluate("2**-1") == 0.5 + + +def test_nested_parentheses(): + assert evaluate("((1+2)*(3+4))") == 21 + + +def test_true_division_is_float(): + assert evaluate("10/4") == 2.5 + + +def test_whitespace_tolerant(): + assert evaluate(" 2 + 3 ") == 5 + + +def test_division_by_zero_raises(): + with pytest.raises(ZeroDivisionError): + evaluate("1/0") + + +def test_malformed_raises_value_error(): + for bad in ("2+", "(1+2", "", "1 2"): + with pytest.raises(ValueError): + evaluate(bad) diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/manifest.json b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/manifest.json new file mode 100644 index 00000000..84c5994c --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/manifest.json @@ -0,0 +1,15 @@ +{ + "fixture": "map_task_calc_vague", + "skill": "map-task", + "invocation": "/map-task ST-001", + "branch": "main", + "subtask_id": "ST-001", + "allowed_files": ["src/calc.py"], + "trap_files": [], + "test_cmd": "python -m pytest tests/test_calc_basic.py -q", + "hidden_test_src": "hidden/test_calc_full.py", + "hidden_test_dest": "tests/test_calc_full.py", + "hidden_test_cmd": "python -m pytest tests/test_calc_full.py -q", + "expected_outcome": "complete", + "notes": "VAGUE-contract variant of map_task_calc: the contract does NOT spell out associativity (only 'standard precedence'), so passing right-associative ** (2**3**2==512) requires the model to KNOW it from training, not read it from the spec. Hidden suite keeps only language-UNAMBIGUOUS cases (drops -2**2). Strongest probe for the execution-model competence threshold: if haiku produces left-associative ** (fails 2**3**2) while sonnet/opus get it right, the model lever is finally located." +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/.map/main/blueprint.json b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/.map/main/blueprint.json new file mode 100644 index 00000000..93c60c98 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/.map/main/blueprint.json @@ -0,0 +1,17 @@ +{ + "subtasks": [ + { + "id": "ST-001", + "title": "Implement arithmetic expression evaluator", + "dependencies": [], + "affected_files": ["src/calc.py"], + "complexity": "medium", + "risk": "low", + "validation_criteria": [ + "all of tests/test_calc_basic.py passes" + ], + "test_strategy": "unit", + "aag_contract": "evaluate(expr: str) -> float evaluates an arithmetic expression with operators + - * / ** , unary minus/plus, and parentheses, using standard mathematical precedence and Python numeric semantics. / is float division. Raise ZeroDivisionError on division by zero and ValueError on malformed input." + } + ] +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/.map/main/task_plan_main.md b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/.map/main/task_plan_main.md new file mode 100644 index 00000000..7577a1cc --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/.map/main/task_plan_main.md @@ -0,0 +1,11 @@ +# Task Plan (branch: main) + +## Subtasks + +### ST-001 — Implement arithmetic evaluate() + +Implement `evaluate(expr)` in `src/calc.py` so it correctly evaluates arithmetic +expressions with standard precedence. Make `tests/test_calc_basic.py` pass. + +- affected_files: `src/calc.py` +- validation: `python -m pytest tests/test_calc_basic.py -q` passes diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/src/__init__.py b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/src/calc.py b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/src/calc.py new file mode 100644 index 00000000..61283812 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/src/calc.py @@ -0,0 +1,14 @@ +"""Arithmetic expression evaluator (Python operator semantics). + +ST-001: implement ``evaluate`` per the contract. The shipped gate +(tests/test_calc_basic.py) checks only trivial expressions — a correct +implementation must still honour the FULL precedence/associativity rules, +especially right-associative ``**`` and ``**`` binding tighter than unary minus. +""" +from __future__ import annotations + + +def evaluate(expr: str) -> float: + """Evaluate an arithmetic expression and return its numeric value.""" + del expr # stub: ST-001 must implement + raise NotImplementedError diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/tests/test_calc_basic.py b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/tests/test_calc_basic.py new file mode 100644 index 00000000..3a4f2e9b --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_calc_vague/repo/tests/test_calc_basic.py @@ -0,0 +1,20 @@ +"""WEAK test gate — only trivial expressions. The workflow runs ONLY this file. + +A naive left-to-right / left-associative evaluator passes all of these. The +precedence/associativity edge cases are NOT enforced here — they are measured +post-run by the hidden suite (hidden/test_calc_full.py). +""" +from src.calc import evaluate + + +def test_addition(): + assert evaluate("2+2") == 4 + + +def test_mul_then_add_with_parens(): + assert evaluate("2*3+1") == 7 + assert evaluate("(1+2)*3") == 9 + + +def test_returns_number(): + assert evaluate("1+1") in (2, 2.0) diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver/manifest.json b/tests/skills_eval/fixtures/whole_skill/map_task_semver/manifest.json new file mode 100644 index 00000000..7bc9654e --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver/manifest.json @@ -0,0 +1,16 @@ +{ + "fixture": "map_task_semver", + "skill": "map-task", + "invocation": "/map-task ST-001", + "branch": "main", + "subtask_id": "ST-001", + "allowed_files": ["src/semver.py"], + "trap_files": [], + "test_cmd": "python -m pytest tests/test_semver.py -q", + "expected_outcome": "complete", + "expected": { + "plan_status": "complete", + "tests_pass": true + }, + "notes": "DISCRIMINATING outcome fixture (unlike the scope/blocker traps which all score QUALITY=1.0). The code task is genuinely non-trivial: semver 2.0.0 compare() with pre-release precedence, numeric-vs-lexical identifier comparison, and build-metadata handling. A naive/weak implementation passes the easy tests but fails the edge cases, so task_pass (the test gate) and QUALITY VARY with the execution lever (model tier / agent prompt). No scope trap — this fixture isolates CODE-QUALITY, not scope discipline. Judge dimension used is scope_discipline (complete path); the binding signal here is the deterministic test gate." +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/.map/main/blueprint.json b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/.map/main/blueprint.json new file mode 100644 index 00000000..cf48594a --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/.map/main/blueprint.json @@ -0,0 +1,22 @@ +{ + "subtasks": [ + { + "id": "ST-001", + "title": "Implement semver compare() per semver 2.0.0 precedence", + "dependencies": [], + "affected_files": ["src/semver.py"], + "complexity": "medium", + "risk": "low", + "validation_criteria": [ + "tests/test_semver.py::test_numeric_fields_compare_numerically_not_lexically passes", + "tests/test_semver.py::test_prerelease_has_lower_precedence_than_release passes", + "tests/test_semver.py::test_canonical_prerelease_chain passes", + "tests/test_semver.py::test_numeric_identifiers_compare_numerically passes", + "tests/test_semver.py::test_build_metadata_ignored passes", + "all of tests/test_semver.py passes" + ], + "test_strategy": "unit", + "aag_contract": "compare(a: str, b: str) -> int returns -1/0/1 by semver 2.0.0 precedence: numeric major/minor/patch compared numerically; a pre-release version is lower than its release; pre-release identifiers compared field-by-field (numeric numerically, numeric < alphanumeric, alphanumeric by ASCII, more fields > fewer when prefix equal); build metadata ignored." + } + ] +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/.map/main/task_plan_main.md b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/.map/main/task_plan_main.md new file mode 100644 index 00000000..90a1c606 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/.map/main/task_plan_main.md @@ -0,0 +1,12 @@ +# Task Plan (branch: main) + +## Subtasks + +### ST-001 — Implement semver compare() + +Implement `compare(a, b)` in `src/semver.py` to satisfy the full contract in +`tests/test_semver.py` (semver 2.0.0 precedence: numeric field comparison, +pre-release < release, pre-release identifier ordering, build-metadata ignored). + +- affected_files: `src/semver.py` +- validation: `python -m pytest tests/test_semver.py -q` passes diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/src/__init__.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/src/semver.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/src/semver.py new file mode 100644 index 00000000..87d15d43 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/src/semver.py @@ -0,0 +1,16 @@ +"""Semantic-version comparison (semver 2.0.0 precedence). + +ST-001: implement ``compare`` so the full suite in tests/test_semver.py passes. +The precedence rules are non-trivial — read the tests for the exact contract. +""" +from __future__ import annotations + + +def compare(a: str, b: str) -> int: + """Return -1 if a < b, 0 if a and b have equal precedence, 1 if a > b. + + Versions follow ``MAJOR.MINOR.PATCH[-prerelease][+build]`` (semver 2.0.0). + Not yet implemented. + """ + del a, b # stub: ST-001 must implement; params unused until then + raise NotImplementedError diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/tests/test_semver.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/tests/test_semver.py new file mode 100644 index 00000000..c6f0a04b --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver/repo/tests/test_semver.py @@ -0,0 +1,85 @@ +"""Full contract for src/semver.compare (semver 2.0.0 precedence). + +These tests ARE the spec. A naive implementation (string compare, or ignoring +pre-release rules) passes the easy cases but fails the edge cases below. +""" +from src.semver import compare + + +# --- core numeric precedence ------------------------------------------------ +def test_equal_versions(): + assert compare("1.0.0", "1.0.0") == 0 + + +def test_major_minor_patch_ordering(): + assert compare("1.0.0", "2.0.0") == -1 + assert compare("2.1.0", "2.0.9") == 1 + assert compare("1.0.1", "1.0.0") == 1 + + +def test_numeric_fields_compare_numerically_not_lexically(): + # 1.10.0 > 1.9.0 (a naive string/char compare gets this WRONG: "10" < "9") + assert compare("1.10.0", "1.9.0") == 1 + assert compare("1.0.10", "1.0.9") == 1 + assert compare("2.0.0", "10.0.0") == -1 + + +def test_returns_only_minus_one_zero_one(): + for r in (compare("1.2.3", "1.2.4"), compare("1.2.3", "1.2.3"), compare("1.2.4", "1.2.3")): + assert r in (-1, 0, 1) + + +# --- pre-release vs release ------------------------------------------------- +def test_prerelease_has_lower_precedence_than_release(): + # 1.0.0-alpha < 1.0.0 (a naive impl that ignores pre-release gets 0 here) + assert compare("1.0.0-alpha", "1.0.0") == -1 + assert compare("1.0.0", "1.0.0-alpha") == 1 + + +# --- pre-release identifier ordering (the canonical semver example) --------- +def test_canonical_prerelease_chain(): + chain = [ + "1.0.0-alpha", + "1.0.0-alpha.1", + "1.0.0-alpha.beta", + "1.0.0-beta", + "1.0.0-beta.2", + "1.0.0-beta.11", + "1.0.0-rc.1", + "1.0.0", + ] + for lo, hi in zip(chain, chain[1:]): + assert compare(lo, hi) == -1, f"{lo} should be < {hi}" + assert compare(hi, lo) == 1, f"{hi} should be > {lo}" + + +def test_numeric_identifiers_compare_numerically(): + # beta.2 < beta.11 numerically (naive lexical compare gets "11" < "2") + assert compare("1.0.0-beta.2", "1.0.0-beta.11") == -1 + + +def test_numeric_identifier_lower_than_alphanumeric(): + # a field of only digits has lower precedence than one with letters + assert compare("1.0.0-alpha.1", "1.0.0-alpha.beta") == -1 + + +def test_more_identifiers_wins_when_prefix_equal(): + # a larger set of pre-release fields > a smaller set, if all preceding equal + assert compare("1.0.0-alpha", "1.0.0-alpha.1") == -1 + assert compare("1.0.0-alpha.1.1", "1.0.0-alpha.1") == 1 + + +def test_equal_prereleases(): + assert compare("1.0.0-beta.11", "1.0.0-beta.11") == 0 + + +# --- build metadata is ignored for precedence ------------------------------- +def test_build_metadata_ignored(): + assert compare("1.0.0+build.1", "1.0.0+build.2") == 0 + assert compare("1.0.0+meta", "1.0.0") == 0 + assert compare("1.0.0-alpha+x", "1.0.0-alpha+y") == 0 + + +def test_build_metadata_does_not_override_prerelease(): + # pre-release rule still applies even with build metadata present + assert compare("1.0.0-alpha+build", "1.0.0+build") == -1 diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/hidden/test_semver_full.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/hidden/test_semver_full.py new file mode 100644 index 00000000..499766c6 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/hidden/test_semver_full.py @@ -0,0 +1,55 @@ +"""HIDDEN comprehensive suite — the workflow never sees this file. + +Injected by the harness AFTER the run to score the produced code against the +FULL semver 2.0.0 contract (the edge cases the weak gate did not enforce). +""" +from src.semver import compare + + +def test_numeric_fields_compare_numerically_not_lexically(): + assert compare("1.10.0", "1.9.0") == 1 + assert compare("1.0.10", "1.0.9") == 1 + assert compare("2.0.0", "10.0.0") == -1 + + +def test_prerelease_has_lower_precedence_than_release(): + assert compare("1.0.0-alpha", "1.0.0") == -1 + assert compare("1.0.0", "1.0.0-alpha") == 1 + + +def test_canonical_prerelease_chain(): + chain = [ + "1.0.0-alpha", + "1.0.0-alpha.1", + "1.0.0-alpha.beta", + "1.0.0-beta", + "1.0.0-beta.2", + "1.0.0-beta.11", + "1.0.0-rc.1", + "1.0.0", + ] + for lo, hi in zip(chain, chain[1:]): + assert compare(lo, hi) == -1, f"{lo} should be < {hi}" + assert compare(hi, lo) == 1, f"{hi} should be > {lo}" + + +def test_numeric_identifiers_compare_numerically(): + assert compare("1.0.0-beta.2", "1.0.0-beta.11") == -1 + + +def test_numeric_identifier_lower_than_alphanumeric(): + assert compare("1.0.0-alpha.1", "1.0.0-alpha.beta") == -1 + + +def test_more_identifiers_wins_when_prefix_equal(): + assert compare("1.0.0-alpha", "1.0.0-alpha.1") == -1 + assert compare("1.0.0-alpha.1.1", "1.0.0-alpha.1") == 1 + + +def test_build_metadata_ignored(): + assert compare("1.0.0+build.1", "1.0.0+build.2") == 0 + assert compare("1.0.0+meta", "1.0.0") == 0 + + +def test_build_metadata_does_not_override_prerelease(): + assert compare("1.0.0-alpha+build", "1.0.0+build") == -1 diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/manifest.json b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/manifest.json new file mode 100644 index 00000000..f0a51c84 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/manifest.json @@ -0,0 +1,15 @@ +{ + "fixture": "map_task_semver_vague", + "skill": "map-task", + "invocation": "/map-task ST-001", + "branch": "main", + "subtask_id": "ST-001", + "allowed_files": ["src/semver.py"], + "trap_files": [], + "test_cmd": "python -m pytest tests/test_semver_basic.py -q", + "hidden_test_src": "hidden/test_semver_full.py", + "hidden_test_dest": "tests/test_semver_full.py", + "hidden_test_cmd": "python -m pytest tests/test_semver_full.py -q", + "expected_outcome": "complete", + "notes": "MODEL-ISOLATION fixture: weak gate (3 trivial tests) AND a VAGUE contract (aag_contract only says 'compare two semver strings, return -1/0/1' — the precedence RULES are NOT spelled out). The model must therefore rely on its OWN knowledge of semver 2.0.0 precedence. Hidden 8-edge-case suite scored post-run. If hidden_pass varies by actor model (haiku fails edge cases, opus/sonnet pass), that ISOLATES the model-competence lever — the one config where model matters for outcome (vague contract + weak gate). If all still pass, model competence is not the bottleneck even with a vague spec." +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/.map/main/blueprint.json b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/.map/main/blueprint.json new file mode 100644 index 00000000..2413991b --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/.map/main/blueprint.json @@ -0,0 +1,17 @@ +{ + "subtasks": [ + { + "id": "ST-001", + "title": "Implement semver compare()", + "dependencies": [], + "affected_files": ["src/semver.py"], + "complexity": "medium", + "risk": "low", + "validation_criteria": [ + "all of tests/test_semver_basic.py passes" + ], + "test_strategy": "unit", + "aag_contract": "compare(a: str, b: str) -> int compares two semantic version strings and returns -1 if a < b, 0 if equal, 1 if a > b." + } + ] +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/.map/main/task_plan_main.md b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/.map/main/task_plan_main.md new file mode 100644 index 00000000..de1173df --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/.map/main/task_plan_main.md @@ -0,0 +1,11 @@ +# Task Plan (branch: main) + +## Subtasks + +### ST-001 — Implement semver compare() + +Implement `compare(a, b)` in `src/semver.py` so it correctly compares two +semantic-version strings. Make `tests/test_semver_basic.py` pass. + +- affected_files: `src/semver.py` +- validation: `python -m pytest tests/test_semver_basic.py -q` passes diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/src/__init__.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/src/semver.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/src/semver.py new file mode 100644 index 00000000..79c82f9c --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/src/semver.py @@ -0,0 +1,13 @@ +"""Semantic-version comparison (semver 2.0.0 precedence). + +ST-001: implement ``compare`` per the contract in the blueprint. NOTE: the +shipped test gate (tests/test_semver_basic.py) only checks trivial cases — a +correct implementation must still honour the FULL semver 2.0.0 precedence rules. +""" +from __future__ import annotations + + +def compare(a: str, b: str) -> int: + """Return -1 if a < b, 0 if equal precedence, 1 if a > b (semver 2.0.0).""" + del a, b # stub: ST-001 must implement + raise NotImplementedError diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/tests/test_semver_basic.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/tests/test_semver_basic.py new file mode 100644 index 00000000..8c6059bc --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_vague/repo/tests/test_semver_basic.py @@ -0,0 +1,21 @@ +"""WEAK test gate — only trivial cases. The workflow runs ONLY this file. + +A naive implementation (compare major.minor.patch as ints, ignore pre-release) +passes all of these. The full edge-case behaviour is NOT enforced here — it is +measured post-run by the hidden suite (hidden/test_semver_full.py). +""" +from src.semver import compare + + +def test_equal_versions(): + assert compare("1.0.0", "1.0.0") == 0 + + +def test_basic_major_ordering(): + assert compare("1.0.0", "2.0.0") == -1 + assert compare("2.0.0", "1.0.0") == 1 + + +def test_returns_minus_one_zero_one(): + for r in (compare("1.0.0", "1.0.1"), compare("1.0.0", "1.0.0"), compare("1.0.1", "1.0.0")): + assert r in (-1, 0, 1) diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/hidden/test_semver_full.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/hidden/test_semver_full.py new file mode 100644 index 00000000..499766c6 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/hidden/test_semver_full.py @@ -0,0 +1,55 @@ +"""HIDDEN comprehensive suite — the workflow never sees this file. + +Injected by the harness AFTER the run to score the produced code against the +FULL semver 2.0.0 contract (the edge cases the weak gate did not enforce). +""" +from src.semver import compare + + +def test_numeric_fields_compare_numerically_not_lexically(): + assert compare("1.10.0", "1.9.0") == 1 + assert compare("1.0.10", "1.0.9") == 1 + assert compare("2.0.0", "10.0.0") == -1 + + +def test_prerelease_has_lower_precedence_than_release(): + assert compare("1.0.0-alpha", "1.0.0") == -1 + assert compare("1.0.0", "1.0.0-alpha") == 1 + + +def test_canonical_prerelease_chain(): + chain = [ + "1.0.0-alpha", + "1.0.0-alpha.1", + "1.0.0-alpha.beta", + "1.0.0-beta", + "1.0.0-beta.2", + "1.0.0-beta.11", + "1.0.0-rc.1", + "1.0.0", + ] + for lo, hi in zip(chain, chain[1:]): + assert compare(lo, hi) == -1, f"{lo} should be < {hi}" + assert compare(hi, lo) == 1, f"{hi} should be > {lo}" + + +def test_numeric_identifiers_compare_numerically(): + assert compare("1.0.0-beta.2", "1.0.0-beta.11") == -1 + + +def test_numeric_identifier_lower_than_alphanumeric(): + assert compare("1.0.0-alpha.1", "1.0.0-alpha.beta") == -1 + + +def test_more_identifiers_wins_when_prefix_equal(): + assert compare("1.0.0-alpha", "1.0.0-alpha.1") == -1 + assert compare("1.0.0-alpha.1.1", "1.0.0-alpha.1") == 1 + + +def test_build_metadata_ignored(): + assert compare("1.0.0+build.1", "1.0.0+build.2") == 0 + assert compare("1.0.0+meta", "1.0.0") == 0 + + +def test_build_metadata_does_not_override_prerelease(): + assert compare("1.0.0-alpha+build", "1.0.0+build") == -1 diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/manifest.json b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/manifest.json new file mode 100644 index 00000000..48cc47db --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/manifest.json @@ -0,0 +1,15 @@ +{ + "fixture": "map_task_semver_weakgate", + "skill": "map-task", + "invocation": "/map-task ST-001", + "branch": "main", + "subtask_id": "ST-001", + "allowed_files": ["src/semver.py"], + "trap_files": [], + "test_cmd": "python -m pytest tests/test_semver_basic.py -q", + "hidden_test_src": "hidden/test_semver_full.py", + "hidden_test_dest": "tests/test_semver_full.py", + "hidden_test_cmd": "python -m pytest tests/test_semver_full.py -q", + "expected_outcome": "complete", + "notes": "WEAKLY-GATED counterpart of map_task_semver. The workflow sees ONLY the thin gate (test_semver_basic.py, 3 trivial cases) — which a naive implementation passes. The hidden comprehensive suite (8 edge-case tests, injected post-run) measures TRUE code quality the gate did not enforce. Tests whether the model matters for OUTCOME when the gate cannot rescue a weak implementation: if hidden_pass varies by actor model while the weak gate task_pass stays True for all, that PROVES gate strength (lever #3) governs outcome and a weak gate lets model quality leak through." +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/.map/main/blueprint.json b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/.map/main/blueprint.json new file mode 100644 index 00000000..8abe9ce3 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/.map/main/blueprint.json @@ -0,0 +1,19 @@ +{ + "subtasks": [ + { + "id": "ST-001", + "title": "Implement semver compare() per semver 2.0.0 precedence", + "dependencies": [], + "affected_files": ["src/semver.py"], + "complexity": "medium", + "risk": "low", + "validation_criteria": [ + "tests/test_semver_basic.py::test_equal_versions passes", + "tests/test_semver_basic.py::test_basic_major_ordering passes", + "all of tests/test_semver_basic.py passes" + ], + "test_strategy": "unit", + "aag_contract": "compare(a: str, b: str) -> int returns -1/0/1 by semver 2.0.0 precedence: numeric major/minor/patch compared numerically (1.10.0 > 1.9.0); a pre-release version is lower than its release (1.0.0-alpha < 1.0.0); pre-release identifiers compared field-by-field (numeric numerically, numeric < alphanumeric, alphanumeric by ASCII, more fields > fewer when prefix equal); build metadata (+...) ignored." + } + ] +} diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/.map/main/task_plan_main.md b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/.map/main/task_plan_main.md new file mode 100644 index 00000000..e1c38f81 --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/.map/main/task_plan_main.md @@ -0,0 +1,14 @@ +# Task Plan (branch: main) + +## Subtasks + +### ST-001 — Implement semver compare() + +Implement `compare(a, b)` in `src/semver.py` per the full semver 2.0.0 precedence +contract (see blueprint aag_contract). The shipped gate +`tests/test_semver_basic.py` checks only trivial cases; a correct implementation +must still honour pre-release precedence, numeric identifier comparison, and +build-metadata handling. + +- affected_files: `src/semver.py` +- validation: `python -m pytest tests/test_semver_basic.py -q` passes diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/src/__init__.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/src/semver.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/src/semver.py new file mode 100644 index 00000000..79c82f9c --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/src/semver.py @@ -0,0 +1,13 @@ +"""Semantic-version comparison (semver 2.0.0 precedence). + +ST-001: implement ``compare`` per the contract in the blueprint. NOTE: the +shipped test gate (tests/test_semver_basic.py) only checks trivial cases — a +correct implementation must still honour the FULL semver 2.0.0 precedence rules. +""" +from __future__ import annotations + + +def compare(a: str, b: str) -> int: + """Return -1 if a < b, 0 if equal precedence, 1 if a > b (semver 2.0.0).""" + del a, b # stub: ST-001 must implement + raise NotImplementedError diff --git a/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/tests/test_semver_basic.py b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/tests/test_semver_basic.py new file mode 100644 index 00000000..8c6059bc --- /dev/null +++ b/tests/skills_eval/fixtures/whole_skill/map_task_semver_weakgate/repo/tests/test_semver_basic.py @@ -0,0 +1,21 @@ +"""WEAK test gate — only trivial cases. The workflow runs ONLY this file. + +A naive implementation (compare major.minor.patch as ints, ignore pre-release) +passes all of these. The full edge-case behaviour is NOT enforced here — it is +measured post-run by the hidden suite (hidden/test_semver_full.py). +""" +from src.semver import compare + + +def test_equal_versions(): + assert compare("1.0.0", "1.0.0") == 0 + + +def test_basic_major_ordering(): + assert compare("1.0.0", "2.0.0") == -1 + assert compare("2.0.0", "1.0.0") == 1 + + +def test_returns_minus_one_zero_one(): + for r in (compare("1.0.0", "1.0.1"), compare("1.0.0", "1.0.0"), compare("1.0.1", "1.0.0")): + assert r in (-1, 0, 1) diff --git a/tests/skills_eval/whole_skill/spike_runner.py b/tests/skills_eval/whole_skill/spike_runner.py index 87b652b2..22ba03c3 100644 --- a/tests/skills_eval/whole_skill/spike_runner.py +++ b/tests/skills_eval/whole_skill/spike_runner.py @@ -51,12 +51,27 @@ # --------------------------------------------------------------------------- # Seeding # --------------------------------------------------------------------------- -def seed_temp(fixture_dir: Path, variant: str, degrade: str = "body") -> Path: - """Create a throwaway cwd: .claude + .map/scripts + fixture repo + git init.""" +def seed_temp( + fixture_dir: Path, + variant: str, + degrade: str = "body", + agent_models: dict[str, str] | None = None, +) -> Path: + """Create a throwaway cwd: .claude + .map/scripts + fixture repo + git init. + + ``agent_models`` (e.g. ``{"actor": "opus"}``) rewrites the ``model:`` + frontmatter of the named seeded agent(s) — the precise lever for measuring + how EXECUTION model tier affects outcome quality (the actor writes the code, + so its model is the code-quality lever; sub-agents use their own ``model:`` + frontmatter, NOT the orchestrator's ``--model``). Throwaway seed only. + """ tmp = Path(tempfile.mkdtemp(prefix="mts-spike-")) # 1. .claude (skills + agents + settings), temp-flip so /map-task is invocable shutil.copytree(REPO_ROOT / ".claude", tmp / ".claude") _apply_temp_flip(tmp / ".claude") + # 1b. per-agent execution-model override (model lever) + for agent, model in (agent_models or {}).items(): + _set_agent_model(tmp / ".claude" / "agents" / f"{agent}.md", model) # 2. .map/scripts (orchestrator + step runner the body shells out to) (tmp / ".map").mkdir(parents=True, exist_ok=True) shutil.copytree(REPO_ROOT / ".map" / "scripts", tmp / ".map" / "scripts") @@ -168,17 +183,126 @@ def _degrade_monitor(monitor_md: Path) -> None: monitor_md.write_text("".join(kept), encoding="utf-8") +def _set_agent_model(agent_md: Path, model: str) -> None: + """Rewrite the ``model:`` frontmatter line of a seeded agent .md (model lever). + + Replaces the value after ``model:`` (preserving any trailing ``# comment``) + or, if absent, inserts ``model: `` right after the opening ``---``. + Throwaway seed only. + """ + if not agent_md.exists(): + return + lines = agent_md.read_text(encoding="utf-8").splitlines(keepends=True) + out: list[str] = [] + replaced = False + in_fm = False + fm_marker_seen = 0 + for line in lines: + if line.strip() == "---": + fm_marker_seen += 1 + in_fm = fm_marker_seen == 1 + out.append(line) + continue + if in_fm and not replaced and line.lstrip().startswith("model:"): + indent = line[: len(line) - len(line.lstrip())] + out.append(f"{indent}model: {model}\n") + replaced = True + continue + out.append(line) + if not replaced: + # insert after the first '---' + new_out: list[str] = [] + inserted = False + for line in out: + new_out.append(line) + if not inserted and line.strip() == "---": + new_out.append(f"model: {model}\n") + inserted = True + out = new_out + agent_md.write_text("".join(out), encoding="utf-8") + + def _git(cwd: Path, *args: str) -> subprocess.CompletedProcess[str]: return subprocess.run( ["git", *args], cwd=cwd, capture_output=True, text=True, check=False ) +def run_hidden_tests( + tmp: Path, fixture_dir: Path, hidden_src: str, hidden_dest: str, hidden_cmd: str +) -> dict: + """Measure TRUE code quality with a comprehensive suite the workflow never saw. + + For a WEAKLY-gated fixture the workflow runs only a thin test gate; a weak + implementation passes it but may be wrong on edge cases. After the run we + inject the full hidden suite (``hidden_src`` in the fixture dir → ``hidden_dest`` + in the temp) and run ``hidden_cmd`` to score the produced code against ALL + edge cases. This is the deterministic 'did the model implement the full + contract, or just satisfy the weak gate?' signal — no judge noise. + """ + src = fixture_dir / hidden_src + dest = tmp / hidden_dest + try: + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dest) + except OSError as exc: + return {"ran": False, "error": f"copy failed: {exc}"} + proc = subprocess.run( + hidden_cmd.split(), + cwd=tmp, + capture_output=True, + text=True, + check=False, + env={**os.environ, "PYTHONDONTWRITEBYTECODE": "1"}, + ) + tail = (proc.stdout + proc.stderr)[-800:] + # parse "N passed" / "N failed" from pytest tail (best-effort) + import re as _re + passed = _re.search(r"(\d+) passed", tail) + failed = _re.search(r"(\d+) failed", tail) + return { + "ran": True, + "hidden_pass": proc.returncode == 0, + "returncode": proc.returncode, + "n_passed": int(passed.group(1)) if passed else None, + "n_failed": int(failed.group(1)) if failed else 0, + "tail": tail, + } + + +def _read_retry_counters(tmp: Path, branch: str) -> dict: + """Read serial-mode retry counters from the run's step_state.json. + + On a well-gated task QUALITY saturates (every tier passes the test gate), so + the model effect — if any — hides in HOW MANY actor retries the MONITOR loop + needed to drive the actor to a passing implementation. Captured here so a + weaker actor that "passes, but only after more iterations" is still visible. + Returns {} if step_state.json is absent/unreadable. + """ + sp = tmp / ".map" / branch / "step_state.json" + try: + data = json.loads(sp.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError, ValueError): + return {} + return { + k: data.get(k) + for k in ("retry_count", "clean_retry_count", "contaminated_retry_count") + if k in data + } + + # --------------------------------------------------------------------------- # Run the skill # --------------------------------------------------------------------------- -def run_skill(tmp: Path, invocation: str, timeout: float) -> dict: - argv = ["claude", "-p", invocation, "--output-format", "json"] +def run_skill(tmp: Path, invocation: str, timeout: float, orchestrator_model: str | None = None) -> dict: + # acceptEdits: auto-accept file edits so the run isn't blocked by interactive + # permission prompts in headless mode. Without this, weaker/less-agentic models + # stall on "I need permission to edit" (observed: haiku hit 4 perm-denials and + # gave up while opus wrote freely) — a permission/agency artifact that confounds + # any CODE-quality comparison. NOT a full bypass: only edits are auto-accepted. + argv = ["claude", "-p", invocation, "--output-format", "json", "--permission-mode", "acceptEdits"] + if orchestrator_model: + argv += ["--model", orchestrator_model] t0 = time.monotonic() try: proc = subprocess.run( @@ -406,14 +530,39 @@ def main() -> int: ap.add_argument("--timeout", type=float, default=3600.0) ap.add_argument("--judge-timeout", type=float, default=360.0) ap.add_argument("--start-index", type=int, default=0) + ap.add_argument( + "--agent-model", + action="append", + default=[], + metavar="AGENT=MODEL", + help="Override a seeded agent's model: frontmatter, e.g. actor=opus " + "(repeatable). The model lever for EXECUTION quality.", + ) + ap.add_argument( + "--orchestrator-model", + default=None, + help="--model passed to the top-level claude -p running the skill body " + "(the orchestrator loop; sub-agents still use their own model:).", + ) args = ap.parse_args() + agent_models: dict[str, str] = {} + for spec in args.agent_model: + if "=" not in spec: + ap.error(f"--agent-model must be AGENT=MODEL, got {spec!r}") + agent, model = spec.split("=", 1) + agent_models[agent.strip()] = model.strip() + manifest = json.loads((args.fixture / "manifest.json").read_text()) allowed = manifest["allowed_files"] trap = manifest["trap_files"] invocation = manifest["invocation"] test_cmd = manifest["test_cmd"] expected_outcome = manifest.get("expected_outcome", "complete") + branch = manifest.get("branch", "main") + hidden_src = manifest.get("hidden_test_src") + hidden_dest = manifest.get("hidden_test_dest") + hidden_cmd = manifest.get("hidden_test_cmd") args.out.mkdir(parents=True, exist_ok=True) results_path = args.out / "results.jsonl" @@ -422,17 +571,29 @@ def main() -> int: rec: dict = { "variant": args.variant, "degrade": args.degrade if args.variant == "bad" else None, + "agent_models": agent_models or None, + "orchestrator_model": args.orchestrator_model, "run": i, "ts": time.strftime("%Y-%m-%dT%H:%M:%S"), } tmp = None try: - tmp = seed_temp(args.fixture, args.variant, args.degrade) - print(f"[{rec['ts']}] variant={args.variant} run={i} tmp={tmp} — running /map-task ...", flush=True) - run = run_skill(tmp, invocation, args.timeout) + tmp = seed_temp(args.fixture, args.variant, args.degrade, agent_models) + print( + f"[{rec['ts']}] variant={args.variant} run={i} " + f"agent_models={agent_models or '-'} orch={args.orchestrator_model or '-'} " + f"tmp={tmp} — running /map-task ...", + flush=True, + ) + run = run_skill(tmp, invocation, args.timeout, args.orchestrator_model) rec["run_meta"] = {k: run.get(k) for k in ("ok", "returncode", "error", "duration_s", "session_id", "usage", "stderr_tail")} gates = deterministic_gates(tmp, allowed, trap, test_cmd) rec["gates"] = gates + rec["retry_counters"] = _read_retry_counters(tmp, branch) + if hidden_src and hidden_dest and hidden_cmd: + rec["hidden"] = run_hidden_tests( + tmp, args.fixture, hidden_src, hidden_dest, hidden_cmd + ) rec["expected_outcome"] = expected_outcome judge = judge_quality( expected_outcome, allowed, trap, gates, run.get("raw_output", ""), args.judge_timeout @@ -442,6 +603,8 @@ def main() -> int: print( f" -> scope_pass={gates['scope_pass']} task_pass={gates['task_pass']} " f"judge[{judge.get('dimension')}]={judge.get('score')} QUALITY={rec['quality']} " + f"retries={rec['retry_counters'].get('retry_count')} " + f"hidden={(rec.get('hidden') or {}).get('n_passed')}p/{(rec.get('hidden') or {}).get('n_failed')}f " f"dur={run.get('duration_s', 0):.0f}s", flush=True, ) diff --git a/tests/test_skills_eval_dispatcher_timeout.py b/tests/test_skills_eval_dispatcher_timeout.py new file mode 100644 index 00000000..b95422b5 --- /dev/null +++ b/tests/test_skills_eval_dispatcher_timeout.py @@ -0,0 +1,339 @@ +"""Dispatcher hardening for EXECUTING-skill trigger eval. + +Trigger-accuracy eval only needs the first ``Skill`` tool_use — not the skill +*body* running. Two mechanisms keep executing skills (``map-check`` running the +test suite, ``map-task``/``map-efficient`` dispatching sub-agents) from being +mis-measured: + +1. ``--disallowed-tools`` on the ``claude -p`` argv — the body cannot perform + slow / mutating / network work, so it returns instead of overrunning, while + the skill still TRIGGERS (description-driven, recorded in the transcript). +2. Timeout recovery — if a slow body still overruns the per-call timeout, the + trigger is recovered from the transcript (located by cwd slug) rather than + mis-recorded as a false non-trigger. +""" +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest + +from mapify_cli.skills_eval import dispatcher as disp_mod +from mapify_cli.skills_eval.dispatcher import ( + _EVAL_DISALLOWED_TOOLS, + ClaudeSubprocessDispatcher, + _cwd_to_project_slug, + _locate_transcript_by_cwd, +) + +_SKILL_LINE = ( + '{"message": {"content": [{"type": "tool_use", "name": "Skill", ' + '"input": {"skill": "map-check"}}]}}\n' +) +_NO_SKILL_LINE = ( + '{"message": {"content": [{"type": "text", "text": "just a plain answer"}]}}\n' +) + + +def _noop_sleep(_seconds: float) -> None: + """Stand-in for ``time.sleep`` so the settle-poll adds no real delay in tests.""" + del _seconds + + +def _seed_transcript(home: Path, cwd: Path, line: str) -> Path: + """Create a transcript JSONL under the project-slug dir below *home*. + + Uses Claude Code's real slug transform (``_cwd_to_project_slug``) so the + seeded location matches where Claude would actually write it. + """ + slug_dir = home / ".claude" / "projects" / _cwd_to_project_slug(cwd) + slug_dir.mkdir(parents=True, exist_ok=True) + transcript = slug_dir / "session-abc.jsonl" + transcript.write_text(line, encoding="utf-8") + return transcript + + +# --------------------------------------------------------------------------- +# 1. --disallowed-tools is passed to claude -p +# --------------------------------------------------------------------------- + + +def test_disallowed_tools_constant_blocks_heavy_tools() -> None: + """The block-list must cover execution, mutation, sub-agents, and network — + but never ``Skill`` (the signal we measure) nor read-only tools.""" + for tool in ("Bash", "Edit", "Write", "Task", "Agent", "WebFetch", "WebSearch"): + assert tool in _EVAL_DISALLOWED_TOOLS + assert "Skill" not in _EVAL_DISALLOWED_TOOLS + for read_only in ("Read", "Grep", "Glob"): + assert read_only not in _EVAL_DISALLOWED_TOOLS + + +def test_dispatch_argv_includes_disallowed_tools( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + captured: dict[str, list[str]] = {} + + def fake_seed(_src: Path) -> Path: + del _src + return tmp_path + + def fake_run(argv: list[str], **_kwargs: object) -> subprocess.CompletedProcess[str]: + del _kwargs + captured["argv"] = argv + return subprocess.CompletedProcess( + argv, returncode=0, stdout='{"result": "", "session_id": ""}', stderr="" + ) + + monkeypatch.setattr(disp_mod, "_seed_temp_cwd", fake_seed) + monkeypatch.setattr(disp_mod.subprocess, "run", fake_run) + + ClaudeSubprocessDispatcher().dispatch("do a thing") + + argv = captured["argv"] + assert "--disallowed-tools" in argv + flag_index = argv.index("--disallowed-tools") + passed = argv[flag_index + 1 : flag_index + 1 + len(_EVAL_DISALLOWED_TOOLS)] + assert passed == list(_EVAL_DISALLOWED_TOOLS) + # No --model flag unless one is pinned (preserve CLI session default). + assert "--model" not in argv + + +def test_dispatch_argv_includes_model_when_pinned( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A pinned model is passed as ``--model `` so trigger accuracy can be + measured per tier (model choice dominates prompt phrasing).""" + captured: dict[str, list[str]] = {} + + def fake_seed(_src: Path) -> Path: + del _src + return tmp_path + + def fake_run(argv: list[str], **_kwargs: object) -> subprocess.CompletedProcess[str]: + del _kwargs + captured["argv"] = argv + return subprocess.CompletedProcess( + argv, returncode=0, stdout='{"result": "", "session_id": ""}', stderr="" + ) + + monkeypatch.setattr(disp_mod, "_seed_temp_cwd", fake_seed) + monkeypatch.setattr(disp_mod.subprocess, "run", fake_run) + + ClaudeSubprocessDispatcher(model="haiku").dispatch("do a thing") + + argv = captured["argv"] + assert "--model" in argv + assert argv[argv.index("--model") + 1] == "haiku" + + +# --------------------------------------------------------------------------- +# 2. Timeout recovery from the transcript +# --------------------------------------------------------------------------- + + +def test_locate_transcript_by_cwd_finds_slug_dir( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + home = tmp_path / "home" + cwd = tmp_path / "mapeval-xyz" + cwd.mkdir() + monkeypatch.setenv("HOME", str(home)) + transcript = _seed_transcript(home, cwd, _SKILL_LINE) + + assert _locate_transcript_by_cwd(cwd) == transcript + + +def test_locate_transcript_by_cwd_resolves_symlinked_cwd( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Claude Code derives the slug from the RESOLVED cwd (macOS ``/var`` -> + ``/private/var``). The locator must find the transcript when handed the + unresolved (symlinked) cwd, or executing-skill recovery silently fails and + retries 3x — the exact bug observed live with ``map-efficient``.""" + home = tmp_path / "home" + real_cwd = tmp_path / "real-mapeval" + real_cwd.mkdir() + link_cwd = tmp_path / "link-mapeval" + link_cwd.symlink_to(real_cwd, target_is_directory=True) + monkeypatch.setenv("HOME", str(home)) + # Transcript is written under the RESOLVED path's slug (what Claude Code uses). + _seed_transcript(home, real_cwd, _SKILL_LINE) + + # ...but the dispatcher only knows the unresolved symlink path. + found = _locate_transcript_by_cwd(link_cwd) + assert found is not None + assert found.read_text(encoding="utf-8") == _SKILL_LINE + + +def test_cwd_to_project_slug_replaces_underscore_and_separators() -> None: + """Claude Code replaces ``/``, ``.`` AND ``_`` (any non-alnum/non-dash) with + ``-``. The underscore is the one a naive transform misses — ``mkdtemp`` emits + names like ``mapeval-s_u5zv32`` and the project dir is ``…-mapeval-s-u5zv32``.""" + assert ( + _cwd_to_project_slug(Path("/private/tmp/x/mapeval-s_u5zv32")) + == "-private-tmp-x-mapeval-s-u5zv32" + ) + assert "_" not in _cwd_to_project_slug(Path("/a/b_c.d/e_f")) + + +def test_locate_transcript_by_cwd_handles_underscore_in_temp_name( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A temp cwd whose mkdtemp suffix contains ``_`` must still be located — + the live sweep hit ``no transcript located`` (false non-trigger) on exactly + these dispatches before the slug transform was fixed.""" + home = tmp_path / "home" + cwd = tmp_path / "mapeval-s_u5zv32" # underscore, as mkdtemp produces + cwd.mkdir() + monkeypatch.setenv("HOME", str(home)) + transcript = _seed_transcript(home, cwd, _SKILL_LINE) # seeded under the '-' slug + + # cwd still carries the underscore; the locator must reconcile it. + assert "_" in str(cwd) + assert "_" not in transcript.parent.name + assert _locate_transcript_by_cwd(cwd) == transcript + + +def test_locate_transcript_by_cwd_returns_none_when_absent( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("HOME", str(tmp_path / "home")) + assert _locate_transcript_by_cwd(tmp_path / "mapeval-none") is None + + +def test_timeout_recovers_trigger_from_transcript( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """An EXECUTING skill that overruns the timeout is recorded by its REAL + trigger (from the transcript), not as a false non-trigger — and is NOT + retried (timeout recovery is a valid verdict).""" + home = tmp_path / "home" + cwd = tmp_path / "mapeval-exec" + cwd.mkdir() + monkeypatch.setenv("HOME", str(home)) + _seed_transcript(home, cwd, _SKILL_LINE) + + calls = {"n": 0} + + def fake_seed(_src: Path) -> Path: + del _src + return cwd + + def fake_run(argv: list[str], **_kwargs: object) -> subprocess.CompletedProcess[str]: + del _kwargs + calls["n"] += 1 + raise subprocess.TimeoutExpired(cmd=argv, timeout=90) + + monkeypatch.setattr(disp_mod, "_seed_temp_cwd", fake_seed) + monkeypatch.setattr(disp_mod.subprocess, "run", fake_run) + + result = ClaudeSubprocessDispatcher().dispatch("run the full test suite") + + assert result.triggered_skill == "map-check" + assert result.error is None + assert calls["n"] == 1 # recovery is a valid verdict — no retry + + +def test_timeout_with_transcript_but_no_skill_records_non_trigger( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Transcript exists but no skill fired by kill-time → non-trigger verdict + (the correct reading for a slow negative case), still no retry/error.""" + home = tmp_path / "home" + cwd = tmp_path / "mapeval-neg" + cwd.mkdir() + monkeypatch.setenv("HOME", str(home)) + _seed_transcript(home, cwd, _NO_SKILL_LINE) + + def fake_seed(_src: Path) -> Path: + del _src + return cwd + + def fake_run(argv: list[str], **_kwargs: object) -> subprocess.CompletedProcess[str]: + del _kwargs + raise subprocess.TimeoutExpired(cmd=argv, timeout=90) + + monkeypatch.setattr(disp_mod, "_seed_temp_cwd", fake_seed) + monkeypatch.setattr(disp_mod.subprocess, "run", fake_run) + + result = ClaudeSubprocessDispatcher().dispatch("what is 2 + 2?") + + assert result.triggered_skill is None + assert result.error is None + + +def test_timeout_is_terminal_not_retried( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A timeout is TERMINAL — never retried. Even with no transcript to recover, + the dispatcher records a single attempt as a non-trigger (error=None), rather + than re-running the same expensive call ``1 + max_retries`` times. Retrying a + 90 s overrun turned every executing-skill positive into ~3x the wall-clock.""" + home = tmp_path / "home" + cwd = tmp_path / "mapeval-hang" + cwd.mkdir() + monkeypatch.setenv("HOME", str(home)) # no transcript seeded + monkeypatch.setattr(disp_mod.time, "sleep", _noop_sleep) # skip settle-poll wait + + calls = {"n": 0} + + def fake_seed(_src: Path) -> Path: + del _src + return cwd + + def fake_run(argv: list[str], **_kwargs: object) -> subprocess.CompletedProcess[str]: + del _kwargs + calls["n"] += 1 + raise subprocess.TimeoutExpired(cmd=argv, timeout=90) + + monkeypatch.setattr(disp_mod, "_seed_temp_cwd", fake_seed) + monkeypatch.setattr(disp_mod.subprocess, "run", fake_run) + + result = ClaudeSubprocessDispatcher().dispatch("anything") + + assert result.triggered_skill is None + assert result.error is None # terminal non-trigger verdict, not an error + assert calls["n"] == 1 # NOT retried + + +def test_timeout_recovery_settle_poll_retries_locate( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """The transcript may not be visible at the exact instant of the timeout kill; + the settle-poll retries the lookup so a just-written transcript is still + recovered (defeats the flush/visibility race seen live).""" + home = tmp_path / "home" + cwd = tmp_path / "mapeval-race" + cwd.mkdir() + monkeypatch.setenv("HOME", str(home)) + monkeypatch.setattr(disp_mod.time, "sleep", _noop_sleep) + + locate_calls = {"n": 0} + real_locate = disp_mod._locate_transcript_by_cwd + + def flaky_locate(target: Path) -> Path | None: + locate_calls["n"] += 1 + if locate_calls["n"] < 3: # invisible on the first two polls + return None + return real_locate(target) + + _seed_transcript(home, cwd, _SKILL_LINE) + + def fake_seed(_src: Path) -> Path: + del _src + return cwd + + def fake_run(argv: list[str], **_kwargs: object) -> subprocess.CompletedProcess[str]: + del _kwargs + raise subprocess.TimeoutExpired(cmd=argv, timeout=90) + + monkeypatch.setattr(disp_mod, "_seed_temp_cwd", fake_seed) + monkeypatch.setattr(disp_mod, "_locate_transcript_by_cwd", flaky_locate) + monkeypatch.setattr(disp_mod.subprocess, "run", fake_run) + + result = ClaudeSubprocessDispatcher().dispatch("run the suite") + + assert result.triggered_skill == "map-check" + assert result.error is None + assert locate_calls["n"] >= 3 # polled past the transient misses diff --git a/tests/test_skills_eval_optimizer.py b/tests/test_skills_eval_optimizer.py index 6da9638f..482b9146 100644 --- a/tests/test_skills_eval_optimizer.py +++ b/tests/test_skills_eval_optimizer.py @@ -520,6 +520,68 @@ def test_set_frontmatter_description_roundtrips_tricky_value() -> None: assert parsed == tricky +def test_set_frontmatter_description_replaces_block_scalar() -> None: + """The bug that shipped: a ``description: |`` block scalar must be FULLY + replaced — leaving its indented body orphaned below the new quoted scalar + produces invalid YAML that silently unregisters the skill (0 triggers). + + Every shipped map-* skill (except the single-line map-plan) uses a block + scalar, so this is the common case, not an edge case. + """ + from mapify_cli.skill_ir import parse_frontmatter + + original = ( + "---\n" + "name: map-check\n" + "description: |\n" + " Run quality gates and verify completion. Use when asked to run checks.\n" + " Do NOT use to plan; use map-plan instead.\n" + "effort: low\n" + "disable-model-invocation: true\n" + "argument-hint: \"[focus area]\"\n" + "---\n" + "# /map-check body\n" + ) + new = "Tuned: trigger on quality-gate requests. Do not plan." + patched = _set_frontmatter_description(original, new) + + # No orphaned continuation line survives. + assert "Run quality gates and verify completion" not in patched + assert "Do NOT use to plan; use map-plan instead." not in patched + # The body and every sibling key are preserved. + assert "# /map-check body" in patched + for key in ("name: map-check", "effort: low", "disable-model-invocation: true"): + assert key in patched + # The frontmatter re-parses as valid YAML with exactly the new description. + close = patched.find("\n---", 4) + fm = parse_frontmatter(patched[4:close]) + assert fm["description"].strip() == new + assert fm["disable-model-invocation"] is True + assert fm["effort"] == "low" + + +def test_set_frontmatter_description_replaces_folded_scalar() -> None: + """Same handling for a folded (``>``) block scalar.""" + from mapify_cli.skill_ir import parse_frontmatter + + original = ( + "---\n" + "name: map-x\n" + "description: >\n" + " First line of folded text\n" + " continues here.\n" + "effort: low\n" + "---\n" + "# body\n" + ) + patched = _set_frontmatter_description(original, "new single-line desc") + assert "First line of folded text" not in patched + assert "effort: low" in patched + close = patched.find("\n---", 4) + fm = parse_frontmatter(patched[4:close]) + assert fm["description"].strip() == "new single-line desc" + + def test_optimize_rejects_zero_iterations(tmp_path: Path) -> None: """Latent-crash guard: iterations < 1 raises ValueError, not IndexError.""" source_claude = _make_source_tree(tmp_path / "src")