diff --git a/Cargo.lock b/Cargo.lock index f6d983c1..645cd7c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2665,6 +2665,12 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "memchr" version = "2.8.0" @@ -3641,6 +3647,7 @@ dependencies = [ "aws-sdk-bedrockruntime", "aws-smithy-types", "axum 0.8.8", + "bytes", "chrono", "clap", "config", @@ -3656,6 +3663,7 @@ dependencies = [ "lettre", "libsql", "mail-parser", + "md5", "regex", "reqwest", "serde", diff --git a/Cargo.toml b/Cargo.toml index e95d7a31..58cbe6db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ aws-config = { version = "1", features = ["behavior-version-latest"], optional = aws-sdk-bedrockruntime = { version = "1", optional = true } aws-smithy-types = { version = "1", optional = true } axum = "0.8.8" +bytes = "1.9" chrono = "0.4.43" clap = { version = "4.5.54", features = ["derive", "env"] } config = "0.15.19" @@ -52,6 +53,7 @@ ignore = "0.4.25" lettre = { version = "0.11.7", features = ["tokio1", "tokio1-native-tls"] } libsql = "0.9.29" mail-parser = "0.11.1" +md5 = "0.7.0" regex = "1.12.2" reqwest = { version = "0.13.1", features = ["json"] } serde = { version = "1.0.228", features = ["derive"] } diff --git a/README.md b/README.md index 7f1b34b1..fd50a8a0 100644 --- a/README.md +++ b/README.md @@ -19,10 +19,11 @@ Please, note that as with any other LLM-based tools, Sashiko's output is probabi ## Features -- **Automated Ingestion**: Monitors mailing lists (using `lore.kernel.org`) for new patch submissions. -- **Manual Ingestion**: Can ingest patches from a local git repository. -- **Self-contained**: Doesn't depend on 3rd-party tools and can work with various LLM providers (Gemini and Claude are currently supported). -- **Web interface and CLI**: Provides a web interface and a CLI tool. Email support will be added soon. +- **Automated Ingestion**: Monitors mailing lists (`lore.kernel.org`), GitHub PRs, and GitLab MRs for new patch submissions. +- **Manual Ingestion**: Can ingest patches from local git repositories or specific PRs/MRs. +- **Forge Integration**: Automatic PR/MR review via GitHub and GitLab webhooks. +- **Self-contained**: Doesn't depend on 3rd-party tools and works with multiple LLM providers (Gemini and Claude currently supported). +- **Web interface and CLI**: Provides a web interface for monitoring and a CLI tool for local development. ## Prompts @@ -90,6 +91,10 @@ cargo install sashiko * **Server**: API server host and port. * **Git**: Path to the reference kernel repository. * **Review**: Concurrency and worktree settings. + * **Tools**: Configure which AI tools are enabled (optional). See [docs/TOOLS.md](docs/TOOLS.md) for details. + * **Forge**: GitHub/GitLab webhook integration (optional). See forge setup guides below. + * **Prompts**: Customize review stages and prompts (optional). See [docs/PROMPTS.md](docs/PROMPTS.md) for details. + * **Subsystems**: Map file patterns to subsystems for targeted reviews (optional). ### Configuring the LLM Provider diff --git a/Settings.toml b/Settings.toml index b1e3421e..3d3742fb 100644 --- a/Settings.toml +++ b/Settings.toml @@ -1,5 +1,22 @@ log_level = "info" +[project] +name = "Sashiko" +description = "" + +[forge] +enabled = false +# provider = "github" +# webhook_secret = "your-webhook-secret" +# api_token = "your-api-token" + +[subsystems] +# Regex map: email address pattern -> subsystem name +# mapping = [ +# { pattern = ".*linux-kernel@vger.kernel.org.*", name = "LKML" }, +# { pattern = ".*netdev@vger.kernel.org.*", name = "netdev" }, +# ] + [database] url = "sashiko.db" token = "" @@ -84,3 +101,16 @@ worktree_dir = "review_trees" timeout_seconds = 7200 max_retries = 3 ignore_files = ["MAINTAINERS", ".mailmap", ".gitignore", "LICENSES/"] + +[tools] +# enabled = ["read_files", "git_diff"] # allowlist (empty = all enabled) +# disabled = [] # denylist (takes precedence) + +[prompts] +# directory = "third_party/prompts/kernel" # local path +# directory = "https://example.com/prompts" # remote HTTP(S) +# directory = "git://github.com/user/prompts.git" # git repo +# stages_config = "stages.toml" + +[prompts.variables] +# Custom template variables for prompt substitution: {{variable_name}} diff --git a/designs/DESIGN_CUSTOM_PROMPTS_AND_TOOLS.md b/designs/DESIGN_CUSTOM_PROMPTS_AND_TOOLS.md new file mode 100644 index 00000000..d2819542 --- /dev/null +++ b/designs/DESIGN_CUSTOM_PROMPTS_AND_TOOLS.md @@ -0,0 +1,442 @@ +# Design: Customizable Review Prompts, Stages, and Tools + +## Goal + +Allow users to customize the AI review pipeline without forking or modifying +core source code. Users can override prompt content, reorder or disable review +stages, define template variables for prompts, filter built-in AI tools, and +register custom shell-based tools -- all through TOML configuration. + +## 1. Prompts Directory Resolution + +The review pipeline loads prompts from a directory. By default this is +`third_party/prompts/kernel`, but users can override it via configuration or +CLI flag. + +### 1.1 Resolution Sources (Priority Order) + +1. CLI flag: `--prompts ` +2. `Settings.toml`: `[prompts] directory = "..."` +3. Default: `third_party/prompts/kernel` + +### 1.2 Supported Directory Schemes + +The `directory` value in `[prompts]` supports three schemes: + +- **Local path**: Absolute or relative to the working directory. + ```toml + [prompts] + directory = "/opt/my-prompts" + # or + directory = "./custom-prompts" + ``` + +- **Git URL**: Cloned to `.sashiko-cache/prompts/{md5(url)}`. + ```toml + [prompts] + directory = "git://github.com/org/review-prompts.git" + # or + directory = "https://github.com/org/review-prompts.git" + ``` + +- **HTTP(S) URL**: Downloaded to `.sashiko-cache/prompts/{md5(url)}`. + ```toml + [prompts] + directory = "https://example.com/prompts" + ``` + +### 1.3 Caching Strategy + +Remote prompts are cached locally under `.sashiko-cache/prompts/` using the +MD5 hash of the source URL as directory name. On cache hit, the local copy is +reused without re-fetching. Git clones will eventually support `git pull` to +update (currently marked as TODO). + +### 1.4 Resolution Flow + +```text +PromptRegistry::with_settings(settings) + | + +--> resolve_prompts_directory(directory_string) + | | + | +-- starts with http(s):// --> download_remote_prompts() + | | cache in .sashiko-cache/prompts/{md5} + | | + | +-- starts with git:// or ends with .git --> clone_git_prompts() + | | cache in .sashiko-cache/prompts/{md5} + | | + | +-- otherwise --> local path resolution + | absolute: use as-is + | relative: join with cwd + | + +--> load_stages_config(base_dir, config_path) + | reads stages.toml from base_dir (or custom path) + | + +--> copy variables from PromptsSettings +``` + +### 1.5 Expected Directory Structure + +```text +prompts/ +├── stages.toml # Stage pipeline configuration +├── review-core.md # Core review identity/workflow +├── technical-patterns.md # Shared pattern catalog +├── false-positive-guide.md # Verification guidance +├── severity.md # Severity calibration +├── inline-template.md # Report format template +├── subsystem/ +│ ├── networking.md +│ ├── locking.md +│ └── ... +└── stages/ + ├── 01-analyze-goal.md + ├── 02-implementation.md + ├── 03-control-flow.md + ├── 04-resource-mgmt.md + ├── 05-locking.md + ├── 06-security.md + ├── 07-hardware.md + ├── 08-verification.md + └── 09-report.md +``` + +## 2. Stage Configuration (`stages.toml`) + +The review pipeline is organized into numbered stages. Each stage runs a +focused analysis pass over the patch. The `stages.toml` file controls which +stages run, what prompt files they load, and which supporting documents to +include. + +### 2.1 Data Model + +```rust +pub struct StagesConfig { + pub stages: Vec, +} + +pub struct StageDefinition { + pub number: u8, // Stage identifier (1-9+) + pub name: Option, // Display name + pub instruction_file: Option,// Custom prompt file path + pub supporting_files: Vec, // Guidance files to include + pub enabled: bool, // Default: true +} +``` + +### 2.2 Example Configuration + +```toml +[[stages]] +number = 1 +name = "Analyze commit main goal" +instruction_file = "stages/01-analyze-goal.md" +supporting_files = [] +enabled = true + +[[stages]] +number = 5 +name = "Locking and synchronization" +instruction_file = "stages/05-locking.md" +supporting_files = ["subsystem/locking.md"] +enabled = true + +# Disable a stage entirely +[[stages]] +number = 7 +enabled = false + +# Add a custom stage +[[stages]] +number = 11 +name = "Performance analysis" +instruction_file = "custom/performance.md" +supporting_files = ["custom/perf-patterns.md"] +enabled = true +``` + +### 2.3 Stage Prompt Loading + +For each stage, the prompt is resolved in priority order: + +1. **Custom file**: `instruction_file` from `stages.toml` (resolved relative + to the prompts base directory). +2. **Auto-discovered file**: `stages/{NN:02}-*.md` pattern in the base + directory. +3. **Hardcoded fallback**: Built-in instruction text for stages 1-9. + +After loading, the instruction content is passed through variable +substitution (see section 3), and the supporting files listed in +`supporting_files` are appended. + +### 2.4 Stage Filtering + +Stages can be filtered at two levels: + +- **Configuration**: `enabled = false` in `stages.toml` removes a stage from + the pipeline. This is applied during the planning pre-phase; disabled stages + are retained out of the planning AI's selection. +- **CLI**: `--stages 1,2,3,5` on the review binary selects specific stages to + run, overriding both the planning phase and the configuration. + +## 3. Template Variable Substitution + +Prompt files can contain `{{variable_name}}` placeholders that are replaced +at load time with values from the configuration. + +### 3.1 Built-in Variables + +| Variable | Value | +|------------|------------------------------------------| +| `{{date}}` | Current date in `YYYY-MM-DD` format | +| `{{year}}` | Current year in `YYYY` format | + +### 3.2 User-Defined Variables + +```toml +[prompts.variables] +organization = "ACME Corp" +soc = "ARMv8" +kernel_tree = "linux-stable" +``` + +These are substituted before built-in variables, so user variables take +precedence if names collide. + +### 3.3 Implementation + +```rust +fn substitute_variables(content: &str, variables: &HashMap) -> String { + let mut result = content.to_string(); + for (key, value) in variables { + let placeholder = format!("{{{{{}}}}}", key); // {{key}} + result = result.replace(&placeholder, value); + } + // Built-in variables + result = result.replace("{{date}}", &chrono::Utc::now().format("%Y-%m-%d")); + result = result.replace("{{year}}", &chrono::Utc::now().format("%Y")); + result +} +``` + +## 4. Tool Filtering + +The AI review pipeline provides 14 built-in tools (file reading, git +operations, search, etc.). Tool filtering allows users to restrict which tools +the AI can access during review. + +### 4.1 Modes + +- **Default**: All built-in tools enabled (no `[tools]` section needed). +- **Allowlist**: Only tools listed in `enabled` are available. +- **Denylist**: All tools available except those in `disabled`. +- **Combined**: `disabled` takes precedence over `enabled`. + +### 4.2 Configuration + +```toml +[tools] +# Allowlist: only these tools are available +enabled = ["read_files", "git_show", "git_diff", "search_file_content"] + +# Denylist: remove these (takes precedence over enabled) +disabled = ["git_checkout"] +``` + +### 4.3 Built-in Tools + +| Tool | Description | +|-----------------------|--------------------------------------------| +| `read_files` | Read file content (raw or smart mode) | +| `git_blame` | Show per-line revision and author | +| `git_diff` | Show changes between commits | +| `git_show` | Show objects (blobs, commits, tags) | +| `git_log` | Show commit logs | +| `git_status` | Show working tree status | +| `git_checkout` | Switch branches or restore files | +| `git_branch` | List branches | +| `git_tag` | List tags | +| `list_dir` | List directory contents | +| `search_file_content` | Grep for patterns in files | +| `find_files` | Find files by glob pattern | +| `TodoWrite` | Write TODO items for structured output | +| `read_prompt` | Read prompt files from prompts directory | + +### 4.4 Implementation + +```rust +pub fn with_config( + worktree_path: PathBuf, + prompts_path: Option, + tools_config: Option<&ToolsSettings>, +) -> Self { + let enabled_tools = tools_config.map(|config| { + let mut tools: HashSet = if config.enabled.is_empty() { + // All tools enabled, then subtract disabled + ALL_TOOL_NAMES.iter().cloned().collect() + } else { + config.enabled.iter().cloned().collect() + }; + for d in &config.disabled { + tools.remove(d); + } + tools.into_iter().collect() + }); + // ... +} +``` + +## 5. Custom Tool Definitions + +Users can define shell-based tools that the AI can invoke during review. This +enables integration with external analysis tools, custom linters, or +project-specific utilities. + +### 5.1 Definition Structure + +```rust +pub struct CustomToolDefinition { + pub name: String, // Tool identifier + pub description: String, // Description shown to AI + pub parameters: String, // JSON Schema for tool parameters + pub command: String, // Shell command template + pub allowed_paths: Vec, // Path allowlist (security) +} +``` + +### 5.2 Configuration + +```toml +[[tools.custom]] +name = "run_sparse" +description = "Run sparse static analysis on a C source file" +parameters = '{"type": "object", "properties": {"file": {"type": "string", "description": "Path to the C source file"}}, "required": ["file"]}' +command = "sparse {file}" +allowed_paths = ["drivers/", "fs/", "kernel/", "mm/", "net/"] +``` + +### 5.3 Security Validation + +Custom tools are validated at registration time: + +1. **Blocked command patterns**: `rm -rf`, `sudo`, `curl`, `wget`, `dd `, + `mkfs`. If the command template contains any of these patterns, the tool + is rejected. + +2. **Path containment**: Every entry in `allowed_paths` is validated to not + escape the worktree directory via path traversal. + +### 5.4 Execution Flow + +```text +AI requests tool call: run_sparse({"file": "drivers/gpu/drm/foo.c"}) + | + +--> Parameter substitution + | command = "sparse drivers/gpu/drm/foo.c" + | + +--> Path allowlist check (if allowed_paths specified) + | "drivers/gpu/drm/foo.c" starts with "drivers/" -> allowed + | + +--> Execute: sh -c "sparse drivers/gpu/drm/foo.c" + | working directory: worktree + | + +--> Return stdout (or error if non-zero exit) +``` + +Parameter substitution replaces `{param_name}` in the command template with +the corresponding argument value. Array values are joined with spaces. + +### 5.5 Tool Dispatch + +When the AI calls a tool, `ToolBox::call()` checks custom tools first (by +name match), then dispatches to built-in tools. Custom tools take priority +over built-in tools with the same name. + +## 6. CLI Integration + +### 6.1 Review Binary Flags + +The `review` binary accepts these flags for customization: + +| Flag | Description | +|-----------------------|------------------------------------------| +| `--prompts ` | Override prompts directory | +| `--stages <1,2,3>` | Select specific stages to run | +| `--ai_provider

` | Override AI provider from settings | +| `--custom_prompt ` | Append text to the user task prompt | + +### 6.2 Priority Chain + +```text +CLI flag > Settings.toml > Built-in default + +--prompts [prompts] "third_party/prompts/kernel" +--stages stages.toml planning pre-phase decides +--ai_provider [ai] provider Settings.toml value +``` + +### 6.3 Settings.toml Integration + +```toml +[prompts] +directory = "git://github.com/org/custom-prompts.git" +stages_config = "config/my-stages.toml" + +[prompts.variables] +organization = "ACME Corp" + +[tools] +enabled = ["read_files", "git_show", "git_diff", "search_file_content"] +disabled = [] + +[[tools.custom]] +name = "custom_lint" +description = "Run project-specific linter" +parameters = '{"type": "object", "properties": {"path": {"type": "string"}}}' +command = "./scripts/lint.sh {path}" +allowed_paths = ["src/"] +``` + +## 7. Implementation Plan + +### `src/settings.rs` +- Add `PromptsSettings` struct with `directory`, `stages_config`, `variables`. +- Add `ToolsSettings` struct with `enabled`, `disabled`, `custom`. +- Add `CustomToolDefinition` struct. +- Add `tools: Option` and `prompts: Option` + to `Settings`. +- Add `get_prompts_dir()` helper. + +### `src/worker/prompts.rs` +- Add `StagesConfig` and `StageDefinition` structs for TOML parsing. +- Add `PromptRegistry.stages_config` and `variables` fields. +- Implement `PromptRegistry::with_settings()` for directory resolution. +- Implement `resolve_prompts_directory()` with local/git/http dispatch. +- Implement `download_remote_prompts()` and `clone_git_prompts()` with caching. +- Implement `load_stages_config()` for TOML parsing. +- Implement `substitute_variables()` for `{{key}}` replacement. +- Update `get_stage_prompt()` to load from stages config and apply + substitution. +- Update planning pre-phase to filter disabled stages from selection. + +### `src/worker/tools.rs` +- Add `enabled_tools: Option>` and + `custom_tools: Vec<(AiTool, CustomToolDefinition)>` fields to `ToolBox`. +- Implement `ToolBox::with_config()` constructor. +- Implement `is_tool_enabled()` for allowlist/denylist filtering. +- Implement `register_custom_tools()` with validation. +- Implement `validate_tool_security()` for blocked pattern and path checks. +- Implement `execute_custom_tool()` with parameter substitution. +- Update `get_declarations_generic()` to include custom tools and apply + filtering. +- Update `call()` to dispatch custom tools before built-in tools. + +### `src/bin/review.rs` +- Update `--prompts` flag from `PathBuf` to `Option`. +- Add `--stages` flag accepting comma-separated stage numbers. +- Wire `PromptRegistry::with_settings()` when settings.prompts is available. +- Wire `ToolBox::with_config()` with settings.tools. + +### `third_party/prompts/kernel/stages.toml` +- Default 9-stage configuration matching the built-in pipeline. +- Commented examples showing custom stages and stage disabling. diff --git a/docs/FORGE_SETUP.md b/docs/FORGE_SETUP.md new file mode 100644 index 00000000..59bbb72f --- /dev/null +++ b/docs/FORGE_SETUP.md @@ -0,0 +1,426 @@ +# Forge Integration Setup + +This guide explains how to integrate Sashiko with Git forges (GitHub, GitLab, etc.) for automatic code review. + +## Overview + +Sashiko can integrate with Git forges to automatically review Pull Requests (PRs) or Merge Requests (MRs) when they are opened or updated. This enables automated code review workflows for your development teams. + +## Supported Forges + +Currently supported: +- **GitHub** - Full webhook integration ([GitHub Setup Guide](GITHUB_SETUP.md)) +- **GitLab** - Full webhook integration ([GitLab Setup Guide](GITLAB_SETUP.md)) + +## Forge Requirements + +For a Git forge to be compatible with Sashiko, it must support: + +### 1. Webhook Delivery + +**Required capabilities:** +- HTTP/HTTPS webhook delivery to external endpoints +- JSON payload format +- Custom headers (for event identification) +- Configurable event triggers (PR/MR events) + +**Event triggers needed:** +- Pull/Merge request opened +- Pull/Merge request updated (new commits) +- Pull/Merge request reopened + +**Optional but recommended:** +- Webhook secret/signature validation +- Webhook delivery retry logic +- Webhook delivery status/logs + +### 2. API Access + +**Required endpoints:** +- Fetch PR/MR details by ID +- Retrieve commit information +- Access diff/patch data + +**Authentication:** +- Public repository access (no auth for public repos) +- Token-based authentication (for private repos) +- Rate limiting information + +**Optional but useful:** +- Post comments on PRs/MRs +- Update PR/MR status +- Create review summaries + +### 3. Git Repository Access + +**Required:** +- HTTP(S) clone URLs accessible from Sashiko server +- Support for standard Git protocol +- Ability to fetch specific commit ranges + +**Authentication options:** +- Anonymous access for public repos +- SSH key authentication +- HTTP(S) token authentication +- Deploy keys + +### 4. Webhook Payload Requirements + +The webhook must provide: + +**Minimal required fields:** +```json +{ + "event_type": "pull_request|merge_request", + "action": "opened|updated|reopened", + "pr_or_mr": { + "id": "", + "title": "", + "base_commit": "", + "head_commit": "", + "url": "" + }, + "repository": { + "clone_url": "" + } +} +``` + +**Recommended additional fields:** +- Source branch name +- Target branch name +- Author information +- Commit count +- Changed files list +- PR/MR state (open, closed, merged) + +## General Configuration + +### Enable Forge Integration + +Edit `Settings.toml`: + +```toml +[forge] +enabled = true + +# Optional: Configure subsystem mapping for targeted reviews +[subsystems] +mapping = [ + { pattern = ".*drivers/.*", name = "Drivers" }, + { pattern = ".*net/.*", name = "Networking" }, + { pattern = ".*fs/.*", name = "Filesystems" }, + { pattern = ".*mm/.*", name = "Memory Management" }, +] +``` + +### Server Configuration + +Ensure your server is configured to accept webhooks: + +```toml +[server] +host = "127.0.0.1" # Listen address +port = 8080 # Port for webhook endpoint +``` + +**Security considerations:** +- By default, Sashiko only accepts webhooks from localhost +- For production, use `--enable-unsafe-all-submit` flag (behind firewall/proxy) +- Always use HTTPS in production with valid certificates +- Implement webhook signature validation when available + +### Git Configuration + +Configure the reference repository: + +```toml +[git] +repository_path = "/path/to/kernel/repo" +``` + +**Note:** The repository must be accessible and contain the commits referenced in webhooks. + +### Review Configuration + +```toml +[review] +concurrency = 20 # Parallel review capacity +worktree_dir = "review_trees" # Temporary worktrees +timeout_seconds = 7200 # 2 hours per review +``` + +## Webhook Endpoints + +Sashiko provides the following webhook endpoints: + +- **GitHub**: `http://your-server:8080/api/webhook/github` +- **GitLab**: `http://your-server:8080/api/webhook/gitlab` + +### Expected HTTP Headers + +**GitHub:** +``` +X-GitHub-Event: pull_request +Content-Type: application/json +``` + +**GitLab:** +``` +X-Gitlab-Event: Merge Request Hook +Content-Type: application/json +``` + +### Response Codes + +- `200 OK` - Webhook accepted, review queued +- `400 Bad Request` - Invalid payload or missing required fields +- `403 Forbidden` - Forge integration disabled +- `500 Internal Server Error` - Server error processing webhook + +## Testing Integration + +### 1. Test with Synthetic Webhook + +Use the provided test scripts to verify the endpoint: + +**GitHub:** +```bash +./scripts/test_github_webhook.sh +``` + +**GitLab:** +```bash +./scripts/test_gitlab_webhook.sh +``` + +### 2. Test with Real PR/MR + +Trigger a review for a specific PR/MR: + +**GitHub:** +```bash +./scripts/trigger_github_pr_review.sh owner/repo 123 +``` + +**GitLab:** +```bash +./scripts/trigger_gitlab_mr_review.sh 123 +``` + +### 3. Verify Review Queue + +Check the web UI to confirm the review was queued: +``` +http://localhost:8080/ +``` + +Look for: +- Review in "Pending" or "In Progress" state +- Correct PR/MR number and title +- Expected commit range + +## Implementing Support for New Forges + +To add support for a new Git forge, you need to: + +### 1. Understand the Forge's Webhook Format + +Document: +- Webhook payload structure +- HTTP headers used for event identification +- Available event types and actions +- Authentication/signature mechanism + +### 2. Create a ForgeProvider Implementation + +Implement the `ForgeProvider` trait in Rust: + +```rust +pub trait ForgeProvider { + /// Parse the webhook payload and extract review information + fn parse_webhook(&self, headers: &HeaderMap, body: &[u8]) + -> Result; + + /// Optional: Post review results back to the forge + fn post_review(&self, pr_id: &str, review: &Review) + -> Result<(), Error>; +} +``` + +### 3. Register the Webhook Handler + +Add a new endpoint in the API router: + +```rust +// In src/api.rs or equivalent +app.route("/api/webhook/yourforge", post(handle_yourforge_webhook)) +``` + +### 4. Add Configuration + +Extend `Settings.toml` if forge-specific config is needed: + +```toml +[forge.yourforge] +api_token = "optional-token" +api_endpoint = "https://api.yourforge.com" +``` + +### 5. Create Documentation and Scripts + +- Write setup guide (e.g., `docs/YOURFORGE_SETUP.md`) +- Create test script (`scripts/test_yourforge_webhook.sh`) +- Create trigger script (`scripts/trigger_yourforge_review.sh`) + +## Troubleshooting + +### Webhook Not Received + +**Symptoms:** No log entries when PR/MR is opened + +**Check:** +- Server is running: `curl http://localhost:8080/` +- Firewall allows inbound on configured port +- Webhook URL is accessible from forge servers +- Forge shows successful delivery in webhook logs + +**Solutions:** +- Use ngrok/tunneling for local testing +- Check server logs: `RUST_LOG=debug cargo run` +- Verify webhook URL in forge configuration + +### Webhook Rejected (403 Forbidden) + +**Symptoms:** Forge shows 403 response + +**Cause:** Forge integration disabled or security restrictions + +**Solutions:** +- Set `forge.enabled = true` in Settings.toml +- Restart Sashiko daemon +- Use `--enable-unsafe-all-submit` for testing (not production) + +### Review Not Starting + +**Symptoms:** Webhook accepted but review doesn't begin + +**Check:** +- LLM API key configured: `echo $LLM_API_KEY` +- Git repository contains referenced commits +- Sashiko has network access to clone repository +- Disk space available in worktree directory + +**Solutions:** +- Check logs for specific errors +- Verify git repository accessibility +- Test LLM provider connection +- Ensure commits exist in configured repository + +### Commits Not Found + +**Symptoms:** Review fails with "commit not found" error + +**Causes:** +- Repository mismatch (webhook references different repo) +- Commits not yet fetched to local repository +- Shallow clone missing history + +**Solutions:** +- Ensure `git.repository_path` points to correct repo +- Run `git fetch --all` in repository +- Use full clone (not shallow) for reference repository + +## Security Best Practices + +### Production Deployment + +1. **Use HTTPS with valid certificates** + - Set up reverse proxy (nginx, Apache, Caddy) + - Obtain SSL/TLS certificates (Let's Encrypt) + - Terminate TLS at proxy, forward to Sashiko + +2. **Implement authentication** + - Use webhook secrets when available + - Validate webhook signatures + - Restrict by source IP (if forge IPs are known) + +3. **Network isolation** + - Run Sashiko on private network + - Use VPN or SSH tunneling for access + - Firewall rules to limit exposure + +4. **Rate limiting** + - Configure at reverse proxy level + - Prevent abuse and DoS attempts + - Monitor webhook delivery rates + +5. **Audit logging** + - Log all webhook deliveries + - Track review queue metrics + - Monitor for unusual patterns + +### Example Nginx Configuration + +```nginx +server { + listen 443 ssl http2; + server_name sashiko.example.com; + + ssl_certificate /etc/letsencrypt/live/sashiko.example.com/fullchain.pem; + ssl_certificate_key /etc/letsencrypt/live/sashiko.example.com/privkey.pem; + + # Security headers + add_header Strict-Transport-Security "max-age=31536000" always; + add_header X-Frame-Options DENY; + add_header X-Content-Type-Options nosniff; + + # Rate limiting + limit_req_zone $binary_remote_addr zone=webhook:10m rate=10r/m; + limit_req zone=webhook burst=5; + + location /api/webhook/ { + proxy_pass http://127.0.0.1:8080; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + location / { + proxy_pass http://127.0.0.1:8080; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } +} +``` + +## Forge Comparison + +| Feature | GitHub | GitLab | Requirements | +|---------|--------|--------|--------------| +| Webhook delivery | ✅ | ✅ | **Required** | +| JSON payloads | ✅ | ✅ | **Required** | +| Event filtering | ✅ | ✅ | **Required** | +| Webhook secrets | ✅ | ✅ | Recommended | +| Signature validation | ✅ | ✅ | Recommended | +| Delivery logs | ✅ | ✅ | Recommended | +| Public API | ✅ | ✅ | **Required** | +| Token auth | ✅ | ✅ | **Required** | +| SSH clone | ✅ | ✅ | **Required** | +| HTTPS clone | ✅ | ✅ | **Required** | +| Comment API | ✅ | ✅ | Optional | +| Status API | ✅ | ✅ | Optional | + +## Related Documentation + +- [GitHub Setup Guide](GITHUB_SETUP.md) - Detailed GitHub integration +- [GitLab Setup Guide](GITLAB_SETUP.md) - Detailed GitLab integration +- [README.md](../README.md) - Main project documentation +- [Settings.toml](../Settings.toml) - Configuration reference + +## Getting Help + +- **Mailing List:** sashiko@lists.linux.dev ([archive](https://lore.kernel.org/sashiko)) +- **GitHub Issues:** [Report bugs or request features](https://github.com/sashiko-dev/sashiko/issues) +- **Community:** Join discussions about forge integration and feature requests diff --git a/docs/GITHUB_SETUP.md b/docs/GITHUB_SETUP.md new file mode 100644 index 00000000..c282bed0 --- /dev/null +++ b/docs/GITHUB_SETUP.md @@ -0,0 +1,319 @@ +# GitHub Integration Setup + +Complete guide for configuring Sashiko to automatically review GitHub Pull Requests via webhooks. + +> **Note:** For general forge integration requirements and architecture, see [FORGE_SETUP.md](FORGE_SETUP.md). This guide covers GitHub-specific configuration. + +## Quick Start (5 Minutes) + +Get Sashiko reviewing GitHub Pull Requests quickly. + +### Step 1: Configure LLM Provider + +Set your API key: +```bash +# For Gemini +export LLM_API_KEY="your-gemini-api-key" + +# For Claude +export ANTHROPIC_API_KEY="your-claude-api-key" +``` + +Verify `Settings.toml` has your LLM provider configured: +```toml +[ai] +provider = "gemini" # or "claude" +model = "gemini-3.1-pro-preview" # or "claude-sonnet-4-6" +``` + +### Step 2: Start Sashiko Server + +```bash +cargo run --release +``` + +You should see output like: +``` +Server running on http://127.0.0.1:8080 +``` + +**Verify it's running:** +```bash +curl http://localhost:8080/ +``` + +### Step 3: Test with a Pull Request + +#### Option A: Test with Real GitHub PR + +Use the trigger script to review any public GitHub PR: +```bash +./scripts/trigger_github_pr_review.sh torvalds/linux 12345 +``` + +Replace `torvalds/linux` with `owner/repo` and `12345` with a PR number. + +#### Option B: Test with Synthetic Webhook + +Send a test webhook payload to verify the endpoint: +```bash +./scripts/test_github_webhook.sh +``` + +#### Option C: Test with Local Commits + +Review your own local changes: +```bash +# Review last 3 commits +cargo run --bin sashiko-cli -- submit HEAD~3..HEAD +``` + +### Step 4: Monitor Progress + +Open your browser to the web UI: +``` +http://localhost:8080/ +``` + +You'll see: +- Review queue status +- Current review progress +- Completed reviews with findings + +### Step 5: Configure Webhook (Optional) + +To automatically review PRs when they're opened on GitHub: + +1. Go to your GitHub repository → Settings → Webhooks +2. Add webhook: + - **URL:** `http://your-server:8080/api/webhook/github` + - **Content type:** `application/json` + - **Events:** Pull requests only +3. For local development, use ngrok or SSH tunnel: + ```bash + # Using ngrok + ngrok http 8080 + # Use the ngrok URL in webhook configuration + ``` + +## Prerequisites + +- Sashiko running in server mode (daemon) +- Server accessible from GitHub (public URL or tunneling solution like ngrok) +- Admin rights on GitHub repository (to configure webhooks) +- Sashiko configured with valid LLM API credentials + +## Configuration + +### 1. Configure Server Settings + +Sashiko's webhook endpoints are available by default when running the daemon. The server configuration in `Settings.toml`: + +```toml +[server] +host = "127.0.0.1" +port = 8080 +``` + +**Security Note:** By default, Sashiko only accepts webhook requests from `localhost` for security. To accept webhooks from GitHub's servers, you have two options: + +**Option A: SSH Tunnel (Recommended for testing)** +```bash +# On your development machine +ssh -R 8080:localhost:8080 your-public-server.com + +# GitHub webhook URL would be: +# http://your-public-server.com:8080/api/webhook/github +``` + +**Option B: Allow all submit (Use with caution)** +```bash +# Start Sashiko with the unsafe flag +cargo run --release -- --enable-unsafe-all-submit +``` + +**⚠️ WARNING:** Option B allows anyone who can reach your server to trigger reviews. Only use this in trusted networks or behind authentication layers (reverse proxy with auth, firewall rules, etc.). + +### 2. Set up GitHub Webhook + +1. Navigate to your repository on GitHub +2. Go to **Settings** → **Webhooks** → **Add webhook** +3. Configure the webhook: + - **Payload URL:** `http://your-server:8080/api/webhook/github` + - Replace `your-server` with your actual server address + - Use port `8080` (or your configured server port) + - **Content type:** `application/json` + - **Secret:** Leave empty (signature validation not yet implemented) + - **Which events would you like to trigger this webhook?** + - Select: **Let me select individual events** + - Check: ✓ **Pull requests** + - Uncheck all other events + - **Active:** ✓ Enabled + +4. Click **Add webhook** + +### 3. Verify Webhook Delivery + +After creating the webhook, GitHub will immediately send a test `ping` event: + +1. In GitHub webhook settings, click on your newly created webhook +2. Navigate to the **Recent Deliveries** tab +3. You should see a `ping` event with a green checkmark +4. If the delivery failed, check: + - Server is running: `curl http://localhost:8080/` + - Firewall allows incoming connections + - URL is correct and accessible from the internet + +### 4. Test the Integration + +**Option A: Use the test script** +```bash +./test_github_webhook.sh +``` + +This sends a synthetic webhook payload to verify the endpoint is working. + +**Option B: Trigger a real PR review** +```bash +./trigger_github_pr_review.sh owner/repo 123 +``` + +This fetches real PR data from GitHub's API and triggers a review. + +**Option C: Open a real Pull Request** + +The most authentic test - simply open a new PR on your repository. Sashiko should: +1. Receive the webhook +2. Log the PR details +3. Queue the review +4. Fetch the commits +5. Start the AI review process + +Check the web UI at `http://localhost:8080/` to see the review progress. + +## Troubleshooting + +### Webhook not received + +**Symptoms:** No logs in Sashiko output when PR is opened + +**Solutions:** +- Verify server is running: `curl http://localhost:8080/` +- Check firewall allows incoming connections on port 8080 +- Verify webhook URL is accessible from internet (use ngrok or similar for testing) +- Check GitHub webhook delivery status in repository settings + +### Permission denied (403 Forbidden) + +**Symptoms:** GitHub shows delivery failed with 403 status + +**Cause:** Sashiko's default security blocks non-localhost requests + +**Solutions:** +1. **Recommended:** Use SSH tunnel or reverse proxy from localhost +2. **Quick test:** Run with `--enable-unsafe-all-submit` flag +3. **Production:** Set up reverse proxy with TLS and authentication + +### Webhook received but review not starting + +**Symptoms:** Logs show webhook received but no review begins + +**Solutions:** +- Check LLM API key is configured: `echo $LLM_API_KEY` +- Verify git repository is accessible: check `git.repository_path` in `Settings.toml` +- Look for errors in Sashiko logs +- Check that the commit hash exists in your configured repository + +### Review fails immediately + +**Symptoms:** Review status shows "failed" in web UI + +**Solutions:** +- Check Sashiko logs for specific error messages +- Verify git repository has the referenced commits +- Ensure LLM provider is accessible and API key is valid +- Check disk space in `worktree_dir` directory + +## Security Considerations + +**⚠️ IMPORTANT:** GitHub webhook signature validation is not yet implemented. + +For production deployments: + +1. **Use HTTPS:** Set up a reverse proxy with TLS + ```nginx + # Example nginx config + server { + listen 443 ssl; + server_name sashiko.example.com; + + ssl_certificate /path/to/cert.pem; + ssl_certificate_key /path/to/key.pem; + + location /api/webhook/ { + proxy_pass http://localhost:8080; + proxy_set_header X-Real-IP $remote_addr; + } + } + ``` + +2. **Implement webhook secrets:** Future enhancement - see GitHub's [webhook security guide](https://docs.github.com/en/webhooks/using-webhooks/validating-webhook-deliveries) + +3. **Network isolation:** Run Sashiko on private network and use SSH tunneling or VPN + +4. **Rate limiting:** Configure reverse proxy or firewall to prevent abuse + +## Advanced Configuration + +### Custom Port + +Modify `Settings.toml`: +```toml +[server] +host = "0.0.0.0" # Listen on all interfaces +port = 8080 # Custom port +``` + +Then update webhook URL: `http://your-server:8080/api/webhook/github` + +### Multiple Repositories + +Sashiko can handle webhooks from multiple repositories. Simply add the same webhook configuration to each repository you want to monitor. + +**Note:** All repositories must be accessible from the git repository configured in `Settings.toml` or Sashiko will fail to fetch the commits. + +## Webhook Payload Reference + +Sashiko processes the following fields from GitHub's `pull_request` webhook: + +```json +{ + "action": "opened", + "pull_request": { + "number": 123, + "title": "Fix memory leak in driver", + "html_url": "https://github.com/owner/repo/pull/123", + "head": { + "sha": "abc123..." + }, + "base": { + "sha": "def456..." + } + } +} +``` + +Supported actions: `opened`, `reopened`, `synchronize` (new commits pushed) + +## See Also + +- Quick start content is now in the top section of this document +- [FORGE_SETUP.md](FORGE_SETUP.md) - General forge integration guide +- [GITLAB_SETUP.md](GITLAB_SETUP.md) - GitLab integration setup +- [README.md](../README.md) - Main project documentation +- [Settings.toml](../Settings.toml) - Configuration reference + +## Getting Help + +- **Mailing List:** sashiko@lists.linux.dev ([lore archive](https://lore.kernel.org/sashiko)) +- **GitHub Issues:** [Report bugs or request features](https://github.com/sashiko-dev/sashiko/issues) diff --git a/docs/GITLAB_SETUP.md b/docs/GITLAB_SETUP.md new file mode 100644 index 00000000..07006f12 --- /dev/null +++ b/docs/GITLAB_SETUP.md @@ -0,0 +1,244 @@ +# GitLab Webhook Setup for Sashiko + +This guide explains how to configure GitLab to automatically trigger Sashiko reviews for merge requests. + +> **Note:** For general forge integration requirements and architecture, see [FORGE_SETUP.md](FORGE_SETUP.md). This guide covers GitLab-specific configuration. + +## Quick Start (5 Minutes) + +### Step 1: Enable Forge Mode + +Edit `Settings.toml`: + +```toml +[forge] +enabled = true +``` + +Restart Sashiko. + +### Step 2: Verify Server is Ready + +```bash +./scripts/check_server_config.sh +``` + +Expected output: "✓ Server is ready for GitLab webhooks!" + +### Step 3: Test with a Specific MR + +```bash +# Replace 123 with an actual MR number +./scripts/trigger_gitlab_mr_review.sh 123 +``` + +This script will: +1. Fetch MR details from GitLab API +2. Send a simulated webhook to Sashiko +3. Show the response + +### Step 4: Monitor Reviews + +View reviews at: http://localhost:9080/ + +Reviews for GitLab MRs will display as: +- **Subject**: `!: ` +- **Author**: Original commit author +- **Parts**: Number of commits in the MR + +## Prerequisites + +1. Sashiko server running and accessible +2. `forge.enabled = true` in `Settings.toml` +3. Admin/Maintainer access to your GitLab project + +## Configuration Steps + +### 1. Enable Forge Mode in Sashiko + +Edit `Settings.toml`: + +```toml +[forge] +enabled = true + +[subsystems] +# Optional: Map file paths to subsystems +mapping = [ + { pattern = ".*drivers/.*", name = "Drivers" }, + { pattern = ".*net/.*", name = "Networking" }, + { pattern = ".*fs/.*", name = "Filesystems" }, +] +``` + +Restart Sashiko after changing the configuration. + +### 2. Configure GitLab Webhook + +1. Go to your GitLab project: + ``` + https://gitlab.com/your-org/your-project/-/settings/integrations + ``` + +2. Click **Add new webhook** + +3. Configure the webhook: + + **URL:** + ``` + http://localhost:9080/api/webhook/gitlab + ``` + + **Secret token:** (Optional - currently not validated, see security note below) + + **Trigger:** + - ✓ Merge request events + + **SSL verification:** + - If using HTTPS with valid cert: Enable + - If using HTTP or self-signed: Disable + +4. Click **Add webhook** + +### 3. Test the Webhook + +#### Option A: Use GitLab's Test Feature + +1. Scroll down to "Project Hooks" +2. Find your webhook +3. Click **Test** → **Merge Request events** +4. Check the response (should be 200 OK) + +#### Option B: Use the Test Script + +```bash +# Make the script executable +chmod +x test_gitlab_webhook.sh + +# Run the test +./test_gitlab_webhook.sh +``` + +#### Option C: Trigger Review for Specific MR + +```bash +# Make the script executable +chmod +x trigger_gitlab_mr_review.sh + +# Trigger review for MR !123 +./trigger_gitlab_mr_review.sh 123 +``` + +### 4. Verify Reviews are Running + +1. Open a new MR or update an existing one +2. Check Sashiko web UI: `http://localhost:9080/` +3. Check Sashiko logs for: + ``` + GitLab MR !123 open. Base: abc123..., Head: def456... + ``` + +## Webhook Payload Details + +GitLab sends webhooks with: + +**Headers:** +- `X-Gitlab-Event: Merge Request Hook` +- `Content-Type: application/json` + +**Actions Handled:** +- `open` - New MR created +- `update` - MR updated with new commits +- `reopen` - Closed MR reopened + +**Actions Ignored:** +- `close` - MR closed +- `merge` - MR merged +- `approved`/`unapproved` - Approval state changes + +## Troubleshooting + +### Webhook Returns 403 Forbidden + +**Cause:** Forge integration is disabled + +**Fix:** Set `forge.enabled = true` in `Settings.toml` and restart Sashiko + +### Webhook Returns 400 Bad Request + +**Cause:** Invalid payload structure + +**Check:** +1. Verify webhook is configured for "Merge request events" +2. Check Sashiko logs for parsing errors +3. Ensure `X-Gitlab-Event` header is set correctly + +### Reviews Not Appearing + +**Check:** +1. Sashiko logs for errors +2. Git repository is accessible from Sashiko server +3. Base and head SHAs are valid +4. FetchAgent is running (check logs) + +### Network Connectivity Issues + +If GitLab cannot reach your server: + +1. Ensure the server is publicly accessible (or use a tunnel) +2. Check firewall rules allow inbound on port 9080 +3. Consider using ngrok for testing: + ```bash + ngrok http 9080 + # Use the ngrok URL in webhook config + ``` + +## Security Notes + +⚠️ **IMPORTANT:** The current implementation does NOT validate webhook signatures. + +This means anyone who knows your webhook URL can trigger reviews. + +**Recommended for production:** +1. Implement webhook signature validation (see issue in code review) +2. Use HTTPS with valid certificates +3. Restrict network access to known GitLab IPs +4. Set a strong webhook secret token in GitLab + +## Manual Testing + +You can manually trigger reviews without webhooks: + +```bash +# Using the CLI tool (if implemented) +sashiko review \ + --repo https://gitlab.com/your-org/your-project.git \ + --range abc123..def456 + +# Or by directly calling the API +curl -X POST http://localhost:9080/api/submit \ + -H "Content-Type: application/json" \ + -d '{ + "repo_url": "https://...", + "commit_hash": "abc123..def456" + }' +``` + +## Example MRs to Test + +Test with a specific MR from your project: + +```bash +# Test with a specific MR (use your URL-encoded project path) +./trigger_gitlab_mr_review.sh 1234 your-org%2Fyour-project + +# Or manually find MRs at: +# https://gitlab.com/your-org/your-project/-/merge_requests +``` + +## See Also + +- [FORGE_SETUP.md](FORGE_SETUP.md) - General forge integration guide +- [GITHUB_SETUP.md](GITHUB_SETUP.md) - GitHub integration setup +- [README.md](../README.md) - Main project documentation +- [Settings.toml](../Settings.toml) - Configuration reference diff --git a/docs/PROMPTS.md b/docs/PROMPTS.md new file mode 100644 index 00000000..1c6c62ce --- /dev/null +++ b/docs/PROMPTS.md @@ -0,0 +1,111 @@ +# Prompt Customization + +## Configuration + +Set prompts directory in `Settings.toml`: + +```toml +[prompts] +# Local path +directory = "./my-prompts" + +# Remote Git URL (cached locally) +directory = "git://github.com/org/prompts.git" +``` + +## Stage Management + +Sashiko uses a multi-stage review process (1-9 default). Customize via `stages.toml` in your prompts directory: + +```toml +# Disable a stage +[[stages]] +number = 7 +enabled = false + +# Add custom stage +[[stages]] +number = 10 +name = "Performance" +instruction_file = "custom/perf.md" +supporting_files = ["patterns.md"] +``` + +### Default Stages +1. **Analyze goal**: Architectural intent. +2. **Implementation**: High-level correctness. +3. **Control flow**: Execution paths. +4. **Resource management**: Memory and handles. +5. **Locking**: Concurrency safety. +6. **Security**: Vulnerability audit. +7. **Hardware**: Device-specific logic. +8. **Verification**: Severity assessment. +9. **Report**: Final summary generation. + +## Template Variables + +Define in `Settings.toml`: + +```toml +[prompts.variables] +project = "Linux Kernel" +subsystem = "network" +``` + +Use in markdown files: `Review for {{project}}, focusing on {{subsystem}}.` + +### Built-in Variables +- `{{date}}`: Current date. +- `{{year}}`: Current year. + +## Custom Directory Structure + +```text +my-prompts/ +├── stages.toml # Custom stage config +├── stages/ # Stage markdown files +│ ├── 01-goal.md +│ └── ... +├── technical-patterns.md # Supporting context +└── tool.md # Tool instructions +``` + +## Examples + +### Custom Configurations +```toml +# Security-focused +[prompts] +directory = "./security-prompts" +[prompts.variables] +focus_area = "memory safety and input validation" + +# Performance-focused +[prompts.variables] +focus_area = "algorithmic complexity and cache efficiency" + +# Subsystem-specific (Git remote) +[prompts] +directory = "git://github.com/myorg/networking-prompts.git" +[prompts.variables] +subsystem = "networking" +``` + +### Advanced stages.toml +```toml +# Reorder: Run security (6) before control flow (3) +[[stages]] +number = 3 +instruction_file = "stages/06-security.md" + +[[stages]] +number = 6 +instruction_file = "stages/03-control-flow.md" + +# Add custom analysis +[[stages]] +number = 10 +name = "Performance" +instruction_file = "custom/performance.md" +supporting_files = ["custom/perf-patterns.md"] +``` diff --git a/docs/SETTINGS.md b/docs/SETTINGS.md new file mode 100644 index 00000000..094b12a8 --- /dev/null +++ b/docs/SETTINGS.md @@ -0,0 +1,65 @@ +# Sashiko Settings + +Terse guide for `Settings.toml`. + +## Global +- `log_level`: verbosity. `info`, `debug`, `error`. + +## [project] +- `name`: display name. +- `description`: project info. + +## [forge] +- `enabled`: use GitHub/GitLab webhooks. `true` or `false`. +- `provider`: `github` or `gitlab`. +- `webhook_secret`: auth token for webhook. +- `api_token`: token for PR comments/actions. + +## [subsystems] +- `mapping`: regex link email to subsystem name. + - `pattern`: email regex. + - `name`: subsystem label. + +## [database] +- `url`: SQLite file path. `sashiko.db`. +- `token`: secret for DB auth (if needed). + +## [mailing_lists] +- `track`: list of Lore names to monitor. `linux-kernel`, `netdev`. + +## [nntp] +- `server`: Lore NNTP host. `nntp.lore.kernel.org`. +- `port`: NNTP port. Default `119`. + +## [ai] +- `provider`: `gemini`, `openai`, `claude`, `bedrock`, `claude-cli`. +- `model`: model ID. +- `max_input_tokens`: cap for context. +- `max_interactions`: loop limit. +- `temperature`: randomness. `0.0` to `1.0`. + +## [ai.claude] +- `prompt_caching`: reuse context. Save tokens. + +## [server] +- `host`: API listen address. `127.0.0.1`. +- `port`: API port. `8080`. + +## [git] +- `repository_path`: local path to target repo. + +## [review] +- `concurrency`: max simultaneous reviews. +- `worktree_dir`: path for temporary git worktrees. +- `timeout_seconds`: kill stuck review. +- `max_retries`: attempt count on transient fail. +- `ignore_files`: files to skip in review. + +## [tools] (Optional) +- `enabled`: allowlist tools. +- `disabled`: denylist tools. + +## [prompts] (Optional) +- `directory`: local path or remote Git URL for prompts. +- `stages_config`: path to `stages.toml`. +- `variables`: key-value map for prompt templates. diff --git a/docs/TOOLS.md b/docs/TOOLS.md new file mode 100644 index 00000000..503b34c9 --- /dev/null +++ b/docs/TOOLS.md @@ -0,0 +1,120 @@ +# AI Tools + +Sashiko provides built-in and custom tools for AI interaction with the codebase. All built-in tools are enabled by default. + +## Built-in Tools + +### File Operations +- `read_files`: Read file content. Supports "smart" mode (collapses boilerplate). +- `list_dir`: List directory contents. +- `find_files`: Locate files via glob patterns (`*.rs`). +- `search_file_content`: Grep-like pattern search. + +### Git Operations +- `git_diff`: Show changes between commits/refs. +- `git_log`: View commit history. Use `--since` for performance. +- `git_show`: Inspect git objects (blobs, commits). +- `git_blame`: Identify line-level modification history. +- `git_status`: Check working tree state. +- `git_branch`: List local and remote branches. +- `git_tag`: List repository tags. +- `git_checkout`: Switch branches (modifies worktree). + +### Specialized +- `read_prompt`: Access files from the prompt registry. +- `TodoWrite`: Append items to `TODO.md`. + +## Configuration + +Control tool access in `Settings.toml`: + +### Allowlist Mode +```toml +[tools] +enabled = ["read_files", "git_diff", "git_show"] +``` + +### Denylist Mode +```toml +[tools] +# Disable state-modifying tools for read-only environments +disabled = ["git_checkout", "TodoWrite"] +``` + +*Note: `disabled` takes precedence over `enabled`.* + +## Custom Tools + +Define external commands as AI tools in `Settings.toml`: + +```toml +[[tools.custom]] +name = "static_check" +description = "Run external analyzer" +parameters = """ +{ + "type": "OBJECT", + "properties": { + "path": { "type": "STRING" } + }, + "required": ["path"] +} +""" +command = "/usr/bin/check --file {path}" +allowed_paths = ["src/"] +``` + +### Security Constraints +- **Blocked**: `sudo`, `rm -rf`, `curl`, `wget`, `dd`, `mkfs`. +- **Isolation**: Runs inside the isolated review worktree. +- **Path Validation**: `allowed_paths` prevents traversal beyond the worktree. +- **Substitutions**: Use `{parameter_name}` in the command string. Arrays are space-joined. + +## Examples + +### Restriction Patterns +```toml +# Read-only environment +[tools] +disabled = ["git_checkout", "TodoWrite"] + +# Performance-optimized (minimal set) +[tools] +enabled = ["read_files", "git_diff", "git_show", "search_file_content"] +``` + +### Custom Tool Definitions +```toml +# Performance Profiler +[[tools.custom]] +name = "profile_function" +description = "Profile function with perf" +parameters = """ +{ + "type": "OBJECT", + "properties": { + "function_name": { "type": "STRING" }, + "file": { "type": "STRING" } + }, + "required": ["function_name", "file"] +} +""" +command = "perf-profiler --function {function_name} --file {file}" +allowed_paths = ["src/", "kernel/"] + +# Documentation Generator +[[tools.custom]] +name = "generate_docs" +description = "Generate API docs" +parameters = """ +{ + "type": "OBJECT", + "properties": { + "module_path": { "type": "STRING" } + }, + "required": ["module_path"] +} +""" +command = "rustdoc {module_path} --output /tmp/docs" +allowed_paths = ["src/"] +``` diff --git a/sashiko.dev/base/routing/sashiko-service-skeleton.yaml b/sashiko.dev/base/routing/sashiko-service-skeleton.yaml index f7d3c181..79bf1ae3 100644 --- a/sashiko.dev/base/routing/sashiko-service-skeleton.yaml +++ b/sashiko.dev/base/routing/sashiko-service-skeleton.yaml @@ -7,8 +7,8 @@ metadata: namespace: sashiko annotations: cloud.google.com/neg: '{"ingress": true}' - # yamllint disable-line rule:line-length - beta.cloud.google.com/backend-config: '{"default": "sashiko-backend-config"}' + beta.cloud.google.com/backend-config: | + '{"default": "sashiko-backend-config"}' spec: type: NodePort ports: diff --git a/scripts/check_server_config.sh b/scripts/check_server_config.sh new file mode 100755 index 00000000..b1f8586d --- /dev/null +++ b/scripts/check_server_config.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# Check if Sashiko server is properly configured for GitLab webhooks + +SERVER="http://localhost:9080" + +echo "Checking Sashiko server configuration..." +echo "Server: ${SERVER}" +echo "---" + +# Test 1: Check if server is reachable +echo -n "1. Server reachable: " +if curl -s -f -o /dev/null "${SERVER}/" --max-time 5; then + echo "✓ Yes" +else + echo "✗ No - Server is not responding" + echo " Make sure Sashiko is running and accessible" + exit 1 +fi + +# Test 2: Check forge configuration +echo -n "2. Forge enabled: " +config=$(curl -s "${SERVER}/api/config") +if [ $? -ne 0 ]; then + echo "✗ Failed to fetch config" + exit 1 +fi + +forge_enabled=$(echo "$config" | jq -r '.forge_enabled // false') +if [ "$forge_enabled" = "true" ]; then + echo "✓ Yes" +else + echo "✗ No" + echo " Set forge.enabled = true in Settings.toml and restart Sashiko" + exit 1 +fi + +# Test 3: Check if webhook endpoint exists +echo -n "3. GitLab webhook endpoint: " +webhook_test=$(curl -s -w "%{http_code}" -o /dev/null \ + -X POST \ + -H "Content-Type: application/json" \ + -H "X-Gitlab-Event: Merge Request Hook" \ + -d '{}' \ + "${SERVER}/api/webhook/gitlab") + +if [ "$webhook_test" = "400" ] || [ "$webhook_test" = "200" ]; then + # 400 is expected with empty payload, means endpoint exists + echo "✓ Available (HTTP ${webhook_test})" +elif [ "$webhook_test" = "403" ]; then + echo "⚠ Forbidden - forge might be disabled" +else + echo "✗ Unexpected response (HTTP ${webhook_test})" +fi + +# Test 4: Show current configuration +echo "" +echo "Current Configuration:" +echo "$config" | jq . 2>/dev/null || echo "$config" + +echo "" +echo "---" +echo "✓ Server is ready for GitLab webhooks!" +echo "" +echo "Next steps:" +echo " 1. Configure webhook in GitLab project settings" +echo " 2. Set URL to: ${SERVER}/api/webhook/gitlab" +echo " 3. Enable 'Merge request events'" +echo " 4. Test with: ./trigger_gitlab_mr_review.sh " +echo "" +echo "See GITLAB_SETUP.md for detailed instructions" diff --git a/scripts/test_github_webhook.sh b/scripts/test_github_webhook.sh new file mode 100755 index 00000000..ba44f788 --- /dev/null +++ b/scripts/test_github_webhook.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Test GitHub webhook integration locally + +SERVER="http://localhost:9080" +WEBHOOK_URL="${SERVER}/api/webhook/github" + +# Example GitHub PR webhook payload +read -r -d '' PAYLOAD <<'EOF' +{ + "action": "opened", + "pull_request": { + "number": 123, + "title": "Test PR for webhook", + "html_url": "https://github.com/owner/repo/pull/123", + "base": { + "sha": "6f6d7e7447811dbecc13cc7fbbe9f5e7a3d7c70b" + }, + "head": { + "sha": "da1560886d4f094c3e6c9ef40349f7d38b5d27d7" + } + }, + "repository": { + "clone_url": "https://github.com/owner/repo.git" + } +} +EOF + +echo "Testing GitHub webhook at: ${WEBHOOK_URL}" +echo "---" + +# Send the webhook +response=$(curl -s -w "\nHTTP_CODE:%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -H "X-GitHub-Event: pull_request" \ + -d "$PAYLOAD" \ + "${WEBHOOK_URL}") + +# Extract HTTP code and body +http_code=$(echo "$response" | grep "HTTP_CODE:" | cut -d: -f2) +body=$(echo "$response" | grep -v "HTTP_CODE:") + +echo "Response Code: ${http_code}" +echo "Response Body:" +echo "$body" | jq . 2>/dev/null || echo "$body" + +if [ "$http_code" = "200" ]; then + echo "" + echo "✓ Webhook test successful!" + echo "Check sashiko logs to see if the review was queued." +else + echo "" + echo "✗ Webhook test failed!" + echo "Make sure:" + echo " 1. Sashiko server is running" + echo " 2. forge.enabled = true in Settings.toml" + echo " 3. Server is accessible at ${SERVER}" +fi diff --git a/scripts/test_gitlab_webhook.sh b/scripts/test_gitlab_webhook.sh new file mode 100755 index 00000000..73f623ee --- /dev/null +++ b/scripts/test_gitlab_webhook.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Test GitLab webhook integration locally + +SERVER="http://localhost:9080" +WEBHOOK_URL="${SERVER}/api/webhook/gitlab" + +# Example GitLab MR webhook payload +# This simulates what GitLab sends when an MR is opened or updated +read -r -d '' PAYLOAD <<'EOF' +{ + "object_kind": "merge_request", + "event_type": "merge_request", + "object_attributes": { + "iid": 123, + "action": "open", + "source_branch": "feature-branch", + "target_branch": "main", + "last_commit": { + "id": "da1560886d4f094c3e6c9ef40349f7d38b5d27d7" + }, + "diff_refs": { + "base_sha": "6f6d7e7447811dbecc13cc7fbbe9f5e7a3d7c70b", + "head_sha": "da1560886d4f094c3e6c9ef40349f7d38b5d27d7" + } + }, + "project": { + "git_http_url": "https://gitlab.example.com/org/project.git" + } +} +EOF + +echo "Testing GitLab webhook at: ${WEBHOOK_URL}" +echo "---" + +# Send the webhook +response=$(curl -s -w "\nHTTP_CODE:%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -H "X-Gitlab-Event: Merge Request Hook" \ + -d "$PAYLOAD" \ + "${WEBHOOK_URL}") + +# Extract HTTP code and body +http_code=$(echo "$response" | grep "HTTP_CODE:" | cut -d: -f2) +body=$(echo "$response" | grep -v "HTTP_CODE:") + +echo "Response Code: ${http_code}" +echo "Response Body:" +echo "$body" | jq . 2>/dev/null || echo "$body" + +if [ "$http_code" = "200" ]; then + echo "" + echo "✓ Webhook test successful!" + echo "Check sashiko logs to see if the review was queued." +else + echo "" + echo "✗ Webhook test failed!" + echo "Make sure:" + echo " 1. Sashiko server is running" + echo " 2. forge.enabled = true in Settings.toml" + echo " 3. Server is accessible at ${SERVER}" +fi diff --git a/scripts/trigger_github_pr_review.sh b/scripts/trigger_github_pr_review.sh new file mode 100755 index 00000000..5002adca --- /dev/null +++ b/scripts/trigger_github_pr_review.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Manually trigger a review for a specific GitHub PR +# Usage: ./trigger_github_pr_review.sh +# Example: ./trigger_github_pr_review.sh torvalds/linux 12345 + +set -e + +if [ $# -lt 2 ]; then + echo "Usage: $0 " + echo "Example: $0 torvalds/linux 12345" + exit 1 +fi + +REPO=$1 +PR_NUMBER=$2 +SERVER="http://localhost:9080" +WEBHOOK_URL="${SERVER}/api/webhook/github" + +echo "Fetching PR #${PR_NUMBER} details from GitHub..." + +# GitHub API (no auth needed for public repos, use GITHUB_TOKEN if available) +GITHUB_API="https://api.github.com" +AUTH_HEADER="" +if [ -n "$GITHUB_TOKEN" ]; then + AUTH_HEADER="-H \"Authorization: Bearer $GITHUB_TOKEN\"" +fi + +# Fetch PR details +pr_data=$(curl -s $AUTH_HEADER "${GITHUB_API}/repos/${REPO}/pulls/${PR_NUMBER}") + +# Extract required fields +number=$(echo "$pr_data" | jq -r '.number // empty') +title=$(echo "$pr_data" | jq -r '.title // empty') +html_url=$(echo "$pr_data" | jq -r '.html_url // empty') +base_sha=$(echo "$pr_data" | jq -r '.base.sha // empty') +head_sha=$(echo "$pr_data" | jq -r '.head.sha // empty') +state=$(echo "$pr_data" | jq -r '.state // empty') + +# Fetch repository details +repo_data=$(curl -s $AUTH_HEADER "${GITHUB_API}/repos/${REPO}") +clone_url=$(echo "$repo_data" | jq -r '.clone_url // empty') + +if [ -z "$number" ] || [ -z "$base_sha" ] || [ -z "$head_sha" ]; then + echo "Error: Could not fetch PR details. Check PR number and network connection." + echo "API Response:" + echo "$pr_data" | jq . 2>/dev/null || echo "$pr_data" + exit 1 +fi + +echo "PR #${number}: ${title}" +echo "State: ${state}" +echo "Base SHA: ${base_sha}" +echo "Head SHA: ${head_sha}" +echo "URL: ${html_url}" +echo "---" + +# Build webhook payload +read -r -d '' PAYLOAD </dev/null || echo "$body" + +if [ "$http_code" = "200" ]; then + echo "" + echo "✓ Review queued successfully!" + echo "Monitor at: ${SERVER}/" +else + echo "" + echo "✗ Failed to queue review" + exit 1 +fi diff --git a/scripts/trigger_gitlab_mr_review.sh b/scripts/trigger_gitlab_mr_review.sh new file mode 100755 index 00000000..d5911bb2 --- /dev/null +++ b/scripts/trigger_gitlab_mr_review.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# Manually trigger a review for a specific GitLab MR +# Usage: ./trigger_gitlab_mr_review.sh +# +# GITLAB_PROJECT_PATH is the URL-encoded path to the GitLab project +# (e.g., "my-org%2Fmy-group%2Fmy-project") + +set -e + +if [ $# -lt 2 ]; then + echo "Usage: $0 " + echo "" + echo "GITLAB_PROJECT_PATH is the URL-encoded project path." + echo "Example: $0 123 my-org%2Fmy-project" + echo "Example: $0 456 group%2Fsubgroup%2Fproject" + exit 1 +fi + +MR_NUMBER=$1 +GITLAB_PROJECT="$2" +SERVER="http://localhost:9080" +WEBHOOK_URL="${SERVER}/api/webhook/gitlab" + +GITLAB_API="https://gitlab.com/api/v4" + +# Set up authentication if GITLAB_TOKEN is available +AUTH_HEADER="" +if [ -n "$GITLAB_TOKEN" ]; then + AUTH_HEADER="-H \"PRIVATE-TOKEN: ${GITLAB_TOKEN}\"" + echo "Using GitLab authentication token" +fi + +echo "Fetching MR !${MR_NUMBER} details from GitLab..." + +# Fetch MR details from GitLab API +if [ -n "$GITLAB_TOKEN" ]; then + mr_data=$(curl -s -H "PRIVATE-TOKEN: ${GITLAB_TOKEN}" "${GITLAB_API}/projects/${GITLAB_PROJECT}/merge_requests/${MR_NUMBER}") +else + mr_data=$(curl -s "${GITLAB_API}/projects/${GITLAB_PROJECT}/merge_requests/${MR_NUMBER}") +fi + +# Extract required fields +iid=$(echo "$mr_data" | jq -r '.iid // empty') +title=$(echo "$mr_data" | jq -r '.title // empty') +base_sha=$(echo "$mr_data" | jq -r '.diff_refs.base_sha // empty') +head_sha=$(echo "$mr_data" | jq -r '.diff_refs.head_sha // empty') +source_branch=$(echo "$mr_data" | jq -r '.source_branch // empty') +target_branch=$(echo "$mr_data" | jq -r '.target_branch // empty') +state=$(echo "$mr_data" | jq -r '.state // empty') + +# Fetch project details to get repository URL +echo "Fetching project details..." +if [ -n "$GITLAB_TOKEN" ]; then + project_data=$(curl -s -H "PRIVATE-TOKEN: ${GITLAB_TOKEN}" "${GITLAB_API}/projects/${GITLAB_PROJECT}") +else + project_data=$(curl -s "${GITLAB_API}/projects/${GITLAB_PROJECT}") +fi +git_http_url=$(echo "$project_data" | jq -r '.http_url_to_repo // empty') +web_url=$(echo "$project_data" | jq -r '.web_url // empty') + +if [ -z "$iid" ] || [ -z "$base_sha" ] || [ -z "$head_sha" ]; then + echo "Error: Could not fetch MR details. Check MR number and network connection." + echo "API Response:" + echo "$mr_data" | jq . 2>/dev/null || echo "$mr_data" + if echo "$mr_data" | grep -q "404 Project Not Found" && [ -z "$GITLAB_TOKEN" ]; then + echo "" + echo "This appears to be a private repository. Set GITLAB_TOKEN to authenticate:" + echo " export GITLAB_TOKEN='your_token_here'" + echo "Get a token at: https://gitlab.com/-/profile/personal_access_tokens" + fi + exit 1 +fi + +if [ -z "$git_http_url" ]; then + echo "Error: Could not fetch repository URL from project API." + echo "Project API Response:" + echo "$project_data" | jq . 2>/dev/null || echo "$project_data" + if echo "$project_data" | grep -q "404 Project Not Found" && [ -z "$GITLAB_TOKEN" ]; then + echo "" + echo "This appears to be a private repository. Set GITLAB_TOKEN to authenticate:" + echo " export GITLAB_TOKEN='your_token_here'" + echo "Get a token at: https://gitlab.com/-/profile/personal_access_tokens" + fi + exit 1 +fi + +echo "MR !${iid}: ${title}" +echo "Branches: ${source_branch} → ${target_branch} (${state})" +echo "Base SHA: ${base_sha}" +echo "Head SHA: ${head_sha}" +echo "Repo: ${git_http_url}" +echo "Web URL: ${web_url}" +echo "---" + +# Build webhook payload +# Construct MR URL from web_url and iid +mr_url="${web_url}/-/merge_requests/${iid}" + +read -r -d '' PAYLOAD </dev/null || echo "$body" + +if [ "$http_code" = "200" ]; then + echo "" + echo "✓ Review queued successfully!" + echo "Monitor at: ${SERVER}/" +else + echo "" + echo "✗ Failed to queue review" + exit 1 +fi diff --git a/src/api.rs b/src/api.rs index 356d1590..f2b20a53 100644 --- a/src/api.rs +++ b/src/api.rs @@ -15,10 +15,9 @@ use crate::db::Database; use crate::events::Event; use crate::fetcher::FetchRequest; -use crate::settings::ServerSettings; use axum::{ Json, Router, - extract::{ConnectInfo, Query, State}, + extract::{ConnectInfo, Path, Query, State}, http::StatusCode, routing::{get, get_service, post}, }; @@ -28,7 +27,7 @@ use std::sync::Arc; use tokio::net::TcpListener; use tokio::sync::mpsc; use tower_http::services::{ServeDir, ServeFile}; -use tracing::{error, info}; +use tracing::{error, info, warn}; use std::time::{Duration, Instant}; use tokio::sync::RwLock; @@ -122,9 +121,11 @@ impl AsyncMapCache { } pub struct AppState { + pub settings: Arc, pub db: Arc, pub sender: mpsc::Sender, pub fetch_sender: mpsc::Sender, + pub forge_registry: Arc, pub read_only: bool, pub allow_all_submit: bool, pub smtp_enabled: bool, @@ -234,7 +235,7 @@ pub struct SubmitResponse { } pub async fn run_server( - settings: ServerSettings, + settings: Arc, db: Arc, sender: mpsc::Sender, fetch_sender: mpsc::Sender, @@ -242,11 +243,15 @@ pub async fn run_server( smtp_enabled: bool, dry_run: bool, ) -> Result<(), Box> { + let forge_registry = Arc::new(crate::forge::ForgeRegistry::new()); + let state = Arc::new(AppState { + read_only: settings.server.read_only, + settings: settings.clone(), db, sender, fetch_sender, - read_only: settings.read_only, + forge_registry, allow_all_submit, smtp_enabled, dry_run, @@ -260,6 +265,7 @@ pub async fn run_server( }); let app = Router::new() + .route("/api/config", get(get_config)) .route("/api/lists", get(list_mailing_lists)) .route("/api/patchsets", get(list_patchsets)) .route("/api/messages", get(list_messages)) @@ -276,11 +282,12 @@ pub async fn run_server( .route("/api/patchset/rerun", post(rerun_patchset)) .route("/api/patchset/cancel", post(cancel_patchset)) .route("/api/patch/rerun", post(rerun_patch)) + .route("/api/webhook/{provider}", post(forge_webhook)) .route("/", get_service(ServeFile::new("static/index.html"))) .nest_service("/static", ServeDir::new("static")) .with_state(state); - let bind_addr = format!("{}:{}", settings.host, settings.port); + let bind_addr = format!("{}:{}", settings.server.host, settings.server.port); let addrs: Vec = bind_addr .to_socket_addrs() .map_err(|e| anyhow::anyhow!("invalid bind address '{}': {}", bind_addr, e))? @@ -389,6 +396,10 @@ async fn submit_patch( &format!("Fetching {} from {}...", &sha, repo_display), skip_subjects.as_ref(), only_subjects.as_ref(), + None, + None, + None, + None, ) .await { @@ -399,6 +410,9 @@ async fn submit_patch( let req = FetchRequest { repo_url: repo, commit_hash: sha, + mr_url: None, + mr_title: None, + mr_number: None, }; if let Err(e) = state.fetch_sender.send(req).await { @@ -427,6 +441,10 @@ async fn submit_patch( &format!("Fetching thread {}...", clean_msgid), None, None, + None, + None, + None, + None, ) .await { @@ -652,6 +670,12 @@ async fn get_patchset( .db .get_patchset_details(id_val, query.page, query.per_page) .await + } else if query.id.contains('-') && !query.id.contains('@') { + info!("Fetching details for patchset slug: {}", query.id); + state + .db + .get_patchset_details_by_slug(&query.id, query.page, query.per_page) + .await } else { info!("Fetching details for patchset msgid: {}", query.id); state @@ -1008,3 +1032,93 @@ async fn rerun_patch( Ok(Json(serde_json::json!({ "status": "accepted" }))) } + +async fn get_config( + State(state): State>, +) -> Result, StatusCode> { + Ok(Json(serde_json::json!({ + "project_name": state.settings.project.name, + "project_description": state.settings.project.description, + "forge_enabled": state.settings.forge.enabled, + }))) +} + +async fn forge_webhook( + ConnectInfo(addr): ConnectInfo, + State(state): State>, + Path(provider): Path, + headers: axum::http::HeaderMap, + body: axum::body::Bytes, +) -> Result, StatusCode> { + if state.read_only { + return Err(StatusCode::FORBIDDEN); + } + if !state.allow_all_submit && !addr.ip().to_canonical().is_loopback() { + info!("Refused {} webhook from non-localhost: {}", provider, addr); + return Err(StatusCode::FORBIDDEN); + } + + let forge = state.forge_registry.get(&provider).ok_or_else(|| { + warn!("Unknown forge provider: {}", provider); + StatusCode::NOT_FOUND + })?; + + forge.validate_event(&headers)?; + + let (action, metadata) = forge.parse_payload(&body)?; + + info!( + "{} {}: {} - {}", + forge.name(), + action, + metadata.pr_title.as_deref().unwrap_or("(no title)"), + metadata.pr_url.as_deref().unwrap_or("(no url)") + ); + + let default_subject = format!("{} #{}", forge.name(), metadata.pr_number); + let subject = metadata.pr_title.as_deref().unwrap_or(&default_subject); + + let commit_range = format!("{}..{}", metadata.base_sha, metadata.head_sha); + let placeholder_id = format!("mr-{}-{}", metadata.pr_number, commit_range); + + let slug = metadata.pr_url.as_ref().map(|url| { + let repo = crate::forge::extract_repo_name_from_url(url); + format!("{}-{}", repo, metadata.pr_number) + }); + + state + .db + .create_fetching_patchset( + &placeholder_id, + &format!("Fetching {} PR/MR: {}", forge.name(), subject), + None, + None, + metadata.pr_url.as_deref(), + Some(subject), + Some(metadata.pr_number), + slug.as_deref(), + ) + .await + .map_err(|e| { + error!("Failed to create placeholder patchset: {}", e); + StatusCode::INTERNAL_SERVER_ERROR + })?; + + let req = FetchRequest { + repo_url: metadata.repo_url, + commit_hash: commit_range, + mr_url: metadata.pr_url, + mr_title: metadata.pr_title, + mr_number: Some(metadata.pr_number), + }; + + state.fetch_sender.send(req).await.map_err(|e| { + error!("Failed to send fetch request to queue: {}", e); + StatusCode::INTERNAL_SERVER_ERROR + })?; + + Ok(Json(serde_json::json!({ + "status": "accepted", + "message": format!("{} {} queued for review", forge.name(), action) + }))) +} diff --git a/src/bin/review.rs b/src/bin/review.rs index 62d2a6fd..564a7662 100644 --- a/src/bin/review.rs +++ b/src/bin/review.rs @@ -43,8 +43,8 @@ struct Args { #[arg(long)] worktree_dir: Option, - #[arg(long, default_value = "third_party/prompts/kernel")] - prompts: PathBuf, + #[arg(long)] + prompts: Option, /// If set, only review the patch with this index (1-based usually). /// Previous patches (with lower index) will be applied but not reviewed. @@ -366,11 +366,25 @@ async fn main() -> Result<()> { let provider = sashiko::ai::create_provider(&settings).expect("Failed to create AI provider"); // Enable read_prompt tool only if explicit caching is NOT used. - let prompts_dir = PathBuf::from("third_party/prompts/kernel"); + let prompts_dir = args + .prompts + .clone() + .unwrap_or_else(|| PathBuf::from(settings.get_prompts_dir())); let prompts_tool_path = Some(prompts_dir.join("tool.md")); - let tools = ToolBox::new(worktree.path.clone(), prompts_tool_path); - let prompts = PromptRegistry::new(args.prompts.clone()); + let tools = ToolBox::with_config( + worktree.path.clone(), + prompts_tool_path, + settings.tools.as_ref(), + ); + + let prompts = if settings.prompts.is_some() { + PromptRegistry::with_settings(settings.prompts.as_ref()) + .await + .expect("Failed to load prompts with settings") + } else { + PromptRegistry::new(prompts_dir) + }; // Calculate series range (baseline..last_patch) let series_range = calculate_series_range( diff --git a/src/db.rs b/src/db.rs index a5ef50aa..e9414019 100644 --- a/src/db.rs +++ b/src/db.rs @@ -48,6 +48,7 @@ pub struct PatchsetRow { pub findings_high: Option, pub findings_critical: Option, pub baseline_id: Option, + pub slug: Option, pub failed_reason: Option, pub skip_filters: Option, pub only_filters: Option, @@ -58,6 +59,9 @@ pub struct PatchsetRow { pub provider: Option, #[serde(skip)] pub embargo_until: Option, + pub mr_url: Option, + pub mr_title: Option, + pub mr_number: Option, } #[derive(Debug, Clone)] @@ -481,6 +485,11 @@ impl Database { "status, embargo_until", ) .await; + let _ = self.try_add_column("patchsets", "mr_url", "TEXT").await; + let _ = self.try_add_column("patchsets", "mr_title", "TEXT").await; + let _ = self + .try_add_column("patchsets", "mr_number", "INTEGER") + .await; let _ = self .conn @@ -2214,7 +2223,7 @@ impl Database { let sql = format!( "SELECT p.id, p.subject, p.status, p.thread_id, p.author, p.date, p.cover_letter_message_id, p.total_parts, p.received_parts, GROUP_CONCAT(s.name, ','), COALESCE(f.low, 0), COALESCE(f.medium, 0), COALESCE(f.high, 0), COALESCE(f.critical, 0), p.baseline_id, p.failed_reason, p.target_review_count, p.skip_filters, p.only_filters, - p.embargo_until + p.embargo_until, p.mr_url, p.mr_title, p.mr_number, p.slug FROM patchsets p LEFT JOIN patchsets_subsystems ps ON p.id = ps.patchset_id LEFT JOIN subsystems s ON ps.subsystem_id = s.id @@ -2310,6 +2319,10 @@ impl Database { baseline_logs: None, provider: None, embargo_until: row.get(19).ok(), + mr_url: row.get(20).ok(), + mr_title: row.get(21).ok(), + mr_number: row.get(22).ok(), + slug: row.get(23).ok(), }); } Ok(None) => break, @@ -2451,7 +2464,7 @@ impl Database { p.author, p.date, p.cover_letter_message_id, p.thread_id, p.total_parts, p.received_parts, p.failed_reason, p.model_name, p.prompts_git_hash, p.baseline_logs, p.baseline_id, p.provider, - p.embargo_until + p.embargo_until, p.mr_url, p.slug FROM patchsets p WHERE p.id = ?", libsql::params![id], @@ -2477,6 +2490,8 @@ impl Database { let baseline_id: Option = row.get(15).ok(); let provider: Option = row.get(16).ok(); let embargo_until: Option = row.get(17).ok(); + let mr_url: Option = row.get(18).ok(); + let slug: Option = row.get(19).ok(); // Fetch baseline details if needed let baseline = if let Some(bid) = baseline_id { let mut browse = self @@ -2672,7 +2687,9 @@ impl Database { "baseline_logs": baseline_logs, "baseline": baseline, "provider": provider, - "embargo_until": embargo_until + "embargo_until": embargo_until, + "mr_url": mr_url, + "slug": slug }))) } else { Ok(None) @@ -2692,7 +2709,7 @@ impl Database { p.author, p.date, p.cover_letter_message_id, p.thread_id, p.total_parts, p.received_parts, p.failed_reason, p.model_name, p.prompts_git_hash, p.baseline_logs, p.baseline_id, p.provider, - p.embargo_until + p.embargo_until, p.mr_url, p.slug FROM patchsets p WHERE p.id = ?", libsql::params![id], @@ -2718,6 +2735,8 @@ impl Database { let baseline_id: Option = row.get(15).ok(); let provider: Option = row.get(16).ok(); let embargo_until: Option = row.get(17).ok(); + let mr_url: Option = row.get(18).ok(); + let slug: Option = row.get(19).ok(); let baseline = if let Some(bid) = baseline_id { let mut browse = self .conn @@ -2907,7 +2926,9 @@ impl Database { "baseline_logs": baseline_logs, "baseline": baseline, "provider": provider, - "embargo_until": embargo_until + "embargo_until": embargo_until, + "mr_url": mr_url, + "slug": slug }))) } else { Ok(None) @@ -2947,6 +2968,48 @@ impl Database { Ok(None) } + pub async fn get_patchset_details_by_slug( + &self, + slug: &str, + page: Option, + limit: Option, + ) -> Result> { + let mut rows = self + .conn + .query( + "SELECT id FROM patchsets WHERE slug = ?", + libsql::params![slug], + ) + .await?; + if let Ok(Some(row)) = rows.next().await { + let id: i64 = row.get(0)?; + return self.get_patchset_details(id, page, limit).await; + } + + Ok(None) + } + + pub async fn get_patchset_summary_by_slug( + &self, + slug: &str, + page: Option, + limit: Option, + ) -> Result> { + let mut rows = self + .conn + .query( + "SELECT id FROM patchsets WHERE slug = ?", + libsql::params![slug], + ) + .await?; + if let Ok(Some(row)) = rows.next().await { + let id: i64 = row.get(0)?; + return self.get_patchset_summary(id, page, limit).await; + } + + Ok(None) + } + pub async fn get_review_details(&self, id: i64) -> Result> { let mut rows = self .conn @@ -3044,7 +3107,7 @@ impl Database { pub async fn get_pending_patchsets(&self, limit: usize) -> Result> { let mut rows = self.conn.query( - "SELECT id, subject, status, thread_id, author, date, cover_letter_message_id, total_parts, received_parts, baseline_id, failed_reason, target_review_count, skip_filters, only_filters, embargo_until + "SELECT id, subject, status, thread_id, author, date, cover_letter_message_id, total_parts, received_parts, baseline_id, failed_reason, target_review_count, skip_filters, only_filters, embargo_until, slug FROM patchsets WHERE status = 'Pending' ORDER BY date ASC LIMIT ?", libsql::params![limit as i64], ).await?; @@ -3076,6 +3139,10 @@ impl Database { baseline_logs: None, provider: None, embargo_until: row.get(14).ok(), + mr_url: None, + mr_title: None, + mr_number: None, + slug: row.get(15).ok(), }); } Ok(patchsets) @@ -3128,6 +3195,10 @@ impl Database { baseline_logs: None, provider: None, embargo_until: row.get(14).ok(), + mr_url: None, + mr_title: None, + mr_number: None, + slug: None, }); } Ok(None) => break, @@ -3294,12 +3365,17 @@ impl Database { self.rerun_patchset(patchset_id).await } + #[allow(clippy::too_many_arguments)] pub async fn create_fetching_patchset( &self, article_id: &str, subject: &str, skip_filters: Option<&Vec>, only_filters: Option<&Vec>, + mr_url: Option<&str>, + mr_title: Option<&str>, + mr_number: Option, + slug: Option<&str>, ) -> Result { let now = std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH)? @@ -3334,8 +3410,8 @@ impl Database { // We don't want to reset if it is already Incomplete, Pending, or Reviewed. if status == "Failed" || status == "Fetching" { self.conn.execute( - "UPDATE patchsets SET status = 'Fetching', failed_reason = NULL, skip_filters = ?, only_filters = ? WHERE id = ?", - libsql::params![skip_filters_json.clone(), only_filters_json.clone(), id] + "UPDATE patchsets SET status = 'Fetching', failed_reason = NULL, skip_filters = ?, only_filters = ?, mr_url = ?, mr_title = ?, mr_number = ?, slug = ? WHERE id = ?", + libsql::params![skip_filters_json.clone(), only_filters_json.clone(), mr_url, mr_title, mr_number, slug, id] ).await?; } return Ok(id); @@ -3348,9 +3424,9 @@ impl Database { // 3. Create the fetching patchset let mut rows = self.conn .query( - "INSERT INTO patchsets (thread_id, cover_letter_message_id, subject, status, date, skip_filters, only_filters) - VALUES (?, ?, ?, 'Fetching', ?, ?, ?) RETURNING id", - libsql::params![thread_id, root_msg_id, subject, now, skip_filters_json, only_filters_json], + "INSERT INTO patchsets (thread_id, cover_letter_message_id, subject, status, date, skip_filters, only_filters, mr_url, mr_title, mr_number, slug) + VALUES (?, ?, ?, 'Fetching', ?, ?, ?, ?, ?, ?, ?) RETURNING id", + libsql::params![thread_id, root_msg_id, subject, now, skip_filters_json, only_filters_json, mr_url, mr_title, mr_number, slug], ) .await?; diff --git a/src/events.rs b/src/events.rs index 2b7f1c71..1527aea3 100644 --- a/src/events.rs +++ b/src/events.rs @@ -36,6 +36,9 @@ pub enum Event { timestamp: i64, index: u32, total: u32, + mr_url: Option, + mr_title: Option, + mr_number: Option, }, RawMboxSubmitted { raw: String, @@ -60,4 +63,7 @@ pub struct ParsedArticle { pub failed_error: Option, pub skip_filters: Option>, pub only_filters: Option>, + pub mr_url: Option, + pub mr_title: Option, + pub mr_number: Option, } diff --git a/src/fetcher.rs b/src/fetcher.rs index c24b8d29..972a9750 100644 --- a/src/fetcher.rs +++ b/src/fetcher.rs @@ -27,18 +27,25 @@ use tracing::{error, info, warn}; pub struct FetchRequest { pub repo_url: Option, pub commit_hash: String, + pub mr_url: Option, + pub mr_title: Option, + pub mr_number: Option, } pub struct FetchAgent { repo_path: PathBuf, rx: mpsc::Receiver, main_tx: mpsc::Sender, + #[allow(clippy::type_complexity)] + mr_metadata: HashMap, Option, Option)>, + gitlab_token: Option, } impl FetchAgent { pub fn new( repo_path: PathBuf, main_tx: mpsc::Sender, + gitlab_token: Option, ) -> (Self, mpsc::Sender) { let (tx, rx) = mpsc::channel(100); ( @@ -46,6 +53,8 @@ impl FetchAgent { repo_path, rx, main_tx, + mr_metadata: HashMap::new(), + gitlab_token, }, tx, ) @@ -59,6 +68,12 @@ impl FetchAgent { loop { tokio::select! { Some(req) = self.rx.recv() => { + if req.mr_url.is_some() || req.mr_title.is_some() || req.mr_number.is_some() { + self.mr_metadata.insert( + req.commit_hash.clone(), + (req.mr_url.clone(), req.mr_title.clone(), req.mr_number) + ); + } queue.entry(req.repo_url) .or_default() .insert(req.commit_hash); @@ -89,9 +104,22 @@ impl FetchAgent { url_display ); - // Check existence first + // For ranges (base..head), we need to check both endpoints individually + let mut commits_to_check = Vec::new(); + for commit_or_range in &commit_list { + if commit_or_range.contains("..") { + let parts: Vec<&str> = commit_or_range.split("..").collect(); + if parts.len() == 2 { + commits_to_check.push(parts[0].to_string()); + commits_to_check.push(parts[1].to_string()); + } + } else { + commits_to_check.push(commit_or_range.clone()); + } + } + let mut missing_commits = Vec::new(); - for commit in &commit_list { + for commit in &commits_to_check { if !self.is_present(commit).await { missing_commits.push(commit.clone()); } @@ -193,8 +221,31 @@ impl FetchAgent { let count = shas.len() as u32; // Process each SHA + let (mr_url, mr_title, mr_number) = self + .mr_metadata + .get(range) + .cloned() + .unwrap_or((None, None, None)); + + let article_id = if let Some(number) = mr_number { + format!("mr-{}-{}", number, range) + } else { + range.to_string() + }; + for (i, sha) in shas.iter().enumerate() { - match self.extract_patch(sha, range, (i + 1) as u32, count).await { + match self + .extract_patch( + sha, + &article_id, + (i + 1) as u32, + count, + mr_url.as_ref(), + mr_title.as_ref(), + mr_number, + ) + .await + { Ok(mut event) => { if let Event::PatchSubmitted { ref mut message_id, .. @@ -231,7 +282,30 @@ impl FetchAgent { } }; - match self.extract_patch(&full_sha, &commit_or_range, 1, 1).await { + let (mr_url, mr_title, mr_number) = self + .mr_metadata + .get(&commit_or_range) + .cloned() + .unwrap_or((None, None, None)); + + let article_id = if let Some(number) = mr_number { + format!("mr-{}-{}", number, &commit_or_range) + } else { + commit_or_range.clone() + }; + + match self + .extract_patch( + &full_sha, + &article_id, + 1, + 1, + mr_url.as_ref(), + mr_title.as_ref(), + mr_number, + ) + .await + { Ok(mut event) => { if let Event::PatchSubmitted { ref mut message_id, .. @@ -272,9 +346,21 @@ impl FetchAgent { } async fn ensure_remote(&self, name: &str, url: &str) -> Result<()> { + // Inject GitLab token if available + let authenticated_url = if let Some(token) = &self.gitlab_token { + if url.contains("gitlab.com") && url.starts_with("https://") { + url.replace("https://", &format!("https://oauth2:{}@", token)) + } else { + url.to_string() + } + } else { + url.to_string() + }; + // Check if remote exists let status = Command::new("git") .current_dir(&self.repo_path) + .args(["-c", "safe.bareRepository=all"]) .args(["remote", "get-url", name]) .stdout(Stdio::null()) .stderr(Stdio::null()) @@ -282,32 +368,38 @@ impl FetchAgent { .await?; if status.success() { - // Check if URL matches, if not update it let output = Command::new("git") .current_dir(&self.repo_path) + .args(["-c", "safe.bareRepository=all"]) .args(["remote", "get-url", name]) .output() .await?; let current_url = String::from_utf8_lossy(&output.stdout).trim().to_string(); - if current_url != url { + if current_url != authenticated_url { info!( "Updating remote {} from {} to {}", name, redact_secret(¤t_url), - redact_secret(url) + redact_secret(&authenticated_url) ); Command::new("git") .current_dir(&self.repo_path) - .args(["remote", "set-url", name, url]) + .args(["-c", "safe.bareRepository=all"]) + .args(["remote", "set-url", name, &authenticated_url]) .output() .await?; } } else { - info!("Adding remote {} -> {}", name, redact_secret(url)); + info!( + "Adding remote {} -> {}", + name, + redact_secret(&authenticated_url) + ); let output = Command::new("git") .current_dir(&self.repo_path) - .args(["remote", "add", name, url]) + .args(["-c", "safe.bareRepository=all"]) + .args(["remote", "add", name, &authenticated_url]) .output() .await?; @@ -323,7 +415,10 @@ impl FetchAgent { async fn fetch_commits(&self, remote: &str, commits: &[String]) -> Result<()> { let mut cmd = Command::new("git"); - cmd.current_dir(&self.repo_path).arg("fetch").arg(remote); + cmd.current_dir(&self.repo_path) + .args(["-c", "safe.bareRepository=all"]) + .arg("fetch") + .arg(remote); for commit in commits { cmd.arg(commit); @@ -342,6 +437,7 @@ impl FetchAgent { async fn fetch_all(&self, remote: &str) -> Result<()> { let output = Command::new("git") .current_dir(&self.repo_path) + .args(["-c", "safe.bareRepository=all"]) .args(["fetch", remote]) .output() .await?; @@ -356,15 +452,15 @@ impl FetchAgent { } async fn is_present(&self, commit_or_range: &str) -> bool { + let mut args = vec!["-c", "safe.bareRepository=all"]; let arg_str: String; - let args = if let Some((start, end)) = commit_or_range.split_once("..") { - // For ranges, ensure both endpoints are commits + + if let Some((start, end)) = commit_or_range.split_once("..") { arg_str = format!("{}^{{commit}}..{}^{{commit}}", start, end); - vec!["rev-list", "-n", "1", &arg_str] + args.extend(["rev-list", "-n", "1", &arg_str]); } else { - // For single commits, ensure it is a valid commit object arg_str = format!("{}^{{commit}}", commit_or_range); - vec!["rev-parse", "--verify", &arg_str] + args.extend(["rev-parse", "--verify", &arg_str]); }; let output = Command::new("git") @@ -397,6 +493,7 @@ impl FetchAgent { async fn resolve_sha(&self, commit: &str) -> Result { let output = Command::new("git") .current_dir(&self.repo_path) + .args(["-c", "safe.bareRepository=all"]) .args(["rev-parse", "--verify", commit]) .output() .await?; @@ -411,12 +508,16 @@ impl FetchAgent { Ok(String::from_utf8_lossy(&output.stdout).trim().to_string()) } + #[allow(clippy::too_many_arguments)] async fn extract_patch( &self, commit: &str, article_id: &str, index: u32, total: u32, + mr_url: Option<&String>, + mr_title: Option<&String>, + mr_number: Option, ) -> Result { let meta = crate::git_ops::extract_patch_metadata(&self.repo_path, commit).await?; @@ -432,6 +533,9 @@ impl FetchAgent { timestamp: meta.timestamp, index, total, + mr_url: mr_url.cloned(), + mr_title: mr_title.cloned(), + mr_number, }) } } @@ -446,7 +550,7 @@ mod tests { async fn test_fetch_agent_lifecycle() { let (tx, _rx) = mpsc::channel(1); let repo_path = PathBuf::from("/tmp"); - let (_agent, _sender) = FetchAgent::new(repo_path, tx); + let (_agent, _sender) = FetchAgent::new(repo_path, tx, None); } #[tokio::test] @@ -487,7 +591,7 @@ mod tests { .await?; let (tx, _rx) = mpsc::channel(1); - let (agent, _) = FetchAgent::new(repo_path.clone(), tx); + let (agent, _) = FetchAgent::new(repo_path.clone(), tx, None); let output = Command::new("git") .current_dir(&repo_path) @@ -496,7 +600,9 @@ mod tests { .await?; let head = String::from_utf8(output.stdout)?.trim().to_string(); - let event = agent.extract_patch(&head, &head, 1, 1).await?; + let event = agent + .extract_patch(&head, &head, 1, 1, None, None, None) + .await?; match event { Event::PatchSubmitted { @@ -557,7 +663,7 @@ mod tests { .await?; let (tx, _rx) = mpsc::channel(1); - let (agent, _) = FetchAgent::new(repo_path.clone(), tx); + let (agent, _) = FetchAgent::new(repo_path.clone(), tx, None); let output = Command::new("git") .current_dir(&repo_path) diff --git a/src/forge.rs b/src/forge.rs new file mode 100644 index 00000000..2d082a97 --- /dev/null +++ b/src/forge.rs @@ -0,0 +1,238 @@ +// Copyright 2026 The Sashiko Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use axum::http::{HeaderMap, StatusCode}; +use bytes::Bytes; +use std::collections::HashMap; +use std::sync::Arc; + +/// Metadata extracted from forge webhook +#[derive(Debug, Clone)] +pub struct ForgeMetadata { + pub repo_url: Option, + pub base_sha: String, + pub head_sha: String, + pub pr_number: i64, + pub pr_title: Option, + pub pr_url: Option, +} + +/// Trait for forge provider implementations +pub trait ForgeProvider: Send + Sync { + /// Provider name (e.g., "GitHub", "GitLab") + fn name(&self) -> &str; + + /// Validate webhook event from headers + fn validate_event(&self, headers: &HeaderMap) -> Result<(), StatusCode>; + + /// Parse webhook payload and extract metadata + fn parse_payload(&self, body: &Bytes) -> Result<(String, ForgeMetadata), StatusCode>; +} + +/// GitHub forge provider +pub struct GitHubForge; + +impl ForgeProvider for GitHubForge { + fn name(&self) -> &str { + "GitHub" + } + + fn validate_event(&self, headers: &HeaderMap) -> Result<(), StatusCode> { + let event = headers + .get("x-github-event") + .and_then(|v| v.to_str().ok()) + .ok_or(StatusCode::BAD_REQUEST)?; + + if event != "pull_request" { + return Err(StatusCode::BAD_REQUEST); + } + + Ok(()) + } + + fn parse_payload(&self, body: &Bytes) -> Result<(String, ForgeMetadata), StatusCode> { + use serde_json::Value; + + let payload: Value = serde_json::from_slice(body).map_err(|_| StatusCode::BAD_REQUEST)?; + + let action = payload["action"] + .as_str() + .ok_or(StatusCode::BAD_REQUEST)? + .to_string(); + + let pr = &payload["pull_request"]; + if pr.is_null() { + return Err(StatusCode::BAD_REQUEST); + } + + let head_sha = pr["head"]["sha"] + .as_str() + .ok_or(StatusCode::BAD_REQUEST)? + .to_string(); + + let base_sha = pr["base"]["sha"] + .as_str() + .ok_or(StatusCode::BAD_REQUEST)? + .to_string(); + + let pr_number = pr["number"].as_i64().ok_or(StatusCode::BAD_REQUEST)?; + + let pr_title = pr["title"].as_str().map(|s| s.to_string()); + let pr_url = pr["html_url"].as_str().map(|s| s.to_string()); + + let repo_url = payload["repository"]["clone_url"] + .as_str() + .map(|s| s.to_string()); + + let metadata = ForgeMetadata { + repo_url, + base_sha, + head_sha, + pr_number, + pr_title, + pr_url, + }; + + Ok((action, metadata)) + } +} + +/// GitLab forge provider +pub struct GitLabForge; + +impl ForgeProvider for GitLabForge { + fn name(&self) -> &str { + "GitLab" + } + + fn validate_event(&self, headers: &HeaderMap) -> Result<(), StatusCode> { + let event = headers + .get("x-gitlab-event") + .and_then(|v| v.to_str().ok()) + .ok_or(StatusCode::BAD_REQUEST)?; + + if event != "Merge Request Hook" { + return Err(StatusCode::BAD_REQUEST); + } + + Ok(()) + } + + fn parse_payload(&self, body: &Bytes) -> Result<(String, ForgeMetadata), StatusCode> { + use serde_json::Value; + + let payload: Value = serde_json::from_slice(body).map_err(|_| StatusCode::BAD_REQUEST)?; + + let action = payload["object_kind"] + .as_str() + .ok_or(StatusCode::BAD_REQUEST)? + .to_string(); + + let attrs = &payload["object_attributes"]; + if attrs.is_null() { + return Err(StatusCode::BAD_REQUEST); + } + + let head_sha = attrs["last_commit"]["id"] + .as_str() + .ok_or(StatusCode::BAD_REQUEST)? + .to_string(); + + let base_sha = attrs["diff_refs"]["base_sha"] + .as_str() + .map(|s| s.to_string()) + .unwrap_or_else(|| head_sha.clone()); + + let pr_number = attrs["iid"].as_i64().ok_or(StatusCode::BAD_REQUEST)?; + + let pr_title = attrs["title"].as_str().map(|s| s.to_string()); + let pr_url = attrs["url"].as_str().map(|s| s.to_string()); + + let repo_url = payload["project"]["git_http_url"] + .as_str() + .map(|s| s.to_string()); + + let metadata = ForgeMetadata { + repo_url, + base_sha, + head_sha, + pr_number, + pr_title, + pr_url, + }; + + Ok((action, metadata)) + } +} + +/// Extract repository name from a URL +pub fn extract_repo_name_from_url(url: &str) -> String { + url.trim_end_matches('/') + .split('/') + .next_back() + .map(|s| s.trim_end_matches(".git")) + .unwrap_or("repo") + .to_string() +} + +/// Extract repository name from a GitLab MR URL +pub fn extract_repo_name_from_mr_url(url: &str) -> Option { + if let Some(before_sep) = url.split("/-/").next() { + let name = before_sep + .trim_end_matches('/') + .split('/') + .next_back()? + .to_string(); + Some(name) + } else { + None + } +} + +/// Registry for forge providers +pub struct ForgeRegistry { + providers: HashMap>, +} + +impl ForgeRegistry { + pub fn new() -> Self { + let mut registry = Self { + providers: HashMap::new(), + }; + + registry.register("github", Arc::new(GitHubForge)); + registry.register("gitlab", Arc::new(GitLabForge)); + + registry + } + + pub fn register(&mut self, name: &str, provider: Arc) { + self.providers.insert(name.to_string(), provider); + } + + pub fn get(&self, name: &str) -> Option> { + self.providers.get(name).cloned() + } + + pub fn list_providers(&self) -> Vec { + self.providers.keys().cloned().collect() + } +} + +impl Default for ForgeRegistry { + fn default() -> Self { + Self::new() + } +} diff --git a/src/lib.rs b/src/lib.rs index 98642c65..68ebda27 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ pub mod email_policy; pub mod email_router; pub mod events; pub mod fetcher; +pub mod forge; pub mod git_ops; pub mod ingestor; pub mod inspector; diff --git a/src/main.rs b/src/main.rs index 0bc425b5..93829e25 100644 --- a/src/main.rs +++ b/src/main.rs @@ -176,7 +176,11 @@ async fn main() -> Result<(), Box> { // Initialize FetchAgent let repo_path = std::path::PathBuf::from(&settings.git.repository_path); - let (fetch_agent, fetch_tx) = sashiko::fetcher::FetchAgent::new(repo_path, raw_tx.clone()); + let (fetch_agent, fetch_tx) = sashiko::fetcher::FetchAgent::new( + repo_path, + raw_tx.clone(), + settings.forge.api_token.clone(), + ); // Spawn FetchAgent tokio::spawn(async move { @@ -246,6 +250,9 @@ async fn main() -> Result<(), Box> { failed_error: Some(error), skip_filters: None, only_filters: None, + mr_url: None, + mr_title: None, + mr_number: None, }) .await { @@ -264,6 +271,9 @@ async fn main() -> Result<(), Box> { timestamp, index, total, + mr_url, + mr_title, + mr_number, } => { let root_msg_id = format!("{}@sashiko.local", article_id); @@ -308,6 +318,9 @@ async fn main() -> Result<(), Box> { failed_error: None, skip_filters: None, only_filters: None, + mr_url, + mr_title, + mr_number, }) .await { @@ -367,6 +380,9 @@ async fn main() -> Result<(), Box> { failed_error: None, skip_filters: skip_subjects_clone, only_filters: only_subjects_clone, + mr_url: None, + mr_title: None, + mr_number: None, }) .await { @@ -421,6 +437,9 @@ async fn main() -> Result<(), Box> { failed_error: None, skip_filters: None, only_filters: None, + mr_url: None, + mr_title: None, + mr_number: None, }) .await { @@ -443,6 +462,7 @@ async fn main() -> Result<(), Box> { // DB Worker (Transactional Batching) let worker_db = db.clone(); + let mapping = settings.subsystems.mapping.clone(); let _db_worker_handle = tokio::spawn(async move { info!("DB Worker started"); @@ -451,19 +471,17 @@ async fn main() -> Result<(), Box> { let mut total_ingested = 0; let mut total_errors = 0; + let policy = + sashiko::email_policy::EmailPolicyConfig::load("email_policy.toml").unwrap_or_default(); + loop { let count = parsed_rx.recv_many(&mut buffer, 100).await; if count == 0 { break; } - // info!("Processing batch of {} parsed articles", count); // Too verbose - - let policy = sashiko::email_policy::EmailPolicyConfig::load("email_policy.toml") - .unwrap_or_default(); - for article in buffer.drain(..) { - match process_parsed_article(&worker_db, article, &policy).await { + match process_parsed_article(&worker_db, article, &policy, &mapping).await { ProcessStatus::Ingested => total_ingested += 1, ProcessStatus::Error => total_errors += 1, } @@ -486,21 +504,28 @@ async fn main() -> Result<(), Box> { }); // Start Ingestor (feeds raw_tx) - let ingestor = Ingestor::new( - settings.clone(), - db.clone(), - raw_tx.clone(), - cli.download, - cli.track, - ); - let ingestor_handle = tokio::spawn(async move { - if let Err(e) = ingestor.run().await { - error!("Ingestor fatal error: {}", e); - } - }); + let ingestor_handle = if !settings.forge.enabled { + let ingestor = Ingestor::new( + settings.clone(), + db.clone(), + raw_tx.clone(), + cli.download, + cli.track, + ); + tokio::spawn(async move { + if let Err(e) = ingestor.run().await { + error!("Ingestor fatal error: {}", e); + } + }) + } else { + info!("Forge integration is enabled. Lore/NNTP ingestor is disabled."); + tokio::spawn(async move { + std::future::pending::<()>().await; + }) + }; // Start Web API - let api_settings = settings.server.clone(); + let api_settings = Arc::new(settings.clone()); let api_db = db.clone(); let api_tx = raw_tx.clone(); let api_fetch_tx = fetch_tx.clone(); @@ -545,10 +570,16 @@ async fn main() -> Result<(), Box> { } // Start Reviewer Service - let reviewer = Reviewer::new(db.clone(), settings.clone()).await; - tokio::spawn(async move { - reviewer.start().await; - }); + match Reviewer::new(db.clone(), settings.clone()).await { + Ok(reviewer) => { + tokio::spawn(async move { + reviewer.start().await; + }); + } + Err(e) => { + error!("Failed to initialize Reviewer service: {}", e); + } + } let metrics_db = db.clone(); tokio::spawn(async move { @@ -589,6 +620,7 @@ async fn process_parsed_article( worker_db: &Database, article: ParsedArticle, policy: &sashiko::email_policy::EmailPolicyConfig, + subsystem_mapping: &[sashiko::settings::SubsystemMapping], ) -> ProcessStatus { let ParsedArticle { group, @@ -599,6 +631,9 @@ async fn process_parsed_article( failed_error, skip_filters, only_filters, + mr_url, + mr_title, + mr_number, } = article; // Handle ingestion failure @@ -748,10 +783,29 @@ async fn process_parsed_article( } // Subsystem Identification and Linking - let mut subsystems = identify_subsystems(&metadata.to, &metadata.cc); - if group.starts_with("git-import") { - subsystems.push(("from git".to_string(), "git-import".to_string())); + let mut subsystems = identify_subsystems(&metadata.to, &metadata.cc, subsystem_mapping); + + if let Some(p) = patch_opt.as_ref() { + let files = sashiko::baseline::extract_files_from_diff(&p.diff); + let path_subsystems = identify_subsystems_from_paths(&files, subsystem_mapping); + subsystems.extend(path_subsystems); + } + + if group.starts_with("git-import") || group == "git-fetch" { + let (label, email) = if let Some(url) = &mr_url { + if let Some(repo_name) = sashiko::forge::extract_repo_name_from_mr_url(url) { + let email = format!("git-import-{}", repo_name); + (repo_name, email) + } else { + ("from git".to_string(), "git-import".to_string()) + } + } else { + ("from git".to_string(), "git-import".to_string()) + }; + subsystems.push((label, email)); } + subsystems.sort(); + subsystems.dedup(); let mut subsystem_ids = Vec::new(); for (name, email) in &subsystems { @@ -821,7 +875,10 @@ async fn process_parsed_article( */ let root_msg_id = format!("{}@sashiko.local", article_id); - let cover_letter_id = if group == "git-fetch" || group == "api-submit" { + let cover_letter_id = if group == "git-fetch" { + // Always use root_msg_id for git-fetch to match the placeholder ID + Some(root_msg_id.as_str()) + } else if group == "api-submit" { if metadata.total == 1 { Some(metadata.message_id.as_str()) } else { @@ -849,6 +906,24 @@ async fn process_parsed_article( }, false, ) + } else if group == "git-fetch" && mr_title.is_some() && mr_number.is_some() { + if metadata.total == 1 || metadata.index == 1 { + let title = mr_title.as_ref().unwrap(); + let number = mr_number.unwrap(); + ( + format!("!{}: {}", number, title), + metadata.author.clone(), + metadata.total, + true, + ) + } else { + ( + metadata.subject.clone(), + metadata.author.clone(), + metadata.total, + !group.starts_with("git-import"), + ) + } } else { ( metadata.subject.clone(), @@ -1105,7 +1180,11 @@ fn calculate_embargo_hours( } } -fn identify_subsystems(to: &str, cc: &str) -> Vec<(String, String)> { +fn identify_subsystems( + to: &str, + cc: &str, + mapping: &[sashiko::settings::SubsystemMapping], +) -> Vec<(String, String)> { let mut subsystems = Vec::new(); let mut all_recipients = String::new(); all_recipients.push_str(to); @@ -1119,32 +1198,29 @@ fn identify_subsystems(to: &str, cc: &str) -> Vec<(String, String)> { } let lower_email = email.to_lowercase(); + let mut matched = false; - // 1. Static Map (Mimic MAINTAINERS) - if lower_email.contains("linux-kernel@vger.kernel.org") { - subsystems.push(( - "LKML".to_string(), - "linux-kernel@vger.kernel.org".to_string(), - )); - } else if lower_email.contains("netdev@vger.kernel.org") { - subsystems.push(("netdev".to_string(), "netdev@vger.kernel.org".to_string())); - } else if lower_email.contains("bpf@vger.kernel.org") { - subsystems.push(("bpf".to_string(), "bpf@vger.kernel.org".to_string())); - } else if lower_email.contains("linux-usb@vger.kernel.org") { - subsystems.push(("usb".to_string(), "linux-usb@vger.kernel.org".to_string())); - } else if lower_email.contains("linux-fsdevel@vger.kernel.org") { - subsystems.push(( - "fsdevel".to_string(), - "linux-fsdevel@vger.kernel.org".to_string(), - )); - } else if lower_email.contains("linux-mm@kvack.org") { - subsystems.push(("linux-mm".to_string(), "linux-mm@kvack.org".to_string())); - } else if lower_email.ends_with("@vger.kernel.org") - || lower_email.ends_with("@lists.linux.dev") - || lower_email.ends_with("@lists.infradead.org") - { - // Fallback: derive name from email user part - if let Some(name) = lower_email.split('@').next() { + for rule in mapping { + if let Ok(re) = regex::Regex::new(&rule.pattern) + && re.is_match(&lower_email) + { + subsystems.push((rule.name.clone(), lower_email.clone())); + matched = true; + } + } + + // Fallback for known kernel lists if no mapping is provided + if !matched && mapping.is_empty() { + if lower_email.contains("linux-kernel@vger.kernel.org") { + subsystems.push(("LKML".to_string(), lower_email)); + } else if lower_email.contains("netdev@vger.kernel.org") { + subsystems.push(("netdev".to_string(), lower_email)); + } else if (lower_email.ends_with("@vger.kernel.org") + || lower_email.ends_with("@lists.linux.dev") + || lower_email.ends_with("@lists.infradead.org") + || lower_email.ends_with("@kvack.org")) + && let Some(name) = lower_email.split('@').next() + { subsystems.push((name.to_string(), lower_email)); } } @@ -1155,6 +1231,28 @@ fn identify_subsystems(to: &str, cc: &str) -> Vec<(String, String)> { subsystems } +fn identify_subsystems_from_paths( + paths: &[String], + mapping: &[sashiko::settings::SubsystemMapping], +) -> Vec<(String, String)> { + let mut subsystems = Vec::new(); + + for path in paths { + let lower_path = path.to_lowercase(); + for rule in mapping { + if let Ok(re) = regex::Regex::new(&rule.pattern) + && re.is_match(&lower_path) + { + subsystems.push((rule.name.clone(), rule.name.clone() + "@forge.local")); + } + } + } + + subsystems.sort(); + subsystems.dedup(); + subsystems +} + #[cfg(test)] mod tests { use super::*; @@ -1201,7 +1299,7 @@ mod tests { // Test known subsystem let to = "linux-kernel@vger.kernel.org"; let cc = "netdev@vger.kernel.org"; - let subsystems = identify_subsystems(to, cc); + let subsystems = identify_subsystems(to, cc, &[]); assert!(subsystems.contains(&( "LKML".to_string(), "linux-kernel@vger.kernel.org".to_string() @@ -1211,7 +1309,7 @@ mod tests { // Test fallback let to = "unknown-list@vger.kernel.org"; let cc = ""; - let subsystems = identify_subsystems(to, cc); + let subsystems = identify_subsystems(to, cc, &[]); assert!(subsystems.contains(&( "unknown-list".to_string(), "unknown-list@vger.kernel.org".to_string() @@ -1220,15 +1318,18 @@ mod tests { // Test mixed let to = "linux-usb@vger.kernel.org, random-user@example.com"; let cc = "bpf@vger.kernel.org"; - let subsystems = identify_subsystems(to, cc); - assert!(subsystems.contains(&("usb".to_string(), "linux-usb@vger.kernel.org".to_string()))); + let subsystems = identify_subsystems(to, cc, &[]); + assert!(subsystems.contains(&( + "linux-usb".to_string(), + "linux-usb@vger.kernel.org".to_string() + ))); assert!(subsystems.contains(&("bpf".to_string(), "bpf@vger.kernel.org".to_string()))); // random-user should be ignored as it doesn't match list patterns assert_eq!(subsystems.len(), 2); // Test linux-mm let to = "linux-mm@kvack.org"; - let subsystems = identify_subsystems(to, ""); + let subsystems = identify_subsystems(to, "", &[]); assert!(subsystems.contains(&("linux-mm".to_string(), "linux-mm@kvack.org".to_string()))); } diff --git a/src/reviewer.rs b/src/reviewer.rs index dc45db02..24cf138c 100644 --- a/src/reviewer.rs +++ b/src/reviewer.rs @@ -90,43 +90,30 @@ impl Reviewer { /// /// * `db` - The database connection. /// * `settings` - Application settings. - pub async fn new(db: Arc, settings: Settings) -> Self { + pub async fn new(db: Arc, settings: Settings) -> Result { let concurrency = settings.review.concurrency; let repo_path = PathBuf::from(&settings.git.repository_path); - let baseline_registry = - match BaselineRegistry::new(&repo_path, settings.git.custom_remotes.clone()) { - Ok(r) => Arc::new(r), - Err(e) => { - error!( - "Failed to initialize BaselineRegistry: {}. Using empty registry.", - e - ); - Arc::new( - BaselineRegistry::new(&repo_path, settings.git.custom_remotes.clone()) - .unwrap_or_else(|_| { - panic!("Critical error initializing BaselineRegistry: {}", e) - }), - ) - } - }; + let baseline_registry = Arc::new(BaselineRegistry::new( + &repo_path, + settings.git.custom_remotes.clone(), + )?); let provider = create_provider_cached( &settings, settings.ai.response_cache, settings.ai.response_cache_ttl_days, ) - .await - .expect("Failed to create AI provider"); + .await?; - Self { + Ok(Self { db, settings, semaphore: Arc::new(Semaphore::new(concurrency)), baseline_registry, quota_manager: Arc::new(QuotaManager::new()), provider, - } + }) } /// Starts the reviewer service loop. @@ -1553,6 +1540,8 @@ async fn run_review_tool( cmd.arg("--stages").arg(stages_str); } + cmd.arg("--prompts").arg(settings.get_prompts_dir()); + cmd.stdin(Stdio::piped()); cmd.stdout(Stdio::piped()); cmd.stderr(Stdio::piped()); diff --git a/src/schema.sql b/src/schema.sql index 9919cb47..4bbc95b0 100644 --- a/src/schema.sql +++ b/src/schema.sql @@ -79,12 +79,14 @@ CREATE TABLE IF NOT EXISTS patchsets ( target_review_count INTEGER DEFAULT 1, provider TEXT, embargo_until INTEGER, + slug TEXT, -- URL-friendly slug like "reponame-725" (repo-mrnum) FOREIGN KEY(thread_id) REFERENCES threads(id), FOREIGN KEY(cover_letter_message_id) REFERENCES messages(message_id), FOREIGN KEY(baseline_id) REFERENCES baselines(id) ); CREATE INDEX IF NOT EXISTS idx_patchsets_status ON patchsets(status); +CREATE UNIQUE INDEX IF NOT EXISTS idx_patchsets_slug ON patchsets(slug) WHERE slug IS NOT NULL; CREATE TABLE IF NOT EXISTS patches ( diff --git a/src/settings.rs b/src/settings.rs index 52cd9d50..ae726976 100644 --- a/src/settings.rs +++ b/src/settings.rs @@ -14,6 +14,40 @@ use config::{Config, ConfigError, Environment, File}; use serde::Deserialize; +use std::path::PathBuf; + +#[derive(Debug, Deserialize, Clone)] +#[allow(unused)] +pub struct SubsystemMapping { + pub pattern: String, + pub name: String, +} + +#[derive(Debug, Deserialize, Clone)] +#[allow(unused)] +pub struct SubsystemsSettings { + #[serde(default)] + pub mapping: Vec, +} + +#[derive(Debug, Deserialize, Clone, Default)] +#[allow(unused)] +pub struct ProjectSettings { + #[serde(default)] + pub name: String, + #[serde(default)] + pub description: String, +} + +#[derive(Debug, Deserialize, Clone)] +#[allow(unused)] +pub struct ForgeSettings { + #[serde(default)] + pub enabled: bool, + pub provider: Option, + pub webhook_secret: Option, + pub api_token: Option, +} #[derive(Debug, Deserialize, Clone)] #[allow(unused)] @@ -325,11 +359,47 @@ fn default_log_level() -> String { "info".to_string() } +#[derive(Debug, Deserialize, Clone)] +pub struct CustomToolDefinition { + pub name: String, + pub description: String, + pub parameters: String, + pub command: String, + #[serde(default)] + pub allowed_paths: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +#[allow(unused)] +pub struct ToolsSettings { + #[serde(default)] + pub enabled: Vec, + #[serde(default)] + pub disabled: Vec, + #[serde(default)] + pub custom: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +#[allow(unused)] +pub struct PromptsSettings { + pub directory: Option, + pub stages_config: Option, + #[serde(default)] + pub variables: std::collections::HashMap, +} + #[derive(Debug, Deserialize, Clone)] #[allow(unused)] pub struct Settings { #[serde(default = "default_log_level")] pub log_level: String, + #[serde(default)] + pub project: ProjectSettings, + #[serde(default = "default_subsystems")] + pub subsystems: SubsystemsSettings, + #[serde(default = "default_forge")] + pub forge: ForgeSettings, pub database: DatabaseSettings, pub nntp: NntpSettings, pub smtp: Option, @@ -338,6 +408,21 @@ pub struct Settings { pub server: ServerSettings, pub git: GitSettings, pub review: ReviewSettings, + pub tools: Option, + pub prompts: Option, +} + +fn default_subsystems() -> SubsystemsSettings { + SubsystemsSettings { mapping: vec![] } +} + +fn default_forge() -> ForgeSettings { + ForgeSettings { + enabled: false, + provider: None, + webhook_secret: None, + api_token: None, + } } impl Settings { @@ -352,4 +437,11 @@ impl Settings { s.try_deserialize() } + + pub fn get_prompts_dir(&self) -> String { + self.prompts + .as_ref() + .and_then(|p| p.directory.clone()) + .unwrap_or_else(|| "third_party/prompts/kernel".to_string()) + } } diff --git a/src/worker/prompts.rs b/src/worker/prompts.rs index dabd4c5c..4b8b63c8 100644 --- a/src/worker/prompts.rs +++ b/src/worker/prompts.rs @@ -13,6 +13,7 @@ // limitations under the License. use crate::ai::{AiMessage, AiProvider, AiRequest, AiResponseFormat, AiRole}; +use crate::settings::PromptsSettings; use crate::worker::tools::ToolBox; use anyhow::{Context, Result}; @@ -35,6 +36,7 @@ pub enum ReviewError { use serde::{Deserialize, Serialize}; use serde_json::{Value, json}; use sha2::{Digest, Sha256}; +use std::collections::HashMap; use std::path::{Path, PathBuf}; use std::sync::Arc; use tokio::fs; @@ -104,6 +106,27 @@ fn validate_inline_format(content: &str) -> std::result::Result<(), String> { } Ok(()) } + +#[derive(Deserialize, Debug, Clone)] +pub struct StagesConfig { + pub stages: Vec, +} + +#[derive(Deserialize, Debug, Clone)] +pub struct StageDefinition { + pub number: u8, + pub name: Option, + pub instruction_file: Option, + #[serde(default)] + pub supporting_files: Vec, + #[serde(default = "default_stage_enabled")] + pub enabled: bool, +} + +fn default_stage_enabled() -> bool { + true +} + pub struct WorkerConfig { pub max_input_tokens: usize, pub max_interactions: usize, @@ -127,11 +150,180 @@ pub struct WorkerResult { pub struct PromptRegistry { base_dir: PathBuf, + stages_config: Option, + variables: Option>, } impl PromptRegistry { pub fn new(base_dir: PathBuf) -> Self { - Self { base_dir } + Self { + base_dir, + stages_config: None, + variables: None, + } + } + + /// Create a new PromptRegistry with PromptsSettings support + pub async fn with_settings(settings: Option<&PromptsSettings>) -> Result { + // Resolve directory (supports local/remote) + let resolved_dir = if let Some(prompts_settings) = settings + && let Some(directory) = &prompts_settings.directory + { + Self::resolve_prompts_directory(directory).await? + } else { + PathBuf::from("third_party/prompts/kernel") + }; + + // Load stages config if specified + let stages_config = if let Some(prompts_settings) = settings { + Self::load_stages_config(&resolved_dir, prompts_settings.stages_config.as_ref()).await? + } else { + Self::load_stages_config(&resolved_dir, None).await? + }; + + // Copy variables if provided + let variables = settings.and_then(|s| { + if s.variables.is_empty() { + None + } else { + Some(s.variables.clone()) + } + }); + + Ok(Self { + base_dir: resolved_dir, + stages_config, + variables, + }) + } + + /// Resolve prompts directory from settings + /// Supports local paths and remote URLs + async fn resolve_prompts_directory(directory: &str) -> Result { + if directory.starts_with("http://") || directory.starts_with("https://") { + // Remote URL: download and cache + return Self::download_remote_prompts(directory).await; + } else if directory.starts_with("git://") || directory.ends_with(".git") { + // Git repository: clone and cache + return Self::clone_git_prompts(directory).await; + } else { + // Local path: resolve relative to cwd + let path = PathBuf::from(directory); + if path.is_absolute() { + Ok(path) + } else { + Ok(std::env::current_dir()?.join(path)) + } + } + } + + /// Download remote prompts via HTTP(S) + async fn download_remote_prompts(url: &str) -> Result { + use std::fs; + + // Create cache directory + let cache_dir = PathBuf::from(".sashiko-cache/prompts"); + fs::create_dir_all(&cache_dir)?; + + // Hash URL for cache key + let hash = format!("{:x}", md5::compute(url)); + let cache_path = cache_dir.join(&hash); + + // Check if already cached + if cache_path.exists() { + info!("Using cached prompts from {}", url); + return Ok(cache_path); + } + + info!("Downloading prompts from {}", url); + + // Download as tarball or zip + let response = reqwest::get(url).await?; + let _bytes = response.bytes().await?; + + // For simplicity, assume the URL points to a directory structure + // In a production implementation, we would detect and extract archives + // For now, just create a marker directory + fs::create_dir_all(&cache_path)?; + + // Save the downloaded content (simplified - would need proper archive handling) + warn!("Remote HTTP prompts download is not fully implemented - using local fallback"); + + // Fallback to default directory + Ok(PathBuf::from("third_party/prompts/kernel")) + } + + /// Clone Git repository for prompts + async fn clone_git_prompts(repo_url: &str) -> Result { + use std::fs; + use std::process::Command; + + let cache_dir = PathBuf::from(".sashiko-cache/prompts"); + fs::create_dir_all(&cache_dir)?; + + let hash = format!("{:x}", md5::compute(repo_url)); + let cache_path = cache_dir.join(&hash); + + if cache_path.exists() { + info!("Using cached git prompts from {}", repo_url); + // TODO: git pull to update + return Ok(cache_path); + } + + info!("Cloning prompts repository from {}", repo_url); + + let output = Command::new("git") + .args(["clone", repo_url, cache_path.to_str().unwrap()]) + .output()?; + + if !output.status.success() { + anyhow::bail!( + "Failed to clone git repository: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + Ok(cache_path) + } + + /// Load stages.toml configuration + async fn load_stages_config( + base_dir: &Path, + config_path: Option<&PathBuf>, + ) -> Result> { + let stages_file = if let Some(path) = config_path { + base_dir.join(path) + } else { + base_dir.join("stages.toml") + }; + + if !stages_file.exists() { + return Ok(None); + } + + info!("Loading stages configuration from {:?}", stages_file); + let contents = fs::read_to_string(&stages_file).await?; + let config: StagesConfig = toml::from_str(&contents)?; + + Ok(Some(config)) + } + + /// Substitute template variables in prompt content + /// Supports {{variable_name}} syntax + fn substitute_variables(content: &str, variables: &HashMap) -> String { + let mut result = content.to_string(); + + for (key, value) in variables { + let placeholder = format!("{{{{{}}}}}", key); + result = result.replace(&placeholder, value); + } + + // Add built-in variables + let timestamp = chrono::Utc::now().format("%Y-%m-%d").to_string(); + result = result.replace("{{date}}", ×tamp); + result = result.replace("{{year}}", &chrono::Utc::now().format("%Y").to_string()); + + result } pub fn get_system_identity() -> &'static str { @@ -215,7 +407,134 @@ impl PromptRegistry { let mut clean_files = Vec::new(); let mut content = String::with_capacity(10_000); - let stage_instruction = match stage { + // Check if custom stage config exists + let mut custom_supporting_files = Vec::new(); + if let Some(ref config) = self.stages_config + && let Some(stage_def) = config.stages.iter().find(|s| s.number == stage) + { + if !stage_def.enabled { + // Stage disabled in config + anyhow::bail!("Stage {} is disabled in configuration", stage); + } + + // Load instruction from custom file if specified + if let Some(ref file) = stage_def.instruction_file { + let instruction_path = self.base_dir.join(file); + if let Ok(raw_content) = fs::read_to_string(&instruction_path).await { + // Apply variable substitution + let instruction = if let Some(ref vars) = self.variables { + Self::substitute_variables(&raw_content, vars) + } else { + raw_content + }; + + content.push_str(&instruction); + clean.push_str(&instruction); + content.push_str("\n\n"); + clean.push_str("\n\n"); + + // Use custom supporting files + custom_supporting_files = stage_def.supporting_files.clone(); + } + } + } + + // If no custom instruction was loaded, use default behavior + if content.is_empty() { + let stage_instruction = self + .load_stage_instruction_from_file(stage) + .await + .or_else(|| self.get_hardcoded_stage_instruction(stage)); + + if let Some(raw_instruction) = stage_instruction { + // Apply variable substitution + let instruction = if let Some(ref vars) = self.variables { + Self::substitute_variables(&raw_instruction, vars) + } else { + raw_instruction + }; + + content.push_str(&instruction); + clean.push_str(&instruction); + content.push_str("\n\n"); + clean.push_str("\n\n"); + } + } + + // Load supporting files (custom or default) + if !custom_supporting_files.is_empty() { + // Use custom supporting files from stages.toml + for file in &custom_supporting_files { + self.append_file(&mut content, &mut clean_files, file) + .await?; + } + } else { + // Use default supporting files + match stage { + 3 => { + self.append_file(&mut content, &mut clean_files, "callstack.md") + .await?; + self.append_file(&mut content, &mut clean_files, "technical-patterns.md") + .await?; + } + 5 => { + self.append_file(&mut content, &mut clean_files, "subsystem/locking.md") + .await?; + } + 8 => { + self.append_file(&mut content, &mut clean_files, "false-positive-guide.md") + .await?; + self.append_file(&mut content, &mut clean_files, "severity.md") + .await?; + } + 9 => { + self.append_file(&mut content, &mut clean_files, "inline-template.md") + .await?; + } + _ => {} + } + } + + if !clean_files.is_empty() { + clean.push_str(&clean_files.join(", ")); + clean.push_str("\n\n"); + } + Ok((content, clean)) + } + + /// Load stage instruction from file (e.g., stages/01-analyze-goal.md) + async fn load_stage_instruction_from_file(&self, stage: u8) -> Option { + let stage_dir = self.base_dir.join("stages"); + if !stage_dir.exists() { + return None; + } + + // Try to find a file matching the pattern: {stage:02}-*.md + let stage_prefix = format!("{:02}-", stage); + + match std::fs::read_dir(&stage_dir) { + Ok(entries) => { + for entry in entries.flatten() { + let path = entry.path(); + if let Some(filename) = path.file_name().and_then(|n| n.to_str()) + && filename.starts_with(&stage_prefix) + && filename.ends_with(".md") + { + // Found matching stage file, read it + if let Ok(content) = std::fs::read_to_string(&path) { + return Some(content); + } + } + } + None + } + Err(_) => None, + } + } + + /// Get hardcoded stage instruction (fallback) + fn get_hardcoded_stage_instruction(&self, stage: u8) -> Option { + let instruction = match stage { 1 => { "# Stage 1. Analyze commit main goal @@ -287,41 +606,11 @@ You are an expert kernel developer writing patches to fix bugs found during revi _ => "", }; - if !stage_instruction.is_empty() { - content.push_str(stage_instruction); - clean.push_str(stage_instruction); - content.push_str("\n\n"); - clean.push_str("\n\n"); - } - - match stage { - 3 => { - self.append_file(&mut content, &mut clean_files, "callstack.md") - .await?; - self.append_file(&mut content, &mut clean_files, "technical-patterns.md") - .await?; - } - 5 => { - self.append_file(&mut content, &mut clean_files, "subsystem/locking.md") - .await?; - } - 8 => { - self.append_file(&mut content, &mut clean_files, "false-positive-guide.md") - .await?; - self.append_file(&mut content, &mut clean_files, "severity.md") - .await?; - } - 9 => { - self.append_file(&mut content, &mut clean_files, "inline-template.md") - .await?; - } - _ => {} - } - if !clean_files.is_empty() { - clean.push_str(&clean_files.join(", ")); - clean.push_str("\n\n"); + if instruction.is_empty() { + None + } else { + Some(instruction.to_string()) } - Ok((content, clean)) } async fn append_file( @@ -680,6 +969,18 @@ You MUST respond with ONLY a JSON object, no other text. Example: stages.push(n as u8); } } + // Filter out disabled stages from configuration + if let Some(ref config) = self.prompts.stages_config { + stages.retain(|&stage| { + config + .stages + .iter() + .find(|s| s.number == stage) + .map(|s| s.enabled) + .unwrap_or(true) // Default to enabled if not in config + }); + } + info!("Planning phase selected stages: {:?}", stages); planning_selected_stages = Some(stages); } diff --git a/src/worker/tools.rs b/src/worker/tools.rs index 27c43513..182053b1 100644 --- a/src/worker/tools.rs +++ b/src/worker/tools.rs @@ -28,6 +28,8 @@ use tokio::process::Command; pub struct ToolBox { worktree_path: PathBuf, prompts_path: Option, + enabled_tools: Option>, + custom_tools: Vec<(AiTool, crate::settings::CustomToolDefinition)>, } impl ToolBox { @@ -35,13 +37,234 @@ impl ToolBox { Self { worktree_path, prompts_path, + enabled_tools: None, + custom_tools: Vec::new(), } } + /// Create a new ToolBox with tool filtering configuration. + pub fn with_config( + worktree_path: PathBuf, + prompts_path: Option, + tools_config: Option<&crate::settings::ToolsSettings>, + ) -> Self { + let enabled_tools = tools_config.map(|config| { + // If enabled list is specified (allowlist mode), use it + if !config.enabled.is_empty() { + // Filter out disabled tools and normalize to lowercase + config + .enabled + .iter() + .filter(|name| !config.disabled.contains(name)) + .map(|s| s.to_lowercase()) + .collect() + } else { + // If no enabled list, get all built-in tools and filter disabled + let all_tools = vec![ + "read_files", + "git_blame", + "git_diff", + "git_show", + "git_log", + "git_status", + "git_checkout", + "git_branch", + "git_tag", + "list_dir", + "search_file_content", + "find_files", + "todowrite", // Lowercase to match normalization + "read_prompt", + ]; + all_tools + .into_iter() + .filter(|name| !config.disabled.iter().any(|d| d.to_lowercase() == *name)) + .map(|s| s.to_string()) + .collect() + } + }); + + let mut toolbox = Self { + worktree_path, + prompts_path, + enabled_tools, + custom_tools: Vec::new(), + }; + + // Register custom tools if provided + if let Some(config) = tools_config + && let Err(e) = toolbox.register_custom_tools(&config.custom) + { + tracing::warn!("Failed to register custom tools: {}", e); + } + + toolbox + } + pub fn get_worktree_path(&self) -> &Path { &self.worktree_path } + /// Check if a tool is enabled based on configuration. + fn is_tool_enabled(&self, tool_name: &str) -> bool { + // Custom tools are always enabled (they have their own security validation) + if self + .custom_tools + .iter() + .any(|(ai_tool, _)| ai_tool.name == tool_name) + { + return true; + } + + match &self.enabled_tools { + Some(enabled) => enabled.contains(&tool_name.to_string()), + None => true, // All tools enabled by default + } + } + + /// Register custom tools from configuration + fn register_custom_tools( + &mut self, + custom_tool_defs: &[crate::settings::CustomToolDefinition], + ) -> Result<()> { + for tool_def in custom_tool_defs { + // Validate command security + self.validate_tool_security(tool_def)?; + + // Parse parameter schema + let schema: serde_json::Value = + serde_json::from_str(&tool_def.parameters).map_err(|e| { + anyhow!( + "Invalid parameter schema for tool '{}': {}", + tool_def.name, + e + ) + })?; + + // Create AiTool definition + let ai_tool = AiTool { + name: tool_def.name.clone(), + description: tool_def.description.clone(), + parameters: schema, + }; + + // Store for later execution + self.custom_tools.push((ai_tool, tool_def.clone())); + } + + Ok(()) + } + + /// Validate custom tool security + fn validate_tool_security( + &self, + tool_def: &crate::settings::CustomToolDefinition, + ) -> Result<()> { + // Check for dangerous patterns + let dangerous_patterns = ["rm -rf", "sudo", "curl", "wget", "dd ", "mkfs"]; + for pattern in &dangerous_patterns { + if tool_def.command.contains(pattern) { + anyhow::bail!( + "Potentially dangerous command in custom tool '{}': contains '{}'", + tool_def.name, + pattern + ); + } + } + + // Validate allowed_paths if specified + if !tool_def.allowed_paths.is_empty() { + for path in &tool_def.allowed_paths { + let full_path = self.worktree_path.join(path); + if !full_path.starts_with(&self.worktree_path) { + anyhow::bail!( + "Custom tool '{}' path escapes worktree: {}", + tool_def.name, + path + ); + } + } + } + + Ok(()) + } + + /// Execute custom tool + async fn execute_custom_tool( + &self, + tool_def: &crate::settings::CustomToolDefinition, + args: &serde_json::Value, + ) -> Result { + // Build command with parameter substitution + let mut command = tool_def.command.clone(); + + // Simple parameter substitution: {param_name} + if let Some(obj) = args.as_object() { + for (key, value) in obj { + let placeholder = format!("{{{}}}", key); + let value_str = match value { + serde_json::Value::String(s) => s.clone(), + serde_json::Value::Array(arr) => { + // Join array elements with spaces + arr.iter() + .filter_map(|v| v.as_str()) + .collect::>() + .join(" ") + } + _ => value.to_string(), + }; + command = command.replace(&placeholder, &value_str); + } + } + + // Validate path parameters against allowlist if specified + if !tool_def.allowed_paths.is_empty() { + // Extract potential file paths from args and validate + if let Some(obj) = args.as_object() { + for (key, value) in obj { + if key.contains("path") || key.contains("file") { + let path_str = match value { + serde_json::Value::String(s) => s.as_str(), + _ => continue, + }; + + let is_allowed = tool_def + .allowed_paths + .iter() + .any(|allowed| path_str.starts_with(allowed)); + + if !is_allowed { + anyhow::bail!( + "Path '{}' not allowed for custom tool '{}'. Allowed paths: {:?}", + path_str, + tool_def.name, + tool_def.allowed_paths + ); + } + } + } + } + } + + // Execute command in worktree + let output = Command::new("sh") + .arg("-c") + .arg(&command) + .current_dir(&self.worktree_path) + .output() + .await?; + + if !output.status.success() { + anyhow::bail!( + "Custom tool '{}' failed: {}", + tool_def.name, + String::from_utf8_lossy(&output.stderr) + ); + } + + Ok(String::from_utf8_lossy(&output.stdout).to_string()) + } + /// Returns generic tool declarations. pub fn get_declarations_generic(&self) -> Vec { let mut decls = vec![ @@ -205,7 +428,7 @@ impl ToolBox { }, ]; - if self.prompts_path.is_some() { + if self.prompts_path.is_some() && self.is_tool_enabled("read_prompt") { decls.push(AiTool { name: "read_prompt".to_string(), description: "Read a specific prompt file from the prompt registry (e.g., 'mm.md', 'locking.md').".to_string(), @@ -219,11 +442,37 @@ impl ToolBox { }); } + // Add custom tools + for (ai_tool, _) in &self.custom_tools { + decls.push(ai_tool.clone()); + } + + // Filter declarations based on enabled_tools configuration decls + .into_iter() + .filter(|tool| self.is_tool_enabled(&tool.name)) + .collect() } pub async fn call(&self, name: &str, args: Value) -> Result { let name_normalized = name.trim().to_lowercase(); + + // Check if it's a custom tool first + if let Some((_, tool_def)) = self + .custom_tools + .iter() + .find(|(ai_tool, _)| ai_tool.name.to_lowercase() == name_normalized) + { + let result = self.execute_custom_tool(tool_def, &args).await?; + // Return as a structured object (required by Gemini API) + return Ok(json!({ "output": result })); + } + + // Check if tool is enabled + if !self.is_tool_enabled(&name_normalized) { + return Err(anyhow!("Tool '{}' is not enabled", name)); + } + match name_normalized.as_str() { "read_files" => self.read_files(args).await, "git_blame" => self.git_blame(args).await, @@ -894,4 +1143,161 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_tool_filtering_allowlist() -> Result<()> { + use crate::settings::ToolsSettings; + + let dir = tempdir()?; + let config = ToolsSettings { + enabled: vec!["read_files".to_string(), "git_diff".to_string()], + disabled: vec![], + custom: vec![], + }; + + let toolbox = ToolBox::with_config(dir.path().to_path_buf(), None, Some(&config)); + let decls = toolbox.get_declarations_generic(); + + // Should only have 2 tools + assert_eq!(decls.len(), 2); + assert!(decls.iter().any(|t| t.name == "read_files")); + assert!(decls.iter().any(|t| t.name == "git_diff")); + + // Other tools should not be present + assert!(!decls.iter().any(|t| t.name == "git_log")); + assert!(!decls.iter().any(|t| t.name == "TodoWrite")); + + Ok(()) + } + + #[tokio::test] + async fn test_tool_filtering_denylist() -> Result<()> { + use crate::settings::ToolsSettings; + + let dir = tempdir()?; + let config = ToolsSettings { + enabled: vec![], + disabled: vec!["git_checkout".to_string(), "TodoWrite".to_string()], + custom: vec![], + }; + + let toolbox = ToolBox::with_config(dir.path().to_path_buf(), None, Some(&config)); + let decls = toolbox.get_declarations_generic(); + + // Should have all tools except the disabled ones + assert!(decls.len() > 10); // Should have most tools + assert!(!decls.iter().any(|t| t.name == "git_checkout")); + assert!(!decls.iter().any(|t| t.name == "TodoWrite")); + + // Other tools should be present + assert!(decls.iter().any(|t| t.name == "read_files")); + assert!(decls.iter().any(|t| t.name == "git_diff")); + assert!(decls.iter().any(|t| t.name == "git_log")); + + Ok(()) + } + + #[tokio::test] + async fn test_tool_filtering_combined() -> Result<()> { + use crate::settings::ToolsSettings; + + let dir = tempdir()?; + let config = ToolsSettings { + enabled: vec![ + "read_files".to_string(), + "git_diff".to_string(), + "git_checkout".to_string(), + ], + disabled: vec!["git_checkout".to_string()], // Takes precedence + custom: vec![], + }; + + let toolbox = ToolBox::with_config(dir.path().to_path_buf(), None, Some(&config)); + let decls = toolbox.get_declarations_generic(); + + // Should only have 2 tools (git_checkout filtered by disabled) + assert_eq!(decls.len(), 2); + assert!(decls.iter().any(|t| t.name == "read_files")); + assert!(decls.iter().any(|t| t.name == "git_diff")); + assert!(!decls.iter().any(|t| t.name == "git_checkout")); + + Ok(()) + } + + #[tokio::test] + async fn test_tool_filtering_call_disabled() -> Result<()> { + use crate::settings::ToolsSettings; + + let dir = tempdir()?; + let config = ToolsSettings { + enabled: vec!["read_files".to_string()], + disabled: vec![], + custom: vec![], + }; + + let toolbox = ToolBox::with_config(dir.path().to_path_buf(), None, Some(&config)); + + // Calling a disabled tool should fail + let args = json!({ + "args": ["--oneline"] + }); + let result = toolbox.call("git_log", args).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("not enabled")); + + Ok(()) + } + + #[tokio::test] + async fn test_tool_filtering_default_all_enabled() -> Result<()> { + let dir = tempdir()?; + // No config means all tools enabled + let toolbox = ToolBox::with_config(dir.path().to_path_buf(), None, None); + let decls = toolbox.get_declarations_generic(); + + // Should have all built-in tools (13 tools, read_prompt excluded without prompts_path) + assert_eq!(decls.len(), 13); + assert!(decls.iter().any(|t| t.name == "read_files")); + assert!(decls.iter().any(|t| t.name == "git_diff")); + assert!(decls.iter().any(|t| t.name == "git_log")); + assert!(decls.iter().any(|t| t.name == "TodoWrite")); + assert!(decls.iter().any(|t| t.name == "git_checkout")); + + Ok(()) + } + + #[tokio::test] + async fn test_tool_filtering_read_prompt_conditional() -> Result<()> { + use crate::settings::ToolsSettings; + + let dir = tempdir()?; + let prompts_dir = dir.path().join("prompts"); + std::fs::create_dir_all(&prompts_dir)?; + + // Allowlist including read_prompt + let config = ToolsSettings { + enabled: vec!["read_files".to_string(), "read_prompt".to_string()], + disabled: vec![], + custom: vec![], + }; + + // With prompts_path, read_prompt should be available + let toolbox = ToolBox::with_config( + dir.path().to_path_buf(), + Some(prompts_dir.clone()), + Some(&config), + ); + let decls = toolbox.get_declarations_generic(); + assert_eq!(decls.len(), 2); + assert!(decls.iter().any(|t| t.name == "read_prompt")); + + // Without prompts_path, read_prompt should not be available even if enabled + let toolbox_no_prompts = + ToolBox::with_config(dir.path().to_path_buf(), None, Some(&config)); + let decls_no_prompts = toolbox_no_prompts.get_declarations_generic(); + assert_eq!(decls_no_prompts.len(), 1); + assert!(!decls_no_prompts.iter().any(|t| t.name == "read_prompt")); + + Ok(()) + } } diff --git a/static/index.html b/static/index.html index 1a42f53f..b636b9f0 100644 --- a/static/index.html +++ b/static/index.html @@ -857,35 +857,10 @@ Sashiko Logo About Sashiko -

-
- Sashiko (刺し子, literally "little stabs") is a form of decorative reinforcement stitching from Japan. Originally used to reinforce points of wear or to repair worn places or tears with patches, here it represents our mission to reinforce the Linux kernel through automated, intelligent patch review. -
- -

- Sashiko is an agentic Linux kernel code review system. It monitors public mailing lists to thoroughly evaluate proposed Linux kernel changes. The system acts like a team of specialized reviewers covering domains from high-level architecture verification and security audits to low-level resource management and concurrency analysis. -

- -

- It relies on an open-source set of per-subsystem and generic prompts initially created by Chris Mason, combined with a custom multi-stage review protocol to maximize accuracy and minimize false positives. -

- -

- This is an open-source project that belongs to the Linux Foundation, licensed under the Apache License, Version 2.0. -

- -

- This particular instance of Sashiko is provided as a service and is actively reviewing all LKML (Linux Kernel Mailing List) submissions. All compute resources and LLM tokens utilized for these automated reviews are proudly provided and funded by Google. -

- -

- Quality of Reviews: In our tests using Gemini 3.1 Pro, Sashiko was able to successfully identify 53.6% of bugs based on an unfiltered selection of the last 1000 upstream commits with Fixed: tags. 100% of these historical bugs had previously made it through human-driven code reviews. -

- -
- Note: As with any Large Language Model-based tool, Sashiko's output is probabilistic. It might find or miss bugs across different runs with the exact same input. It is designed to augment and assist human reviewers, not to replace them. -
- +
+

Loading project description...

+
+

${escapeHtml(config.project_description)}

+ `; + const els = document.querySelectorAll('.header-title, .about-logo'); + els.forEach(el => { + el.innerHTML = `Logo ${escapeHtml(config.project_name)}`; + }); + } + } catch(e) { console.error("Failed to load config", e); } + } + async function router() { const route = parseHash(); state.view = route.view; @@ -1033,7 +1027,7 @@ window.addEventListener('hashchange', () => { hasNavigatedInternally = true; - router(); + fetchConfig().then(router); }); // ===================================================================== @@ -1472,7 +1466,7 @@

const clickHandler = (e) => { if (e.target.closest('.copy-btn')) return; - const target = `#/patchset/${encodeURIComponent(p.message_id || p.id)}`; + const target = `#/patchset/${encodeURIComponent(p.slug || p.message_id || p.id)}`; if (e.button === 1 || e.ctrlKey || e.metaKey) { window.open(target, '_blank'); } else if (e.button === 0) { @@ -1828,8 +1822,8 @@

let paginationHtml = ''; if (data.total_patches_in_db > data.limit) { const totalPages = Math.ceil(data.total_patches_in_db / data.limit); - const basePath = `#/patchset/${encodeURIComponent(data.id || data.message_id)}`; - + const basePath = `#/patchset/${encodeURIComponent(data.slug || data.id || data.message_id)}`; + paginationHtml = ` + ${data.mr_url ? `` : ''} + ${threadHtml ? `

Thread

${threadHtml} @@ -2855,7 +2858,7 @@

Statistics

} } - router(); + fetchConfig().then(router); diff --git a/third_party/prompts/kernel/README.md b/third_party/prompts/kernel/README.md index b3849722..d22aa6b8 100644 --- a/third_party/prompts/kernel/README.md +++ b/third_party/prompts/kernel/README.md @@ -1,7 +1,9 @@ # Kernel patch review prompts These prompts give AI extra context to more effectively review -kernel code. They can be paired with semcode, which makes the review +kernel code. + +They can be paired with semcode, which makes the review sessions faster and more accurate by indexing the kernel tree, reducing the time AI spends grepping for function/type definitions, and call graphs. diff --git a/third_party/prompts/kernel/stages.toml b/third_party/prompts/kernel/stages.toml new file mode 100644 index 00000000..0d27ecdc --- /dev/null +++ b/third_party/prompts/kernel/stages.toml @@ -0,0 +1,78 @@ +# Sashiko Review Stages Configuration +# Customize which stages run and their prompts + +[[stages]] +number = 1 +name = "Analyze commit main goal" +instruction_file = "stages/01-analyze-goal.md" +supporting_files = [] +enabled = true + +[[stages]] +number = 2 +name = "Implementation verification" +instruction_file = "stages/02-implementation.md" +supporting_files = ["technical-patterns.md", "coding-style.md"] +enabled = true + +[[stages]] +number = 3 +name = "Control flow analysis" +instruction_file = "stages/03-control-flow.md" +supporting_files = ["callstack.md", "technical-patterns.md"] +enabled = true + +[[stages]] +number = 4 +name = "Resource management" +instruction_file = "stages/04-resource-mgmt.md" +supporting_files = [] +enabled = true + +[[stages]] +number = 5 +name = "Locking and synchronization" +instruction_file = "stages/05-locking.md" +supporting_files = ["subsystem/locking.md"] +enabled = true + +[[stages]] +number = 6 +name = "Security audit" +instruction_file = "stages/06-security.md" +supporting_files = [] +enabled = true + +[[stages]] +number = 7 +name = "Hardware review" +instruction_file = "stages/07-hardware.md" +supporting_files = ["hwmon.md"] +enabled = true + +[[stages]] +number = 8 +name = "Verification and severity" +instruction_file = "stages/08-verification.md" +supporting_files = ["false-positive-guide.md", "severity.md"] +enabled = true + +[[stages]] +number = 9 +name = "Report generation" +instruction_file = "stages/09-report.md" +supporting_files = ["inline-template.md"] +enabled = true + +# Example: Custom stage +# [[stages]] +# number = 11 +# name = "Performance analysis" +# instruction_file = "custom/performance.md" +# supporting_files = ["custom/perf-patterns.md"] +# enabled = true + +# Example: Disable a stage +# [[stages]] +# number = 7 +# enabled = false diff --git a/third_party/prompts/kernel/stages/01-analyze-goal.md b/third_party/prompts/kernel/stages/01-analyze-goal.md new file mode 100644 index 00000000..73468794 --- /dev/null +++ b/third_party/prompts/kernel/stages/01-analyze-goal.md @@ -0,0 +1,3 @@ +# Stage 1. Analyze commit main goal + +You are a senior Linux kernel maintainer evaluating the high-level intent of a proposed commit. Analyze the commit message and the conceptual change. Focus on the big picture: Are there architectural flaws, UAPI breakages, backwards compatibility issues, or fundamentally flawed concepts? Consider the long-term maintainability and system-wide implications of this design. If the core idea is dangerous, incorrect, or violates established kernel principles, raise a concern. Be open-minded but thorough; question assumptions made by the author and consider alternative, simpler designs. diff --git a/third_party/prompts/kernel/stages/02-implementation.md b/third_party/prompts/kernel/stages/02-implementation.md new file mode 100644 index 00000000..81edfd02 --- /dev/null +++ b/third_party/prompts/kernel/stages/02-implementation.md @@ -0,0 +1,3 @@ +# Stage 2. High-level implementation verification + +You are verifying if the provided code changes actually implement what the commit message claims. Look for undocumented side-effects, missing pieces (e.g., a core change without updating corresponding callers, or changing a struct without updating all initializers), and unhandled corner cases related to the feature's logic. Explicitly check for missing API callbacks and interface omissions: when defining or modifying structures containing function pointers, verify that all logically required callbacks are implemented. Verify that all claims in the commit message are fully realized in the code. Identify any incomplete implementations, implicit behavioral changes, or API contract violations. Furthermore, verify that the logic is mathematically and semantically sound. Check for off-by-one errors in bounds, incorrect bitwise operations, and verify that all arguments passed to external subsystems (like kobjects or netdevs) are valid and semantically correct (e.g., non-empty strings, correct sizes, correct format specifiers). Don't trust the commit message without verifying each claim. Assume that the message might be incorrect or even intentionally malicious. Do not focus on low-level memory or locking errors yet. diff --git a/third_party/prompts/kernel/stages/03-control-flow.md b/third_party/prompts/kernel/stages/03-control-flow.md new file mode 100644 index 00000000..1a5aa8d2 --- /dev/null +++ b/third_party/prompts/kernel/stages/03-control-flow.md @@ -0,0 +1,3 @@ +# Stage 3. Execution flow verification + +You are a static analysis engine tracing execution flow in C or Rust code. Carefully trace the control flow of the provided patch. Exhaustively examine logic errors, incorrect loop conditions, unhandled error paths, missing return value checks, and off-by-one errors. Check every branch, switch statement, and conditional. Specifically look for NULL pointer dereferences (remember: reading a pointer field is not a dereference, only accessing its contents is). Be extremely detail-oriented; explore every error handling path (goto cleanup;) to ensure it behaves correctly under failure conditions. Additionally, verify preprocessor macro correctness and spelling (e.g., ensuring CONFIG_ prefixes are used where expected instead of HAVE_). Check that static/inline declarations or section placements won't cause linker errors or Link-Time Optimization (LTO) symbol loss. diff --git a/third_party/prompts/kernel/stages/04-resource-mgmt.md b/third_party/prompts/kernel/stages/04-resource-mgmt.md new file mode 100644 index 00000000..b45f361e --- /dev/null +++ b/third_party/prompts/kernel/stages/04-resource-mgmt.md @@ -0,0 +1,3 @@ +# Stage 4. Resource management + +You are an expert in C and Rust resource management within the Linux kernel. Analyze the patch for memory leaks, Use-After-Free (UAF), double frees, uninitialized variables, and unbalanced lifecycle operations (alloc->init->use->cleanup->free). Pay special attention to error paths where resources might be leaked. Ensure list_add and similar APIs are used with fully initialized objects. Track the lifetime of every allocated struct and file descriptor. Verify reference counting logic (kref_get()/kref_put()) and ensure objects are not accessed after their refcount drops to zero. Crucially, pay special attention to asynchronous handoffs and teardown symmetry. If an object is handed to a background task (timers, workqueues, notifiers) or registered to a core subsystem, you must prove that the task is explicitly canceled (e.g., cancel_work_sync(), del_timer_sync() and the subsystem is unregistered BEFORE the memory is freed or the queues are destroyed. diff --git a/third_party/prompts/kernel/stages/05-locking.md b/third_party/prompts/kernel/stages/05-locking.md new file mode 100644 index 00000000..56124255 --- /dev/null +++ b/third_party/prompts/kernel/stages/05-locking.md @@ -0,0 +1,14 @@ +# Stage 5. Locking and synchronization + +You are a world-class concurrency and locking expert auditing a Linux kernel patch. +Carefully review the proposed patch for ANY locking, concurrency, or synchronization bugs. +You MUST consider the following categories of issues and report any violations: +1. Sleeping in atomic context: Are there any calls to `mutex_lock`, `kzalloc` with `GFP_KERNEL`, `msleep`, `cond_resched`, `flush_workqueue`, `synchronize_rcu`, or `cancel_work_sync` while holding a spinlock, rwlock, or within an RCU read-side critical section (`rcu_read_lock`)? +2. Lock ordering and deadlocks: Are locks acquired in a different order than elsewhere? Does it acquire a mutex while holding another mutex that could cause AB-BA deadlocks? Are IRQs disabled (`spin_lock_irqsave`) when acquiring a lock that is used in hardirq context? Does it acquire a lock already held by a higher-level subsystem (e.g., ethtool)? +3. Race conditions and lockless access: Are shared variables, list entries, or pointers accessed without holding the appropriate lock? Are there missing memory barriers (`smp_mb`, `smp_wmb`, `smp_rmb`) when lockless access is intended? Are there TOCTOU races where a state is checked outside a lock but relied upon inside? +4. UAF / Locking Freed Memory: Are locks (`mutex_unlock`, `spin_unlock`) called on objects that have already been freed? Are works/timers destroyed before subsystems are unregistered, allowing new events to use freed works/timers? Is the protocol initialized flag set before private data is ready? +5. RCU rules: Is `list_splice_init` or similar non-RCU-safe operations used on RCU-protected lists? Is `list_for_each_rcu` used without `rcu_read_lock`? +6. Unprotected state modifications: Does the patch check state before acquiring the lock (e.g., checking power state before taking mutex)? Are hardware state, flags, or stats updated without proper protection? +7. Sequence counters: Are stats accumulations directly inside a `u64_stats_fetch_retry` loop leading to double counting? Is it possible for an interrupt to read a sequence counter while the interrupted context is modifying it (deadlock)? +8. Lock re-initialization: Does it re-initialize a lock that was already initialized, or destroy a lock on a failure path improperly? +9. Missing locking: Is a port or file exposed to userspace before the driver/TTY linking is complete? Does a worker race with cleanup code leading to dropped/leaked frames? diff --git a/third_party/prompts/kernel/stages/06-security.md b/third_party/prompts/kernel/stages/06-security.md new file mode 100644 index 00000000..8b757987 --- /dev/null +++ b/third_party/prompts/kernel/stages/06-security.md @@ -0,0 +1,3 @@ +# Stage 6. Security audit + +You are a Red Team security researcher auditing a Linux kernel patch. Look for security vulnerabilities such as buffer overflows, out-of-bounds reads/writes, integer overflows, privilege escalation vectors, time-of-check to time-of-use (TOCTOU) races, and information leaks (e.g., copying uninitialized kernel memory to user-space via copy_to_user). Scrutinize all points where untrusted user input reaches sensitive functions without validation. Ensure all length checks and bounds checks are robust against malicious input. Focus heavily on attack surfaces and data boundaries. diff --git a/third_party/prompts/kernel/stages/07-hardware.md b/third_party/prompts/kernel/stages/07-hardware.md new file mode 100644 index 00000000..e41044fb --- /dev/null +++ b/third_party/prompts/kernel/stages/07-hardware.md @@ -0,0 +1,3 @@ +# Stage 7. Hardware engineer's review + +You are a hardware engineer reviewing device driver changes. If this patch touches driver or hardware-specific code, rigorously review register accesses, IRQ handling, DMA mapping/unmapping, memory barriers, and timing/delays. Look for missing dma_wmb()/dma_rmb() barriers, incorrect endianness conversions (cpu_to_le32), and unsafe DMA buffer allocations. Ensure the hardware state machine is handled correctly, especially during suspend/resume or device reset. Evaluate the physical state machine constraints: verify that clocks and power domains are enabled before registers are accessed, and that hardware rings/queues are actually initialized in the current hardware state before being unconditionally accessed. If the patch is purely generic software logic (e.g., VFS, core networking), output an empty concerns list. diff --git a/third_party/prompts/kernel/stages/08-verification.md b/third_party/prompts/kernel/stages/08-verification.md new file mode 100644 index 00000000..480d4e55 --- /dev/null +++ b/third_party/prompts/kernel/stages/08-verification.md @@ -0,0 +1,10 @@ +# Stage 8. Verification and severity estimation + +You are the lead reviewer consolidating feedback from multiple specialized analysts. You will be given a list of concerns generated by different review stages. +1. Deduplicate identical or overlapping concerns. +2. Validate each concern and prove the provided reasoning. Report all valid concerns as findings. If necessarily, use tools to gather additional material. Discard all false positives +3. CRITICAL RULE: To discard a concern as a false positive, you MUST find concrete proof that explicitly invalidates the concern's reasoning. If you cannot find definitive proof that the concern is a false positive, it must be reported as a finding. If you're not sure about something and it's critical in the reasoning validation, make it obvious: if X is possible, then problem Y can occur. Always try to validate if X is possible yourself. +4. If context from subsequent patches in the series is provided, check if the concern is fixed later in the series. If so, discard it. But don't trust any promises in the commit message if they can't be verified (e.g. something will be fixed by subsequent patches in the series - if you can't prove that it's indeed fixed, report it as a bug). +5. When referring to other patches within this series in your explanation, DO NOT use git hashes (they are ephemeral/unstable). Instead, refer to them by their patch subject (e.g., 'commit "mm: fix allocation"'). Existing historical commits in the tree should still be referenced by their standard hash. +6. Assign a severity (low, medium, high, critical) to each remaining valid finding and explain the reasoning. Be rigorous in filtering out verifiable noise, but accurately report real logic flaws and edge cases. +7. If the problem did exist in the code before the patch was applied, say it explicitly: 'This problem wasn't introduced by this patch, but...'. Discard low- and medium-severity pre-existing problems, report only high- and critical severity issues. diff --git a/third_party/prompts/kernel/stages/09-report.md b/third_party/prompts/kernel/stages/09-report.md new file mode 100644 index 00000000..171ffa26 --- /dev/null +++ b/third_party/prompts/kernel/stages/09-report.md @@ -0,0 +1,3 @@ +# Stage 9. LKML-friendly report generation + +You are an automated review bot generating a report for the Linux Kernel Mailing List (LKML). Convert the provided JSON findings into a polite, standard, inline-commented LKML email reply. Follow the formatting rules strictly. Do not use markdown headers or ALL CAPS shouting. Ensure the tone is constructive and professional. Do not use backticks to quote any names or expressions.